// SGMLSplitter.java
// (c) 1997 A.L. Borchers

package SGMLKit;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PushbackReader;

import java.util.Enumeration;
import java.util.Vector;

public class SGMLSplitter {

  public static final boolean includeDelimiter= true;
  public static final boolean expandWhitespace= true;

  private transient PeekabooReader reader= null;

  // Entity table for mappings 
  private static EntityTable entityTable= null;

  public SGMLSplitter() {
    try {
      entityTable= new EntityTable();
    }
    catch (Exception e) {
      System.err.println("Unable to initialize entity table - " + e.toString());
    }
  }

  public SGMLSplitter(String entityFile) {
    try {
      entityTable= new EntityTable(entityFile);
    }
    catch (Exception e) {
      System.err.println("Unable to initilaize entity table - " + e.toString());
    }
  }

  // Return the tags of a document in a vector
  public Vector getTags(InputStream inSGML)
    throws IOException, InvalidSGMLException {
      Vector tags= new Vector();
      split(inSGML,tags,null);
      return tags;
  }

  // Return the text portion of a document in a string
  public String getText(InputStream inSGML)
    throws IOException, InvalidSGMLException {
      StringBuffer textBuffer= new StringBuffer();
      split(inSGML, null, textBuffer);
      return textBuffer.toString();
  }


  // Split the stream, storing tags in a vector and text in string buffer
  // Return true if no errors were detected in the markup and false if 
  // errors were present. In the event of a markup error, the contents 
  // of the stream up to the point where the error occured are stored
  public boolean split(InputStream inSGML, Vector tags, StringBuffer text)
    throws IOException {
      reader= new PeekabooReader(new InputStreamReader(inSGML));
      // Keep count of characters written to text stream
      int nCharsWritten= 0;
      // Read SGML a character at a time, appending text to it's StringBuffer
      // and tags to their Vector
      while (!reader.consumed()) {
	if (reader.nextCharIs('<')) {
	  // Beginning of tag detected
	  Tag tOut= null;
	  try {
	    tOut= new Tag(reader,nCharsWritten);
	  }
	  catch (InvalidSGMLException e) {
	    return false;
	  }
	  if (tags != null) {
	    tags.addElement(tOut);
	  }
	}
	else if (reader.nextCharIs('&') && entityTable != null) {
	  // Entity open discovered. Push back and read the entity
	  // TODO: Some entities that may be safe to store in the text 
	  // buffer (e.g. STAGO, TAGC) will not be safe to inline 
	  // back into a markup version, ergo a complementary proces
	  // will be required in join() 
	  String entity= readEntity(reader);
	  text.append(entity);
	  nCharsWritten+= entity.length();
	}
	else {
	  // Plain text (or we aren't processing entities) so write to text buffer
	  int next= reader.read();
	  if (text != null) {
	    text.append((char)next);
	    nCharsWritten++;
	  }
	}
      }
      // If we made it here, there were no errors in the markup
      return true;
  }

  // readEntity
  // PRE: Stream at entity start char (&)
  // POST: return entity, stream set at next char after entity
  private String readEntity(PeekabooReader reader) 
    throws IOException {
      // read the &
      reader.read();
      // check if this was a wayward &
      if (reader.nextCharIsWhitespace()) {
	return "&";
      }
      // Else it really is an entity
      // TODO: Catch any possible illegal entity chars
      char[] entityBreaks= {';',' '};
      String entity= reader.readToAny(entityBreaks,includeDelimiter,expandWhitespace);
      // check that the entity was correctly terminated, as this is often done incorrectly...
      char lastChar= entity.charAt(entity.length()-1);
      entity= entity.substring(0,entity.length()-1);
      if (lastChar != ';') {
	// improperly terminated so put the last char back on the reader
	reader.unread(lastChar);
      }
      String entityValue= entityTable.map(entity);
      // If the entity was successfully mapped, return the mapped value
      // else return the uninterpreted entity as it was discovered
      return entityValue != null ? entityValue : ("&" + entity + (lastChar == ';' ? ";" : ""));
  }
  
  // Join a vector of tags and a string of text together into an SGML
  // document on the output stream
  public void join(Vector tags, StringBuffer textBuffer, OutputStream out) 
    throws IOException {
      // Index into the text string
      int textIndex= 0;
      // While there are more tags, interleave them where they belong in 
      // the text
      Enumeration e= tags.elements();
      while (e.hasMoreElements()) {
	Tag nextTag= (Tag)e.nextElement();
	int tagPosition= nextTag.getPosition();
	while (textIndex < tagPosition)
	  out.write(textBuffer.charAt(textIndex++));
	String tagOut= nextTag.toString();
	for (int i= 0; i < tagOut.length(); i++)
	  out.write(tagOut.charAt(i));
      }
      // write the remaining text
      while (textIndex < textBuffer.length())
	out.write(textBuffer.charAt(textIndex++));
  }

}














