// WordCounterRule.java
// A.L. Borchers, 1997 November
// University of Kentucky Department of Computer Science

// Rule for counting words in the text component of a document
//
// ALLOWABLE ARGUMENTS:
// STOPLIST   (LIST) Comma delimited list of stop words 
// STEM       (FLAG) Whether to treat stoplist as word stems 
// SQUEEZEDOC (FLAG) same as RegExpRule - squeeze spaces
// MATCHCASE  (FLAG) Enforce strict case matching in word comparisons
// ACCUMULATE (FLAG) Collect last document data and append/revise to it
// VALIDATE   (FLAG) Does it's own validation so the VALIDATE flag, though aceptable
//                   should generally be omitted from templates referencing this Rule
//
// RESULT:
// A vector of (word,number) tuples


package Scout;

import java.util.StringTokenizer;
import java.util.Vector;

public class WordCounterRule extends Rule {

  // Array of words to ignore
  private String[] stopList= null;

  // Flag whether to treat elements of stopList as stems
  private boolean stem= false;

  // Flag whether to squeeze LWS
  private boolean squeezeDoc= false;

  // Flag whether to match case in word comparisons
  private boolean matchCase= false;

  // Flag whether to accumulate on old results
  private boolean accumulate= false;

  public WordCounterRule(Scout scout, RuleHash h) 
    throws RuleFormatException {
      super(scout,h);
      // Load the stop list if provided
      String stopWords= attr.get("stoplist");
      if (stopWords != null) {
	StringTokenizer st= new StringTokenizer(stopWords,",");
	stopList= new String[st.countTokens()];
	int i= 0;
	while (st.hasMoreTokens()) {
	  stopList[i++]= st.nextToken();
	}
      }
      // Set the flags
      stem= attr.containsKey("stem");
      squeezeDoc= attr.containsKey("squeezedoc");
      matchCase= attr.containsKey("matchcase");
      accumulate= attr.containsKey("accumulate");
      // Inform the log of startup state
      scout.logger.log(getName() + ".WordCounterRule - Ignoring " + stopWords + "; Stemming == " + 
		       (stem ? "on" : "off") + "; Squeezing == " + (squeezeDoc ? "on" : "off"));
  }

  public void processDoc() {
    if (accumulate && sequenceNumber > 0) {
      // preload results with results of last document
      scout.logger.log(getName() + " requesting results for document " + (sequenceNumber-1));
      results= scout.ruleResults.get(this,getName(),(sequenceNumber-1));
      scout.logger.log(getName() + " acquired results for document " + (sequenceNumber-1));
    }
    String text= null;
    try {
      text= squeezeDoc ? squeeze(doc.getText()) : doc.getText();
    }
    catch (DocumentAccessException dae) {
      scout.logger.log(getName() + ".WordCounterRule - Exception getting document text. Cannot continue. " + dae.toString());
      return;
    }
    StringTokenizer st= new StringTokenizer(text);
    while (st.hasMoreTokens()) {
      String nextWord= stripPunctuation(st.nextToken());
      if (!stopWord(nextWord)) {
	int wordIndex= findWordIndex(nextWord);
	if (wordIndex < results.size()) {
	  // See if this is a recurring word or a new one that should be inserted 
	  // at this point in the vector
	  WordCounterRuleWord w= (WordCounterRuleWord)results.elementAt(wordIndex);
	  if (w.equals(nextWord,matchCase)) {
	    // Recurring
	    w.incrementCount();
	  }
	  else {
	    // New word
	    results.insertElementAt(new WordCounterRuleWord(nextWord),wordIndex);
	  }
	}
	else {
	  // New word appending at end
	  results.addElement(new WordCounterRuleWord(nextWord));
	}
      }
    }
  }

  // Return the index at which the word is or should be
  private int findWordIndex(String s) {
    int i= 0;
    for (i= 0; i < results.size(); i++) {
      WordCounterRuleWord w= (WordCounterRuleWord)results.elementAt(i);
      if (w.word.compareTo(s) >= 0) return i;
    }
    return i;
  }


  // Return true if word is stopped 
  private boolean stopWord(String s) {
    if (stopList == null) {
      return false; // obviously!
    }
    for (int i= 0; i < stopList.length; i++) {
      boolean stop= stem ? s.startsWith(stopList[i]) : s.equals(stopList[i]);
      if (stop) return stop;
    }
    return false;
  }

  // Remove any leading or trailing non-alphanumeric characters
  private String stripPunctuation(String s) {
    int i;
    i= 0;
    while (i < s.length() && !Character.isLetterOrDigit(s.charAt(i))) {
      i++;
      s= s.substring(i);
    }
    i= s.length() - 1;
    while (i >= 0 && !Character.isLetterOrDigit(s.charAt(i))) {
      s= s.substring(0,i);
      i--;
    }
    return s;
  }

  // squeeze runs of white space into a single space char
  // and convert any newline chars to spaces
  private String squeeze(String in) {
    StringBuffer out= new StringBuffer();
    int i= 0;
    while (i < in.length()) {
      out.append(in.charAt(i) == '\n' || in.charAt(i) == '\r' ? ' ' : in.charAt(i));
      i++;
      while (i < in.length() &&
	           Character.isWhitespace(out.charAt(out.length()-1)) && 
	           Character.isWhitespace(in.charAt(i))) 
	      i++;
    }
    return out.toString();
  }


}
