// RegExpRule.java
// A.L. Borchers, 1997 November
// University of Kentucky Department of Computer Science

package Scout;

import pat.Regex;

import java.util.Vector;

public class RegExpRule extends Rule {

  // The pattern to be searched. The RegEx for searching is created 
  // over in each pass through processDoc in order to interpolate
  // VAL/Name/ data from previous rules
  String pattern= null;

  // Procedure is to get the first match and store 
  // the rest of the document string, then whittle
  // away till the doc is completely consumed
  public RegExpRule(Scout scout, RuleHash h) 
    throws RuleFormatException {
      super(scout,h);
      pattern= attr.get("pattern");
      if (pattern == null) {
        throw new RuleFormatException("Missing required attribute for " + getName());
      }
      scout.logger.log(getName() + ".RegExpRule - ready to search on pattern " + 
        pattern);
  }

  // overrides processBuffer in class Rule
  public synchronized void processDoc() {
    super.processDoc();
    // interpolate VAL/RuleName/ arguments for the pattern
    String searchPattern= interpolateDataValues(pattern);
    if (searchPattern == null) {
      scout.logger.log(getName() + ".processDoc - Data interpolation failed. Cannot continue...");
      return;
    }
    scout.logger.log(getName() + ".processDoc - searching on pattern " + searchPattern);
    // init the regexp for searching documents
    Regex rEx= new Regex(searchPattern);
    String text= null;
    // pre squeeze the document if requested
    try {
      text= attr.containsKey("squeezedoc") ? squeeze(doc.getText()) : doc.getText();
    }
    catch (DocumentAccessException dae) {
      scout.logger.log(getName() + ".processDoc - Exception accessing document text. Cannot continue..." + dae.toString());
      return;
    }
    while (rEx.search(text)) {
      String match= rEx.substring();
      // squeeze spaces and/or trim as requested
      match= attr.containsKey("squeezematch") ? squeeze(match) : match;
      match= attr.containsKey("trim") ? match.trim() : match;
      results.addElement(match);
      // Store the remainder of the document and continue
      text= rEx.right(); // docString.substring(docString.indexOf(match)+match.length());
    }
  }

  // squeeze runs of white space into a single space char
  // and convert any newline chars to spaces
  private String squeeze(String in) {
    StringBuffer out= new StringBuffer();
    int i= 0;
    while (i < in.length()) {
      out.append(in.charAt(i) == '\n' || in.charAt(i) == '\r' ? ' ' : in.charAt(i));
      i++;
      while (i < in.length() &&
	           Character.isWhitespace(out.charAt(out.length()-1)) && 
	           Character.isWhitespace(in.charAt(i))) 
	      i++;
    }
    return out.toString();
  }

}
