// BreadFirstSearch.java
// A.L. Borchers, December 1997
// University of Kentucky Department of Computer Science
// Scout rule for implementing depth first search of Web documents

package Scout;

import SGMLKit.*;

import java.net.URL;
import java.net.MalformedURLException;
import java.util.Vector;

public class BreadthFirstSearch extends Rule {

  // The obligatory constructor
  public BreadthFirstSearch(Scout s, RuleHash h) 
    throws RuleFormatException {
      super(s,h);
  }

  // And the obligatory processDoc method
  public synchronized void processDoc() {
    // extract URLs in the tags and append to the queue
    Vector tags= doc.getTags();
    if (tags == null || tags.size() == 0) {
      scout.logger.log(getName() + ".processDoc - No tags found at " + doc.getURL().toString() + ". Returning...");
      return;
    }
    Vector urlsFound= extractLinks(doc.getURL(),tags);
    int enqueued= 0;
    for (int i= 0; i < urlsFound.size(); i++) {
      // only enqueue the url if it doesn't show in scout's visit record
      if (!scout.visitedURL((URL)urlsFound.elementAt(i))) {
        scout.urls.append((URL)urlsFound.elementAt(i));
        enqueued++;
      }
    }
    scout.logger.log(getName() + " - Enqueued " + enqueued + " URLs");
  }

  // -----------------------------------------------------------------------------
  // extractLinks
  // -----------------------------------------------------------------------------
  // Return a vector of URLs referenced in a set of tags, interpreting the URLs 
  // as necessary in context of the current URL
  // -----------------------------------------------------------------------------
  private Vector extractLinks(URL currentURL, Vector tags) {
    scout.logger.log(getName() + " - Extracting links from URL " + currentURL.toString());
    // buffer relevant config information
    String restrictHost= scout.config.get("SCOUT","RestrictHost");
    String restrictDomain= scout.config.get("SCOUT","RestrictDomain");
    Vector out= new Vector();
    String urlStr= null, linkAttribute= null;
    for (int i= 0; i < tags.size(); i++) {
      try {
        Tag nextTag= (Tag)tags.elementAt(i);
	// if the tag bears a link, get the attribute that it's stored under
	if ((linkAttribute= getLinkAttribute(nextTag)) != null) {
	  // extract the link
	  urlStr= (String)nextTag.get(linkAttribute);
	  // if quoted, remove the quote chars
	  if (urlStr.startsWith("\"") || urlStr.startsWith("'")) {
            urlStr= urlStr.substring(1,urlStr.length()-1);
          }
	  if (urlStr != null) {
	    // remove anchors
	    int anchorStart= urlStr.indexOf("#");
	    if (anchorStart > -1) {
	      urlStr= anchorStart == 0 ? "" : urlStr.substring(0,anchorStart);
	    }
	    // construct absolute URL relative to current URL
	    URL url= new URL(currentURL,urlStr);
	    urlStr= url.toString();
	    // five possibilities:
	    // 1: already seen
	    if (scout.visitedURL(urlStr)) {
	      // scout.logger.log(getName() + " - Ignoring " + urlStr + " as seen");
	    }
	    // 2: excluded
	    else if (scout.exclusions.exclude(url)) {
	      // scout.logger.log(getName() + " - Ignoring URL " + urlStr + " as excluded");
	    }
	    // 3: expand on host
	    else if (!restrictHost.equals("null")) {
	      if (url.getHost().equals(restrictHost)) {
	        // scout.logger.log(getName() + " - Storing URL " + urlStr + " for expansion");
		out.addElement(url);
	      }
              else {
	        // scout.logger.log(getName() + " - Ignoring URL " + urlStr +  " as off host");
	      }
	    }
	    // 4: expand on domain
	    else if (!restrictDomain.equals("null")) {
	      if (url.getHost().endsWith(restrictDomain)) {
	        // scout.logger.log(getName() + " - Storing URL " + urlStr + " for expansion");
	        out.addElement(url);
	      }
	      else {
	        // scout.logger.log(getName() + " - Ignoring URL " + urlStr +  " as off domain");
	      }
	    }
	    // 5: expand anywhere
	    else {
	      // scout.logger.log(getName() + " - Storing URL " + urlStr + " for expansion");
	      out.addElement(url);
	    }
	  }
	}
      }
      catch (MalformedURLException e) {
        // scout.logger.log(getName() + " - Ignoring URL " + urlStr + " as malformed");
      }
    }
    return out;
  }

  // -----------------------------------------------------------------------------
  // getLinkAttribute
  // -----------------------------------------------------------------------------
  // If this is a link type tag, return the attribute (key) that refers to a 
  // link, else return null
  // -----------------------------------------------------------------------------
  private String getLinkAttribute(Tag tag) {
    String linkAttribute= null;
    // garden variety hyperlinks and client side map anchors
    String identifier= tag.getIdentifier();
    if ((identifier.equals("a") || identifier.equals("area")) && tag.containsKey("href")) {
      // we have no use for mailto (note startsWith is not used as the argument may be quoted
      // as stored in the tag
      linkAttribute= ((String)tag.get("href")).indexOf("mailto:") >= 0 ? null : "href";
    }
    // frame reference
    else if (identifier.equals("frame") && tag.containsKey("src")) {
      linkAttribute= "src";
    }
    return linkAttribute;
  }


}
