/* 

Nobots.java
A.L. Borchers, 1997 November
University of Kentucky Department of Computer Science

Implementation of Robots Exclusion Protocol used to keep robots from going 
where webmasters (and robot operators) don't want them going. Also implements 
avoidance of anathema file extensions, which is not a part of REP.

A basic set of path and type extensions defined by the static arrays 
defaultPathExclusions and defaultTypeExclusions respectively are always used, 
and these are loaded automatically when a Nobots object is created and are
always prepended to any exclusions gotten from a host. 

Other methods of interest are

getHostExclusions(hostName)

Contacts the named host and requests /robots.txt. Any paths indicated therein as 
applicable to us (version of Scout) specifically or to all robots (*) as excluded
are added to the pathExclusions vector

getCurrentHost()

Returns the name of the host to which the current exclusion set applies

addPathExclusion(path) and addTypeExclusion(type)

Do what their names imply. Be aware that any changes in the exclusion lists
brought about by these methods are erased the next time getHostExclusions() 
is called. Also available are 

addPathExclusions(paths) and addTypeExclusions(types)

which take arrays rather than single strings

*/

package Scout;

import HTTPClient.*;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.Serializable;

import java.io.IOException;

import java.net.URL;

import java.util.Hashtable;
import java.util.Vector;

public class Nobots implements Serializable {

  // the scout we work for
  private transient Scout scout= null;

  // directory in which to store host information
  // a copy of the robots.txt file for each site
  // visited is stored here so we don't have to go
  // back to the host on each revisit
  public static final String dataDir= "Nobots";

  // Flags to indicate whether to use built in path and type default exclusions
  private boolean useDefaultPathExclusions= true;
  private boolean useDefaultTypeExclusions= true;

  // paths to ignore on any host
  public static final String[] defaultPathExclusions= {
    // scripts
    "/cgi-bin/",
    "/htbin/",
    "/cgi-win/"
  };

  // file types to ignore (generally binary types)
  public static final String[] defaultTypeExclusions= {
    // java and javascript types
    ".java", ".class", ".js",
    // image types
    ".gif",".jpeg",".jpg",".png",".bmp", 
    // audio types
    ".wav",".mid",".au",".ra",".ram",".mod",".mp2",
    // video types
    ".mpg",".avi",".mov",
    // other multimedia types
    ".wrl"
  };

  // current host
  private String currentHost= null;

  // paths excluded on the current host
  private Vector pathExclusions= null;

  // file types excluded
  private Vector typeExclusions= null;

  // constructor - given the default exclusions flags
  public Nobots(boolean useDefaultPathExclusions, boolean useDefaultTypeExclusions) 
    throws IOException {
      // set our internal flags
      this.useDefaultPathExclusions= useDefaultPathExclusions;
      this.useDefaultTypeExclusions= useDefaultTypeExclusions;
      // verify the directory for nobots data
      File d= new File(dataDir);
      if (!d.exists() && !d.mkdirs()) {
	throw new IOException("Couldn't create directory " + dataDir);
      }
      else if (d.exists() && !d.isDirectory()) {
	throw new IOException(dataDir + " exists as file");
      }
      // init path exclusions
      pathExclusions= new Vector();
      if (useDefaultPathExclusions) {
	addPathExclusions(defaultPathExclusions);
      }
      // init type exclusions
      typeExclusions= new Vector();
      if (useDefaultTypeExclusions) {
	addTypeExclusions(defaultTypeExclusions);
      }
  }

  // Set the scout member
  public void setScout(Scout s) {
    scout= s;
  }

  // report whether a URL should be excluded based on the current exclusions vectors
  public boolean exclude(URL url) {
    String urlStr= url.toString();
    // check type extension exclusions
    for (int i= 0; i < typeExclusions.size(); i++) {
      if (urlStr.endsWith((String)typeExclusions.elementAt(i))) {
	return true;
      }
    }
    // check path exclusions
    for (int i= 0; i < pathExclusions.size(); i++) {
      if (urlStr.indexOf((String)pathExclusions.elementAt(i)) >= 0) {
        return true;
      }
    }
    // return false if passed all exclusions
    return false;
  }

  // get the name of the current host, i.e. the host that the current exclusions apply to
  public String getCurrentHost() {
    return currentHost;
  }

  // get the exclusions for a host name from the host or the local data dir as available
  public void getHostExclusions(String hostName) {
    log("Nobots.getHostExclusions - Getting exclusions for host " + hostName);
    currentHost= hostName;
    pathExclusions= new Vector();
    loadExclusions();
    if (useDefaultPathExclusions) {
      addPathExclusions(defaultPathExclusions);
    }
    // Note that since types aren't part of REP, we don't need to update them
    log("Nobots.getHostExclusions - Stored " + pathExclusions.size() + 
	" excluded paths and " + typeExclusions.size() + " excluded types for " + currentHost);
  }

  // add a path to the exclusions for the current host
  public void addPathExclusion(String path) {
    pathExclusions.addElement(path);
  }

  // Add an array of path exclusions
  public void addPathExclusions(String[] paths) {
    for (int i= 0; i < paths.length; i++) {
      pathExclusions.addElement(paths[i]);
    }
  }

  // add a file type exclusion
  public void addTypeExclusion(String type) {
    typeExclusions.addElement(type);
  }

  // Add an array of type exclusions
  public void addTypeExclusions(String[] types) {
    for (int i= 0; i < types.length; i++) {
      typeExclusions.addElement(types[i]);
    }
  }

  // load the exclusions from a local file
  private void loadExclusions() {
    String hostFile= dataDir + "/" + currentHost;
    InputStream in= null;
    try {
      in= new FileInputStream(hostFile);
    }
    catch (Exception e) {
    }
    String outFile= (in == null) ? hostFile : null; 
    if (in == null) {
      log("Nobots.loadExclusions - No local data found for host " + currentHost);
      // load it from the remote host only if we are instructed to by scout. 
      // This is useful when we're doing analysis out of the cache and don't 
      // want to waste time on the network... 
      if (scout.requestRobotsFile) {
	      in= getHostInputStream();
      }
      else {
	log("Nobots.loadExclusions - Remote request for robots file disabled in Scout");
      }
    }
    if (in != null) {
      readExclusions(in,outFile);
    }
    log("Nobots.loadExclusions - Read " + pathExclusions.size() + " path exclusions for host " + 
	currentHost);
  }

  // get a stream for reading exclusions from a host
  // TODO: implement port awareness for Web server
  // currently only works on port 80 servers 
  private InputStream getHostInputStream() {
    InputStream in= null;
    try {
      log("Nobots.getHostInputStream - Checking for robots exclusion file on " + currentHost);
      HTTPConnection connection= new HTTPConnection(currentHost);
      HTTPResponse response= connection.Get("/robots.txt");
      if (response.getStatusCode() != 200) {
        log("Nobots.getHostInputStream - HTTP response indicates no robots.txt file");
      }
      else {
      	in= response.getInputStream();
      }
    }
    catch (Exception e) {
      log("Nobots.getHostInputStream - Error getting robots exclusion data from host " + 
	  currentHost + " - " + e.toString());
    }
    return in;
  }

  // read exclusions from input stream, writing to outFile if given
  private void readExclusions(InputStream in, String outFile) {
    try {
      // read the stream into buffer, writing a copy on out if
      // out is non null
      BufferedReader reader= new BufferedReader(new InputStreamReader(in));
      BufferedWriter writer= outFile == null ? null : new BufferedWriter(new FileWriter(outFile));
      // track whether rules apply to us
      boolean ruleApplies= false;
      String line= null;
      while ((line= reader.readLine()) != null) {
	line= removeComments(line,'#');
	if (!line.equals("")) {
          if (writer != null) {
            writer.write(line + "\n");
          }
	  // determine if this is an agent field or disallow rule
	  int split= line.indexOf(':');
	  if (split <= 0 || split == (line.length()-1)) {
	    log("Nobots.readExclusions - Invalid record in robots file");
          }
	  else {
	    String key= line.substring(0,split).trim().toLowerCase();
	    String val= line.substring(split+1).trim();
	    if (key.equals("user-agent")) {
	      if (val.equals("*") || val.startsWith(scout.agentName))
		ruleApplies= true;
	      else
		ruleApplies= false;
	    }
	    else if (key.equals("disallow") && ruleApplies && !val.equals("")) {
              addPathExclusion(val);
            }
          }
        }
      }
      if (writer != null) {
        writer.close();
      }
    }
    catch (Exception e) {
      log("Nobots.readExclusions - Error reading robots exclusion data " + e.toString());
      e.printStackTrace();
      // don't preserve the output file if there was an error
      if (outFile != null && !(new File(outFile)).delete()) {
	log("Nobots.readExclusions - Error deleting output file " + outFile);
      }
    }
  }

  // remove char delimited comments from a line
  private String removeComments(String line, char commentChar) {
    String out= null;
    line= line.trim();
    int commentStart= line.indexOf(commentChar);
    if (commentStart == -1) {
      out= line;
    }
    else if (commentStart == 0) {
      out= "";
    }
    else {
      out= line.substring(0,commentStart);
    }
    return out;
  }

  // Print a message to the logger in scout or stdout if no scout defined
  private void log(String logMessage) {
    if (scout != null) {
      scout.logger.log(logMessage);
    }
    else {
      System.out.println(logMessage);
    }
  }


}
