package Scout;

import HTTPClient.*;

import SGMLKit.*;

import java.io.File;
import java.io.InputStreamReader;
import java.io.FileWriter;

import java.net.URL;
import java.net.MalformedURLException;

import java.util.Vector;
import java.util.StringTokenizer;


//
//
// ImageGrabber - download the images 
//
//
class ImageGrabber extends Rule {

  // Default Directory for storing images collected
  public static final String defaultImageDir= "Images";

  // Directory for storing images collected
  String imageDir= null;

  // List of extensions to accept as images
  Vector imageTypes= null;

  // Hostname of current URL 
  String documentHost= null;

  // Use httpConnection to load images
  HTTPConnection httpConnection= null;

  // Use VisitRecord to remember images previously loaded
  VisitRecord visits= null;


  public ImageGrabber(Scout s, RuleHash rh) 
    throws RuleFormatException {
      super(s,rh);
      configure();
  }
  
  private void configure() 
    throws RuleFormatException {
      // Rule specific arguments are EXTNS (image file extensions) and 
      // OUTDIR (directory to store collected images in)
      String confItem= attr.get("types");
      imageTypes= new Vector();
      if (confItem != null) {
	StringTokenizer st= new StringTokenizer(confItem,",");
	while (st.hasMoreTokens()) {
	  // extensions are stored with leading '.' assured
	  String nextType= st.nextToken();
	  nextType= nextType.startsWith(".") ? nextType : "." + nextType;
	  imageTypes.addElement(nextType.toLowerCase());
	}
      }
      scout.logger.log(getName() + ".configure() - Matching " + imageTypes.size() + " image types");
      // Assign and verify image directory
      imageDir= attr.get("outdir");
      imageDir= imageDir == null ? imageDir : defaultImageDir;
      imageDir= imageDir.endsWith("/") ? imageDir.substring(0,imageDir.length()-1) : imageDir;
      File d= new File(imageDir);
      if (!d.exists()) {
	if (!d.mkdirs()) {
	  throw new RuleFormatException("Couldn't create image directory");
	}
      }
      if (!d.isDirectory()) {
	throw new RuleFormatException("Image directory exists as file");
      }
      if (!d.canWrite()) {
	throw new RuleFormatException("Cannot write to image directory");
      }
      scout.logger.log(getName() + ".configure() - Writing images to " + imageDir);
      // Init the list of image URLs visited
      visits= new VisitRecord();
  }
  
  public synchronized void processDoc() {
    super.processDoc();
    // Scan links for images and download those found
    Vector tags= doc.getTags();
    for (int i= 0; i < tags.size(); i++) {
      Tag t= (Tag)tags.elementAt(i);
      String imgURL= null;
      if ((imgURL= getImageURL(t)) != null) {
        // if not already visited, load and record
        if (!visits.visited(imgURL)) {
          scout.logger.log(getName() + ".processDoc() - Retrieving image " + imgURL);
          loadImageURL(imgURL);
          visits.add(imgURL);
        }
	else {
          scout.logger.log(getName() + ".processDoc() - Ignoring image " + imgURL + " as already seen");
	}
      }
    }
  }
  
  private String getImageURL(Tag t) {
    // Return URL of image 
    String out= null;
    // Flag whether to accept image URL
    boolean accept= false;
    // Get the image url according to tag identifier
    if (t.getIdentifier().equals("img")) {
      // imgs are sometimes generated via scripts or otherwise fail to have a known 
      // image extension, so we go ahead and set accept true
      out= (String)t.get("src");
      accept= true;
    }
    else if (t.getIdentifier().equals("body")) {
      out= (String)t.get("background");
    }
    else if (t.getIdentifier().equals("a")) {
      out= (String)t.get("href");
    }
    // if quoted, remove the quote chars
    if (out != null && (out.startsWith("\"") || out.startsWith("'"))) {
      out= out.substring(1,out.length()-1);
    }
    // If url did not come from an img tag, verify that an image type is specified
    if (!accept && out != null) {
      for (int i= 0; i < imageTypes.size(); i++) {
        if (out.toLowerCase().endsWith(((String)imageTypes.elementAt(i)).toLowerCase())) {
          accept= true;
          break;
        }
      }
    }
    out= accept ? out : null;
    return out;
  }
  
  private void loadImageURL(String imgURL) {
    URL url= doc.getURL();
    try {
      // Construct URL object
      url= new URL(url,imgURL);
      // Generate new HTTPConnection if host changes
      if (documentHost == null || documentHost != url.getHost()) {
	documentHost= url.getHost();
        httpConnection= new HTTPConnection(documentHost);
        NVPair[] headers= new NVPair[1];
        headers[0]= new NVPair("Accept","*");
        httpConnection.setDefaultHeaders(headers);
      }
      // Obtain a local file to store image to
      // Separate file name and extension in case renaming is required
      String imageFile, imageExtn;
      String fileName= url.getFile().substring(url.getFile().lastIndexOf("/")+1);
      int extnStart= fileName.lastIndexOf(".");
      if (extnStart > -1) {
        imageExtn=  fileName.substring(extnStart+1);
	imageFile=  fileName.substring(0,extnStart);
      }
      else {
        // no extension found! use ".img"
	imageFile= url.getFile();
	imageExtn= ".img";
      }
      // Guarantee unused file
      File f= new File(imageDir + "/" + imageFile + "." + imageExtn);
      while (f.exists()) {
	imageFile+= '~';
	f= new File(imageDir + "/" + imageFile + "." +imageExtn);
      }
      // Get the URL
      try {
        wait(scout.netDelay);
      }
      catch (InterruptedException ie) {
      }
      HTTPResponse response= httpConnection.Get(url.getFile());
      if (response.getStatusCode()/100 != 2) {
        throw new Exception("Non 2XX HTTP Response: " + response.getStatusCode());
      }
      // read URL from remote and write to local
      InputStreamReader reader= new InputStreamReader(response.getInputStream());
      FileWriter writer= new FileWriter(f);
      int n= -1;
      while ((n= reader.read()) != -1) writer.write(n);
      // log it
      scout.logger.log(getName() + ".loadImageURL() - " + imgURL + "->" + f.toString()); 
      reader.close();
      writer.close();
    }
    catch (Exception e) {
      scout.logger.log(getName() + ".loadImageURL() - Unable to load URL " + imgURL + " " + e.toString());
    }
  }
  
}













