import au.id.jericho.lib.html.*;
import java.util.*;
import java.io.*;
import java.net.*;

public class ExtractText {
	public static void main(String[] args) throws Exception {
		String sourceUrlString="data/test.html";
		if (args.length==0)
		  System.err.println("Using default argument of \""+sourceUrlString+'"');
		else
			sourceUrlString=args[0];
		if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString;
		Source source=new Source(new URL(sourceUrlString));
		source.setLogWriter(new OutputStreamWriter(System.err)); // send log messages to stderr

		source.fullSequentialParse();

		System.out.println("Document title:");
		String title=getTitle(source);
		System.out.println(title==null ? "(none)" : title);

		System.out.println("\nDocument description:");
		String description=getMetaValue(source,"description");
		System.out.println(description==null ? "(none)" : description);

		System.out.println("\nDocument keywords:");
		String keywords=getMetaValue(source,"keywords");
		System.out.println(keywords==null ? "(none)" : keywords);
	
		System.out.println("\nLinks to other documents:");
		List linkElements=source.findAllElements(HTMLElementName.A);
		for (Iterator i=linkElements.iterator(); i.hasNext();) {
			Element linkElement=(Element)i.next();
			String href=linkElement.getAttributeValue("href");
			if (href==null) continue;
			// A element can contain other tags so need to extract the text from it:
			String label=linkElement.getContent().extractText();
			System.out.println(href+" ("+label+")");
		} 

		System.out.println("\nAll text from BODY (exluding content inside SCRIPT and STYLE elements):");
		Element bodyElement=source.findNextElement(0,HTMLElementName.BODY);
		Segment contentSegment=(bodyElement==null) ? source : bodyElement.getContent();
		System.out.println(contentSegment.extractText(true));
  }

	private static String getTitle(Source source) {
		Element titleElement=source.findNextElement(0,HTMLElementName.TITLE);
		if (titleElement==null) return null;
		// TITLE element never contains other tags so just decode it collapsing whitespace:
		return CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
	}

	private static String getMetaValue(Source source, String key) {
		for (int pos=0; pos<source.length();) {
			StartTag startTag=source.findNextStartTag(pos,"name",key,false);
			if (startTag==null) return null;
			if (startTag.getName()==HTMLElementName.META)
				return startTag.getAttributeValue("content"); // Attribute values are automatically decoded
			pos=startTag.getEnd();
		}
		return null;
	}
}
