/*
 * Copyright (c) Tomas Znamenacek
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

package net.sf.turkey;

import java.util.Map;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import java.util.Random;

import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.BufferedReader;
import java.io.IOException;

class UnknownPrefix extends Exception {}

/**
 * Text analyzer and generator.
 * This class is able to analyze given sample of text and
 * generate some text based on the sample's characteristics.
 * The real work gets done in this class.
 */
public class ProbabilityTable {

	/** 
	 * The prefix length.
	 * Larger values mean better (less random) output, best are values
	 * around 3.
	 */
	public static final int PREFIX_LENGTH = 3;
	
	/** Average word length of the analyzed text. */
	protected int avgWordLength = 0;

	/** The name of the table. */
	protected String name;
	
	/** 
	 * Probability table.
	 * Contains strings and their probable suffixes.
	 */
	protected Map table;

	/** Random seed generator. */
	protected Random rnd = new Random();

	/** Builds the probability table from an input stream. */
	public ProbabilityTable(InputStream in, String n) throws IOException {

		table = new Hashtable();
		BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"));
		String s;
		
		while ((s = reader.readLine()) != null)
			analyzeString(s);
		
		reader.close();
		name = n;
	}

	/** Collects character characteristics of a single string. */
	protected void analyzeString(String s) {

		if ("".equals(s)) return;
		s = " " + s.toLowerCase() + " ";
		
		//
		// Analyze prefixes
		//
		
		for (int p = 1; p <= PREFIX_LENGTH; p++) {
			for (int i = p; i < s.length(); i++) {
				
				String prefix = s.substring(i-p, i);
				Character next = new Character(s.charAt(i));

				if (!table.containsKey(prefix))
					table.put(prefix, new Vector());

				((Vector)table.get(prefix)).add(next);
			}
		}

		//
		// Analyze average word length
		//

		boolean inWord = false;
		int total = 0;
		int length = 0;
		char c;

		for (int i = 0; i < s.length(); i++) {
			
			c = s.charAt(i);
			
			if (inWord && Character.isWhitespace(c)) {
				inWord = false;
				total++;
				
			} else if (inWord && Character.isLetter(c)) {
				length++;
				
			} else if (Character.isLetter(c)) {
				inWord = true;
				length++;
			}

			// jumping over non-word characters
		}

		avgWordLength = (total != 0) ? Math.round(length/total) : 0;
	}
	
	/** Returns character that is likely to follow given prefix. */
	public Character getNext(String prefix) throws UnknownPrefix {

		if (!table.containsKey(prefix))
			throw new UnknownPrefix();

		Vector nextChars = (Vector)table.get(prefix);
		return (Character)nextChars.elementAt(rnd.nextInt(nextChars.size()));
	}

	/** Generates a word. */
	public String getWord() {

		Vector startChars = (Vector)table.get(" ");
		Character next = (Character)startChars.elementAt(rnd.nextInt(startChars.size()));
		StringBuffer prefix = new StringBuffer(" " + next.toString());
		StringBuffer word = new StringBuffer(next.toString());

		while (true) {

			for (int i = prefix.length(); i > 0; i--) {
				
				try {
					next = getNext(prefix.substring(0, i));
				} catch (UnknownPrefix e) {
					if ((i+1) == prefix.length())
						return ("<unable to finish: " + prefix.toString() + ">");
					continue;
				}
				
				break;
			}
			
			if (Character.isWhitespace(next.charValue()))
				return word.toString();
			
			word.append(next);
			prefix.append(next);

			if (prefix.length() > PREFIX_LENGTH)
				prefix.delete(0, prefix.length() - PREFIX_LENGTH);
		}
	}

	/** Generates a sentence. */
	public String getSentence() {

		StringBuffer s = new StringBuffer();
		String firstWord = getWord();
		s.append(firstWord.substring(0, 1).toUpperCase() + firstWord.substring(1, firstWord.length()));
		
		for (int i = 0; i < 10; i++)
			s.append(getWord() + " ");
	
		s.append(getWord() + ".");
		return s.toString();
	}
	
	/** Returns string representation of the probability table. */
	public String dump() {

		StringBuffer s = new StringBuffer();
		Iterator keys = table.keySet().iterator();

		while (keys.hasNext()) {
			String key = (String)keys.next();
			s.append("'" + key + "':\t" + table.get(key) + "\n");
		}
		
		return s.toString();
	}

	public String toString() {

		return name;
	}
}
