
# ----------------------------------------------------------
# advas
# advanced search algorithms implemented as a python module
#
# (C) 2002-03 Frank Hofmann, Chemnitz, Germany
# email fh@efho.de
#
# example for testing the n-gram functions
#
# ----------------------------------------------------------

# changed 2003-11-23

# import advas and time module
import advas, time, advas017

# test unit taken from: 
# Python Cookbook by Alex Martinelli/David Ascher, OREILLY, 2002

def timeo(fun, n=10000):
	def void():pass

	start = time.clock()
	for i in range(n):
		void()

	stend = time.clock()
	overhead = stend - start
	
	start = time.clock()
	for i in range(n):
		fun()
	stend = time.clock()
	thetime = stend - start

	return fun.__name__, thetime - overhead

def get_ngrams (term, size):
	"returns n-grams of size n"

	# old advas ngrams function, included for test purposes

	# define empty list of n-grams
	ngrams = []

	# length of the term
	term_length = len(term)

	if (size>term_length):
		# we can't form any n-grams - term too small for given size
		return term
	# end if

	if (size<2):
		# we can't form any n-grams - size must be at least +2
		return term
	# end if

	# define left and right boundaries
	left = 0
	right = left + size

	while (right<=term_length):

		# extract slice
		item = term[left:right]

		# get list size
		list_items = len(ngrams)

		available = 0

		if (list_items>0):
			i = 0
			while (available == 0):
				# does this ngram already exist?
				if (advas.cmp_strings(item, ngrams[i]) <> 0):
					# no, not yet
					i = i + 1

					# at the end of the list?
					if (i==list_items):
						break
					# end if
				else:
					# yep, it is here
					available = 1
				# end if
			# end while
		# end if

		# add n-gram if not in list
		if (available == 0):
			ngrams.append(item)
		# end if

		# move slice to the right
		left = left + 1
		right = right + 1
	# end while

	# return n-grams
	return ngrams

def ngr(term, size):
	# define empty list of n-grams
	ngrams = []

	# length of the term
	term_length = len(term)

	if (size>term_length):
		# we can't form any n-grams - term too small for given size
		return term
	# end if

	if (size<2):
		# we can't form any n-grams - size must be at least +2
		return term
	# end if

	# define left and right boundaries
	left = 0
	right = left + size

	while (right<=term_length):
		# extract slice and append to the list
		slice = term[left:right]
		ngrams.append(slice)
		
		# move slice to the right
		left = left + 1
		right = right + 1
	# end while

	# calculate term frequency
	dict = advas.count_words(ngrams)

	# return ngrams = keys of the dict
	return dict.keys()

def comp_ngrams_new(term1, term2, size):
	# get n-grams for term1 and term2
	list1 = advas.get_ngrams(term1, size)
	list2 = advas.get_ngrams(term2, size)

	list1_dict = advas.count_words(list1)
	list2_dict = advas.count_words(list2)

	list3 = filter(list1_dict.has_key, list2_dict.keys())
	dict = advas.convert_list_into_dictionary(list3, 0)

	# return value
	return dict.keys()


def old_n():
	term = "alphabetanalphabetalphabetization"
	size = 2
	get_ngrams (term, size)

def new_n():
	term = "alphabetanalphabetalphabetization"
	size = 2
	ngr (term, size)

def new_n2():
	term = "alphabetanalphabetalphabetization"
	size = 2
	advas.get_ngrams (term, size)

def cmp_ngrams_o():
	term = "alphabetanalphabetalphabetization"
	size = 2
	term2 = "analphabet"
	advas.comp_ngrams (term, term2, size)

def cmp_ngrams_n():
	term = "alphabetanalphabetalphabetization"
	size = 2
	term2 = "analphabet"
	comp_ngrams_new (term, term2, size)

def ngr_st_new():
	line = "foreign, foreigner, foreleg, forelock, foreman, foremost, forename, forenoon, forensic, foresee, foreseeable, forest, forestry, forever, forge, forger, forgery, forget, forgot"

	# split text
	words = advas.split_line (line)
	advas.ngram_stemmer (words, 2, 0.8)

def ngr_st_old():
	line = "foreign, foreigner, foreleg, forelock, foreman, foremost, forename, forenoon, forensic, foresee, foreseeable, forest, forestry, forever, forge, forger, forgery, forget, forgot"

	# split text
	words = advas.split_line (line)
	advas017.ngram_stemmer (words, 2, 0.8)

def comp_descr_old():
	request = ["footnote", "footpath", "footman"]
	document = ["footnote", "footpath", "footman", "footprint"]
	equality = advas017.comp_descriptors (request, document)
	#print '%s : %f' % ("equality", equality)

def comp_descr_new():
	request = ["footnote", "footpath", "footman"]
	document = ["footnote", "footpath", "footman", "footprint"]
	equality = advas.comp_descriptors (request, document)
	#print '%s : %f' % ("equality", equality)


#for f in old_n, new_n, new_n2, cmp_ngrams_o, cmp_ngrams_n:
#for f in cmp_ngrams_o, cmp_ngrams_n, ngr_st_old, ngr_st_new:
for f in comp_descr_old, comp_descr_new:
	print "%s: %.2f"%timeo(f)

print advas017.metaphone("knuth"), advas.metaphone("knuth")

#term = "alphabetanalphabet"
#size = 2

#n_ngrams = ngr (term, size)
#o_ngrams = advas.get_ngrams (term, size)
#print 'The term %s consists of the following n-grams (n=%d):' % (term, size)
#for i in n_ngrams:
#	print i

#print "\n"
#for j in o_ngrams:
#	print j

#term = "alphabetanalphabetalphabetization"
#size = 2
#term2 = "analphabet"
#value = advas.comp_ngrams (term, term2, 2) * 100
#print '%s and %s have a similarity of %f' % (term, term2, value) + "%."
