
##########################################################################
#                                                                        #
#  copyright:          (c) 2003 by Konrad Wojas <wojas@vvtp.tudelft.nl>  #
#                                                                        #
#  This program is free software; you can redistribute it and/or modify  #
#  it under the terms of the GNU General Public License as published by  #
#  the Free Software Foundation; either version 2 of the License, or     #
#  (at your option) any later version.                                   #
#                                                                        #
##########################################################################

"""
Parser classen for the IMDb result pages
"""

from htmllib import HTMLParser
import formatter
import re

class IMDbSearchParser(HTMLParser):

	"""
	Parser for the search result page
	"""

	def __init__(self):
		HTMLParser.__init__(self,\
		  formatter.AbstractFormatter(formatter.NullWriter()) )
		self.in_ol 		= 0
		self.ol_count 		= 0
		self.in_a 		= 0
		self.in_p 		= 0
		self.has_results	= 0
		self.last_a 		= None
		self.results		= []
		self.result_index	= 0
		self.prev_result_a	= 0
		self.prev_result_idx	= -1

	def start_ol(self,attr):
		self.in_ol = 1
		self.ol_count += 1
	
	def start_a(self,attr):
		self.in_a = 1
		self.last_a = attr
	
	def start_p(self,attr):
		self.in_p = 1
		
	def end_ol(self):
		self.in_ol = 0
		
	def end_a(self):
		self.in_a = 0

	def end_p(self):
		self.in_p = 0

	def split_title_year(self,s):
		m = re.match( r'^(.*) \(([0-9]{4})\)', s )
		if m:
			return (m.group(1),m.group(2))
		else:
			return (s,'')
	
	def getResults(self):
		resdata = self.results
		res = []
		for r in resdata:
			if r[0]:
				title = ''.join( r[1:] )
				ty = self.split_title_year(title)
				res.append( (ty[0],ty[1],r[0]) )
		return res

	def handle_data(self,text):
		if self.has_results and self.in_ol and self.in_a and self.ol_count==1:
			href = ""
			if not self.last_a==self.prev_result_a:
				self.prev_result_a = self.last_a
				self.prev_result_idx += 1
				self.results.append([None])
			for a in self.last_a:
				if a[0].lower()=='href':
					href = "http://imdb.com" + a[1]
					self.results[self.prev_result_idx][0] = href
					self.results[self.prev_result_idx].append(text)
					break
		elif self.in_p and text=='Most popular title searches:':
			self.has_results = 1

# ---------------------------------------------------------------------

class IMDbDetailsParser(HTMLParser):

	"""
	Parser for the movie details pages
	"""
	
	def __init__(self):
		HTMLParser.__init__(self,\
		  formatter.AbstractFormatter(formatter.NullWriter()) )
		self.in_title	= 0
		self.in_b_ch	= 0
		self.in_b	= 0
		self.in_a_genre	= 0
		self.tmp_title	= ""
		self.b_ch_last	= ""
		self.data = { "genres" : [] }

	def split_title_year(self,s):
		m = re.match( r'^(.*) \(([0-9]{4})\)', s )
		if m:
			return (m.group(1),m.group(2))
		else:
			return (s,'')

	def start_title(self,attr):
		self.in_title = 1
		
	def end_title(self):
		self.in_title = 0
		ty = self.split_title_year(self.tmp_title)
		self.data['title'] = ty[0]
		self.data['year'] = ty[1]
		
	def start_b(self,attr):
		for a in attr:
			if a[0]=="class" and a[1]=="ch":
				self.in_b_ch = 1
				return
		self.in_b = 1
		
	def end_b(self):
		self.in_b_ch = 0
		self.in_b = 0
	
	def start_a(self,attr):
		if self.b_ch_last=='Genre:':
			self.in_a_genre = 1
		
	def end_a(self):
		self.in_a_genre = 0
	
	def handle_data(self,text):
		if self.in_title:
			self.tmp_title += text
		elif self.in_b_ch:
			self.b_ch_last = text
		elif self.in_b:
			m = re.match( r"^([0-9]\.[0-9])/10$", text )
			if m: self.data["rating"] = m.group(1)
		elif self.in_a_genre and text!="(more)":
			self.data["genres"].append(text)
			
