# url snarfer
# (c) Wijnand 'tehmaze' Modderman - http://tehmaze.com
# BSD License

import copy
import re
import urllib
import urllib2
import urlparse
from gozerbot.callbacks import callbacks
from gozerbot.aliases import aliases
from gozerbot.commands import cmnds
from gozerbot.config import config
from gozerbot.examples import examples
from gozerbot.generic import decode_html_entities, get_encoding, geturl, geturl2, rlog
from gozerbot.persist import Persist

cfg           = Persist('snarf', {})
re_html_title = re.compile(u'<title>(.*)</title>', re.I | re.M)
re_url_match  = re.compile(u'((?:http|https)://\S+)')
re_html_valid = {
    'result':   re.compile('(Failed validation, \d+ errors?|Passed validation)', re.I | re.M),
    'modified': re.compile('<th>Modified:</th><td colspan="2">([^<]+)</td>', re.I | re.M),
    'server':   re.compile('<th>Server:</th><td colspan="2">([^<]+)</td>', re.I | re.M),
    'size':     re.compile('<th>Size:</th><td colspan="2">([^<]+)</td>', re.I | re.M),
    'content':  re.compile('<th>Content-Type:</th><td colspan="2">([^<]+)</td>', re.I | re.M),
    'encoding': re.compile('<td>([^<]+)</td><td><select name="charset" id="charset">', re.I | re.M),
    'doctype':  re.compile('<td>([^<]+)</td><td><select id="doctype" name="doctype">', re.I | re.M)
    }
urlcache      = {}
urlbad        = ['swf', 'flv', 'fla', 'jpg', 'png', 'bmp', 'rar', 'zip', 'bz2', 'gz', 'arj', 'arc', 'exe', 'com']
urlvalidate   = 'http://validator.w3.org/check?charset=%%28detect+automatically%%29&doctype=Inline&verbose=1&%s'

def geturl_title(url):
    try:
        result = geturl2(url)
    except urllib2.HTTPError:
        return False
    except IOError, ex:
        try:
            errno = ex[0]
        except IndexError:
            handle_exception(ievent=ievent)
            return
        return False
    if not result:
        return False
    test_title = re_html_title.search(result)
    if test_title:
        # try to find an encoding and standardize it to utf-8
        encoding = get_encoding(result)
        title = test_title.group(1).decode(encoding, 'replace')
        return decode_html_entities(title)
    return False

def geturl_validate(url):
    url = urlvalidate % urllib.urlencode({'uri': url})
    try:
        result = geturl(url)
    except IOError, ex:
        try:
            errno = ex[0]
        except IndexError:
            handle_exception(ievent=ievent)
            return
        return False
    if not result:
        return False
    results = {}
    for key in re_html_valid.keys():
        results[key] = re_html_valid[key].search(result)
        if results[key]:
            results[key] = results[key].group(1)
        else:
            results[key] = '(unknown)'
    return results

def valid_url(url):
    if not re_url_match.search(url):
        return False
    parts = urlparse.urlparse(url)
    if parts[2] and parts[2].split('.')[-1].lower() in urlbad:
        return False
    cleanurl = '%s://%s' % (parts[0], parts[1])
    if parts[2]:
        cleanurl = '%s%s' % (cleanurl, parts[2])
    if parts[3]:
        cleanurl = '%s;%s' % (cleanurl, parts[3])
    if parts[4]:
        cleanurl = '%s?%s' % (cleanurl, parts[4])
    return cleanurl

def handle_snarf(bot, ievent):
    if not ievent.rest and (not urlcache.has_key(bot.name) or not urlcache[bot.name].has_key(ievent.target)):
        ievent.missing('<url>')
        return
    elif not ievent.rest:
        url = urlcache[bot.name][ievent.target]
    else:
        url = ievent.rest
    url = valid_url(url)
    if not url:
        ievent.reply('invalid or bad URL')
        return
    title = geturl_title(url)
    if title:
        host = urlparse.urlparse(url)[1]
        if len(host) > 20:
            host = host[0:20] + '...'
        ievent.reply('%s: %s' % (host, title))
    else:
        ievent.reply('no title found at %s' % urlparse.urlparse(url)[1])

cmnds.add('snarf', handle_snarf, 'USER')
examples.add('snarf', 'fetch the title from an URL', 'snarf http://gozerbot.org')
aliases.data['@'] = 'snarf'
aliases.data['title'] = 'snarf'

def handle_snarf_enable(bot, ievent):
    if not cfg.data.has_key(bot.name):
        cfg.data[bot.name] = {}
    cfg.data[bot.name][ievent.target] = True
    cfg.save()
    ievent.reply('ok')

cmnds.add('snarf-enable', handle_snarf_enable, 'OPER')
aliases.data['snarf-on'] = 'snarf-enable'

def handle_snarf_disable(bot, ievent):
    if not cfg.data.has_key(bot.name):
        ievent.reply('ok')
        return
    cfg.data[bot.name][ievent.target] = False
    cfg.save()
    ievent.reply('ok')

cmnds.add('snarf-disable', handle_snarf_disable, 'OPER')
aliases.data['snarf-off'] = 'snarf-disable'

def handle_snarf_list(bot, ievent):
    snarfs = []
    names  = cfg.data.keys()
    names.sort()
    for name in names:
        targets = cfg.data[name].keys()
        targets.sort()
        snarfs.append('%s: %s' % (name, ' '.join(targets)))
    if not snarfs:
        ievent.reply('none')
    else:
        ievent.reply('snarfers enable on: %s' % ', '.join(snarfs))

cmnds.add('snarf-list', handle_snarf_list, 'OPER')

def handle_validate(bot, ievent):
    if not ievent.rest and not urlcache.has_key(bot.name) and not urlcache[bot.name].has_key(ievent.target):
        ievent.missing('<url>')
        return
    elif not ievent.rest:
        url = urlcache[bot.name][ievent.target]
    else:
        url = ievent.rest
    url = valid_url(url)
    if not url:
        ievent.reply('invalid or bad URL')
        return
    result = geturl_validate(url)
    if result:
        host = urlparse.urlparse(url)[1]
        if len(host) > 20:
            host = host[0:20] + '...'
        ievent.reply('%s: %s | modified: %s | server: %s | size: %s | content-type: %s | encoding: %s | doctype: %s' % \
            tuple([host] + [result[x] for x in ['result', 'modified', 'server', 'size', 'content', 'encoding', 'doctype']]))

cmnds.add('validate', handle_validate, 'USER')
examples.add('validate', 'validate an URL', 'validate http://gozerbot.org')
aliases.data['valid'] = 'validate'

def privmsgcb(bot, ievent):
    test_url = re_url_match.search(ievent.txt)
    if test_url:
        url = test_url.group(1)
        if not urlcache.has_key(bot.name):
            urlcache[bot.name] = {}
        urlcache[bot.name][ievent.target] = url
        rlog(10, bot.name, 'cached url %s on %s' % (url, ievent.target))
        if cfg.data.has_key(bot.name) and cfg.data[bot.name].has_key(ievent.target) and cfg.data[bot.name][ievent.target]:
            # see if we can get channel control character
            try:
                cchar = bot.channels[ievent.channel]['cc']
            except LookupError:
                cchar = config['defaultcc'] or '!'
            except TypeError:
                cchar = config['defaultcc'] or '!'
            nevent = copy.copy(ievent)
            nevent.txt = '%ssnarf %s' % (cchar, url)
            bot.handle_privmsg(nevent)

callbacks.add('PRIVMSG', privmsgcb)

