import straw, error, re, string, sgmllib

ws_re = re.compile("\s+")
xmlheaderRe = re.compile(r'<\?.*encoding=(["\'])(?P<encoding>.*?)\1.*\?>')

class ImageParser(sgmllib.SGMLParser):
    def __init__(self, feed):
        sgmllib.SGMLParser.__init__(self)
        self._feed = feed
        self._image_urls = []

    def do_img(self, attrs):
        for name, value in attrs:
            if name == 'src':
                url = straw.utils.complete_url(value, self._feed.location)
                self._image_urls.append(url)

    def get_image_urls(self):
        return self._image_urls

    def close(self):
        sgmllib.SGMLParser.close(self)


def unicode_field(dict, key, enc, default='', nowhitespace=False):
    v = dict.get(key, default)
    # on patrick logan's feed feedparser was, for an unknown reason,
    # returning for one item (not the others) content, creator, description,
    # link, modified and title as a list containing a dict containing keys
    # base, language, mode, type, value
    # this was with feedparser 2.7.5, on 2004-04-25
    if type(v) == type([]) and len(v) == 1 and type(v[0]) == type({}):
        v = v[0]['value']
    if len(v):
        try:
            v = unicode(v, enc)
        except (ValueError, UnicodeError):
            # try with iso-8859-1, usually we get at least something
            v = unicode(v, "iso-8859-1")
        except TypeError:
            # string is already unicoded
            pass

    if nowhitespace:
        v = ws_re.sub(" ", v.strip())

    return v

def parse(content, feed):
    parser = straw.feedparser.FeedParser()
    parsed = straw.ParsedSummary()
    imp = ImageParser(feed)
    parser.feed(content)

    enc = parser.channel.get('encoding', '')

    if enc == '':
        match = xmlheaderRe.match(content)
        if match:
            enc =  parser.channel['encoding'] = match.group('encoding').lower()
        else:
            enc = straw.utils.get_locale_encoding()

    parsed.title = unicode_field(parser.channel, "title", enc, nowhitespace=True)
    parsed.description = unicode_field(parser.channel, "description", enc, nowhitespace=True)
    parsed.link = unicode_field(parser.channel, "link", enc, nowhitespace=True)
    parsed.copyright = unicode_field(parser.channel, "rights", enc, nowhitespace=True)
    parsed.last_build_date = parser.channel.get('date_parsed',"")

    if parser.channel.has_key('creator'):
        parsed.creator = unicode_field(parser.channel, "creator", enc, nowhitespace=True)

    # item properties
    for idict in parser.items:
        item = straw.SummaryItem()
        description = ''
        item.feed = feed

        if idict.has_key('content'):
            value = idict.get('content', None)[0]
            description = unicode_field(value, 'value', enc)
        elif idict.has_key('description'):
            description = unicode_field(idict, 'description', enc)


        title = unicode_field(idict, 'title', enc, nowhitespace=True)
        item.title = straw.utils.convert_title(title, description)
        item.title_converted = True

        if idict.has_key('guid'):
            item.guid = unicode_field(idict, 'guid', enc, nowhitespace=True)
            item.guidislink = idict.get('guidislink', False)
        item.link = unicode_field(idict, 'link', enc, nowhitespace=True)

        if len(description):
            item.description = description
            try:
                imp.feed(description)
                image_urls = imp.get_image_urls()
                if len(image_urls):
                    for im in image_urls:
                        item.add_image(im)
                imp.close()
            except Exception, ex:
                error.log("error while parsing description of %s: %s" % (item.feed.title, ex))
                item.feed.router.set_error((_("Error parsing item %s: %s") % (item.title, ex)))

        # dc:creator
        if idict.has_key('creator'):
            item.creator = unicode_field(idict, 'creator', enc, nowhitespace=True)

        if idict.has_key('date_parsed'):
            item.pub_date = idict.get('date_parsed',"")

        if idict.has_key('source'):
            st = unicode_field(idict, 'source', enc, nowhitespace=True)
            # source format: "url,name"   e.g. 'http://foo.com,Foo'
            url, text = string.split(st, ',', 1 )
            item.source = {'url': url.strip(), 'text': text}

        if idict.has_key('license'):
            # freshmeat
            if parser.namespacemap.has_key('fm'):
                item.fm_license = unicode_field(idict, 'license', enc)
                item.fm_changes = unicode_field(idict, 'changes', enc)
            else:
                license = unicode_field(idict, "license", enc)
                item.license_urls.append(license)

        # prism
        if idict.has_key('publicationName'):
            item.publication_name = unicode_field(idict,'publicationName', enc)
            item.publication_volume = unicode_field(idict, 'volume', enc)
            item.publication_number = unicode_field(idict, 'number', enc)
            item.publication_section = unicode_field(idict, 'section', enc)
            item.publication_starting_page = unicode_field(idict, 'startingPage', enc)

        parsed.addItem(item)

    parser.reset()

    return parsed

