///###////////////////////////////////////////////////////////////////////////
//
// Burton Computer Corporation
// http://www.burton-computer.com
// http://www.cooldevtools.com
// $Id: HtmlTokenizer.cc 86 2004-11-11 14:48:57Z brian $
//
// Copyright (C) 2000 Burton Computer Corporation
// ALL RIGHTS RESERVED
//
// This program is open source software; you can redistribute it
// and/or modify it under the terms of the Q Public License (QPL)
// version 1.0. Use of this software in whole or in part, including
// linking it (modified or unmodified) into other programs is
// subject to the terms of the QPL.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// Q Public License for more details.
//
// You should have received a copy of the Q Public License
// along with this program; see the file LICENSE.txt.  If not, visit
// the Burton Computer Corporation or CoolDevTools web site
// QPL pages at:
//
//    http://www.burton-computer.com/qpl.html
//    http://www.cooldevtools.com/qpl.html
//

#include "AbstractTokenReceiver.h"
#include "StringReader.h"
#include "HtmlTokenizer.h"

static const char SEPARATOR = ' ';
static const int MAX_ENTITY_LENGTH = 6;

HtmlTokenizer::HtmlTokenizer(AbstractTokenizer *textTokenizer,
                             AbstractTokenizer *tagTokenizer,
                             int maxTagLength,
                             AbstractTokenReceiver *tagReceiver)
: m_textTokenizer(textTokenizer), m_tagTokenizer(tagTokenizer), m_tagReceiver(tagReceiver), m_maxTagLength(maxTagLength)
{
}

HtmlTokenizer::~HtmlTokenizer()
{
}

void HtmlTokenizer::tokenize(AbstractTokenReceiver *receiver,
                             AbstractCharReader *reader)
{
    m_reader = reader;
    m_textTokenizer->tokenize(receiver, this);
}

void HtmlTokenizer::processTagBody(const string &tag)
{
    StringReader reader(tag);
    m_tagTokenizer->tokenize(m_tagReceiver, &reader);
}

bool HtmlTokenizer::isCommentTag(const string &tag)
{
    return starts_with(tag, "!--") && ends_with(tag, "--");
}

bool HtmlTokenizer::isInvisibleTag(const string &tag)
{
    return isCommentTag(tag);
}

bool HtmlTokenizer::isOpenCommentTag(const string &tag)
{
    return starts_with(tag, "!--");
}

bool HtmlTokenizer::isIncompleteCommentTag(const string &tag)
{
    return starts_with(tag, "!--") && !ends_with(tag, "--");
}

bool HtmlTokenizer::processedTag(string &tag)
{
    if (m_reader->currentChar() != '<') {
        return false;
    }

    tag.erase();
    NewPtr<AbstractCharReaderPosition> pos(m_reader->createMark());
    while (m_reader->forward() && (static_cast<int>(tag.length()) < m_maxTagLength || isOpenCommentTag(tag))) {
        if (m_reader->currentChar() == '>' && !isIncompleteCommentTag(tag)) {
            processTagBody(tag);
            return true;
        }

        if (processedEntity()) {
            tag += currentChar();
        } else {
            tag += m_reader->currentChar();
        }
    }

    m_reader->returnToMark(pos.get());
    return false;
}

char HtmlTokenizer::parseEntityInteger(const string &entity)
{
    char answer = ' ';
    if (entity.length() > 1 && (entity[1] == 'x' || entity[1] == 'X')) {
        answer = (char)hex_to_int(entity.c_str() + 1);
    } else {
        answer = (char)atoi(entity.c_str());
    }
    return answer;
}

bool HtmlTokenizer::processEntity(const string &entity)
{
    if (entity == "amp") {
        setCurrentChar('&');
        return true;
    }

    if (entity == "apos") {
        setCurrentChar('\'');
        return true;
    }

    if (entity == "quot") {
        setCurrentChar('"');
        return true;
    }

    if (entity == "lt") {
        setCurrentChar('<');
        return true;
    }

    if (entity == "gt") {
        setCurrentChar('>');
        return true;
    }

    if (entity == "nbsp") {
        setCurrentChar(' ');
        return true;
    }

    if (entity[0] == '#') {
        setCurrentChar(parseEntityInteger(entity));
        return true;
    }

    return false;
}

bool HtmlTokenizer::processedEntity()
{
    if (m_reader->currentChar() != '&') {
        return false;
    }

    NewPtr<AbstractCharReaderPosition> startPos(m_reader->createMark());

    string entity;
    while (m_reader->forward() && static_cast<int>(entity.length()) < MAX_ENTITY_LENGTH) {
        if (m_reader->currentChar() == ';') {
            if (!processEntity(entity)) {
                break;
            }
            return true;
        }
        entity += m_reader->currentChar();
    }

    m_reader->returnToMark(startPos.get());
    return false;
}

bool HtmlTokenizer::forward()
{
    while (true) {
        if (!m_reader->forward()) {
            return false;
        }

        if (processedEntity()) {
            return true;
        }

        string tagText;
        if (!processedTag(tagText)) {
            setCurrentChar(m_reader->currentChar());
            return true;
        }

        if (!isInvisibleTag(tagText)) {
            setCurrentChar(' ');
            return true;
        }
    }
}

bool HtmlTokenizer::hasChar()
{
    return m_reader->hasChar();
}

bool HtmlTokenizer::atEnd()
{
    return m_reader->atEnd();
}

bool HtmlTokenizer::skip(int nchars)
{
    bool have_char = true;
    while (have_char && nchars-- > 0) {
        have_char = forward();
    }
    return have_char;
}

