/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <udm_config.h>

#include <stdlib.h>
#include <string.h>
#include <errno.h>

#include <sys/types.h>
#ifdef   HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ZLIB
#include <zlib.h>
#endif

#include "udm_store.h"
#include "udm_services.h"
#include "udm_xmalloc.h"
#include "udm_hash.h"
#include "udm_utils.h"
#include "udm_log.h"
#include "udm_vars.h"
#include "udm_parsehtml.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_searchtool.h"
#include "udm_sgml.h"
#include "udm_sqldbms.h"
#include "udm_mutex.h"
#include "udm_doc.h"
#include "udm_db.h"
#include "udm_guesser.h"


static int UdmUniNSpace(int c)
{
  if (c > 0x0020)
  {
    if (c == 0x00A0) return 0;
    if (c < 0x1680)
      return 1;
    if (c == 0x1680) return 0;
    if ((c >= 0x2000) && (c <= 0x200B)) return 0;
    if (c == 0x202F) return 0;
    if (c == 0x3000) return 0;
  }
  else
  {
    if (c == 0x0020) return 0;
    if (c == 0x000D) return 0;
    if (c == 0x000A) return 0;
    if (c == 0x0009) return 0;
  }
  
  return 1;
}


static int *UdmGetExcerptSourceCachedCopy (UDM_AGENT *Agent,
                                           UDM_RESULT *Res, UDM_DOCUMENT *Doc)
{
#ifdef HAVE_ZLIB
  size_t l;
  char *Source, *in_buf;
  z_stream zstream;
  UDM_DSTR buf;
  UDM_CHARSET *sys_int= &udm_charset_sys_int;
  UDM_CHARSET *dcs= UdmGetCharSet(UdmVarListFindStr(&Doc->Sections, "Charset", "iso-8859-1"));
  const char *CachedCopy= UdmVarListFindStr(&Doc->Sections, "CachedCopy", NULL);
  const char *content_type= UdmVarListFindStr(&Doc->Sections, "Content-Type", NULL);
  const char *deftype=  UdmVarListFindStr(&Agent->Conf->Vars,"DefaultContentType",NULL);
  UDM_HTMLTOK tag;
  const char *htok, *last;
  int hlstop= UdmVarListFindBool(&Agent->Conf->Vars, "ExcerptStopword", 1);
  
  if (!CachedCopy || !sys_int || !dcs)
    return NULL;
  
  if (!(Source = UdmMalloc(UDM_MAXDOCSIZE)))
    return NULL;
  
  l= strlen(CachedCopy);
  if (!(in_buf= UdmMalloc(l)))
  {
    UdmFree(Source);
    return(NULL);
  }
  zstream.next_in= (Byte *)in_buf;
  zstream.avail_in= udm_base64_decode((char *)zstream.next_in, CachedCopy, l);
  zstream.next_out= (Byte *)Source;
  zstream.avail_out= UDM_MAXDOCSIZE - 1;
  zstream.zalloc= Z_NULL;
  zstream.zfree= Z_NULL;
  zstream.opaque= Z_NULL;
  
  if (inflateInit2(&zstream, 15) != Z_OK)
  {
    UdmFree(Source);
    UdmFree(in_buf);
    return(NULL);
  }
  
  inflate(&zstream, Z_FINISH);
  inflateEnd(&zstream);
  Source[zstream.total_out] = 0;
  UdmFree(in_buf);

  if (!content_type ||
      (strcmp(content_type, "text/plain") &&
       strcmp(content_type, "text/xml") &&
       strcmp(content_type, "text/vnd.wap.wml") &&
       strcmp(content_type, "text/html")))
    content_type= UdmGuessContentType(Source, zstream.total_out, deftype);
  UdmDSTRInit(&buf, 1024);

  if (!strcmp(content_type, "text/xml") ||
      !strcmp(content_type, "text/vnd.wap.wml"))
  {
    UdmHTMLTOKInit(&tag);
    htok= UdmHTMLToken(Source, &last, &tag);
    do
    {
      if (tag.type == UDM_HTML_TXT &&
          !tag.script && !tag.comment && ! tag.style)
        UdmDSTRAppend(&buf, htok, last - htok);
      else
        UdmDSTRAppend(&buf, " ", 1);
    } while ((htok = UdmHTMLToken(NULL, &last, &tag)));
  }
  else if (!strcmp(content_type, "text/html"))
  {
    UdmHTMLTOKInit(&tag);
    htok= UdmHTMLToken(Source, &last, &tag);
    do
    {
      if (tag.type == UDM_HTML_TXT && tag.body &&
          !tag.script && !tag.comment && ! tag.style)
        UdmDSTRAppend(&buf, htok, last - htok);
      else
        UdmDSTRAppend(&buf, " ", 1);
    } while ((htok = UdmHTMLToken(NULL, &last, &tag)));
  }
  else
  {
    UdmDSTRAppend(&buf, Source, zstream.total_out);
  }
  
  UdmFree(Source);

  Source= UdmHlConvertExt(&Res->WWList, buf.data, dcs, sys_int, hlstop);
  UdmDSTRFree(&buf);
  return (int*) Source;

#else
  return NULL;
#endif
}

static int *UdmGetExcerptSourceBody(UDM_AGENT *Agent,
                                    UDM_RESULT *Res, UDM_DOCUMENT *Doc)
{
  int *res;
  size_t l, ul;
  const char *Source= UdmVarListFindStr(&Doc->Sections, "body", NULL);
  UDM_CHARSET *sys_int= &udm_charset_sys_int;
  UDM_CHARSET *cs= UdmGetCharSet(UdmVarListFindStr(&Agent->Conf->Vars, "BrowserCharset", "iso-8859-1"));
  UDM_CONV conv;
  
  if (!Source || !sys_int || !cs)
    return NULL;
  
  l= strlen(Source);
  if (!(res = UdmMalloc(sizeof(int) * (l + 1))))
    return NULL;
  
  UdmConvInit(&conv, cs, sys_int, UDM_RECODE_HTML);
  if ((ul= UdmConv(&conv, (char *)res, sizeof(int) * (l + 1), Source, l)) < 0)
  {
    UdmFree(res);
    return(NULL);
  }
  res[ul / sizeof(int)]= 0;
  return res;
}

__C_LINK char * __UDMCALL UdmExcerptDoc (UDM_AGENT *Agent,
                                         UDM_RESULT *Res, UDM_DOCUMENT *Doc,
                                         size_t ExcerptSize, size_t ExcerptPadding)
{
  char *res;
  size_t ul,l, i, j, left, right;
  size_t prev_right= 0;
  int *Source;
  UDM_CONV conv;
  UDM_CHARSET *sys_int= &udm_charset_sys_int;
  UDM_CHARSET *cs= UdmGetCharSet(UdmVarListFindStr(&Agent->Conf->Vars, "BrowserCharset", "iso-8859-1"));
  UDM_DSTR buf;
  int dots[]= { 0x2E, 0x2E, 0x2E };
  
  if (!sys_int || !cs)
    return NULL;

  if (!(Source= UdmGetExcerptSourceCachedCopy(Agent, Res, Doc)) &&
      !(Source= UdmGetExcerptSourceBody(Agent, Res, Doc)))
    return(NULL);

  ul= UdmUniLen(Source);

  /* Strip whitespaces */
  for (i= 0, j= 0; i < ul; i++)
  {
    if (UdmUniNSpace(Source[i]))
      Source[j++] = Source[i];
    else if (j && UdmUniNSpace(Source[j - 1]))
      Source[j++] = 0x20;
  }
  if (j && !UdmUniNSpace(Source[j-1]))
    j--;
  Source[j] = 0;
  ul = j;

  /* Get excerpt */
  UdmDSTRInit(&buf, 1024);
  for (i= 0; i < ul; i++)
  {
    if (Source[i] == 2)
    {
      for (j= i + 1; j < ul; j++)
        if (Source[j] == 3)
          break;
      
      left= ExcerptPadding < i ? i - ExcerptPadding : 0;
      if (left < prev_right)
        left= prev_right;
      if (left) 
        while (left < i && Source[left] != 0x20)
          left++;
      
      right= ExcerptPadding + j;
      if (right >= ul)
        right= ul - 1;
      else
        while (right > j && Source[right] != 0x20)
          right--;
      
      if (ExcerptSize < buf.size_data / sizeof(int) + right - left + 1)
        break;
      if (left != prev_right)
        UdmDSTRAppend(&buf, (char *)dots, sizeof(dots));
      UdmDSTRAppend(&buf, (char *)&Source[left], (right - left + 1) * sizeof(int));
      i= right;
      prev_right= right;
    }
  }
  if (!buf.size_data)
  {
    ul= ExcerptSize > ul ? ul : ExcerptSize;
    UdmDSTRAppend(&buf, (char *)Source, ul * sizeof(int));
  }
  UdmFree(Source);
  
  ul= buf.size_data / sizeof(int) * 20;
  if (!ul || !(res= UdmMalloc(ul))) 
  {
    UdmDSTRFree(&buf);
    return NULL;
  }

  UdmConvInit(&conv, sys_int, cs, UDM_RECODE_HTML);
  l= UdmConv(&conv, res, ul, buf.data, buf.size_data);
  UdmDSTRFree(&buf);
  res[l]= 0;
  return res;
}
