#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h> 
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <ctype.h>
#include <regex.h>

/*
#define UDM_OK     0
#define UDM_ERROR  1
#define UDM_BINARY 0
#define UdmMalloc  malloc
#define UdmRealloc realloc
#define UdmFree    free
#define UdmStrdup  strdup
#define udm_snprintf snprintf
*/

#include "udm_common.h"
#include "udm_unidata.h"
#include "udm_spell.h"
#include "udm_utils.h"


static void
UdmSpellListInit(UDM_SPELLLIST *Lst)
{
  bzero((void*)Lst, sizeof(Lst[0]));
}


static void
stUdmSpellListFree(UDM_SPELLLIST *Lst)
{
  UDM_FREE(Lst->fbody);
  UDM_FREE(Lst->Item);
}


static int
cmpspell(const void *a, const void *b)
{
  return strcmp(((const UDM_SPELL *)a)->word, ((const UDM_SPELL *)b)->word);
}


static int
cmpspell_word_and_flag(const void *a, const void *b)
{
  int res= strcmp(((const UDM_SPELL *)a)->word, ((const UDM_SPELL *)b)->word);
  if (!res)
    res= strcmp(((const UDM_SPELL *)a)->flags, ((const UDM_SPELL *)b)->flags);
  return res;
}


static int
UdmSpellListLoad(UDM_SPELLLIST *L, char *err, size_t errlen)
{
  int rc= UDM_OK;
  struct stat sb;
  int fd;
  ssize_t nbytes;
  char *tok;
  const char *filename= L->fname;
  static char noflag[]="";
  char tolowermap[256];
  size_t i;
  
  if (L->fbody)
    return UDM_OK; /* Already loaded */
  
  L->cs= UdmGetCharSet(L->cset);
  if (!L->cs)
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Unknown charset '%s'", L->cset);
    goto ex;
  }
  
  if (stat(filename, &sb))
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Can't stat '%s'", filename);
    goto ex;
  }
  
  if((fd= open(filename,O_RDONLY|UDM_BINARY)) <= 0)
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Can't open '%s'", filename);
    goto ex;
  }

  if (!(L->fbody = (char*)UdmMalloc(sb.st_size+1)))
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Can't open '%s'", filename);
    goto ex;
  }
  
  if ((nbytes= read(fd, L->fbody, sb.st_size)) != sb.st_size)
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Read error");
    goto ex; 
  }
  
  L->fbody[nbytes]= '\0';

  for (i=0 ; i < 256; i++)
    tolowermap[i]= i;
  L->cs->lcase(L->cs, tolowermap, 256);

  for (tok= L->fbody; *tok; )
  {
    UDM_SPELL *S;
    size_t wordlen;
    
    if (L->mitems <= L->nitems)
    {
      L->mitems+= 32*1024;
      L->Item= (UDM_SPELL*)UdmRealloc(L->Item, L->mitems*sizeof(L->Item[0]));
    }
    
    S= &L->Item[L->nitems];
    S->word= tok;
    S->flags= NULL;
    
    for (; *tok; tok++)
    {
      /* 
        This kind of tolower conversion is valid for 8bit charsets
        only, but I haven't seen non-8bit ispell files yet.
      */
      *tok= tolowermap[(unsigned char) *tok];
      if (*tok == '/')
      {
        wordlen= tok - S->word;
        *tok++= '\0';
        S->flags= tok;
        for ( ; *tok && *tok != '\r' && *tok != '\n' ; tok++);
        break;
      }
      else if (*tok == '\r' || *tok == '\n')
      {
        wordlen= tok - S->word;
        break;
      }  
    }
    for ( ; *tok == '\r' || *tok == '\n' ; *tok++= '\0');

    /*
      We skip the words without flags.
      If we need these words in the future,
      we need to combine the same words with
      and withot flags together. For example,
      'Cook' presents as a name as a separate
      word in the dictionary. We need to 
      combine it with 'cook'.
    */

    if (!S->flags)
      S->flags= noflag;

#if 0
    /* This is slow for 8bit character sets */
    L->cs->lcase(L->cs, S->word, wordlen);
#endif

    L->nitems++;
  }

  qsort((void*)L->Item, L->nitems, sizeof(L->Item[0]), cmpspell_word_and_flag);

ex:

  close(fd);
  return rc;
}



void
UdmSpellListListInit(UDM_SPELLLISTLIST *L)
{
  bzero((void*)L, sizeof(*L));
}

void
UdmSpellListListFree(UDM_SPELLLISTLIST *L)
{
  size_t i;
  for (i=0 ; i < L->nitems; i++)
    stUdmSpellListFree(&L->Item[i]);
  if (L->Item)
    UdmFree(L->Item);
}

int
UdmSpellListListAdd(UDM_SPELLLISTLIST *L,
                    const char *lang, const char *cset, const char *name)
{
  UDM_SPELLLIST *I;
  if (L->mitems <= L->nitems)
  {
    L->mitems+= 16;
    L->Item= (UDM_SPELLLIST*)UdmRealloc((void*)L->Item,
                                        L->mitems * sizeof(L->Item[0]));
    if (!L->Item)
      return UDM_ERROR;
  }
  I= &L->Item[L->nitems++];
  UdmSpellListInit(I);
  strcpy(I->lang, lang);
  strcpy(I->cset, cset);
  strcpy(I->fname, name);
  return UDM_OK;
}


int
UdmSpellListListLoad(UDM_SPELLLISTLIST *L, char *err, size_t errlen)
{
  size_t i;
  for (i=0 ; i < L->nitems; i++)
  {
    if (UdmSpellListLoad(&L->Item[i], err, errlen))
      return UDM_ERROR;
  }
  return UDM_OK;
}



static void
UdmAffixListInit(UDM_AFFIXLIST *L)
{
  bzero((void*)L, sizeof(*L));
}


static void
stUdmAffixListFree(UDM_AFFIXLIST *L)
{
  size_t i;
  for (i=0 ; i < L->nitems; i++)
  {
    UDM_AFFIX *A= &L->Item[i];
    UdmFree(A->find);
    UdmFree(A->repl);
    UdmFree(A->mask);
    regfree(&A->regex);
  }
  if (L->Item)
    UdmFree(L->Item);
}


static void rmsp(char *s)
{
  char *d;
  for (d= s; *s; s++)
  {
    if (*s != ' ' && *s != '-' && *s != '\t')
      *d++= *s;
  }
  *d= '\0';
}


static
int UdmAffixListLoad(UDM_AFFIXLIST *L, int flags, char *err, size_t errlen)
{
  char str[BUFSIZ];
  char flag= 0;
  char mask[8*BUFSIZ]="";
  char find[8*BUFSIZ]="";
  char repl[8*BUFSIZ]="";
  char *s;
  int i, rc= UDM_OK;
  int suffixes= 0;
  int prefixes= 0;
  FILE *affix;
  char *filename= L->fname;

  if (L->Item)
    return UDM_OK; /* Already loaded */

  L->cs= UdmGetCharSet(L->cset);
  if (!L->cs)
  {
    rc= UDM_ERROR;
    udm_snprintf(err, errlen, "Unknown charset '%s'", L->cset);
    goto ex;
  }

  if(!(affix=fopen(filename,"r")))
  {
    udm_snprintf(err, errlen, "Can't open file '%s'", filename);
    return UDM_ERROR;
  }

  while(fgets(str,sizeof(str),affix))
  {
    UDM_AFFIX *A;
    if(!strncasecmp(str,"suffixes",8))
    {
      suffixes=1;
      prefixes=0;
      continue;
    }
    if(!strncasecmp(str,"prefixes",8))
    {
      suffixes=0;
      prefixes=1;
      continue;
    }
    if(!strncasecmp(str,"flag ",5))
    {
      s=str+5;
      while(strchr("* ",*s))s++;
      flag= *s;
      continue;
    }
    if((!suffixes)&&(!prefixes))continue;
    if((prefixes)&&(flags & UDM_SPELL_NOPREFIX)) continue;

    if((s=strchr(str,'#')))*s=0;
    if(!*str)continue;

    mask[0]= '\0';
    find[0]= '\0';
    repl[0]= '\0';

    i= sscanf(str,"%[^>\n]>%[^,\n],%[^\n]",mask,find,repl);

    rmsp(repl);
    UdmStrToLower(L->cs, repl, strlen(repl));
    
    rmsp(find);
    UdmStrToLower(L->cs, find, strlen(find));
    
    rmsp(mask);
    UdmStrToLower(L->cs, mask, strlen(mask));

    switch(i)
    {
      case 3:break;
      case 2:
        if(*find != '\0')
        {
          strcpy(repl,find);
          find[0]= '\0';
        }
        break;
      default:
        continue;
    }

    if (suffixes)
      sprintf(str, "%s$", mask);
    else
      sprintf(str, "^%s", mask);
    strcpy(mask, str);

/*
printf("'%s' '%s' '%s' '%s' '%c'\n", mask, find, repl, suffixes ? "s" : "p", flag);
*/
    if (L->mitems <= L->nitems)
    {
      L->mitems+= 256;
      L->Item= (UDM_AFFIX*) UdmRealloc(L->Item, L->mitems * sizeof(L->Item[0]));
    }
    A= &L->Item[L->nitems++];
    A->find= UdmStrdup(find);
    A->findlen= strlen(find);
    A->repl= UdmStrdup(repl);
    A->replen= strlen(repl);
    A->mask= UdmStrdup(mask);
    A->type= suffixes ? 's' : 'p';
    A->flag= flag;
    if (regcomp(&A->regex, A->mask, REG_EXTENDED|REG_NOSUB))
    {
      regfree(&A->regex);
      udm_snprintf(err, errlen, "Can't regcomp: '%s'", A->mask);
      rc= UDM_ERROR;
      goto ex;
    }
  }

ex:
  fclose(affix);
  return rc;
}


void
UdmAffixListListInit(UDM_AFFIXLISTLIST *L)
{
  bzero((void*) L, sizeof(*L)); 
}


void
UdmAffixListListFree(UDM_AFFIXLISTLIST *L)
{
  size_t i;
  for (i=0 ; i < L->nitems; i++)
    stUdmAffixListFree(&L->Item[i]);
  UDM_FREE(L->Item);
}

int
UdmAffixListListAdd(UDM_AFFIXLISTLIST *L,
                    const char *lang, const char *cset, const char *name)
{
  UDM_AFFIXLIST *I;
  if (L->mitems <= L->nitems)
  {
    L->mitems+= 16;
    L->Item= (UDM_AFFIXLIST*)UdmRealloc((void*)L->Item,
                                        L->mitems * sizeof(L->Item[0]));
    if (!L->Item)
      return UDM_ERROR;
  }
  I= &L->Item[L->nitems++];
  UdmAffixListInit(I);
  strcpy(I->lang, lang);
  strcpy(I->cset, cset);
  strcpy(I->fname, name);
  return UDM_OK;
}


int
UdmAffixListListLoad(UDM_AFFIXLISTLIST *L, int flags, char *err, size_t errlen)
{
  size_t i;
  for (i=0 ; i < L->nitems; i++)
  {
    if (UdmAffixListLoad(&L->Item[i], flags, err, errlen))
      return UDM_ERROR;
  }
  return UDM_OK;
}


size_t
UdmSpellDenormalize(UDM_SPELLLIST *Sl,
                    UDM_AFFIXLIST *Al,
                    UDM_SPELL *S,
                    char **Res, size_t mres)
{
  UDM_AFFIX *Ab, *Ae;
  size_t nres= 0;
  size_t len= strlen(S->word);

  if (!S->flags)
    return 0;

  for (Ab= &Al->Item[0], Ae= &Al->Item[Al->nitems]; Ab < Ae; Ab ++)
  {
    char wrd[128];
    if (Ab->type == 's' && strchr(S->flags, Ab->flag) &&
        !strcmp(S->word + len - Ab->findlen, Ab->find) &&
        !regexec(&Ab->regex, S->word, 0, NULL, 0))
    {
      memcpy(wrd, S->word, len - Ab->findlen);
      strcpy(wrd + len - Ab->findlen, Ab->repl);
    } else if (Ab->type == 'p' && strchr(S->flags, Ab->flag) &&
               !memcmp(S->word, Ab->find, Ab->findlen) &&
               !regexec(&Ab->regex, S->word, 0, NULL, 0))
    {
      memcpy(wrd, Ab->repl, Ab->replen);
      strcpy(wrd + Ab->replen, S->word + Ab->findlen);
    }
    else
      continue;
    
    if (nres < mres)
    {
      Res[nres++]= UdmStrdup(wrd);
    }
    /* 
    printf("Found: '%s/%s/%s/%c' '%s'\n",
           Ab->mask, Ab->find, Ab->repl, Ab->flag, wrd);
    */
  }
  return nres;
}


size_t
UdmSpellNormalize(UDM_SPELLLIST *Sl, UDM_AFFIXLIST *Al,
                  const char *word, UDM_SPELL *Res, size_t nres)
{
  UDM_AFFIX *Ab, *Ae;
  UDM_SPELL *N, Find;
  char wrd[128];
  char noflag[]= "";
  size_t len= strlen(word);
  size_t cres= 0;

  for (Ab= &Al->Item[0], Ae= &Al->Item[Al->nitems]; Ab < Ae; Ab ++)
  {
    size_t rootlen;

    if (len < Ab->replen)
      continue;

    rootlen= len - Ab->replen;

    if (Ab->type == 's' && !memcmp(word + rootlen, Ab->repl, Ab->replen))
    {
      memcpy(wrd, word, rootlen);
      strcpy(wrd + rootlen, Ab->find);
      /*
      printf("suffix\n");
      */
    }
    else if (Ab->type == 'p' && !memcmp(word, Ab->repl, Ab->replen))
    {
      memcpy(wrd, Ab->find, Ab->findlen);
      strcpy(wrd + Ab->findlen, word + Ab->replen);
      /*  
      printf("preffix\n");
      */
    }
    else
      continue;

    /*
    printf("HERE0 '%s'\n", wrd);
    printf("HERE1 '%s' '%s' '%s' '%c' \n", Ab->find, Ab->repl, wrd, Ab->flag);
    */

    Find.word= wrd;
    Find.flags= noflag;
    N= (UDM_SPELL*) bsearch((const void *) &Find,
                            (const void *) Sl->Item,
                            Sl->nitems, sizeof(Sl->Item[0]), cmpspell);
    
    if (N)
    {
      UDM_SPELL *Beg, *End;
      for (Beg= N; Beg > Sl->Item && !strcmp(wrd, Beg[-1].word); Beg--);
      for (End= N; End < &Sl->Item[Sl->nitems] && !strcmp(wrd, End->word); End++);
      
      for (N= Beg; N < End; N++)
      {
        /*
        printf("HERE2: '%s/%s/%c'\n", N->word, N->flags,Ab->flag);
        */
        if (N->flags[0] && strchr(N->flags, Ab->flag) &&
            !regexec(&Ab->regex, wrd, 0, NULL, 0))
        {
          /*
          printf("HERE2: '%s/%s'\n", N->word, N->flags);
          */
          if (cres < nres)
            Res[cres++]= N[0];
        }
      }
    }
  }

  /* Check that the word itself is a normal form */

  strcpy(wrd, word);
  Find.word= wrd;
  Find.flags= noflag;
  N= (UDM_SPELL*) bsearch((const void *) &Find,
                          (const void *) Sl->Item,
                          Sl->nitems, sizeof(Sl->Item[0]), cmpspell);
  if (N)
  {
    
    UDM_SPELL *Beg, *End;
    for (Beg= N; Beg > Sl->Item && !strcmp(wrd, Beg[-1].word); Beg--);
    for (End= N; End < &Sl->Item[Sl->nitems] && !strcmp(wrd, End->word); End++);
    
    for (N= Beg; N < End; N++)
    {
      /*
      printf("HERE3: '%s/%s'\n", N->word, N->flags);
      */
      if (cres < nres)
        Res[cres++]= N[0];
    }
    
  }
  
  return cres;
}


#ifdef UDM_SPELL_DEMO

static size_t UdmSpellAllForms(UDM_SPELLLISTLIST *SLL,
                            UDM_AFFIXLISTLIST *ALL,
                            const char *word,
                            char **Res, size_t mres)
{
  UDM_AFFIXLIST *Al;
  size_t nres= 0;
  for (Al= ALL->Item; Al < &ALL->Item[ALL->nitems]; Al++)
  {
    UDM_SPELLLIST *Sl;
    for (Sl= SLL->Item; Sl < &SLL->Item[SLL->nitems]; Sl++)
    {
      if (!strcmp(Al->lang, Sl->lang) && !strcmp(Al->cset, Sl->cset))
      {
        UDM_SPELL Norm[128], *N;
        size_t nnorm= UdmSpellNormalize(Sl, Al, word, Norm, 128);
        for (N= Norm ; N < Norm + nnorm; N++)
        {
          size_t cres;
          if (mres)
          {
            *Res++= UdmStrdup(N->word);
            nres++;
            mres--;
          }
          cres= UdmSpellDenormalize(Sl, Al, N, Res, mres);
          nres+= cres;
          mres-= cres;
          Res+= cres;
        }
      }
    }
  }
  return nres;
}


#include <locale.h>
int main(void)
{
  UDM_AFFIXLISTLIST ALL;
  UDM_SPELLLISTLIST SLL;
  size_t n;
  setlocale(LC_ALL, "ru_RU.KOI8-R");
  char str[128];
  char err[128]= "";
  char *forms[128];
  int flags= UDM_SPELL_NOPREFIX;

  UdmSpellListListInit(&SLL);
  UdmSpellListListAdd(&SLL, "ru", "koi8-r", "russian.dict");
  UdmSpellListListAdd(&SLL, "en", "latin1", "british.xlg");
  UdmSpellListListAdd(&SLL, "en", "latin1", "american.xlg");

  UdmAffixListListInit(&ALL);
  UdmAffixListListAdd(&ALL, "ru", "koi8-r", "russian.aff");
  UdmAffixListListAdd(&ALL, "en", "latin1", "english.aff");


  if (UdmSpellListListLoad(&SLL, err, sizeof(err)) ||  
      UdmAffixListListLoad(&ALL, flags, err, sizeof(err)))
  {
    printf("error: %s\n", err);
    goto ex;
  }
  while (fgets(str, sizeof(str), stdin))
  {
    size_t i;
    char *s;
    for (s= str; *s; s++)
    {
      if (*s == '\r' || *s == '\n')
      {
        *s= '\0';
        break;
      }
    }
    UdmTolower(str, strlen(str));
    n= UdmSpellAllForms(&SLL, &ALL, str, forms, 128);
    printf("total: %d word: '%s'\n", n, str);
    for (i=0 ; i < n; i++)
    {
      printf("[%d] %s\n", i, forms[i]);
      UdmFree(forms[i]);
    }
  }

ex:

  UdmAffixListListFree(&ALL);
  UdmSpellListListFree(&SLL);

  return 0;
}

#endif
