/*
 *  main.c - Thesaurus coder for OpenOffice.org
 *
 *  Copyright (C) 2003 Giuseppe Modugno
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 
/*  thescoder is a program that converts an input thesaurus plain-text file
 *  into two files, index file (.idx extension) and data file (.dat extension).
 *  
 *  Plain-text input file must have the following format:
 *  <characters accepted>
 *  <word1>,<syn11>,<syn12>...\n
 *  <word2>,<syn21>,<syn22>...\n
 *  ...
 *  <wordN>,<synN1>,<synN2>...\n
 *  On the first line there is a list of characters accepted: lowercase
 *  and uppercase letters, dash ('-'), space (' ') and so on.
 *  On every next lines there is a word and all its synonimous, separated by a
 *  comma. Spaces is optional before and after comma word-separator character.
 *  A word or synonimous is a sequence of characters as listed in the first
 *  line. Spaces at the beginning and ending of words are ignored by thescoder.
 *  Note that it isn't necessary to have word alphabetically sorted.
 *
 *  Index file (.idx) generated is a plain-text file with the following format:
 *  <word1>,<offset1>
 *  <word2>,<offset2>
 *  ...
 *  <wordN>,<offsetN>
 *  It's a **sorted** list of all the words and synonimous. Synonimous list
 *  for every word is stored in data file. So, in index file, for every word
 *  there is the offset in data file where synonimous list for that word is.
 *
 *  Data file is a binary file with the following format:
 *  <n1><idx_11><idx_12>...<idx_1n1>
 *  <n2><idx_21><idx_22>...<idx_2n2>
 *  ...
 *  <nM><idx_M1><idx_M2>...<idx_MnM>
 *  where <ni> is number of synonimous of i-th word (as in index file),
 *        <idx_ij> is the j-th synonimous index (as in index file) of i-th word.
 *  Every number is stored as 16 bit unsigned integer in Big Endian format.
 *  OpenOffice.org read that files considering numbers in Big Endian format
 *  indipendent of the platform in which it is running. In this way, the same
 *  .idx and .dat files can be distributed for all the platforms without
 *  conversion.
 */
  
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "wordtree.h"

#define RELEASE_MAJOR 0
#define RELEASE_MINOR 5

#define VERBOSE
/* define DEBUG*/

#define APPNAME          "thescoder"
#define WORD_MAXLENGTH   128        /* Max word and syn length. */
#define SYN_SEP          ','        /* Syn separator character. */
#define MAX_WORDCHARS	 128        /* Maximum number of characters accepted. */

/* Useful macros. */
#define make_string(x)			#x
#define def2string(x)			make_string(x)

#define syntax_error()      {fprintf( stderr, APPNAME ": Syntax error\n" );\
                             fprintf( stderr, "usage: " APPNAME " <input file> <output prefix file>\n" );\
                            }

/* Internal funtion prototypes. */
int read_word( FILE *f, char *word );
int iswordchar( char c );

/* Global varibles. */
char wordchar[MAX_WORDCHARS];


int
main( int argc, char *argv[] )
{
  FILE *in,*idx,*dat;	/* Input and output files. */
  WTelem *wordtree = NULL, /* Wordtree home address. */
    *syn,		/* Synonimous pointer (tree element). */
    *word;		/* Word pointer (tree element). */
  char word_str[WORD_MAXLENGTH]; /* Buffer for word. */
  int sep;		/* Synonimous separator. */
  char *filename;		/* String for output filename. */
  char *out_prefix;	/* Output file name (without extension). */
  unsigned int nw=0;	/* Word counter. */
  

  /* Read input file name (first parameter)... */
  if( !--argc ) {
    syntax_error();
    exit(1);
  }
  /* ...and open it. */
  if( !strcmp("-v",*++argv) ) {
    fprintf( stderr, APPNAME " " def2string(RELEASE_MAJOR) "." def2string(RELEASE_MINOR) "\n" );
    exit(0);
  }

  if( (in=fopen(*argv,"rt"))==NULL ) {
    fprintf( stderr, APPNAME ": Error opening %s file for reading\n", *argv );
    exit(1);
  }
  
  /* Read output prefix file name (second parameter). */
  if( !--argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  out_prefix = *++argv;
  
  /* If more arguments there are, it's a syntax error. */
  if( --argc ) {
    syntax_error();
    fclose(in);
    exit(1);
  }
  
  
  
#ifdef VERBOSE
  fprintf( stderr, "Reading words...\n" );
#endif
  /* Read word characters. */
  if( fscanf(in, "%" def2string(MAX_WORDCHARS) "s\n",wordchar)!=1 ) {
    fprintf( stderr, APPNAME ": error reading word characters line.\n" );
    fclose(in);
    exit(1);
  }
  
  /* Now we can read input file and add words to the tree. */
  while( (sep=read_word(in,word_str))!=-1 ) {
    /* Increment number of words. */
    ++nw;
#ifdef DEBUG
    fprintf( stderr, "Word %4u: .%s. read\n", nw, word_str );
#endif
    /* Add word to the tree (as a word). */
    if( (word=wordtree_add( word_str, &wordtree ))==NULL ) {
      fprintf( stderr, APPNAME ": out of memory\n" );
      fclose( in );
      exit(1);
    }
    if( word->isword )
      fprintf( stderr, "Warning: Two or more lines for the word .%s.\n", word->word );
    else
      word->isword = 1;
    
    /* Add synonimous to the word. */
    while( (char)sep==SYN_SEP ) {
      sep=read_word(in,word_str);
#ifdef DEBUG
      fprintf( stderr, "Adding synonimous %s to word %s\n", word_str, word->word );
#endif
      /* Add synonimous to the tree. */
      if( (syn=wordtree_add( word_str, &wordtree ))==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
      /* Add synonimous to the word synonimous list. */
      if( synlist_add(word,syn)==NULL ) {
	fprintf( stderr, APPNAME ": out of memory\n" );
	fclose(in);
	exit(1);
      }
    }
  }
#ifdef VERBOSE
  fprintf( stderr, "%u words read from input file\n", nw );
#endif
  /* Close input file. */
  fclose( in );
  
  
  /* Open output files. */
#ifdef VERBOSE
  fprintf( stderr, "Writing output files...\n" );
#endif
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    exit(1);
  }
  sprintf( filename, "%s.idx", out_prefix );
  if( (idx=fopen(filename,"wt"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    exit(1);
  }
  free(filename);
  
  if( (filename=(char *)malloc(strlen(out_prefix)+5))==NULL ) {
    fprintf( stderr, APPNAME ": out of memory\n" );
    fclose(idx);
    exit(1);
  }
  sprintf( filename, "%s.dat", out_prefix );
  if( (dat=fopen(filename,"wb"))==NULL ) {
    fprintf( stderr, APPNAME ": error opening file %s for writing.\n", filename );
    free(filename);
    fclose(idx);
    exit(1);
  }
  free(filename);
  
  /* Write output files. */
  wordtree_output(wordtree, idx, dat);
  
  /* Exit without errors. */
  wordtree_free(wordtree);
  fclose( idx );
  fclose( dat );
  return(0);
}



int
read_word( FILE *f, char *word )
{
  /* Read from file f the next word and fill word buffer.
   * Return -1 if there isn't another word to read,
   otherwise the last character read. */
  int c;
  int inword=0;
  
  /* Skip initial spaces. */
  while( isspace((c=fgetc(f))) )
    ;
  
  /* Read alphabetical characters. */
  while( iswordchar(c) || c==' ' ) {
    if( !inword )
      inword=1;
    *word++=c;
    c=fgetc(f);
  }
  
  /* Cut trailing spaces. */
  while( *--word==' ' )
    ;
  /* Terminate the word. */
  *++word='\0';
  
  return(inword?c:-1);
}


int
iswordchar( char c )
{
  /* Return 1 if c is in wordchar[] array of characters. */
  char *w = wordchar;
  
  while( *w && (*w!=c) )
    w++;
  
  return( *w==c?1:0 );
}
