/**
 * @file   ngram_write_bin.c
 * 
 * <JA>
 * @brief  N-gramХʥǥե˽񤭽Ф
 *
 * rev.3.5 ꡤɤ߹ߤι®θƽ񤭽ФΥХȥ
 * Big endian ꤫ޥ¸ѹ줿ޤǥå 24bit 
 *  2-gram ΥХåեǡΰ̤Ԥʤɡե
 * ͤѹ줿ˤꡤ3.5 ʹߤ mkbingram 
 * ХʥN-gram, 3.4.2 Julius ǤϻȤʤ
 * (إååǥ顼Ȥʤ)
 *
 * ʤ 3.5 ʹߤ Julius ǤϽΥǥʤɤ롥ξ,
 * ǥå 24bit ȥХåդΰ̤ϥǥɤ߹߻
 * ٹԤ롥
 *
 * Хȥ˴ؤƥإå˵Ҥ뤳Ȥǡɤ߹߻Ƚꤷ
 * ɤ߹ࡥˤꡤۤʤХȥΥޥ
 * ХʥN-gramǤʤɤ롥Υǥ⤽Τޤ
 * ɤ߹롥
 * </JA>
 * 
 * <EN>
 * @brief  Write a whole N-gram data to a file in binary format
 *
 * From 3.5, internal format of binary N-gram has changed for using
 * machine-dependent natural byte order (previously fixed to big endian),
 * 24bit index and 2-gram backoff compression.  So, binary N-gram
 * generated by mkbingram of 3.5 and later will not work on 3.4.2 and
 * earlier versions.
 *
 * There is full upward- and cross-machine compatibility in 3.5.  Old
 * binary N-gram files still can be read directly, in which case the conversion
 * to 24bit index will performed just after model has been read.
 * Byte order will also considered by header information, so
 * binary N-gram still can be used among different machines.
 * </EN>
 * 
 * @author Akinobu LEE
 * @date   Wed Feb 16 17:23:16 2005
 *
 * $Revision: 1.4 $
 * 
 */
/*
 * Copyright (c) 1991-2007 Kawahara Lab., Kyoto University
 * Copyright (c) 2000-2005 Shikano Lab., Nara Institute of Science and Technology
 * Copyright (c) 2005-2007 Julius project team, Nagoya Institute of Technology
 * All rights reserved
 */

#include <sent/stddefs.h>
#include <sent/ngram2.h>

static boolean need_swap; ///< TRUE if need byte swap

#define wrt(A,B,C,D) if (wrtfunc(A,B,C,D) == FALSE) return FALSE

static unsigned int count;
void
reset_wrt_counter()
{
  count = 0;
}
static unsigned int
get_wrt_counter()
{
  return count;
}
     

/** 
 * Binary write function, with byte swapping if needed.
 * 
 * @param fp [in] file pointer
 * @param buf [in] data buffer to write
 * @param unitbyte [in] unit size in bytes
 * @param unitnum [in] number of unit to write
 */
static boolean
wrtfunc(FILE *fp, void *buf, size_t unitbyte, size_t unitnum)
{
  if (need_swap == TRUE && unitbyte != 1) {
    swap_bytes((char *)buf, unitbyte, unitnum);
  }
  if (myfwrite(buf, unitbyte, unitnum, fp) < unitnum) {
    jlog("Error: write_ngram_bin: failed to write %d bytes", unitbyte*unitnum);
    return FALSE;
  }
  if (need_swap == TRUE && unitbyte != 1) {
    swap_bytes((char *)buf, unitbyte, unitnum);
  }
  count += unitbyte * unitnum;
  return TRUE;
}

/** 
 * Write header information, with identifier string.
 * 
 * @param fp [in] file pointer
 * @param str [in] user header string (any string within BINGRAM_HDSIZE
 * bytes is allowed)
 * @param version [in] file format version id
 */
static boolean
write_header(FILE *fp, char *str)
{
  char buf[BINGRAM_HDSIZE];
  int i, totallen;

  for(i=0;i<BINGRAM_HDSIZE;i++) buf[i] = EOF;
  totallen = strlen(BINGRAM_IDSTR_V5) + 1 + strlen(BINGRAM_SIZESTR_HEAD) + strlen(BINGRAM_SIZESTR_BODY) + 1 + strlen(BINGRAM_BYTEORDER_HEAD) + strlen(BINGRAM_NATURAL_BYTEORDER) + 1 + strlen(str);
  if (totallen >= BINGRAM_HDSIZE) {
    jlog("Warning: write_bingram: header too long, last will be truncated\n");
    i = strlen(str) - (totallen - BINGRAM_HDSIZE);
    str[i] = '\0';
  }
  sprintf(buf, "%s\n%s%s %s%s\n%s", BINGRAM_IDSTR_V5, BINGRAM_SIZESTR_HEAD, BINGRAM_SIZESTR_BODY, BINGRAM_BYTEORDER_HEAD, BINGRAM_NATURAL_BYTEORDER, str);
  wrt(fp, buf, 1, BINGRAM_HDSIZE);

  return TRUE;
}

/** 
 * Write a whole N-gram data in binary format.
 * 
 * @param fp [in] file pointer
 * @param ndata [in] N-gram data to write
 * @param headerstr [in] user header string
 * 
 * @return TRUE on success, FALSE on failure
 */
boolean
ngram_write_bin(FILE *fp, NGRAM_INFO *ndata, char *headerstr)
{
  int i,n;
  unsigned int len;
  int wlen;
  NGRAM_TUPLE_INFO *t;

  reset_wrt_counter();

  /* write initial header */
  if (write_header(fp, headerstr) == FALSE) return FALSE;

  /* swap not needed any more */
  need_swap = FALSE;

  /* write some header info */
  wrt(fp, &(ndata->n), sizeof(int), 1);
  wrt(fp, &(ndata->dir), sizeof(int), 1);
  wrt(fp, &(ndata->bigram_index_reversed), sizeof(boolean), 1);

  /* write total info */
  for(n=0;n<ndata->n;n++) {
    wrt(fp, &(ndata->d[n].totalnum), sizeof(NNID), 1);
    /*jlog("ngram %d=%d\n",n+1,ndata->ngram_num[n]);*/
  }

  /* unk_*, isopen, max_word_num are set after read, so need not save */

  /* write wname */
  wlen = 0;
  for(i=0;i<ndata->max_word_num;i++) {
    wlen += strlen(ndata->wname[i]) + 1;
  }
  wrt(fp, &wlen, sizeof(int), 1);
  for(i=0;i<ndata->max_word_num;i++) {
    wrt(fp, ndata->wname[i], 1, strlen(ndata->wname[i]) + 1); /* include \0 */
  }

  /* write N-gram */
  for(n=0;n<ndata->n;n++) {
    t = &(ndata->d[n]);

    wrt(fp, &(t->is24bit), sizeof(boolean), 1);
    wrt(fp, &(t->ct_compaction), sizeof(boolean), 1);
    wrt(fp, &(t->bgnlistlen), sizeof(NNID), 1);
    wrt(fp, &(t->context_num), sizeof(NNID), 1);
    if (n > 0) {
      if (t->is24bit) {
	wrt(fp, t->bgn_upper, sizeof(NNID_UPPER), t->bgnlistlen);
	wrt(fp, t->bgn_lower, sizeof(NNID_LOWER), t->bgnlistlen);
      } else {
	wrt(fp, t->bgn, sizeof(NNID), t->bgnlistlen);
      }
      wrt(fp, t->num, sizeof(WORD_ID), t->bgnlistlen);
      wrt(fp, t->nnid2wid, sizeof(WORD_ID), t->totalnum);
    }
    wrt(fp, t->prob, sizeof(LOGPROB), t->totalnum);
    if (t->bo_wt) {
      i = 1;
      wrt(fp, &i, sizeof(int), 1);
      wrt(fp, t->bo_wt, sizeof(LOGPROB), t->context_num);
    } else {
      i = 0;
      wrt(fp, &i, sizeof(int), 1);
    }
    if (t->nnid2ctid_upper) {
      i = 1;
      wrt(fp, &i, sizeof(int), 1);
      wrt(fp, t->nnid2ctid_upper, sizeof(NNID_UPPER), t->totalnum);
      wrt(fp, t->nnid2ctid_lower, sizeof(NNID_LOWER), t->totalnum);
    } else {
      i = 0;
      wrt(fp, &i, sizeof(int), 1);
    }

  }

  /* write additional LR 2-gram */
  if (ndata->bo_wt_1) {
    i = 1;
    wrt(fp, &i, sizeof(int), 1);
    wrt(fp, ndata->bo_wt_1, sizeof(LOGPROB), ndata->d[0].context_num);
  } else {
    i = 0;
    wrt(fp, &i, sizeof(int), 1);
  }
  if (ndata->p_2) {
    i = 1;
    wrt(fp, &i, sizeof(int), 1);
    wrt(fp, ndata->p_2, sizeof(LOGPROB), ndata->d[1].totalnum);
  } else {
    i = 0;
    wrt(fp, &i, sizeof(int), 1);
  }

  len = get_wrt_counter();
  jlog("Stat: ngram_write_bin: wrote %lu bytes (%.1f MB)\n", len, len / 1048576.0);
  return TRUE;
}
