// ---------------------------------------------------------------------------
// - Unicode.cpp                                                             -
// - standard object library - unicode functions class implementation        -
// ---------------------------------------------------------------------------
// - This program is free software;  you can redistribute it  and/or  modify -
// - it provided that this copyright notice is kept intact.                  -
// -                                                                         -
// - This program  is  distributed in  the hope  that it will be useful, but -
// - without  any  warranty;  without  even   the   implied    warranty   of -
// - merchantability or fitness for a particular purpose.  In no event shall -
// - the copyright holder be liable for any  direct, indirect, incidental or -
// - special damages arising in any way out of the use of this software.     -
// ---------------------------------------------------------------------------
// - copyright (c) 1999-2007 amaury darsch                                   -
// ---------------------------------------------------------------------------

#include "Ascii.hpp"
#include "Unicode.hpp"
#include "Utility.hpp"
#include "Exception.hpp"
#include "cucd.hpp"

namespace afnix {

  // -------------------------------------------------------------------------
  // - private section                                                       -
  // -------------------------------------------------------------------------

  // this procedure checks if a character belong to an array
  static inline bool is_csep (const char c, const char* sep) {
    while (*sep != nilc) {
      if (*sep++ == c) return true;
    }
    return false;
  }

  // this procedure checks if a character belong to an array
  static inline bool is_qsep (const t_quad c, const t_quad* sep) {
    while (*sep != nilq) {
      if (*sep++ == c) return true;
    }
    return false;
  }

  // -------------------------------------------------------------------------
  // - public section                                                        -
  // -------------------------------------------------------------------------

  // convert a unicode character to a native character if possible

  char Unicode::tochar (const t_quad value) {
    // check for 8 bit range
    if ((value & 0xFFFFFF00) != 0x00000000) {
      throw Exception ("unicode-error", "cannot convert unicode character");
    }
    // map the character
    char result = (char) (value & 0x000000FF);
    return result;
  }

  // convert a hexadecimal character to a byte

  t_byte Unicode::htob (const t_quad value) {
    char c = Unicode::tochar (value);
    return Ascii::htob (c);
  }

  // convert a native character to a unicode character

  t_quad Unicode::toquad (const char value) {
    t_quad result = value;
    return result & 0x000000ff;
  }

  // convert a string representation to a character

  t_quad Unicode::toquad (const String& value) {
    long slen = value.length ();
    // check for single character
    if (slen == 1) {
      t_quad result = value[0];
      return result;
    }
    // check for ascii representation
    if ((slen > 2) && (value[0] == '\'')) {
      t_quad result = Unicode::toquad (Ascii::tochar (value));
      return result;
    }
    // check for unicode representation
    if ((slen > 2) && (value[0] == 'U') && (value[1] == '+')) {
      // format the string
      String format = "0x";
      format += value.rsubstr (2);
      // convert to quad
      return (t_quad) Utility::tointeger (format);
    }
    // invalid format
    throw Exception ("format-error",
		     "illegal unicode string representation", value);
  }

  // convert a unicode character value to a string

  String Unicode::tostring (const t_quad value) {
    // check for an ascii character
    if ((value & 0xFFFFFF00) == 0x00000000) {
      char cval = (char) (value & 0x000000FF);
      String result = Ascii::tostring (cval);
      return result;
    }
    // we are outside the ascii range, so use the unicode representation
    String result = "U+";
    result += Utility::tohexa ((long) value);
    return result;
  }

  // convert a native character value to a literal string

  String Unicode::toliteral (const t_quad value) {
    String result;
    if (Unicode::isascii (value) == true) {
      char cval = (char) (value & 0x000000FF);
      result += '\'';
      result += cval;
      result += '\'';
    } else {
      result += '"';
      result += Unicode::tostring (value);
      result += '"';
    }
    return result;
  }

  // get the size of unicode array

  long Unicode::strlen (const t_quad* s) {
    // check for nil string
    if (s == nilp) return 0;
    // compute length
    long result = 0;
    while (*s++ != nilq) result++;
    return result;
  }

  // compare two strings and returns true if they are equals.

  bool Unicode::strcmp (const t_quad* s1, const char* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen   (s2));
    // compute string length
    long len1 = Unicode::strlen (ns1);
    long len2 = Unicode::strlen (ns2);
    // check length first
    if (len1 != len2) {
      delete [] ns1;
      delete [] ns2;
      return false;
    }
    // normal compare
    bool result = true;
    for (long i = 0; i < len1; i++) {
      if (ns1[i] != ns2[i]) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] ns1;
    delete [] ns2;
    return result;
  }

  // compare two strings and returns true if they are equals.

  bool Unicode::strcmp (const t_quad* s1, const t_quad* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
    // compute string length
    long len1 = Unicode::strlen (ns1);
    long len2 = Unicode::strlen (ns2);
    // check length first
    if (len1 != len2) {
      delete [] ns1;
      delete [] ns2;
      return false;
    }
    // normal compare
    bool result = true;
    for (long i = 0; i < len1; i++) {
      if (ns1[i] != ns2[i]) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] ns1;
    delete [] ns2;
    return result;
  }

  // compare two strings upto n characters

  bool Unicode::strncmp (const t_quad* s1, const char* s2, const long size) {
    // nil case compare
    if (size == 0) return true;
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen   (s2));
    // compute string length
    long len1 = Unicode::strlen (ns1);
    long len2 = Unicode::strlen (ns2);
    // check length first
    if ((len1 < size) || (len2 < size)) {
      delete [] ns1;
      delete [] ns2;
      return false;
    }
    // normal compare
    bool result = true;
    for (long i = 0; i < size; i++) {
      if (ns1[i] != ns2[i]) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] ns1;
    delete [] ns2;
    return result;
  }

  // compare two strings upto n characters

  bool Unicode::strncmp (const t_quad* s1, const t_quad* s2, const long size) {
    // nil case compare
    if (size == 0) return true;
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
    // compute string length
    long len1 = Unicode::strlen (ns1);
    long len2 = Unicode::strlen (ns2);
    // check length first
    if ((len1 < size) || (len2 < size)) {
      delete [] ns1;
      delete [] ns2;
      return false;
    }
    // normal compare
    bool result = true;
    for (long i = 0; i < size; i++) {
      if (ns1[i] != ns2[i]) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] ns1;
    delete [] ns2;
    return result;
  }

  // compare two strings - less than operator

  bool Unicode::strlth (const t_quad* s1, const char* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen   (s2));
    // save pointers for delete
    t_quad* np1 = ns1;
    t_quad* np2 = ns2;
    // compare without equal
    bool result = false;
    while (*ns1 != nilq) {
      if (*ns1 < *ns2) {
	result = true;
	break;
      }
      if (*ns1++ > *ns2++) break;
    }
    // clean temporaries and return
    delete [] np1;
    delete [] np2;
    return result;
  }

  // compare two strings - less than operator

  bool Unicode::strlth (const t_quad* s1, const t_quad* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
    // save pointers for delete
    t_quad* np1 = ns1;
    t_quad* np2 = ns2;
    // compare without equal
    bool result = false;
    while (*ns1 != nilq) {
      if (*ns1 < *ns2) {
	result = true;
	break;
      }
      if (*ns1++ > *ns2++) break;
    }
    // clean temporaries and return
    delete [] np1;
    delete [] np2;
    return result;
  }

  // compare two strings - less equal operator

  bool Unicode::strleq (const t_quad* s1, const char* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Ascii::strlen   (s2));
    // save pointers for delete
    t_quad* np1 = ns1;
    t_quad* np2 = ns2;
    // compare with equal
    bool result = true;
    while (*ns1 != nilq) {
      if (*ns1 < *ns2) break;
      if (*ns1++ > *ns2++) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] np1;
    delete [] np2;
    return result;
  }

  // compare two strings - less equal operator

  bool Unicode::strleq (const t_quad* s1, const t_quad* s2) {
    // normalize the string first
    t_quad* ns1 = c_ucdnrm (s1, Unicode::strlen (s1));
    t_quad* ns2 = c_ucdnrm (s2, Unicode::strlen (s2));
    // save pointers for delete
    t_quad* np1 = ns1;
    t_quad* np2 = ns2;
    // compare with equal
    bool result = true;
    while (*ns1 != nilq) {
      if (*ns1 < *ns2) break;
      if (*ns1++ > *ns2++) {
	result = false;
	break;
      }
    }
    // clean temporaries and return
    delete [] np1;
    delete [] np2;
    return result;
  }

  // convert an ascii character to an unicode array

  t_quad* Unicode::strmak (const char value) {
    t_quad buf[2];
    buf[0] = Unicode::toquad (value);
    buf[1] = nilq;
    return strdup (buf, 2);
  }

  // convert a unicode character to an unicode array

  t_quad* Unicode::strmak (const t_quad value) {
    t_quad buf[2];
    buf[0] = value;
    buf[1] = nilq;
    return strdup (buf, 2);
  }

  // create a unicode string from a string and a character

  t_quad* Unicode::strmak (const t_quad* s, const char c) {
    t_quad value = Unicode::toquad (c);
    return Unicode::strmak (s, value);
  }

  // create a unicode string from a string one and a unicode character

  t_quad* Unicode::strmak (const t_quad* s, const t_quad c) {
    // compute size
    long    len = Unicode::strlen (s);
    t_quad* buf = new t_quad[len+2];
    // copy string directly
    for (long i = 0; i < len; i++) buf[i] = s[i];
    buf[len]   = c;
    buf[len+1] = nilq;
    // normalize and clean
    try {
      t_quad* result = strdup (buf, len+1);
      delete [] buf;
      return result;
    } catch (...) {
      delete [] buf;
      throw;
    }
  }

  // create a unicode string from a character and a string

  t_quad* Unicode::strmak (const char c, const t_quad* s) {
    t_quad value = Unicode::toquad (c);
    return Unicode::strmak (value, s);
  }

  // create a unicode string from a unicode character and a string

  t_quad* Unicode::strmak (const t_quad c, const t_quad* s) {
    // compute size
    long    len = Unicode::strlen (s);
    t_quad* buf = new t_quad[len+2];
    // add character and copy string
    buf[0] = c;
    for (long i = 0; i < len; i++) buf[i+1] = s[i];
    buf[len+1] = nilq;
    // normalize and clean
    try {
      t_quad* result = strdup (buf, len+1);
      delete [] buf;
      return result;
    } catch (...) {
      delete [] buf;
      throw;
    }
  }

  // concatenate two strings and normalize the result

  t_quad* Unicode::strmak (const t_quad* s1, const char* s2) {
    // compute arguments length
    long len1 = Unicode::strlen (s1);
    long len2 = Ascii::strlen   (s2);
    // allocate a temporary buffer and copy
    t_quad* buf = new t_quad[len1+len2+1];
    for (long i = 0; i < len1; i++) buf[i] = s1[i];
    for (long i = 0; i < len2; i++) buf[len1+i] = Unicode::toquad (s2[i]);
    buf[len1+len2] = nilq;
    // normalize and clean
    try {
      t_quad* result = strdup (buf, len1+len2);
      delete [] buf;
      return result;
    } catch (...) {
      delete [] buf;
      throw;
    }
  }

  // concatenate two strings and normalize the result

  t_quad* Unicode::strmak (const t_quad* s1, const t_quad* s2) {
    // compute arguments length
    long len1 = Unicode::strlen (s1);
    long len2 = Unicode::strlen (s2);
    // allocate a temporary buffer and copy
    t_quad* buf = new t_quad[len1+len2+1];
    for (long i = 0; i < len1; i++) buf[i]      = s1[i];
    for (long i = 0; i < len2; i++) buf[len1+i] = s2[i];
    buf[len1+len2] = nilq;
    // normalize and clean
    try {
      t_quad* result = strdup (buf, len1+len2);
      delete [] buf;
      return result;
    } catch (...) {
      delete [] buf;
      throw;
    }
  }

  // convert an ascii string to an unicode array

  t_quad* Unicode::strdup (const char* s) {
    // get the buffer length and check for nil
    long len = Ascii::strlen (s);
    // convert the source buffer
    t_quad* result = new t_quad[len+1];
    try {
      for (long i = 0; i < len; i++) result[i] = Unicode::toquad (s[i]);
      result[len] = nilq;
      return result;
    } catch (...) {
      delete [] result;
      throw;
    }
  }

  // convert a unicode string to an unicode array

  t_quad* Unicode::strdup (const t_quad* s) {
    // get the string length and check for nil
    long len = Unicode::strlen (s);
    // create a new quad array
    t_quad* result = new t_quad[len+1];
    // copy the source buffer
    try {
      for (long i = 0; i < len; i++) result[i] = s[i];
      result[len] = nilq;
      return result;
    } catch (...) {
      delete [] result;
      throw;
    }
  }

  // convert a character buffer to an unicode array by size

  t_quad* Unicode::strdup (const char* s, const long size) {
    // create a new quad array
    t_quad* result = new t_quad[size+1];
    // convert the source buffer
    try {
      for (long i = 0; i < size + 1; i++) result[i] = Unicode::toquad (s[i]);
      result[size] = nilq;
      return result;
    } catch (...) {
      delete [] result;
      throw;
    }
  }

  // convert a unicode string to an unicode array by size

  t_quad* Unicode::strdup (const t_quad* s, const long size) {
    // create a new quad array
    t_quad* result = new t_quad[size+1];
    // copy the source buffer
    try {
      for (long i = 0; i < size + 1; i++) result[i] = s[i];
      result[size] = nilq;
      return result;
    } catch (...) {
      delete [] result;
      throw;
    }
  }

  // normalize a string by performing a normal form decomposition

  t_quad* Unicode::strnrm (const t_quad* s) {
    // get the string length and check for nil
    long len = Unicode::strlen (s);
    // normalize the string
    return c_ucdnrm (s, len);
  }

  // remove the leading blank and tab and return a new string

  t_quad* Unicode::stripl (const char* s) {
    if (s != nilp) {
      while ((*s != nilc) && ((*s == blkc) || (*s == tabc))) s++;
    }
    return Unicode::strdup (s);
  }

  // remove the leading separators and return a new string

  t_quad* Unicode::stripl (const char* s, const char* sep) {
    if (s != nilp) {
      while ((*s != nilc) && (is_csep (*s, sep) == true)) s++;
    }
    return Unicode::strdup (s);
  }
 
  // remove the leading blank and tab and return a new string

  t_quad* Unicode::stripl (const t_quad* s) {
    if (s != nilp) {
      while ((*s != nilq) && ((*s == blkq) || (*s == tabq))) s++;
    }
    return Unicode::strdup (s);
  }

  // remove the leading separators and return a new string

  t_quad* Unicode::stripl (const t_quad* s, const t_quad* sep) {
    if (s != nilp) {
      while ((*s != nilq) && (is_qsep (*s, sep) == true)) s++;
    }
    return Unicode::strdup (s);
  }

  // remove the trailing blank and return a new string

  t_quad* Unicode::stripr (const char* s) {
    // get the length and check
    long len = Ascii::strlen (s);
    if (len == 0) return c_ucdnil ();
    char* buf = Ascii::strdup (s);
    char* end = buf + len - 1;
    // remove trailing blank
    while ((end != s) && ((*end == blkc) || (*end == tabc))) *end-- = nilc;
    // now copy and return
    t_quad* result = Unicode::strdup (buf);
    delete [] buf;
    return result;
  }

  // remove the trailing separators and return a new string

  t_quad* Unicode::stripr (const char* s, const char* sep) {
    // get the length and check
    long len = Ascii::strlen (s);
    if (len == 0) return c_ucdnil ();
    char* buf = Ascii::strdup (s);
    char* end = buf + len - 1;
    // remove trailing blank
    while ((end != s) && (is_csep (*end, sep) == true)) *end-- = nilc;
    // now copy and return
    t_quad* result = Unicode::strdup (buf);
    delete [] buf;
    return result;
  }

  // remove the trailing blank and return a new string

  t_quad* Unicode::stripr (const t_quad* s) {
    // get the length and check
    long len = Unicode::strlen (s);
    if (len == 0) return c_ucdnil ();
    t_quad* buf = Unicode::strdup (s);
    t_quad* end = buf + len - 1;
    // remove trailing blank
    while ((end != s) && ((*end == blkq) || (*end == tabq))) *end-- = nilq;
    // now copy and return
    t_quad* result = Unicode::strdup (buf);
    delete [] buf;
    return result;
  }

  // remove the trailing separators and return a new string

  t_quad* Unicode::stripr (const t_quad* s, const t_quad* sep) {
    // get the length and check
    long len = Unicode::strlen (s);
    if (len == 0) return c_ucdnil ();
    t_quad* buf = Unicode::strdup (s);
    t_quad* end = buf + len - 1;
    // remove trailing blank
    while ((end != s) && (is_qsep (*end, sep) == true)) *end-- = nilq;
    // now copy and return
    t_quad* result = Unicode::strdup (buf);
    delete [] buf;
    return result;
  }

  // convert an ascii string to lower case

  t_quad* Unicode::tolower (const char* s) {
    // check for length
    if (s == nilp) return c_ucdnil ();
    long len = Ascii::strlen (s);
    // allocate and convert
    long    size = len * UCD_LCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_LCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = c_ucdtol (sdst, Unicode::toquad (s[i]));
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert an ascii string to lower case

  t_quad* Unicode::tolower (const t_quad* s) {
    // check for length
    if (s == nilp) return c_ucdnil ();
    long len = Unicode::strlen (s);
    // allocate and convert
    long    size = len * UCD_LCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_LCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = c_ucdtol (sdst, s[i]);
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert an ascii string to upper case

  t_quad* Unicode::toupper (const char* s) {
    // check for length
    if (s == nilp) return c_ucdnil ();
    long len = Ascii::strlen (s);
    // allocate and convert
    long    size = len * UCD_UCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_UCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = c_ucdtou (sdst, Unicode::toquad (s[i]));
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // convert an unicode string to upper case

  t_quad* Unicode::toupper (const t_quad* s) {
    // check for length
    if (s == nilp) return c_ucdnil ();
    long len = Unicode::strlen (s);
    // allocate and convert
    long    size = len * UCD_UCM_MAX + 1;
    t_quad* sbuf = new t_quad[size];
    long    sidx = 0;
    t_quad  sdst[UCD_UCM_MAX];
    for (long i = 0; i < len; i++) {
      long cnvs = c_ucdtou (sdst, s[i]);
      for (long j = 0; j < cnvs; j++) sbuf[sidx++] = sdst[j];
    }
    // mark end of string
    sbuf[sidx] = nilq;
    // copy and clean
    t_quad* result = Unicode::strdup (sbuf);
    delete [] sbuf;
    return result;
  }

  // return true if the character is a lower character

  bool Unicode::islower (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // check for lower case code
    return (ucd->d_pgcv == UCD_GCV_LL);
  }

  // return true if the character is an upper character

  bool Unicode::isupper (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // check for lower case code
    return (ucd->d_pgcv == UCD_GCV_LU);
  }

  // return true if the unicode character is a letter

  bool Unicode::isletter (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // get the gcv byte and check
    t_byte gcv = ucd->d_pgcv;
    if (gcv == UCD_GCV_LU) return true;
    if (gcv == UCD_GCV_LL) return true;
    if (gcv == UCD_GCV_LT) return true;
    if (gcv == UCD_GCV_LM) return true;
    if (gcv == UCD_GCV_LO) return true;
    return false;
  }

  // return true if the unicode character is a digit

  bool Unicode::isdigit (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // get the gcv byte and check
    t_byte gcv = ucd->d_pgcv;
    if (gcv == UCD_GCV_ND) return true;    
    return false;
  }

  // return true if the unicode character is an alpha-numeric character

  bool Unicode::isalpha (const t_quad code) {
    // check for a digit
    if (Unicode::isdigit (code) == true) return true;
    // check for letter
    if (Unicode::isletter (code) == true) return true;
    // not alpha
    return false;
  }

  // return true if the unicode character is a blank or tab
  
  bool Unicode::isblank (const t_quad code) {
    if ((code == blkq) || (code == tabq)) return true;
    return false;
  }

  // return true if the unicode character is an ascii character
  
  bool Unicode::isascii (const t_quad code) {
    if ((code & 0xFFFFFF80) == 0x00000000) return true;
    return false;
  }

  // return true if the unicode character is a latin character
  
  bool Unicode::islatin (const t_quad code) {
    if ((code & 0xFFFFFF00) == 0x00000000) return true;
    return false;
  }

  // return true if the unicode character is an hexadecimal character
  
  bool Unicode::ishexa (const t_quad code) {
    if ((code >= (t_quad) '0') && (code <= (t_quad) '9')) return true;
    if ((code >= (t_quad) 'a') && (code <= (t_quad) 'f')) return true;
    if ((code >= (t_quad) 'A') && (code <= (t_quad) 'F')) return true;
    return false;
  }

  // return true if the character is an afnix constituent

  bool Unicode::isafnix (const t_quad code) {
    // check for an alhpa character
    if (isalpha (code) == true) return true;
    // check for other constituents
    if (code == (t_quad) '.') return true;
    if (code == (t_quad) '+') return true;
    if (code == (t_quad) '-') return true;
    if (code == (t_quad) '*') return true;
    if (code == (t_quad) '/') return true;
    if (code == (t_quad) '!') return true;
    if (code == (t_quad) '=') return true;
    if (code == (t_quad) '.') return true;
    if (code == (t_quad) '>') return true;
    if (code == (t_quad) '<') return true;
    if (code == (t_quad) '?') return true;
    return false;
  }

  // return true if the unicode character is a valid terminal character

  bool Unicode::isterm (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // get the gcv byte
    t_byte gcv = ucd->d_pgcv;
    // check for letter
    if (gcv == UCD_GCV_LU) return true;
    if (gcv == UCD_GCV_LL) return true;
    if (gcv == UCD_GCV_LT) return true;
    if (gcv == UCD_GCV_LM) return true;
    if (gcv == UCD_GCV_LO) return true;
    // check for marking
    if (gcv == UCD_GCV_MN) return true;
    if (gcv == UCD_GCV_MC) return true;
    if (gcv == UCD_GCV_ME) return true;
    // check for number
    if (gcv == UCD_GCV_ND) return true;
    if (gcv == UCD_GCV_NL) return true;
    if (gcv == UCD_GCV_NO) return true;
    // check for punctuation
    if (gcv == UCD_GCV_PC) return true;
    if (gcv == UCD_GCV_PD) return true;
    if (gcv == UCD_GCV_PS) return true;
    if (gcv == UCD_GCV_PE) return true;
    if (gcv == UCD_GCV_PI) return true;
    if (gcv == UCD_GCV_PF) return true;
    if (gcv == UCD_GCV_PO) return true;
    // check for symbol
    if (gcv == UCD_GCV_SM) return true;
    if (gcv == UCD_GCV_SC) return true;
    if (gcv == UCD_GCV_SK) return true;
    if (gcv == UCD_GCV_SO) return true;
    // check for spacing
    if (gcv == UCD_GCV_ZS) return true;
    // not for a terminal
    return false;
  }


  // return true if the character is a word constituent

  bool Unicode::iswcc (const t_quad code) {
    // get the ucd record and do nothing if it does not exist
    const ucd_s* ucd = c_getucd (code);
    if (ucd == nilp) return false;
    // get the gcv byte
    t_byte gcv = ucd->d_pgcv;
    // check for letter
    if (gcv == UCD_GCV_LU) return true;
    if (gcv == UCD_GCV_LL) return true;
    if (gcv == UCD_GCV_LT) return true;
    if (gcv == UCD_GCV_LM) return true;
    if (gcv == UCD_GCV_LO) return true;
    // check for marking
    if (gcv == UCD_GCV_MN) return true;
    if (gcv == UCD_GCV_MC) return true;
    if (gcv == UCD_GCV_ME) return true;
    // check for number
    if (gcv == UCD_GCV_ND) return true;
    if (gcv == UCD_GCV_NL) return true;
    if (gcv == UCD_GCV_NO) return true;
    // not for a terminal
    return false;
  }

  // return true if the character is a non combining character

  bool Unicode::isncc (const t_quad code) {
    return c_ucdncc (code);
  }

  // encode a unicode character in UTF-8

  char* Unicode::encode (const t_quad c) {
    return Unicode::encode (&c, 1);
  }

  // encode a unicode string in UTF-8

  char* Unicode::encode (const t_quad* s) {
    // get the size and encode
    long size = Unicode::strlen (s);
    return encode (s, size);
  }

  // encode a unicode string in UTF-8
  
  char* Unicode::encode (const t_quad* s, const long size) {
    // check the size
    if (size <= 0) return nilp;
    // allocate the character buffer
    char* buf = new char[size*6+1];
    long  idx = 0;
    // loop in the buffer and encode
    for (long i = 0; i < size; i++) {
      // get the character value
      t_quad value = s[i];
      // encode the value
      if (value < 0x00000080) {
	buf[idx++] = (char) value;
      } else if (value < 0x00000800) {
	buf[idx++] = (char) (0x000000C0 | ((value >> 6)  & 0x0000001F));
	buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x00010000) {
	buf[idx++] = (char) (0x000000E0 | ((value >> 12) & 0x0000000F));
	buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
	buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x00200000) {
	buf[idx++] = (char) (0x000000F0 | ((value >> 18) & 0x00000007));
	buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
	buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x04000000) {
	buf[idx++] = (char) (0x000000F8 | ((value >> 24) & 0x00000003));
	buf[idx++] = (char) (0x00000080 | ((value >> 18) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
	buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else if (value < 0x80000000) {
	buf[idx++] = (char) (0x000000FC | ((value >> 30) & 0x00000001));
	buf[idx++] = (char) (0x00000080 | ((value >> 24) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 18) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 12) & 0x0000003F));
	buf[idx++] = (char) (0x00000080 | ((value >> 6)  & 0x0000003F));
	buf[idx++] = (char) (0x00000080 |  (value        & 0x0000003F));
      } else {
	throw Exception ("encode-error", 
			 "invalid character to encode in utf-8 mode");
      }
    }
    // add the nil character
    buf[idx++] = nilc;
    // here we are
    return buf;
  }

  // decode a unicode buffer

  t_quad* Unicode::decode (const char* s) {
    // get the size and decode
    long size = Ascii::strlen (s);
    return Unicode::decode (s, size); 
  }

  // decode a unicode buffer by size

  t_quad* Unicode::decode (const char* s, const long size) {
    // check the size
    if (size <= 0) return nilp;
    // allocate the quad buffer
    t_quad* buf = new t_quad[size+1];
    long    idx = 0;
    for (long i = 0; i < size; i++) {
      // read first byte
      t_byte b1 = (t_byte) s[i];
      // 1 byte mode
      if (b1 < 0x80) {
	buf[idx++] = (t_quad) b1;
	continue;
      }
      // 2 bytes mode
      if (b1 < 0xE0) {
	buf[idx] = ((t_quad) (b1 & 0x3F)) << 6;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b2 = (t_byte) s[i];
	if ((b2 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b2 & 0x3F);
	if (buf[idx++] < 0x00000080) {
	  throw Exception ("decode-error", "invalid long utf-8 sequence");
	}
	continue;
      }
      // 3 bytes mode
      if (b1 < 0xF0) {
	buf[idx] = ((t_quad) (b1 & 0x0F)) << 12;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b2 = (t_byte) s[i];
	if ((b2 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= ((t_quad) (b2 & 0x3F)) << 6;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b3 = (t_byte) s[i];
	if ((b3 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b3 & 0x3F);
	if (buf[idx++] < 0x00000800) {
	  throw Exception ("decode-error", "invalid long utf-8 sequence");
	}
	continue;
      }
      // 4 bytes mode
      if (b1 < 0xF8) {
	buf[idx] = ((t_quad) (b1 & 0x07)) << 18;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b2 = (t_byte) s[i];
	if ((b2 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= ((t_quad) (b2 & 0x3F)) << 12;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b3 = (t_byte) s[i];
	if ((b3 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b3 & 0x3F) << 6;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b4 = (t_byte) s[i];
	if ((b4 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b4 & 0x3F);
	if (buf[idx++] < 0x00010000) {
	  throw Exception ("decode-error", "invalid long utf-8 sequence");
	}
	continue;
      }
      // 5 bytes mode
      if (b1 < 0xFC) {
	buf[idx] = ((t_quad) (b1 & 0x03)) << 24;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b2 = (t_byte) s[i];
	if ((b2 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= ((t_quad) (b2 & 0x3F)) << 18;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b3 = (t_byte) s[i];
	if ((b3 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b3 & 0x3F) << 12;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b4 = (t_byte) s[i];
	if ((b4 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b4 & 0x3F) << 6;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b5 = (t_byte) s[i];
	if ((b5 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b5 & 0x3F);
	if (buf[idx++] < 0x00200000) {
	  throw Exception ("decode-error", "invalid long utf-8 sequence");
	}
	continue;
      }
      // 6 bytes mode
      if (b1 < 0xFE) {
	buf[idx] = ((t_quad) (b1 & 0x01)) << 30;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b2 = (t_byte) s[i];
	if ((b2 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= ((t_quad) (b2 & 0x3F)) << 24;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b3 = (t_byte) s[i];
	if ((b3 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b3 & 0x3F) << 18;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b4 = (t_byte) s[i];
	if ((b4 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b4 & 0x3F) << 12;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b5 = (t_byte) s[i];
	if ((b5 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b5 & 0x3F) << 6;
	if (++i >= size) {
	  throw Exception ("decode-error", 
			   "invalid eos while reading utf-8 sequence");
	}
	t_byte b6 = (t_byte) s[i];
	if ((b6 & 0x80) != 0x80) {
	  throw Exception ("decode-error", 
			   "invalid byte while reading utf-8 sequence");
	}
	buf[idx] |= (t_quad) (b6 & 0x3F);
	if (buf[idx++] < 0x04000000) {
	  throw Exception ("decode-error", "invalid long utf-8 sequence");
	}
	continue;
      }
      throw Exception ("decode-error", "invalid utf-8 character sequence");
    }
    // add the nil quad
    buf[idx++] = nilq;
    // here we are
    return buf;
  }

  // get the non-combining length of a unicode string

  long Unicode::ncclen (const t_quad* s) {
    // check for nil string
    if (s == nilp) return 0;
    // compute length by counting only the grapheme
    long result = 0;
    while (*s != nilq) {
      if (c_ucdncc (*s++) == true) result++;
    }
    return result;
  }
}
