/*
 * Written by Bastien Chevreux (BaCh)
 *
 * Copyright (C) 2003 and later by Bastien Chevreux
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the 
 * Free Software Foundation, Inc., 
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * 
 */

#ifndef lint
static char vcid[] = "$Id$";
#endif /* lint */

// for boost::trim, split
#include <boost/algorithm/string.hpp>

#include "mira/gff_parse.H"
#include "util/progressindic.H"

// do not delete last (empty) entry in field below, it's needed
//  in GBF::fillFeatureTranslationVectors()
const char * GFFParse::GFFP_featuretranslations[]= {
  "MFSM","MFSM",

  "binding_site","Fm-b",
  "CDS","FCDS",
  "contig","Fctg",                  // new for gap4!
  "exon","Fexn",
  "gap","Fgap",                     // new for gap4!
  "gene","Fgen",
  "long_terminal_repeat","FLTR",
  "mobile_element","Fmel",          // new for gap4!
  "mRNA","FmRN",
  "ncRNA","FncR",                   // new for gap4!
  "origin_of_replication","Frpo",
  "pseudogene","Fpgn",              // new for gap4!
  "pseudogenic_exon","Fpxn",        // new for gap4!
  "pseudogenic_region","Fprg",      // new for gap4!
  "pseudotRNA","FptR",              // new for gap4!
  "recombination_feature","Fm-r",
  "region","Freg",                  // new for gap4!
  "regulatory_region","Frrg",       // new for gap4!
  "repeat_region","Frpr",
  "rRNA","FrRN",
  "sequence_difference","Fm-d",
  "tRNA","FtRN",
  "tmRNA","FtmR",                   // new for gap4!
  "processed_transcript","Fptr",    // new for gap4
  ""
};

GFFParse::strstrmap GFFParse::GFFP_mapgff2gap4;
GFFParse::strstrmap GFFParse::GFFP_mapgap42gff;

// keep this last
const bool GFFParse::GFFP_staticfeaturesinit=GFFParse::staticInitialiser();


bool GFFParse::staticInitialiser()
{
  FUNCSTART("bool GFFParse::staticInitialiser()");

  uint32 i=0;
  while(strlen(GFFP_featuretranslations[i]) != 0) {
    GFFParse::GFFP_mapgff2gap4[GFFP_featuretranslations[i]]=GFFParse::GFFP_featuretranslations[i+1];
    GFFParse::GFFP_mapgap42gff[GFFP_featuretranslations[i+1]]=GFFParse::GFFP_featuretranslations[i];
    i+=2;;
  }

  FUNCEND();
  return true;
}


void GFFParse::foolCompiler()
{
#include "stdinc/foolcompiler.C"
}

// Plain vanilla constructor
GFFParse::GFFParse()
{
  FUNCSTART("GFFParse::GFFParse()");

  zeroVars();
  init();

  FUNCEND();
}

void GFFParse::zeroVars()
{
  FUNCSTART("void GFFParse::zeroVars()");
  FUNCEND();
}

void GFFParse::init()
{
  FUNCSTART("void GFFParse::init()");
  FUNCEND();
}



GFFParse::~GFFParse()
{
  FUNCSTART("GFFParse::~GFFParse()");

  discard();

  FUNCEND();
}


void GFFParse::discard()
{
  FUNCSTART("GFFParse::discard()");

  zeroVars();

  FUNCEND();
}


//// Copy constructor
////  no discard needed as this object will be freshly created when
////  called through this constructor
//GFFParse::GFFParse(GFFParse const &other)
//{
//  FUNCSTART("GFFParse::GFFParse(GFFParse const &other)");
//
//  ??_valid=0;
//
//  *this=other;                               // call the copy operator
//
//  FUNCEND();
//}
//
//// Copy operator, needed by copy-constructor
//GFFParse const & GFFParse::operator=(GFFParse const & other)
//{
//  FUNCSTART("GFFParse const & GFFParse::operator=(GFFParse const & other)");
//  ERROR("Not implemented yet.");
//  FUNCEND();
//  return *this;
//}

//ostream & operator<<(ostream &ostr, GFFParse const &???)
//{
//  FUNCSTART("friend ostream & GFFParse::operator<<(ostream &ostr, const  &???)");
//  ERROR("Not implemented yet.");
//
//  FUNCEND();
//  return ostr;
//}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

const string & GFFParse::getSequenceName(uint32 i) const
{
  FUNCSTART("const string & GFFParse::getSequenceName(uint32 i) const");

  if(i>=GFFP_seqnames.size()){
    throw Notify(Notify::WARNING, THISFUNC, ": Tried to get out of range sequence name.");
  }

  FUNCEND();
  return GFFP_seqnames[i];
}

/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

const string & GFFParse::getSequence(uint32 i) const
{
  FUNCSTART("const string & GFFParse::getSequence(uint32 i) const");

  if(i>=GFFP_sequences.size()){
    throw Notify(Notify::WARNING, THISFUNC, ": Tried to get out of range sequence.");
  }

  FUNCEND();
  return GFFP_sequences[i];
}

/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

const vector<multitag_t> & GFFParse::getTags(uint32 i) const
{
  FUNCSTART("const vector<tag_t> & GFFParse::getTags(uint32 i) const");

  if(i>=GFFP_sequencetags.size()){
    throw Notify(Notify::WARNING, THISFUNC, ": Tried to get out of range tags.");
  }

  FUNCEND();
  return GFFP_sequencetags[i];
}






// substr vector passed by reference: avoid constructing vector n times 
void GFFParse::parseNormalGFFLine(const string & line, const uint64 lineno, vector<string> & substrs)
{
  substrs.clear();
  boost::split(substrs, line, boost::is_any_of("\t"));

  if(substrs.size() != 9) {
    cout << "Line " << lineno << ": expected 9 elements, found " << substrs.size() << "\nBad line: " << line << "\n";
    return;
  }

  strintmap::iterator snI=GFFP_snmap.find(substrs[0]);
  size_t snmindex=0;
  if(snI != GFFP_snmap.end()){
    snmindex=snI->second;
  }else{
    if(substrs[2]=="contig"){
      GFFP_snmap[substrs[0]]=GFFP_seqnames.size();
      snmindex=GFFP_seqnames.size();
      GFFP_seqnames.push_back(substrs[0]);
      GFFP_sequences.resize(GFFP_sequences.size()+1);
      GFFP_sequencetags.resize(GFFP_sequences.size()+1);
    }else{
      cout << "Line " << lineno << ": new sequence name " << substrs[0] 
	   << ", expected 'contig', found '" << substrs[2] << "'\n";
      if(GFFP_errorstatus<2) GFFP_errorstatus=2;
      return;
    }
  }

  GFFP_sequencetags[snmindex].resize(GFFP_sequencetags[snmindex].size()+1);

  // add tag defined by this line to sequence just found
  multitag_t & newtag=GFFP_sequencetags[snmindex].back();

  newtag.source=multitag_t::MT_tagsrcentry_idGFF3;

  newtag.from=atoi(substrs[3].c_str());
  newtag.to=atoi(substrs[4].c_str());

  if(newtag.from==0){
    cout << "Line " << lineno 
	 << ": position 'from' (field 4," << newtag.from << ") is 0? Coordinates in GFF files should have 1 as lowest value.\n";
    if(GFFP_errorstatus<2) GFFP_errorstatus=2;
  }else{
    --newtag.from;
  }
  if(newtag.to==0){
    cout << "Line " << lineno 
	 << ": position 'to' (field 5," << newtag.to << ") is 0? Coordinates in GFF files should have 1 as lowest value.\n";
    if(GFFP_errorstatus<2) GFFP_errorstatus=2;
  }else{
    --newtag.to;
  }
    
  if(substrs[6].empty()){
    cout << "Line " << lineno 
	 << ": field 7 may only be '+', '-' or '=', but found empty string\n";
    newtag.strand='=';
    if(GFFP_errorstatus<2) GFFP_errorstatus=2;
  }else if(substrs[6]!="+" && substrs[6]!="-"){
    cout << "Line " << lineno 
	 << ": direction in field 7 may only be '+', '-' or '=', but found '" 
	 << substrs[6] << "'\n";
    newtag.strand='=';
    if(GFFP_errorstatus<2) GFFP_errorstatus=2;
  }else{
    newtag.strand=substrs[6][0];
  }

  if(!substrs[2].empty()){
    string gap4id=translateGFFfeat2GAP4feat(substrs[2]);
    if(gap4id.empty()){
      cout << "Line " << lineno 
	   << ": MIRA does not know type '" << substrs[2] << "'\n";
      if(GFFP_errorstatus<1) GFFP_errorstatus=1;
      newtag.identifier=multitag_t::newIdentifier(substrs[2]);
    }else{
      newtag.identifier=multitag_t::newIdentifier(gap4id);
    }
  }
  if(!substrs[8].empty()){
    newtag.comment=multitag_t::newComment(substrs[8]);
  }

  if(substrs[2]=="CDS"){
    if(substrs[7].empty()){
      cout << "Line " << lineno 
	   << ": for CDS, field 8 may only be '0', '1' or '2', but found empty string\n";
      if(GFFP_errorstatus<2) GFFP_errorstatus=2;
    }else if(substrs[7] != "0"
	     && substrs[7] != "1"
	     && substrs[7] != "2"){
      //cout << "Line " << lineno 
      //	   << ": for CDS, field 8 may only be '0', '1' or '2', but found string '" 
      //	   << substrs[7] << "'\n";
      //if(GFFP_errorstatus<2) GFFP_errorstatus=2;
    }
  }
}

void GFFParse::loadFile(const string & filename)
{
  FUNCSTART("void GFFParse::loadFile(const string & filename)");

  ifstream gffin;
  size_t numseqsloaded=0;
  uint64 lineno=0;

  GFFP_errorstatus=0;

  gffin.open(filename.c_str(), ios::in|ios::ate);
  if(!gffin) {
    MIRANOTIFY(Notify::FATAL, "GFF file not found for loading:" << filename); 
  }
  if(!gffin.tellg() ) {
    MIRANOTIFY(Notify::FATAL, "GFF file empty? " << filename);
  }

  ProgressIndicator<std::streamoff> P(0, gffin.tellg(),1000);

  gffin.seekg(0, ios::beg);

  string actline;
  actline.reserve(10000);
  vector<string> substrs;
  substrs.reserve(9);

  bool fastamode=false;
  while(!gffin.eof()){
    if(gffin.eof()) break;
    getline(gffin,actline);
    ++lineno;

    if(!actline.empty()){
      if(actline[0]=='#') {
	if(actline == "##FASTA") {
	  fastamode=true;
	  break;
	}
      }else{
	parseNormalGFFLine(actline,lineno,substrs);
      }
    }
    if(P.delaytrigger()) P.progress(gffin.tellg());
  }
  if(fastamode){
    string blanks=" \t\n";
    string tmpseq;
    string tmpsname;
    bool saveseq=false;
    while(!gffin.eof()){
      if(gffin.eof()) break;
      getline(gffin,actline);
      ++lineno;

      if(!actline.empty()){
	if(actline[0]=='>') {
	  // check name of sequence in sequence name map
	  // if present, save current sequence
	  // if not, do not save as this is either a protein or a bogus file
	  if(saveseq){
	    strintmap::iterator snI=GFFP_snmap.find(tmpsname);
	    //cout << "Must save " << tmpsname << "\t" << snI->second << endl;
	    GFFP_sequences[snI->second].swap(tmpseq);
	  }
	  tmpseq.clear();
	  string::size_type tokenend=string::npos;
	  tokenend=actline.find_first_of(blanks,0);
	  if(tokenend==string::npos) tokenend=actline.size();
	  tmpsname=actline.substr(1, tokenend-1);

	  saveseq=false;
	  // see whether we will need to save this sequence
	  strintmap::iterator snI=GFFP_snmap.find(tmpsname);
	  if(snI != GFFP_snmap.end()){
	    saveseq=true;
	  }
	  //cout << "Seeing new seqname ###"<<tmpsname<<"###\n";
	}else{
	  // only spend time if it's a sequence we will keep
	  // (and not a protein or something)
	  if(saveseq){
	    // append trimmed actline to tmpseq
	    boost::trim(actline);
	    tmpseq+=actline;
	  }
	}
      }
      if(P.delaytrigger()) P.progress(gffin.tellg());
    }

    // there might be some unsaved sequences still
    if(saveseq){
      strintmap::iterator snI=GFFP_snmap.find(tmpsname);
      cout << "Must save " << tmpsname << "\t" << snI->second << endl;
      GFFP_sequences[snI->second].swap(tmpseq);
    }
  }

  P.finishAtOnce();
  cout << '\n';

  gffin.close();

  if(GFFP_errorstatus==0) checkTagsOnceLoaded();

  if(GFFP_errorstatus > 0){
    if(GFFP_errorstatus == 1){
      cout << "GFF file '" << filename << "' had errors (see output above), but they seem minor.\n";
    }else{
      MIRANOTIFY(Notify::FATAL,"GFF file '" << filename << "' had unrecoverable errors (see output above). Fix your file!\n");
    }
  }

  FUNCEND();
}

void GFFParse::checkTagsOnceLoaded()
{
  FUNCSTART("void GFFParse::checkTagsOnceLoaded()");

  bool errorsfound=false;
  for(size_t snmindex=0; snmindex<GFFP_seqnames.size(); ++snmindex){
    if(GFFP_sequences[snmindex].empty() && !GFFP_sequencetags.empty()){
      cout << "Sequence " << GFFP_seqnames[snmindex] << " has elements defined on sequence, but no sequence?\n";
      errorsfound=true;
      if(GFFP_errorstatus<2) GFFP_errorstatus=2;
    }
  }

  if(errorsfound) return;

  for(size_t snmindex=0; snmindex<GFFP_seqnames.size(); ++snmindex){
    vector<multitag_t>::iterator mtI=GFFP_sequencetags[snmindex].begin();
    //cout << "checking " << GFFP_seqnames[snmindex] <<endl;
    for(; mtI != GFFP_sequencetags[snmindex].end(); ++mtI){
      if(mtI->from>=GFFP_sequences[snmindex].size()){
	errorsfound=true;
	cout << "Sequence " << GFFP_seqnames[snmindex]
	     << ": position 'from' (field 4," << mtI->from << ") is larger than the sequence size (" << GFFP_sequences[snmindex].size() << "," << GFFP_seqnames[snmindex] << "): " << mtI->getCommentStr() << '\n';
	if(GFFP_errorstatus<2) GFFP_errorstatus=2;
      }
      
      if(mtI->to>=GFFP_sequences[snmindex].size()){
	errorsfound=true;
	cout << "Sequence " << GFFP_seqnames[snmindex]
	     << ": position 'to' (field 5," << mtI->to << ") is larger than the sequence size (" << GFFP_sequences[snmindex].size() << "," << GFFP_seqnames[snmindex] << "): " << mtI->getCommentStr() << '\n';
	if(GFFP_errorstatus<2) GFFP_errorstatus=2;
      }
    }
  }

  FUNCEND();
}



const char * GFFParse::translateGFFfeat2GAP4feat(const string & feature)
{
  strstrmap::iterator transI=GFFP_mapgff2gap4.find(feature);
  if(transI != GFFP_mapgff2gap4.end()) return transI->second.c_str();

  return "";
}

const char * GFFParse::translateGAP4feat2GFFfeat(const string & feature)
{
  strstrmap::iterator transI=GFFP_mapgap42gff.find(feature);
  if(transI != GFFP_mapgap42gff.end()) return transI->second.c_str();

  return "";
}
