/*
 * Written by Bastien Chevreux (BaCh)
 * Copyright (C) 2007 and later by Bastien Chevreux
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
 * 
 * 
 */

// 	$Id$	

#ifndef lint
static char vcid[] = "$Id$";
#endif /* lint */

#include <boost/thread/thread.hpp>
#include <boost/bind.hpp>

#include <iostream>
#include <math.h>


#include "skim.H"

#include "errorhandling/errorhandling.H"





//#define CEBUGFLAG

#ifdef CEBUGFLAG
#define CEBUG(bla)   {cout << bla; cout.flush();}
#define CEBUGF(bla)  {cout << bla; cout.flush();}
#else
#define CEBUG(bla)
#define CEBUGF(bla)
#endif


//#define CEBUG(bla)   {cout << bla; cout.flush();}


#define MAXVHASHMASK 0xFFFFFFLL


#define MAXREADSIZEALLOWED 29900

// 10G
#define SKIMMATCHFIRSTCHECK 10737418240LL
#define SKIMMATCHCHECKINCR 10737418240LL

//#define SKIMMATCHFIRSTCHECK 200000
//#define SKIMMATCHCHECKINCR 100000



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::foolCompiler()
{
#include "stdinc/foolcompiler.C"
}



/*************************************************************************
 *
 *
 *
 *************************************************************************/

// Plain vanilla constructor
Skim::Skim()
{
  FUNCSTART("Skim::Skim(ReadPool & rp)");

  SKIM3_logflag_purgeunnecessaryhits=false;
  init();

  FUNCEND();
}


void Skim::init()
{
  FUNCSTART("Skim::init()");

  SKIM3_possiblehits=0;
  SKIM3_acceptedhits=0;


  SKIM3_numthreads=2;
  SKIM3_basesperhash=16;
  SKIM3_hashsavestepping=4;
  //SKIM3_percentrequired=50;
  SKIM3_overlaplenrequired.clear();
  for(uint32 i=0;i<Read::getNumSequencingTypes(); i++){
    SKIM3_overlaplenrequired.push_back(20);
    SKIM3_percentrequired.push_back(50);
  }

  SKIM3_totalpermbans=0;
  SKIM3_totalhitschosen=0;

  //SKIM3_whpid_counter=0;

  setHashFrequencyRatios(.4,1.6,2.0,8.0,20.0,static_cast<uint32>(100));

  FUNCEND()
}

/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

Skim::~Skim()
{
  FUNCSTART("Skim::~Skim()");
  //  ERROR("Not implemented yet.");

  FUNCEND();
}


//// Copy constructor
////  no discard needed as this object will be freshly created when
////  called through this constructor
//Skim::Skim(Skim const &other)
//{
//  FUNCSTART("Skim::Skim(Skim const &other)");
//
//  SKIM_valid=0;
//
//  *this=other;                               // call the copy operator
//
//  FUNCEND();
//}
//
//
//// Copy operator, needed by copy-constructor
//Skim const & Skim::operator=(Skim const & other)
//{
//  FUNCSTART("Skim const & Skim::operator=(Skim const & other)");
//  ERROR("Not implemented yet.");
//  FUNCEND();
//  return *this;
//}




///*************************************************************************
// *
// *
// *
// *
// *************************************************************************/
//
//ostream & operator<<(ostream &ostr, Skim const &theskim)
//{
//  FUNCSTART("friend ostream & Skim::operator<<(ostream &ostr, const  &theskim)");
//  //  ERROR("Not implemented yet.");
//
//  FUNCEND();
//  return ostr;
//}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::discard()
{
  FUNCSTART("Skim::discard()");

  FUNCEND();
}



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::setHashFrequencyRatios(double freqest_minnormal,
			    double freqest_maxnormal,
			    double freqest_repeat,
			    double freqest_heavyrepeat,
			    double freqest_crazyrepeat,
			    uint32 nastyrepeatratio)
{
  SKIM3_freqest_minnormal=freqest_minnormal;
  SKIM3_freqest_maxnormal=freqest_maxnormal;	
  SKIM3_freqest_repeat=freqest_repeat;	
  SKIM3_freqest_heavyrepeat=freqest_heavyrepeat;
  SKIM3_freqest_crazyrepeat=freqest_crazyrepeat;
  SKIM3_nastyrepeatratio=nastyrepeatratio;
}


/*************************************************************************
 *
 * returns number of megahubs
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

uint32 Skim::skimGo(ReadPool & rp, 
		    string               & posfmatchname,
		    string               & poscmatchname,
		    string               & megahublogname,
		    bannedoverlappairs_t & bannedoverlaps,
		    vector<uint32>       & overlapcounter,
		    vector<uint32>       & writtenhitsperid,
//		    vector<uint32>       & rawhashhitcounter,
		    vector<int32>        & chuntleftcut,
		    vector<int32>        & chuntrightcut,
		    vector<uint8>        & overlapcritlevell,
		    vector<uint8>        & overlapcritlevelr,
		    uint32 numthreads,
		    uint32 maxmemusage,
		    bool onlyagainstrails,
		    bool alsocheckreverse,
		    uint8  bph,
		    uint8  hss,
		    //int32  percentrequired,
		    const vector<int32> & percentrequired,
		    const vector<int32> & overlaplenrequired,
		    uint32 maxhitsperread)
{
  FUNCSTART("uint32 Skim::skimGo( ... )");

  dateStamp(cout);
  Read::setCoutType(Read::AS_CLIPPEDFASTA);

  init();

  SKIM3_readpool=&rp;

  SKIM3_numthreads=numthreads;
  if(SKIM3_numthreads<1) SKIM3_numthreads=1;
  if(SKIM3_numthreads>256) SKIM3_numthreads=256;

  // if extalso is true, the take not only the clipped sequence, but
  //  also the extends to it

  SKIM3_onlyagainstrails=onlyagainstrails;

  if(sizeof(vhash_t)==4){
    if(bph>16) bph=16;
  }
  if(sizeof(vhash_t)==8){
    if(bph>32) bph=32;
  }
  SKIM3_basesperhash=bph;
  SKIM3_hashsavestepping=hss;
  SKIM3_percentrequired=percentrequired;
  SKIM3_overlaplenrequired=overlaplenrequired;
  SKIM3_maxhitsperread=maxhitsperread;

  SKIM3_overlapcritlevell=&overlapcritlevell;
  SKIM3_overlapcritlevelr=&overlapcritlevelr;

  // TODO: check whether these can be re-used between assembly passes
  //  would reduce skim files even a bit further in later passes
  //  Question: can an overlap criterion level become worse through editing???
  SKIM3_overlapcritlevell->clear();
  SKIM3_overlapcritlevell->resize(SKIM3_readpool->size(),255);
  SKIM3_overlapcritlevelr->clear();
  SKIM3_overlapcritlevelr->resize(SKIM3_readpool->size(),255);
  SKIM3_largestencasementscoretodate.clear();
  SKIM3_largestencasementscoretodate.resize(SKIM3_readpool->size(),0);

  SKIM3_overlapcounter=&overlapcounter;
  SKIM3_overlapcounter->clear();
  SKIM3_overlapcounter->resize(SKIM3_readpool->size(),0);

  SKIM3_writtenhitsperid=&writtenhitsperid;
  SKIM3_writtenhitsperid->clear();

  SKIM3_posfmatchnextchecksize=SKIMMATCHFIRSTCHECK;
  SKIM3_poscmatchnextchecksize=SKIM3_posfmatchnextchecksize;

  SKIM3_bannedoverlaps=&bannedoverlaps;


  // for chimerahunt (if wished), but not if only against rails
  SKIM3_chimerahunt.clear();
  SKIM3_chuntleftcut=&chuntleftcut;
  SKIM3_chuntrightcut=&chuntrightcut;
  if(!onlyagainstrails && chuntleftcut.size()!=0){
    chuntleftcut.clear();
    chuntleftcut.resize(SKIM3_readpool->size(),0);
    chuntrightcut.clear();
    chuntrightcut.resize(SKIM3_readpool->size(),0);
    SKIM3_chimerahunt.resize(SKIM3_readpool->size());
    for(uint32 i=0; i<SKIM3_readpool->size(); i++){
      SKIM3_chimerahunt[i].resize(SKIM3_readpool->getRead(i).getLenClippedSeq(),0);
    }
  }


  SKIM_partfirstreadid=0;         // partition first read id
  SKIM_partlastreadid=0;         // partition last read id

  SKIM3_posfmatchfname=posfmatchname;
  SKIM3_posfmatchfout.open(posfmatchname.c_str(), ios::out| ios::trunc | ios::binary);
  SKIM3_poscmatchfname=poscmatchname;
  SKIM3_poscmatchfout.open(poscmatchname.c_str(), ios::out| ios::trunc | ios::binary);

  ofstream mout;
  mout.open(megahublogname.c_str(), ios::out| ios::trunc);    

  uint32 numpartitions=computePartition(maxmemusage*SKIM3_hashsavestepping,true);

  CEBUG("We will get " << numpartitions << " partitions.\n");

  //cout << "\nSKIMMER: Using maximum of " << maxmemusage << " hashes stored in memory, " << numpartitions << " partitions will be computed." << endl << endl;

  CEBUG("Progressend: " << SKIM_progressend << endl);

  SKIM3_megahubs.resize(SKIM3_readpool->size(),0);
  SKIM3_fullencasedcounter.resize(SKIM3_readpool->size(),0);

  fillTagStatusInfoOfReads();

  cout << "Now running threaded and partitioned skimmer with " << numpartitions << " partitions in " << SKIM3_numthreads << " threads:" << endl;

  SKIM_progressindicator= new ProgressIndicator<int64>(0,SKIM_progressend);

  SKIM3_vhraparray.clear();
  for(uint32 actpartition=1; actpartition<=numpartitions; actpartition++){
    CEBUG("\nWorking on partition " << actpartition << "/" << numpartitions << endl);

    computePartition(maxmemusage*SKIM3_hashsavestepping,false);
    
    CEBUG("Will contain read IDs " << SKIM_partfirstreadid << " to " << SKIM_partlastreadid-1 << endl);

    prepareSkim(SKIM_partfirstreadid, SKIM_partlastreadid, SKIM3_vhraparray,true);
    if(!SKIM3_vhraparray.empty()){
      CEBUG("Checking forward hashes" << endl);
      startMultiThreading(1,
			  SKIM3_numthreads,
			  5000,
			  SKIM_partfirstreadid,
			  SKIM3_readpool->size(),
			  boost::bind( &Skim::cfhThreadsDataInit, this, _1 ),
			  boost::bind( &Skim::cfhThreadLoop, this, _1 ));
      purgeMatchFileIfNeeded(1);
      if(alsocheckreverse){
	CEBUG("Checking reverse hashes" << endl);
	startMultiThreading(-1,
			    SKIM3_numthreads,
			    5000,
			    SKIM_partfirstreadid,
			    SKIM3_readpool->size(),
			    boost::bind( &Skim::cfhThreadsDataInit, this, _1 ),
			    boost::bind( &Skim::cfhThreadLoop, this, _1 ));
	purgeMatchFileIfNeeded(-1);
      }
      CEBUG("Done." << endl);
    }

    SKIM_partfirstreadid=SKIM_partlastreadid;
  }

  SKIM_progressindicator->finishAtOnce();

  cout << " done.\n";

  SKIM3_posfmatchfout.close();
  SKIM3_poscmatchfout.close();

  SKIM3_writtenhitsperid->resize(SKIM3_readpool->size(),0);
  purgeUnnecessaryHitsFromSkimFile(SKIM3_posfmatchfname,1);
  purgeUnnecessaryHitsFromSkimFile(SKIM3_poscmatchfname,-1);

  uint32 megahubs=0;

  for(uint32 i=0; i<SKIM3_megahubs.size(); i++){
    if(SKIM3_megahubs[i]>0) {
      megahubs++;
      mout << SKIM3_readpool->getRead(i).getName() << '\n';
    }
  }
  cout << "\nSkim summary:\n\taccepted: " << SKIM3_acceptedhits << "\n\tpossible: " << SKIM3_possiblehits  << "\n\tpermbans: " << SKIM3_totalpermbans;
  cout << "\n\nHits chosen: " << SKIM3_totalhitschosen << "\n\n";

  mout.close();
  dateStamp(cout);

  cout << endl;

  delete SKIM_progressindicator;

  if(SKIM3_chimerahunt.size()){
    chimeraHuntLocateChimeras();
  }


  
  FUNCEND();
  return megahubs;
}
//#define CEBUG(bla)



/*************************************************************************
 *
 *
 *
 *
 *
 *
 *************************************************************************/

void Skim::fillTagStatusInfoOfReads()
{
  SKIM3_hasMNRr.clear();
  SKIM3_hasSRMr.clear();
  SKIM3_hasFpAS.clear();
  SKIM3_hasMNRr.resize(SKIM3_readpool->size(),0);
  SKIM3_hasSRMr.resize(SKIM3_readpool->size(),0);
  SKIM3_hasFpAS.resize(SKIM3_readpool->size(),0);

  for(uint32 actreadid=0; actreadid<SKIM3_readpool->size(); actreadid++){
    for(uint32 tn=0; tn<SKIM3_readpool->getRead(actreadid).getNumOfTags(); tn++){
      if(SKIM3_readpool->getRead(actreadid).getTag(tn).identifier==Read::REA_tagentry_idMNRr) SKIM3_hasMNRr[actreadid]=1;
      if(SKIM3_readpool->getRead(actreadid).getTag(tn).identifier==Read::REA_tagentry_idSRMr) SKIM3_hasSRMr[actreadid]=1;
      if(SKIM3_readpool->getRead(actreadid).getTag(tn).identifier==Read::REA_tagentry_idFpAS) SKIM3_hasFpAS[actreadid]=1;
    }
  }
}

/*************************************************************************
 *
 * computes either 
 *  - starting from the first read id, the last read id to use for the
 *    next partition (SKIM_partfirstreadid and SKIM_partlastreadid)
 * or
 *  - starting from the first read id, the total number of partitions
 * to use in the next skim run.(returned)
 *
 * The maxmemusage is the dominating factor: each partition may not use
 *  (much) more than that.
 *
 *************************************************************************/


//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUGF(bla)  {cout << bla; cout.flush();}

uint32 Skim::computePartition(uint32 maxmemusage, bool computenumpartitions)
{
  FUNCSTART("uint32 Skim::computePartition(uint32 maxmemusage, bool computenumpartitions)");

  uint32 numpartitions=0;
  uint32 totalseqlen=0;
  uint32 maxseqlen=0;

  SKIM_partlastreadid=SKIM_partfirstreadid;

  if(computenumpartitions) {
    SKIM_progressend=SKIM3_readpool->size()-SKIM_partlastreadid;
  }
  
  for(; SKIM_partlastreadid<SKIM3_readpool->size(); SKIM_partlastreadid++) {
    if(!SKIM3_readpool->getRead(SKIM_partlastreadid).hasValidData()
       || !SKIM3_readpool->getRead(SKIM_partlastreadid).isUsedInAssembly()) continue; 

    if(SKIM3_readpool->getRead(SKIM_partlastreadid).getLenClippedSeq() > MAXREADSIZEALLOWED) {
      MIRANOTIFY(Notify::FATAL,"Read " << SKIM3_readpool->getRead(SKIM_partlastreadid).getName() << " is longer than MAXREADSIZEALLOWED (" << MAXREADSIZEALLOWED << ") bases. SKIM cannot handle this, aborting.\n");
    }

    maxseqlen=max(maxseqlen,SKIM3_readpool->getRead(SKIM_partlastreadid).getLenClippedSeq());

    totalseqlen+=SKIM3_readpool->getRead(SKIM_partlastreadid).getLenClippedSeq();
    //if(SKIM_takeextalso){
    //  totalseqlen+=SKIM3_readpool->getRead(SKIM_partlastreadid).getRightExtend();
    //}
    
    if(totalseqlen>maxmemusage) {
      if(computenumpartitions){
	totalseqlen=0;
	numpartitions++;
	
	SKIM_progressend+=SKIM3_readpool->size()-SKIM_partlastreadid;
	
      }else{
	SKIM_partlastreadid++;
	break;
      }
    }
  }

  if(computenumpartitions) {
    if(totalseqlen>0) {
      numpartitions++;
      SKIM_progressend+=SKIM3_readpool->size()-SKIM_partlastreadid;
    }
    SKIM_progressend*=2;
  }
  
  //// Compute SKIM_maxoffsets
  //if(maxseqlen>0){
  //  // this beauty is slow, but a cute hack to find out the number of 
  //  //  the highest bit which power of two fits the given value
  //  // e.g. 1024 -> 10 -> 2^10 = 1024 fits 1024
  //  //      1025 -> 11 -> 2^11 = 2048 fits 1025
  //  SKIM_mo_shiftmultiplier=0;
  //  while((maxseqlen-1) >> ++SKIM_mo_shiftmultiplier);
  //}else{
  //  SKIM_mo_shiftmultiplier=4;
  //}
  //
  //// restrict shift multiplier to 11 (and therefore maxoffsets to 2048)
  //if(SKIM_mo_shiftmultiplier>11) SKIM_mo_shiftmultiplier=11;
  //SKIM_maxoffsets=(1<<SKIM_mo_shiftmultiplier);

  FUNCEND();

  return numpartitions;
}

		       



/*************************************************************************
 *
 * sorter to sort from low to high, but lower 24bit grouped
 *
 *
 *************************************************************************/

inline bool Skim__sortVHRAPArray_(const vhrap_t & a, 
			    const vhrap_t & b);
inline bool Skim__sortVHRAPArray_(const vhrap_t & a, const vhrap_t & b)
{
  ////if(a.vhash == b.vhash) {
  ////  return a.readid < b.readid;
  ////}
  //return a.vhash < b.vhash;
  
  if((a.vhash & MAXVHASHMASK) != (b.vhash & MAXVHASHMASK)) {
    return (a.vhash & MAXVHASHMASK) < (b.vhash & MAXVHASHMASK);
  }
  return a.vhash < b.vhash;
}





/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::prepareSkim(uint32 fromid, uint32 toid, vector<vhrap_t> & vhraparray, bool assemblychecks)
{
  FUNCSTART("void Skim::prepareSkim(bool alsocheckreverse)");

  vhraparray.clear();

  uint32 totalseqlen=0;
  uint32 totalseqs=0;

  for(uint32 seqnr=fromid; seqnr<toid; seqnr++) {
    if(!SKIM3_readpool->getRead(seqnr).hasValidData()) continue;
    if(assemblychecks
       && (!SKIM3_readpool->getRead(seqnr).isUsedInAssembly()
	   || (SKIM3_onlyagainstrails && !SKIM3_readpool->getRead(seqnr).isRail()))) continue; 
    totalseqlen+=SKIM3_readpool->getRead(seqnr).getLenClippedSeq();
    totalseqs++;
    //if(SKIM_takeextalso) totalseqlen+=SKIM3_readpool->getRead(i).getRightExtend();
  }
  
  //dateStamp(cout);
  CEBUG("\nPreparing skim data: "  << fromid << " to " << toid << endl);
  CEBUG(totalseqs << " sequences to skim, totalling " << totalseqlen << " bases." << endl);


  // next loop:
  //  transform each read into a series of forward 
  //   hashes, store them into the array along with info whether
  //   each hash position is valid or not,
  //  also store the info how many hashes each sequence produced

  uint32 totalhashes=0;

  if(totalseqlen>0){

    vhraparray.resize(totalseqlen/SKIM3_hashsavestepping);
    vector<vhrap_t>::iterator vhraparrayI=vhraparray.begin();
    vector<uint8> tagmaskvector;
    
    //ProgressIndicator P(partfirstreadid, partlastreadid);
    for(uint32 seqnr=fromid; seqnr < toid; seqnr++){
      //P.progress(seqnr);
      Read & actread= SKIM3_readpool->getRead(seqnr);
      if(!actread.hasValidData()) continue;
      if(assemblychecks
	 && (!SKIM3_readpool->getRead(seqnr).isUsedInAssembly()
	     || (SKIM3_onlyagainstrails && !SKIM3_readpool->getRead(seqnr).isRail()))) continue; 
      
      uint32 slen=actread.getLenClippedSeq();
      //if(SKIM_takeextalso) slen+=actread.getRightExtend();
      
      const vector<Read::bposhashstat_t> & bposhashstats=actread.getBPosHashStats();
      int32 bfpos=actread.calcClippedPos2RawPos(0);
      int32 bfposinc=1;
      
      if(slen>=8) {
	fillTagMaskVector(seqnr, tagmaskvector);
	uint32 hashesmade= transformSeqToVariableHash(
	  seqnr,
	  actread,
	  actread.getClippedSeqAsChar(),
	  slen,
	  SKIM3_basesperhash,
	  vhraparrayI,
	  false,
	  SKIM3_hashsavestepping,
	  tagmaskvector,
	  bposhashstats,
	  bfpos,
	  bfposinc
	  );
	
	//CEBUG(seqnr << "\t" << totalhashes << "\t" << slen << endl);
	totalhashes+=hashesmade;
      }
    }
    
    //P.progress(partlastreadid);
    CEBUG("Totalseqlen " << totalseqlen << endl);
    CEBUG("Computed " << totalhashes << " linkpoints." << endl);
    
    if(totalhashes>0){
      CEBUG("Resizing array" << endl);
      vhraparray.resize(totalhashes);
      
      if(0){
	CEBUG("Partition unsorted:\n");
	vector<vhrap_t>::const_iterator vaI=vhraparray.begin();
	for(; vaI!=vhraparray.end(); vaI++){
	  cout << *vaI << '\n';
	}
	cout << "###########################" << endl;
      }
      
      CEBUG("Sorting array" << endl);
      sort(vhraparray.begin(), vhraparray.end(), Skim__sortVHRAPArray_);
      
      if(0){
	CEBUG("Partition sorted:\n");
	vector<vhrap_t>::const_iterator vaI=vhraparray.begin();
	for(; vaI!=vhraparray.end(); vaI++){
	  cout << *vaI << '\n';
	}
	cout << "###########################" << endl;
      }
      
    }
  }

  CEBUG("Making shortcuts" << endl);
  makeVHRAPArrayShortcuts(vhraparray, SKIM3_basesperhash);

  FUNCEND();
}
//#define CEBUG(bla)



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::purgeMatchFileIfNeeded(int8 direction)
{
  FUNCSTART("void Skim::purgeMatchFileIfNeeded(int8 direction)");

  ofstream * posmatchfout=NULL;
  string * fname=NULL;
  uint64 * nextchecksize=NULL;
  if(direction>0){
    posmatchfout=&SKIM3_posfmatchfout;
    fname=&SKIM3_posfmatchfname;
    nextchecksize=&SKIM3_posfmatchnextchecksize;
  }else{
    posmatchfout=&SKIM3_poscmatchfout;
    fname=&SKIM3_poscmatchfname;
    nextchecksize=&SKIM3_poscmatchnextchecksize;
  }

  if(posmatchfout->tellp() >= *nextchecksize){
    (*nextchecksize)+=SKIMMATCHCHECKINCR;

    posmatchfout->close();
    purgeUnnecessaryHitsFromSkimFile(*fname,direction);
    
    posmatchfout->open(fname->c_str(), ios::out|ios::app);
    if(!posmatchfout){
      MIRANOTIFY(Notify::FATAL, "Could not reopen SKIM match file " << *fname);
    }
    if(posmatchfout->tellp() >= *nextchecksize){
      (*nextchecksize)=SKIMMATCHCHECKINCR+posmatchfout->tellp();
    }
  }

  FUNCEND();
}


/*************************************************************************
 *
 * Go through written skim hits on disk. Compare saved hits to best level found
 * If both reads agree they have better partners at hand, throw out skim hit.
 * Do this only if both reads are not rails, else it might be that good, fully
 *  encased hits with 99% are thrown out because of seemingly better partially
 *  overlapping hits at 100% (and the part outside the overlap then wreaks
 *  havoc in reality with some non-identical repeat part)
 *
 * Careful: this rewrites and truncates the original file.
 * Length will awlays be <= initial length
 *
 *************************************************************************/
//#define CEBUG(bla)   {cout << bla; cout.flush();}
void Skim::purgeUnnecessaryHitsFromSkimFile(string & filename, const int8 rid2dir)
{
  FUNCSTART("void Skim::purgeUnnecessaryHitsFromSkimFile(string & filename, const int8 rid2dir)");

  // temporary skim container
  vector<skimhitforsave_t> tsc;

  FILE * finfout;
  finfout = fopen(filename.c_str(),"r+");
  if(finfout == NULL) {
    MIRANOTIFY(Notify::FATAL, "File not found: " << filename);
  }

  ofstream logfout;
  if(SKIM3_logflag_purgeunnecessaryhits){
    string path,justfilename;
    splitFullPathAndFileName(filename,path,justfilename);
    string logfilename=path+"/elog.skim.puh."+justfilename;
    cout << "\nSkim: elog " << logfilename << '\n';
    logfout.open(logfilename.c_str(), ios::out|ios::trunc);
  }

//  string system_lsdir = static_cast<string>("ls -l ")+AS_miraparams[0].getDirectoryParams().dir_log;
//  {
//    int tmp=system(system_lsdir.c_str()); 
//    // don't complain about unused variable
//    (void) tmp;
//  }

  fseek(finfout, 0, SEEK_END);
  streamsize finsize=ftell(finfout);
  rewind(finfout);

  uint64 lineno=0;
  uint32 bannedoverlapsfound=0;
  size_t totalhits=0;

  ADSEstimator adse;

  long freadpos=0;
  long fwritepos=0;

  while(!feof(finfout)){
    tsc.resize(500000);
    fseek(finfout, freadpos, SEEK_SET);
    
    size_t numread=fread(&tsc[0],sizeof(skimhitforsave_t),tsc.capacity(),finfout);

    if(numread==0) break;
    CEBUG("rsh4_pUHFNSF: read " << numread << endl;)
    lineno+=numread;

    freadpos=ftell(finfout);
    CEBUG("new freadpos: " << freadpos << endl);

    if(numread<tsc.capacity()) tsc.resize(numread);

    vector<skimhitforsave_t>::const_iterator readI=tsc.begin();
    vector<skimhitforsave_t>::iterator writeI=tsc.begin();

    uint8 ocll=255;
    uint8 oclr=255;
    
    for(; readI != tsc.end(); ++readI){
      bool del1=false;
      bool del2=false;
      bool del3=false;
      if(!(*SKIM3_readpool)[readI->rid1].isRail()
	 && !(*SKIM3_readpool)[readI->rid2].isRail()){
	adse.calcNewEstimateFromSkim(
	  readI->eoffset,
	  (*SKIM3_readpool)[readI->rid1].getLenClippedSeq(),
	  (*SKIM3_readpool)[readI->rid2].getLenClippedSeq(),
	  readI->rid1,
	  readI->rid2,
	  1,
	  rid2dir);
	
	Skim::getOverlapCriterionLevel(readI->rid1,
				       (*SKIM3_readpool)[readI->rid1].getSequencingType(),
				       adse,readI->percent_in_overlap,
				     ocll,oclr);
	CEBUG("OCL: " << readI->rid1 << " " << static_cast<uint16>(ocll) << " " << static_cast<uint16>(oclr) << endl);
	
	// if it's a rail, then the read has no saying in the decision whether this skim should be deleted
	// if not, look at overlap criterion level left and right
	if((*SKIM3_readpool)[readI->rid1].isRail()
	   || (ocll>(*SKIM3_overlapcritlevell)[readI->rid1]
	       && oclr>(*SKIM3_overlapcritlevelr)[readI->rid1])){
	  del1=true;
	}
	// if it's a Solexa and the best overlapcritlevel is not 0, we are probably in a under-coverage
	//  situation ... try to account for that by being less harsh
	// Values: in getOverlapCriterionLevel(), current overlapcritlevel for Solexa goes from 
	//  0-29 for 100% matches
	//  30-59 for 99% 
	//  ... etc up to 95% (including, == 149 max)
	// Therefore, if best overlapcritlevel != 0 && <= 29, then the best overlap is 100% albeit
	//  not as long as it could be when in high coverage situations
	// Therefore: low coverage
	// Therefore: we'll take all 100% matches for that read
	if(del1 && SKIM3_readpool->getRead(readI->rid1).isSequencingType(Read::SEQTYPE_SOLEXA)){
	  if(ocll <= 29
	     && (*SKIM3_overlapcritlevell)[readI->rid1]!= 0
	     && (*SKIM3_overlapcritlevell)[readI->rid1] <= 29){
	    del1=false;
	  }else if(oclr <= 29
		   && (*SKIM3_overlapcritlevelr)[readI->rid1]!= 0 
		   && (*SKIM3_overlapcritlevelr)[readI->rid1] <= 29){
	    del1=false;
	  }
	}

	Skim::getOverlapCriterionLevel(readI->rid2,
				       (*SKIM3_readpool)[readI->rid2].getSequencingType(),
				       adse,readI->percent_in_overlap,
				       ocll,oclr);
	CEBUG("OCL: " << readI->rid2 << " " << static_cast<uint16>(ocll) << " " << static_cast<uint16>(oclr) << endl);
	// if it's a rail, then the read has no saying in the decision whether this skim should be deleted
	// if not, look at overlap criterion level left and right
	if((*SKIM3_readpool)[readI->rid2].isRail()
	   || (ocll>(*SKIM3_overlapcritlevell)[readI->rid2]
	       && oclr>(*SKIM3_overlapcritlevelr)[readI->rid2])){
	  del2=true;
	}

	// if it's a Solexa and the best overlapcritlevel is not 0, we are probably in a under-coverage
	//  situation ... try to account for that by being less harsh
	if(del2 && SKIM3_readpool->getRead(readI->rid2).isSequencingType(Read::SEQTYPE_SOLEXA)){
	  if(ocll <= 29
	     && (*SKIM3_overlapcritlevell)[readI->rid2]!= 0 
	     && (*SKIM3_overlapcritlevell)[readI->rid2] <= 29){
	    del2=false;
	  }else if(oclr <= 29
		   && (*SKIM3_overlapcritlevelr)[readI->rid2]!= 0 
		   && (*SKIM3_overlapcritlevelr)[readI->rid2] <= 29){
	    del2=false;
	  }
	}

	// test
	// but Solexa elitists (<=5 left/right) do not want to play with pariahs (both >+5 levels left/right)
	if(SKIM3_readpool->getRead(readI->rid1).isSequencingType(Read::SEQTYPE_SOLEXA)
	   && SKIM3_readpool->getRead(readI->rid2).isSequencingType(Read::SEQTYPE_SOLEXA)){
	  if((*SKIM3_overlapcritlevell)[readI->rid1] <= 5
	     && (*SKIM3_overlapcritlevelr)[readI->rid1] <= 5){
	    
	    if((*SKIM3_overlapcritlevell)[readI->rid2] > (*SKIM3_overlapcritlevell)[readI->rid1]+5
	       && (*SKIM3_overlapcritlevelr)[readI->rid2] > (*SKIM3_overlapcritlevelr)[readI->rid1]+5){
	      del3=true;
	    }
	  }else if((*SKIM3_overlapcritlevell)[readI->rid2] <= 5
		   && (*SKIM3_overlapcritlevelr)[readI->rid2] <= 5){
	    
	    if((*SKIM3_overlapcritlevell)[readI->rid1] > (*SKIM3_overlapcritlevell)[readI->rid2]+5
	       && (*SKIM3_overlapcritlevelr)[readI->rid1] > (*SKIM3_overlapcritlevelr)[readI->rid2]+5){
	      del3=true;
	    }
	  }
	}
      }

      // is there a template set and is it the same for both reads? keep that overlap
      if(SKIM3_readpool->getRead(readI->rid1).getTemplateID() >= 0
	 && SKIM3_readpool->getRead(readI->rid1).getTemplateID() == SKIM3_readpool->getRead(readI->rid2).getTemplateID()){
	del1=false;
	del2=false;
	del3=false;
      }

      if(readI != writeI){
	*writeI=*readI;
      }
      ++writeI;
      CEBUG("DEL: " << del1 << " " << del2 << " " << del3 << endl);
      CEBUG(readI->rid1 << ": " << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid1]) << " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid1])
	    << "\t\t" << readI->rid2 << ": " << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid2]) << " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid2]) << endl);

      if((del1 && del2) || del3){
	//if(0){
	--writeI;
	CEBUG("Purged: " << *readI);
	if(SKIM3_logflag_purgeunnecessaryhits){
	  logfout << "Purged:\t" << SKIM3_readpool->getRead(readI->rid1).getName()
		  << " (" << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid1]) 
		  << "," << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid1]) << ")"
		  << '\t' << SKIM3_readpool->getRead(readI->rid2).getName()
		  << " (" << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid2]) 
		  << "," << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid2]) << ")"
		  << "\t(" << static_cast<uint16>(ocll) 
		  << "," << static_cast<uint16>(oclr) << ")"
		  << '\t' << *readI;
	}
      }else{
	if(min(readI->rid1,readI->rid2) == 0) CEBUG("DINGO! ");
	CEBUG("Kept: " << *readI);
	if(SKIM3_logflag_purgeunnecessaryhits){
	  logfout << "Kept:\t" << SKIM3_readpool->getRead(readI->rid1).getName()
		  << " (" << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid1]) 
		  << "," << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid1]) << ")"
		  << '\t' << SKIM3_readpool->getRead(readI->rid2).getName()
		  << " (" << static_cast<uint16>((*SKIM3_overlapcritlevell)[readI->rid2]) 
		  << "," << static_cast<uint16>((*SKIM3_overlapcritlevelr)[readI->rid2]) << ")"
		  << "\t(" << static_cast<uint16>(ocll) 
		  << "," << static_cast<uint16>(oclr) << ")"
		  << '\t' << *readI;
	}
	if(!SKIM3_writtenhitsperid->empty()){
	  (*SKIM3_writtenhitsperid)[min(readI->rid1,readI->rid2)]+=2;
	}
      }
    }

    // the resize thing is really not optimal ... one could write to file only a 
    //  subset. However, at the moment just keep it for 100% safety
    CEBUG("Purge skim data. Old size: " << tsc.size() << endl);
    tsc.resize(tsc.size()-(readI-writeI));
    CEBUG("New size: " << tsc.size() << endl);

    if(!tsc.empty()){
      fseek(finfout, fwritepos, SEEK_SET);
      if(fwrite(&tsc[0],
		sizeof(skimhitforsave_t),
		tsc.size(),
		finfout) != tsc.size()){
	MIRANOTIFY(Notify::FATAL, "Could not write anymore to normalised skim file. Disk full? Changed permissions?");
      }
      fwritepos=ftell(finfout);
      CEBUG("new fwritepos: " << fwritepos << endl);
    }
  }

  fclose(finfout);

//  {
//    int tmp=system(system_lsdir.c_str()); 
//    // don't complain about unused variable
//    (void) tmp;
//  }

  cout << "truncating " << filename << " from " << finsize << " to " << fwritepos << endl;
  if(truncate(filename.c_str(),fwritepos)){
    MIRANOTIFY(Notify::FATAL, "Could not truncate normalised skim file? Strange ...");
  }

//  {
//    int tmp=system(system_lsdir.c_str()); 
//    // don't complain about unused variable
//    (void) tmp;
//  }
//
//  tsc.resize(500000);
//  finfout = fopen(filename.c_str(),"r+");
//  size_t numread=fread(&tsc[0],sizeof(skimhitforsave_t),500000,finfout);
//  CEBUG("Read anew: " << numread << endl);
//  tsc.resize(numread);
//  for(uint32 i=0;i < numread; i++){
//    CEBUG("dbgcheck: " << tsc[i]);
//  }

  FUNCEND();
  return;
}
//#define CEBUG(bla)






/*************************************************************************
 *
 * beware: SKIM3_vashortcuts_* arrays may be empty at the return of this
 *  function in case the vhraparray itself was empty! Account for that in
 *  the search functions!
 *
 *************************************************************************/
//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUGF(bla)  {cout << bla; cout.flush();}

void Skim::makeVHRAPArrayShortcuts(vector<vhrap_t> & vhraparray, const uint8 basesperhash)
{
  //cout << "Making VHRAPArrayShortcuts" << endl;

  SKIM3_vashortcuts_begin.clear();
  SKIM3_vashortcuts_end.clear();
  SKIM3_completevhraparray_end=vhraparray.end();
  vector<vhrap_t>::const_iterator vaI=vhraparray.begin();
  if(vaI==vhraparray.end()) return;

  SKIM3_vashortcuts_begin.resize(
    1<<(min(static_cast<uint8>(12),basesperhash)*2),
//    static_cast<vector<vhrap_t>::iterator>(NULL)
//    SKIM3_empty_vector_vhrap_t.end()
    vhraparray.end()
    );

  SKIM3_vashortcuts_end.resize(
    1<<(min(static_cast<uint8>(12),basesperhash)*2),
//    static_cast<vector<vhrap_t>::iterator>(NULL)
//    SKIM3_empty_vector_vhrap_t.end()
    vhraparray.end()
    );

  vhash_t acthash= (vaI->vhash & MAXVHASHMASK);
  while(vaI != vhraparray.end()){
    SKIM3_vashortcuts_begin[acthash]=vaI;
    for(;vaI != vhraparray.end() && (vaI->vhash & MAXVHASHMASK) == acthash; vaI++) ;
    SKIM3_vashortcuts_end[acthash]=vaI;
    //cout << "vhash: " << hex << acthash << "\t" << dec << SKIM3_vashortcuts_end[acthash]-SKIM3_vashortcuts_begin[acthash] << '\n';
    if(vaI != vhraparray.end()) acthash= vaI->vhash & MAXVHASHMASK;
  }
}

//#define CEBUG(bla)
//#define CEBUGF(bla)




/*************************************************************************
 *
 * TODO: this is a mess, rewrite
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUGF(bla)  {cout << bla; cout.flush();}

uint32 Skim::transformSeqToVariableHash (const uint32 readid, const Read & actread, const char * seq, uint32 slen, const uint8 basesperhash, vector<vhrap_t>::iterator & vhraparrayI, const bool countonly, const uint8 hashsavestepping, vector<uint8> & tagmaskvector, const vector<Read::bposhashstat_t> & bposhashstats, int32 bfpos, const int32 bfposinc)
{
  FUNCSTART("void Skim::transformSeqToVariableHash (...)");

  BUGIFTHROW(basesperhash>32, "basesperhash > 32 ?");
  BUGIFTHROW(hashsavestepping<1, "hashsavestepping < 1 ?");
  if(slen>MAXREADSIZEALLOWED){
    MIRANOTIFY(Notify::FATAL,"Read " << actread.getName() << " is " << slen << " bp long and thus longer than MAXREADSIZEALLOWED (" << MAXREADSIZEALLOWED << ") bases. Skim cannot handle than, sorry.");
  }

//  Read::setCoutType(Read::AS_TEXT);
  CEBUG(actread);
  CEBUG("readid: " << readid << endl);
  CEBUG("seq: " << seq << endl);
  CEBUG("strlen(seq): " << strlen(seq) << endl);
  CEBUG("slen: " << slen << endl);
  CEBUG("bfpos: " << bfpos << endl);
  CEBUG("bfposinc: " << bfposinc << endl);

  vector<Read::bposhashstat_t>::const_iterator bhsI=bposhashstats.begin();
  advance(bhsI,bfpos);

  vhash_t lasthash=0;
  vhash_t acthash=0;
  vhash_t hashmask=1;
  // *grml* undefined behaviour of left shift for 64 shifts in a 64 bit type makes this cludge necessary
  if(basesperhash==32){
    hashmask=0;
  }else{
    hashmask<<=(basesperhash*2);
  }
  --hashmask;

  uint32 nonmaskedposbitvector=0;
  uint32 nmpmask=1;
  // *grml* undefined behaviour of left shift for 32 shifts in a 32 bit type makes this cludge necessary
  if(basesperhash==32){
    nmpmask=0;
  }else{
    nmpmask<<=(basesperhash);
  }
  --nmpmask;

  CEBUG("sizeof vhash_t: " << sizeof(vhash_t) << '\n');
  CEBUG("bases per hash: " << static_cast<uint16>(basesperhash) << '\n');
  CEBUG("hashsavestepping: " << static_cast<uint16>(hashsavestepping) << '\n');
  CEBUG("hash mask: " << hex << hashmask << dec << '\n');
  CEBUG("nmpmask: " << hex << nmpmask << dec << '\n');

  // first hash made must also be saved
  uint8 hashsavecounter=1;

  uint32 goods=0;
  uint32 bads=0;
  uint32  baseok=0;
  vector<vhrap_t>::iterator initial_vaI=vhraparrayI;
  vector<uint8>::const_iterator tmvI=tagmaskvector.begin();

  CEBUG("Hashing " << actread.getName() << '\t' << slen << '\t' << strlen(seq) << '\t' << actread.getLenClippedSeq() << "\n");


  char actbase;
  bool mustsavelasthash=false;
  uint16 lastposhashsaved=0;

  bool notright=false;
  for(uint16 seqi=0; seqi<slen; seqi++, seq++, tmvI++){
    lasthash=acthash;
    acthash<<=2;
    acthash&=hashmask;
    baseok++;

    actbase=static_cast<char>(toupper(*seq));

    CEBUG(seqi << '\t' << actbase << endl);
    
    switch (actbase) {
    case 'A' : break;
    case 'C' : {
      acthash+=1;
      break;
    }
    case 'G' : {
      acthash+=2;
      break;
    }
    case 'T' : {
      acthash+=3;
      break;
    }
    default : {
      if(dptools::isValidIUPACStarBase(actbase)) {
	// the IUPAC bases are treated like N and X

	mustsavelasthash=true;

	// break hash making (which is actually better than behaving 
	//  like another character in case of multiple bases with 
	//  IUPAC or '*')
	acthash=0;
	baseok=0;
      } else {
	cout << "Unknown base '" << *seq << "' (ASCII " << static_cast<uint16>(*seq) << ") at position " << seqi << " in _CLIPPED_ sequence " << actread.getName() << endl;
	exit(100);
      }
    }
    }
    
    // handling of masked positions
    bool lastposhadunmasked=false;
    if(nonmaskedposbitvector) lastposhadunmasked=true;
    nonmaskedposbitvector<<=1;
    nonmaskedposbitvector&=nmpmask;
    if(*tmvI==0){
      nonmaskedposbitvector|=1;
    }

    if(lastposhadunmasked && nonmaskedposbitvector==0 
       && lastposhashsaved != seqi-1) {
      mustsavelasthash=true;
    }

    CEBUG(seqi << ' ' << *seq << ' ' << static_cast<uint16>(*tmvI) << ' ' << hex << nonmaskedposbitvector << dec << ' ' << mustsavelasthash << ' ');
    if(baseok >= basesperhash) {
      goods++;
      if(!countonly && nonmaskedposbitvector) {
	if(mustsavelasthash){
	  vhraparrayI->vhash=lasthash;
	  vhraparrayI->readid=readid;
	  vhraparrayI->hashpos=seqi-1;
	  vhraparrayI->bhashstats=(bhsI-bfposinc)->getBHashStat(-bfposinc);
	  // getBHashStat(-bfposinc) because while we're running "forward", we save
	  //  hashes only with a delay of 'basesperhash' and need to know the status
	  //  of the "past" bases ... and this info is readily available in 
	  //  the BHashStat of the other strand

	  CEBUG("saved LG hash: " << *vhraparrayI << '\n');

	  lastposhashsaved=seqi-1;
	  vhraparrayI++;
	  // set hashsavecounter to 1 so that the next good hash
	  //  generated is saved!
	  hashsavecounter=1;
	  mustsavelasthash=false;
	} else if(--hashsavecounter == 0){
	  vhraparrayI->vhash=acthash;
	  vhraparrayI->readid=readid;
	  vhraparrayI->hashpos=seqi;
	  vhraparrayI->bhashstats=bhsI->getBHashStat(-bfposinc);
	  // getBHashStat(-bfposinc) because while we're running "forward", we save
	  //  hashes only with a delay of 'basesperhash' and need to know the status
	  //  of the "past" bases ... and this info is readily available in 
	  //  the BHashStat of the other strand

	  CEBUG("saved hash: " << *vhraparrayI << '\n');

	  vhraparrayI++;
	  lastposhashsaved=seqi;
	  hashsavecounter=hashsavestepping;
	}
      }

    } else {
      //cout << "Missed hash" << endl;
      mustsavelasthash=false;
      if(seqi>=basesperhash) {
	bads++;
      }
    }
    CEBUG('\n');

    // this is a hack to make this routine work in -D_GLIBCXX_DEBUG mode
    // normally, this iterator should be handled by the for() statement,
    //  but in reverese cases where the read has no left clip, after the 
    //  last loop the iterator would advance to the "-1" position where the
    //  STL debug containers are not happy with, even though the iterator
    //  wouldn't be used as the for() loops would stop right there
    // therefore, this cludge

    if(bfposinc<0){
      if(notright){
	MIRANOTIFY(Notify::FATAL, "Something's not right here.");
      }
      if(bhsI!=bposhashstats.begin()) {
	bhsI+=bfposinc;
      }else{
	notright=true;
      }
    }else{
      bhsI+=bfposinc;
    }
  }
  
  //for(uint32 i=0; i<basesperhash; i++, hashp++, hashokp++) {
  //  *hashp=0;
  //  *hashokp=0;
  //}

#ifdef CEBUGFLAG
  CEBUG("goods: " << goods << endl);
  CEBUG("bads: " << bads << endl);
#endif

  return (vhraparrayI-initial_vaI);
}

//#define CEBUG(bla)
//#define CEBUGF(bla)






/*************************************************************************
 *
 * sorter to sort from low to high
 *
 *
 *************************************************************************/

inline bool Skim__compareVHRAPArrayElem_(const vhrap_t & one, const vhrap_t & other) 
{
  return one.vhash < other.vhash;
};


bool Skim__sortreadhashmatch_t_(const readhashmatch_t & a, 
			    const readhashmatch_t & b);
bool Skim__sortreadhashmatch_t_(const readhashmatch_t & a, const readhashmatch_t & b)
{
  if(a.rid2 == b.rid2){
    if(a.eoffset == b.eoffset) return a.hashpos1 < b.hashpos1;
    return a.eoffset < b.eoffset;
  }
  return a.rid2 < b.rid2;
}

/*************************************************************************
 *
 * sorter to sort from high to low
 *
 *
 *************************************************************************/

bool Skim__sortMWByPercent_(const matchwithsorter_t & a, 
			    const matchwithsorter_t & b);
bool Skim__sortMWByPercent_(const matchwithsorter_t & a, const matchwithsorter_t & b)
{
  if(a.percent_in_overlap == b.percent_in_overlap) {
    return a.numhashes > b.numhashes;
  }
  return a.percent_in_overlap > b.percent_in_overlap;
}

bool Skim__sortMWByEstimScore_(const matchwithsorter_t & a, 
			    const matchwithsorter_t & b);
bool Skim__sortMWByEstimScore_(const matchwithsorter_t & a, const matchwithsorter_t & b)
{
  if(a.estimscore == b.estimscore) {
    return a.percent_in_overlap > b.percent_in_overlap;
  }
  return a.estimscore > b.estimscore;
}

bool Skim__sortMWByNumHashes_(const matchwithsorter_t & a, 
			    const matchwithsorter_t & b);
bool Skim__sortMWByNumHashes_(const matchwithsorter_t & a, const matchwithsorter_t & b)
{
  if(a.numhashes == b.numhashes){
     return a.percent_in_overlap > b.percent_in_overlap;
  }
  return a.numhashes > b.numhashes;
}

//#define CEBUG(bla)   {cout << bla; cout.flush();}


//#define CEBUG(bla)   {boost::mutex::scoped_lock lock(SKIM3_coutmutex); cout << bla; cout.flush();}

// TODO: bad: direction should not be in this call, more of the called function
void Skim::startMultiThreading(const int8 direction, const uint32 numthreads, const uint32 readsperthread, const uint32 firstid, const uint32 lastid, boost::function<void(uint32_t)> initfunc, boost::function<void(uint32_t)> callfunc)
{
  // initialise task specific data by task specific init routine
  initfunc(numthreads);

  // initialise the data structure with which the master
  //  process (well, this process) will communicate with the 
  //  worker threads
  // do this *before* creating the threads :-)
  {
    threadworkercontrol_t twc;
    twc.from=0;
    twc.to=0;
    twc.direction=direction;
    twc.flag_datavalid=false;
    twc.flag_endthread=false;
    
    SKIM3_threadcontrol.clear();
    SKIM3_threadcontrol.resize(numthreads,twc);
  }

  // create the number of worker threads we will use
  boost::thread_group workerthreads;
  for(uint32 i=0; i<numthreads;i++){
    workerthreads.create_thread(boost::bind(callfunc, i));
  }

  // main work distribution loop
  // gives each thread a part of the search space. If no 
  //  thread is free, waits for a slave2master signal
  //  (which currently can only mean a thread has finished
  //  going through it's search space)

  uint32 startid=firstid;
  while(startid < lastid) {
    boost::mutex::scoped_lock mylock(SKIM3_mutex);

    // search thread that is idle
    uint32 tnr=0;
    for(; tnr<numthreads; tnr++){
      if(SKIM3_threadcontrol[tnr].flag_datavalid==false) break;
    }
    if(tnr==numthreads) {
      // no idle thread?
      //  well, wait for a slave2master signal
      SKIM3_slave2mastersignal.wait(mylock);
    }else{
      uint32 endid=startid+readsperthread;
      if(endid>lastid) endid=lastid;
      
      CEBUG("Giving " << startid << " to " << endid << " to thread " << tnr << "\n");
      SKIM3_threadcontrol[tnr].from=startid;
      SKIM3_threadcontrol[tnr].to=endid;
      SKIM3_threadcontrol[tnr].flag_datavalid=true;
      
      SKIM3_master2slavesignal.notify_all();

      startid=endid;
    }
  }

  // no more work to distribute
  // tell workerthreads to end as soon as they finished their
  //  current task
  CEBUG("Last packet given, flagging all threads the stop signal.\n");

  {
    boost::mutex::scoped_lock mylock(SKIM3_mutex);
    for(uint32 tnr=0; tnr<numthreads; tnr++){
      SKIM3_threadcontrol[tnr].flag_endthread=true;
    }
  }
  SKIM3_master2slavesignal.notify_all();

  // and wait for all threads of the threadgroup
  //  to return
  workerthreads.join_all();

}

void Skim::cfhThreadsDataInit(const uint32 numthreads)
{
  FUNCSTART("void Skim::cfhThreadsDataInit(const uint32 numthreads)");

  SKIM3_cfhd_vector.resize(numthreads);
  for(uint32 ti=0; ti<numthreads;++ti){
    SKIM3_cfhd_vector[ti].readhashmatches.clear();
    SKIM3_cfhd_vector[ti].readhashmatches.reserve(500000);
    SKIM3_cfhd_vector[ti].smallhist4repeats.clear();
    SKIM3_cfhd_vector[ti].smallhist4repeats.reserve(100);
    SKIM3_cfhd_vector[ti].singlereadvhraparray.clear(); 
    SKIM3_cfhd_vector[ti].singlereadvhraparray.reserve(5000); 
    SKIM3_cfhd_vector[ti].tmpmatchwith.clear();
    SKIM3_cfhd_vector[ti].tmpmatchwith.reserve(2000);
    SKIM3_cfhd_vector[ti].tagmaskvector.clear();
    SKIM3_cfhd_vector[ti].tagmaskvector.reserve(2000);
    SKIM3_cfhd_vector[ti].shfsv.clear();
    SKIM3_cfhd_vector[ti].shfsv.reserve(100000);
    SKIM3_cfhd_vector[ti].ridswithmatches.clear();
    SKIM3_cfhd_vector[ti].ridswithmatches.reserve(10000);
    SKIM3_cfhd_vector[ti].uidswithnewcritlevell.clear();
    SKIM3_cfhd_vector[ti].uidswithnewcritlevell.reserve(10000);
    SKIM3_cfhd_vector[ti].uidswithnewcritlevelr.clear();
    SKIM3_cfhd_vector[ti].uidswithnewcritlevelr.reserve(10000);
    SKIM3_cfhd_vector[ti].critlevellofnewuids.clear();
    SKIM3_cfhd_vector[ti].critlevellofnewuids.reserve(10000);
    SKIM3_cfhd_vector[ti].critlevelrofnewuids.clear();
    SKIM3_cfhd_vector[ti].critlevelrofnewuids.reserve(10000);
  }
  FUNCEND();
}

void Skim::cfhThreadLoop(const uint32 threadnr)
{
  FUNCSTART("void Skim::threadloop(const uint32 threadnr)");

  // threads need their own try() catch() block

  try {
    CEBUG("Thread: " << threadnr << " starting.\n");

    BUGIFTHROW(threadnr>=SKIM3_cfhd_vector.size(),"threadnr>=SKIM3_cfhd_vector.size()???");
    cfh_threaddata_t & cfhd=SKIM3_cfhd_vector[threadnr];
    //cfh_threaddata_t cfhd;

//    cfhd.readhashmatches.reserve(500000);
//    cfhd.singlereadvhraparray.reserve(5000); 
//    cfhd.tmpmatchwith.reserve(2000);
//    cfhd.tagmaskvector.reserve(2000);
//    cfhd.shfsv.reserve(100000);
//    
//    cfhd.posmatchfout=&SKIM3_posfmatchfout;
//
//    cfhd.ridswithmatches.reserve(10000);
//
//    cfhd.uidswithnewcritlevell.reserve(10000);
//    cfhd.uidswithnewcritlevelr.reserve(10000);
//    cfhd.critlevellofnewuids.reserve(10000);
//    cfhd.critlevelrofnewuids.reserve(10000);



    // we'll jump out with a break;
    while(true){
      { 
	boost::mutex::scoped_lock mylock(SKIM3_mutex);
	CEBUG("Thread " << threadnr << " waiting ...\n");
	while(!SKIM3_threadcontrol[threadnr].flag_datavalid
	      && ! SKIM3_threadcontrol[threadnr].flag_endthread){
	  SKIM3_master2slavesignal.wait(mylock);
	}
      }
      if(SKIM3_threadcontrol[threadnr].flag_datavalid){
	CEBUG("Thread " << threadnr << " working on " << SKIM3_threadcontrol[threadnr].from << " to " << SKIM3_threadcontrol[threadnr].to << "\n");
	
	cfhd.posmatchfout=&SKIM3_posfmatchfout;
	if(SKIM3_threadcontrol[threadnr].direction<0) cfhd.posmatchfout=&SKIM3_poscmatchfout;
	checkForHashes_fromto(SKIM3_threadcontrol[threadnr].direction,
			      SKIM3_threadcontrol[threadnr].from,
			      SKIM3_threadcontrol[threadnr].to,
			      cfhd);
	
	boost::mutex::scoped_lock mylock(SKIM3_mutex);
	SKIM3_threadcontrol[threadnr].flag_datavalid=false;
	
	SKIM3_slave2mastersignal.notify_one();
      }else if(SKIM3_threadcontrol[threadnr].flag_endthread){
	CEBUG("Thread " << threadnr << "  exiting.\n");
	break;
      }
    }

    if(cfhd.shfsv.size()){
      boost::mutex::scoped_lock lock(SKIM3_resultfileoutmutex);
      cfhd.posmatchfout->write(reinterpret_cast<char*>(&cfhd.shfsv[0]),sizeof(skimhitforsave_t)*cfhd.shfsv.size());
      if(cfhd.posmatchfout->bad()){
	MIRANOTIFY(Notify::FATAL, "Could not write anymore to skimhit save6. Disk full? Changed permissions?");
      }
      cfhd.shfsv.clear();
    }

  }
  catch(Notify n){
    n.handleError(THISFUNC);
  }

  FUNCEND();
}

//#define CEBUG(bla)




/*************************************************************************
 *
 *
 * TODO: counting fullencased is not thread safe atm! is it needed? should
 *        be approximative anyway
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::checkForHashes_fromto(const int8 direction, const uint32 fromid, const uint32 toid, cfh_threaddata_t & cfhd)
{
  FUNCSTART("void Skim::checkForHashes_fromto(const int8 direction, const uint32 fromid, const uint32 toid, cfh_threaddata_t & cfhd)");

  // really?
  //BUGIFTHROW(Read::getNumSequencingTypes() >4, "Must be reworked for new sequencing types! (encasement shortcuts & others?");

  if(SKIM3_vashortcuts_begin.empty() || SKIM3_vashortcuts_end.empty()) return;

  cfhd.readhashmatches.clear();
  cfhd.singlereadvhraparray.clear();
  cfhd.tmpmatchwith.clear();
  cfhd.tagmaskvector.clear();;
  // do NOT clear vector<skimhitforsave_t> shfsv !!!
  cfhd.ridswithmatches.clear();
  cfhd.uidswithnewcritlevell.clear();
  cfhd.uidswithnewcritlevelr.clear();
  cfhd.critlevellofnewuids.clear();
  cfhd.critlevelrofnewuids.clear();


  for(uint32 actreadid=fromid; actreadid<toid; actreadid++){
    //if(actreadid>100) return;

    // don't need to go through identified megahubs again
    if(SKIM3_megahubs[actreadid]>0) continue;

    // if this read has been fully encased by other reads, then also
    //  skip it
    if(SKIM3_fullencasedcounter[actreadid]) continue;

    Read & actread= SKIM3_readpool->getRead(actreadid);
    if(!actread.hasValidData()
      || !actread.isUsedInAssembly()) continue; 

    uint32 slen=actread.getLenClippedSeq();

    if(slen<SKIM3_basesperhash) continue;

    cfhd.singlereadvhraparray.resize(slen);

    vector<vhrap_t>::iterator srvaI=cfhd.singlereadvhraparray.begin();
    const vector<Read::bposhashstat_t> & bposhashstats=actread.getBPosHashStats();

    uint32 hashesmade;
    if(direction>0) {
      fillTagMaskVector(actreadid, cfhd.tagmaskvector);
      int32 bfpos=actread.calcClippedPos2RawPos(0);
      int32 bfposinc=1;
      hashesmade=transformSeqToVariableHash(
	actreadid,
	actread,
	actread.getClippedSeqAsChar(),
	slen,
	SKIM3_basesperhash,
	srvaI,
	false,
	1,
	cfhd.tagmaskvector,
	bposhashstats,
	bfpos,
	bfposinc
	);
    }else{
      // TODO: first fill, then reverse is ... stupid
      fillTagMaskVector(actreadid, cfhd.tagmaskvector);
      reverseTagMaskVector(cfhd.tagmaskvector);
      int32 bfpos=actread.calcClippedComplPos2RawPos(0);
      int32 bfposinc=-1;
      hashesmade=transformSeqToVariableHash(
	actreadid,
	actread,
	actread.getClippedComplementSeqAsChar(),
	slen,
	SKIM3_basesperhash,
	srvaI,
	false,
	1,
	cfhd.tagmaskvector,
	bposhashstats,
	bfpos,
	bfposinc
	);
    }

    cfhd.singlereadvhraparray.resize(hashesmade);

    srvaI=cfhd.singlereadvhraparray.begin();
    vector<vhrap_t>::const_iterator lowerbound;
    vector<vhrap_t>::const_iterator upperbound;
    uint32 truetestsm2hits=0;
    for(; srvaI != cfhd.singlereadvhraparray.end(); srvaI++){
      lowerbound=SKIM3_vashortcuts_begin[srvaI->vhash & MAXVHASHMASK];
      upperbound=SKIM3_vashortcuts_end[srvaI->vhash & MAXVHASHMASK];

      // "SKIM3_empty_vector_vhrap_t.end()" is the "NULL" replacement
      if(SKIM3_completevhraparray_end != lowerbound){
	if(SKIM3_basesperhash>12){
	  // with more than 12 bases in a hash, the vhrap array is
	  //  subdivided 
	  pair<vector<vhrap_t>::const_iterator, vector<vhrap_t>::const_iterator>
	    p=equal_range(lowerbound,
			  upperbound,
			  *srvaI,
			  Skim__compareVHRAPArrayElem_);
	  lowerbound=p.first;
	  upperbound=p.second;
	}

	for(;lowerbound!=upperbound; lowerbound++){
	  truetestsm2hits++;

	  CEBUG("/// " << actreadid << '\t' << lowerbound->readid << '\n');

	  // hmmmm .....
	  // original: if(actreadid > lowerbound->readid){ 
	  // this fails spectacularly for mapping now that rails shifted to end of pool
	  // correct resolution would be adding 
	  //        
	  // but this might slow down the search quite a bit
	  if(actreadid > lowerbound->readid){
	    CEBUG("/// take!\n");
	    cfhd.readhashmatches.resize(cfhd.readhashmatches.size()+1);
	    cfhd.readhashmatches.back().rid2=lowerbound->readid;
	    cfhd.readhashmatches.back().hashpos1=srvaI->hashpos;
	    cfhd.readhashmatches.back().hashpos2=lowerbound->hashpos;
	    cfhd.readhashmatches.back().eoffset=srvaI->hashpos - lowerbound->hashpos;
	    cfhd.readhashmatches.back().bhashstats=srvaI->bhashstats;
	  }
	}
      }
    }

    if(actreadid % 1 == 0) {
      CEBUG("actreadid: " << actreadid << "\treadhashmatches.size(): " << readhashmatches.size() << "\ttruetestsm2hits: " << truetestsm2hits << endl);
    }

    // Hmmm ... this does not represent the full truth, but without "if" there
    // is a partition effect in the data. Bad, but cannot be helped (except
    // going through all reads in all partitions, which is unnecessary for the
    // search itself and effectively.doubles the SKIM time)
    //if(SKIM_partfirstreadid==0)(*SKIM3_rawhashitcounter)[actreadid]+=truetestsm2hits;

    if(cfhd.readhashmatches.size()>0){
      if(truetestsm2hits>150000) {
	CEBUG("Potential megahub: " << actreadid << "\treadhashmatches.size(): " << readhashmatches.size() << endl);

	// ok, potential megahub. To save the situation, throw out
	//  all hashes with a frequency > 4
	// if the size of readhashmatches can be reduced by at least 50%,
	//  then it's not treated as megahub

	size_t oldsize=cfhd.readhashmatches.size();
	vector<readhashmatch_t>::iterator dstI=cfhd.readhashmatches.begin();
	vector<readhashmatch_t>::iterator srcI=dstI;
	
	for(; srcI != cfhd.readhashmatches.end(); srcI++){
	  if(srcI->bhashstats.getFrequency()<=4){
	    *dstI=*srcI;
	    dstI++;
	  }
	}
	cfhd.readhashmatches.resize(dstI-cfhd.readhashmatches.begin());

	if(cfhd.readhashmatches.size() > oldsize/2) {
	  CEBUG("Megahub confirmed: " << actreadid << "\treadhashmatches.size(): " << cfhd.readhashmatches.size() << endl);
	  SKIM3_megahubs[actreadid]=1;
	}
      }
      if(!SKIM3_megahubs[actreadid]) {
	checkForPotentialHits(direction, actreadid, cfhd.tmpmatchwith, cfhd.readhashmatches, cfhd.smallhist4repeats);

	selectPotentialHitsForSave2(direction, actreadid, 
				    cfhd);

      }
      cfhd.readhashmatches.clear();
    }
  }

  {
    boost::mutex::scoped_lock lock(SKIM3_coutmutex);
    SKIM_progressindicator->increaseprogress(toid-fromid);
  }

}

//#define CEBUG(bla) 





/*************************************************************************
 *
 *
 * 
 *************************************************************************/

#define CEBUG2(bla)

//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUG2(bla)   {cout << bla; cout.flush();}

void Skim::selectPotentialHitsForSave2(const int8 direction, const uint32 actreadid, cfh_threaddata_t & cfhd)
{
  FUNCSTART("void Skim::selectPotentialHitsForSave2(const int8 direction, const uint32 actreadid, cfh_threaddata_t & cfhd)");

  cfhd.ridswithmatches.clear();
  if(cfhd.tmpmatchwith.empty()) return;
  CEBUG("start selectPotentialHitsForSave2()\n");

  updateCriterionLevels(direction,actreadid,
			cfhd);

  sort(cfhd.tmpmatchwith.begin(), cfhd.tmpmatchwith.end(), Skim__sortMWByEstimScore_);

  ADSEstimator adse;

  uint32 numleftext=SKIM3_maxhitsperread/2;
  uint32 numrightext=numleftext;
  uint8 ocll=255;
  uint8 oclr=255;

  bool takenrailfulllength=false;

  CEBUG("actreadid critlevel: " << static_cast<uint16>((*SKIM3_overlapcritlevell)[actreadid]) << " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[actreadid]) << '\n');

  // take all which have the same or lower level (if it is not 255)

  vector<matchwithsorter_t>::iterator tmwI=cfhd.tmpmatchwith.begin();
  for(;tmwI!=cfhd.tmpmatchwith.end(); ++tmwI) {
    CEBUG("### " << tmwI-cfhd.tmpmatchwith.begin() << "\ntmwI: " << *tmwI << endl);

    adse.calcNewEstimateFromSkim(
      tmwI->eoffset,
      (*SKIM3_readpool)[actreadid].getLenClippedSeq(),
      (*SKIM3_readpool)[tmwI->otherid].getLenClippedSeq(),
      actreadid,
      tmwI->otherid,
      1,
      direction);
    CEBUG("ADSE: " << adse << endl);

    getOverlapCriterionLevel(actreadid,
			     SKIM3_readpool->getRead(actreadid).getSequencingType(),
			     adse,
			     static_cast<uint8>(tmwI->percent_in_overlap),
			     ocll,
			     oclr);
    CEBUG("ocll: " << static_cast<uint16>(ocll) << " " << static_cast<uint16>(oclr) << '\n');

    bool regulartake=false;

    if((ocll!=255 && ocll<=(*SKIM3_overlapcritlevell)[actreadid])
       || (oclr!=255 && oclr<=(*SKIM3_overlapcritlevelr)[actreadid])){
      regulartake=true;
    }else if(SKIM3_readpool->getRead(actreadid).isSequencingType(Read::SEQTYPE_SOLEXA)){
      // if it's a Solexa and the best overlapcritlevel is not 0, we are probably in a under-coverage
      //  situation ... try to account for that by being less harsh
      // see purgeUnnecessaryHitsFromSkimFile() for the value "29"
      if(ocll <= 29
	 && (*SKIM3_overlapcritlevell)[actreadid]!= 0 
	 && (*SKIM3_overlapcritlevell)[actreadid] <= 29){
	regulartake=true;
      }else if(oclr <= 29
	       && (*SKIM3_overlapcritlevelr)[actreadid]!= 0 
	       && (*SKIM3_overlapcritlevelr)[actreadid] <= 29){
	regulartake=true;
      }
    }

    if(regulartake){
      CEBUG("actreadid critlevel: " << static_cast<uint16>((*SKIM3_overlapcritlevell)[actreadid]) << " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[actreadid]) << '\n');
      tmwI->taken=true;
      if(numleftext>0 && adse.getEstimatedLeftExpand(actreadid)>0) --numleftext;
      if(numrightext>0 && adse.getEstimatedRightExpand(actreadid)>0) --numrightext;
      CEBUG("+++++++++++++ take critlevel. nle " << numleftext << "\tnre: " << numrightext << '\n');
    } else if(tmwI->ol_fulllength){
      if((*SKIM3_readpool)[actreadid].isRail() || (*SKIM3_readpool)[tmwI->otherid].isRail()){
	// if one of the reads is a rail and there is a full-length overlap, we have to
	//  take that hit no matter what.
	// Reason: either it's a 100% hit, then it's obvious, or its a <100% hit and then
	//  the Smith-Waterman *needs* to have a look at that bugger to get the very best 
	//  placement
	tmwI->taken=true;
	takenrailfulllength=true;
	CEBUG("++++++++++++ take rail\n");
      }else if((*SKIM3_readpool)[actreadid].getLenClippedSeq()>=(*SKIM3_readpool)[tmwI->otherid].getLenClippedSeq()){
	// if first read is larger and encase completely second read, take overlap
	//  if score is >= largest encasement score of other read seen to date
	if(tmwI->estimscore>=SKIM3_largestencasementscoretodate[tmwI->otherid]){
	  if(tmwI->estimscore>SKIM3_largestencasementscoretodate[tmwI->otherid]){
	    // is it worthwhile to make this thread safe???
	    SKIM3_largestencasementscoretodate[tmwI->otherid]=tmwI->estimscore;
	  }
	  tmwI->taken=true;
	  CEBUG("++++++++++++ take encasement\n");
	}
      }
    }
  }

  // if there are free capacities to any side, fill them with the best 3 extends
  //  but only if no full-length overlap was taken with a rail

  if(!takenrailfulllength){
    CEBUG("check freecap\n");
    tmwI=cfhd.tmpmatchwith.begin();
    if(numleftext>3) numleftext=3;
    if(numrightext>3) numrightext=3;
    for(;tmwI!=cfhd.tmpmatchwith.end(); ++tmwI) {
      if(numleftext==0 && numrightext==0) break;
      if(tmwI->taken==false){
	CEBUG("tmwI: " << *tmwI << endl);
	adse.calcNewEstimateFromSkim(
	  tmwI->eoffset,
	  (*SKIM3_readpool)[actreadid].getLenClippedSeq(),
	  (*SKIM3_readpool)[tmwI->otherid].getLenClippedSeq(),
	  actreadid,
	  tmwI->otherid,
	  1,
	  direction);
	if(numleftext>0 && adse.getEstimatedLeftExpand(actreadid)>0){
	  tmwI->taken=true;
	  --numleftext;
	  CEBUG("************* take free capl " << numleftext << '\n');
	}
	if(!tmwI->taken && numrightext>0 && adse.getEstimatedRightExpand(actreadid)>0) {
	  tmwI->taken=true;
	  --numrightext;
	  CEBUG("************* take free capr " << numrightext << '\n');
	}
      }
    }
  }

  ofstream logfout;
  if(SKIM3_logflag_save2){
    string path,justfilename;
    if(direction>0){
      splitFullPathAndFileName(SKIM3_posfmatchfname,path,justfilename);
    }else{
      splitFullPathAndFileName(SKIM3_poscmatchfname,path,justfilename);
    }
    string logfilename=path+"/elog.skim.save2."+justfilename;
    logfout.open(logfilename.c_str(), ios::out|ios::app|ios::ate);
  }

  // we made our choice, now save that
  tmwI=cfhd.tmpmatchwith.begin();
  for(;tmwI!=cfhd.tmpmatchwith.end(); ++tmwI) {
    if(tmwI->taken){
      if(SKIM3_logflag_save2){
	logfout << "taken:\t" << SKIM3_readpool->getRead(tmwI->otherid).getName()
		<< '\t' << SKIM3_readpool->getRead(actreadid).getName()
		<< '\t' << *tmwI;
      }
      if(cfhd.shfsv.size()==cfhd.shfsv.capacity()){
	boost::mutex::scoped_lock lock(SKIM3_resultfileoutmutex);
	cfhd.posmatchfout->write(reinterpret_cast<char*>(&cfhd.shfsv[0]),sizeof(skimhitforsave_t)*cfhd.shfsv.size());
	if(cfhd.posmatchfout->bad()){
	  MIRANOTIFY(Notify::FATAL, "Could not write anymore to skimhit save5. Disk full? Changed permissions?");
	}
	cfhd.shfsv.clear();
      }
      cfhd.shfsv.resize(cfhd.shfsv.size()+1);
      skimhitforsave_t & shfs=cfhd.shfsv.back();
      shfs.rid1=tmwI->otherid;
      shfs.rid2=actreadid;
      shfs.eoffset=-(tmwI->eoffset);
      shfs.percent_in_overlap=tmwI->percent_in_overlap;
      shfs.numhashes=tmwI->numhashes;
      shfs.ol_stronggood  =tmwI->ol_stronggood;
      shfs.ol_weakgood    =tmwI->ol_weakgood;
      shfs.ol_belowavgfreq=tmwI->ol_belowavgfreq;
      shfs.ol_norept      =tmwI->ol_norept;
      shfs.ol_rept        =tmwI->ol_rept;

      CEBUG2("save2: " << actreadid << "\n" << *tmwI);

      cfhd.ridswithmatches.push_back(min(tmwI->otherid,actreadid));
    }else{
      if(SKIM3_logflag_save2){
	logfout << "dropped:\t" << SKIM3_readpool->getRead(tmwI->otherid).getName()
		<< '\t' << SKIM3_readpool->getRead(actreadid).getName()
		<< '\t' << *tmwI;
      }
    }
  }

  CEBUG2("Savestat " << actreadid << ": " << cfhd.ridswithmatches.size() << endl);
  if(!cfhd.ridswithmatches.empty()){
    vector<uint32>::const_iterator rwmI=cfhd.ridswithmatches.begin();
    {
      // TODO: should get rid of that counter ... meaningless now
      boost::mutex::scoped_lock lock(SKIM3_whpid_mutex);
      SKIM3_totalhitschosen+=cfhd.ridswithmatches.size();
    }
  }
  cfhd.ridswithmatches.clear();

  CEBUG("end selectPotentialHitsForSave2()\n");

  FUNCEND();
}
//#define CEBUG2(bla)
//#define CEBUG(bla)


/*************************************************************************
 *
 * Updates SKIM3_overlapcritlevell/r
 *
 * also finishes calculation of elements not initialised yet in the
 *  matchwithsorter_t vector (not done previously in checkForPotentialHits()
 *  to get down the number of calls to ADSEstimator calculations)
 *
 *  - stores the estimated score in estimscore
 *  - calculates ol_fulllength and ol_fullencased 
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
void Skim::updateCriterionLevels(const int8 direction, const uint32 actreadid, cfh_threaddata_t & cfhd)
{
  FUNCSTART("void Skim::updateCriterionLevels(const int8 direction, const uint32 actreadid, vector<matchwithsorter_t> & tmpmatchwith)");

  CEBUG("start updateCriterionLevels() " << actreadid << '\n');

  ADSEstimator adse;

  cfhd.uidswithnewcritlevell.clear();
  cfhd.uidswithnewcritlevelr.clear();
  cfhd.critlevellofnewuids.clear();
  cfhd.critlevelrofnewuids.clear();

  uint8 actreadcritlevell=255;
  uint8 actreadcritlevelr=255;
  uint8 newcll=255;
  uint8 newclr=255;

  vector<matchwithsorter_t>::iterator tmwI=cfhd.tmpmatchwith.begin();
  for(;tmwI!=cfhd.tmpmatchwith.end(); tmwI++) {
    adse.calcNewEstimateFromSkim(
      tmwI->eoffset,
      (*SKIM3_readpool)[actreadid].getLenClippedSeq(),
      (*SKIM3_readpool)[tmwI->otherid].getLenClippedSeq(),
      actreadid,
      tmwI->otherid,
      1,
      direction);

    CEBUG("ADSE: " << adse << endl);

    CEBUG("tmwI before: " << *tmwI << endl);

    //cout << "EstimO: " << adse.getEstimatedOverlap();
    tmwI->estimscore=adse.getEstimatedOverlap()*tmwI->percent_in_overlap*tmwI->percent_in_overlap;
    if(adse.getContainmentLevel()>0) {
      tmwI->ol_fulllength=true;
      if(tmwI->percent_in_overlap==100 && !tmwI->ol_rept
    	 && abs(static_cast<int32>(SKIM3_readpool->getRead(actreadid).getLenClippedSeq())
    		- static_cast<int32>(SKIM3_readpool->getRead(tmwI->otherid).getLenClippedSeq())) >= 8){
    	tmwI->ol_fullencased=true;
      }
    }

    CEBUG("tmwI after: " << *tmwI << endl);

    if(actreadcritlevell>0 || actreadcritlevelr>0){
      CEBUG("old: " << static_cast<uint16>(actreadcritlevell) << " " << static_cast<uint16>(actreadcritlevelr) << '\n');
      getOverlapCriterionLevel(actreadid,
			       SKIM3_readpool->getRead(actreadid).getSequencingType(),
			       adse,
			       static_cast<uint8>(tmwI->percent_in_overlap),
			       newcll,newclr);
      if(newcll<actreadcritlevell) actreadcritlevell=newcll;
      if(newclr<actreadcritlevelr) actreadcritlevelr=newclr;
      CEBUG("new: " << static_cast<uint16>(actreadcritlevell) << " " << static_cast<uint16>(actreadcritlevelr) << '\n');
    }

    // then for other id
    getOverlapCriterionLevel(tmwI->otherid,SKIM3_readpool->getRead(tmwI->otherid).getSequencingType(),
			     adse,
			     static_cast<uint8>(tmwI->percent_in_overlap),
			     newcll,newclr);
    if(newcll<(*SKIM3_overlapcritlevell)[tmwI->otherid]){
      cfhd.uidswithnewcritlevell.push_back(tmwI->otherid);
      cfhd.critlevellofnewuids.push_back(newcll);
      CEBUG("newcll: pushback " << cfhd.uidswithnewcritlevell.back() << "\t" << static_cast<uint16>(cfhd.critlevellofnewuids.back()) << '\n');
    }
    if(newclr<(*SKIM3_overlapcritlevelr)[tmwI->otherid]){
      cfhd.uidswithnewcritlevelr.push_back(tmwI->otherid);
      cfhd.critlevelrofnewuids.push_back(newclr);
      CEBUG("newclr: pushback " << cfhd.uidswithnewcritlevelr.back() << "\t" << static_cast<uint16>(cfhd.critlevelrofnewuids.back()) << '\n');
    }
  }

  {
    boost::mutex::scoped_lock lock(SKIM3_critlevelwrite_mutex);
    if(actreadcritlevell<(*SKIM3_overlapcritlevell)[actreadid]){
      CEBUG("ari: update critlevell " << actreadid << " " << static_cast<uint16>((*SKIM3_overlapcritlevell)[actreadid]));
      (*SKIM3_overlapcritlevell)[actreadid]=actreadcritlevell;
      CEBUG(" to " << static_cast<uint16>((*SKIM3_overlapcritlevell)[actreadid]) << '\n');
    }
    if(actreadcritlevelr<(*SKIM3_overlapcritlevelr)[actreadid]){
      CEBUG("ari: update critlevelr " << actreadid << " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[actreadid]));
      (*SKIM3_overlapcritlevelr)[actreadid]=actreadcritlevelr;
      CEBUG(" to " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[actreadid]) << '\n');
    }

    vector<uint32>::const_iterator nI=cfhd.uidswithnewcritlevell.begin();
    vector<uint8>::const_iterator cI=cfhd.critlevellofnewuids.begin();
    for(; nI != cfhd.uidswithnewcritlevell.end(); ++nI, ++cI){
      // still check ... might have changed in the mean time by another thread
      if(*cI < (*SKIM3_overlapcritlevell)[*nI]){
	CEBUG("ori: update critlevell " << *nI <<  " " << static_cast<uint16>((*SKIM3_overlapcritlevell)[*nI]));
	(*SKIM3_overlapcritlevell)[*nI]=*cI;
	CEBUG(" to " << static_cast<uint16>((*SKIM3_overlapcritlevell)[*nI]) << '\n');
      }
    }
    nI=cfhd.uidswithnewcritlevelr.begin();
    cI=cfhd.critlevelrofnewuids.begin();
    for(; nI != cfhd.uidswithnewcritlevelr.end(); ++nI, ++cI){
      // still check ... might have changed in the mean time by another thread
      if(*cI < (*SKIM3_overlapcritlevelr)[*nI]){
	CEBUG("ori: update critlevelr " << *nI <<  " " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[*nI]));
	(*SKIM3_overlapcritlevelr)[*nI]=*cI;
	CEBUG(" to " << static_cast<uint16>((*SKIM3_overlapcritlevelr)[*nI]) << '\n');
      }
    }

  }

  CEBUG("end updateCriterionLevels()\n");

  FUNCEND();
}
//#define CEBUG(bla)



/*************************************************************************
 *
 * Returns criterium levels for overlaps extending left and right
 *
 *
 *   Sanger:
 *     crit1: 80% overlap
 *     crit2: 60% overlap
 *   454:
 *     crit1: 80% overlap
 *     crit2: 60% overlap
 *   IonTorrent:
 *     crit1: 90% overlap
 *     crit2: 80% overlap
 *     crit3: 70% overlap
 *     crit4: 60% overlap
 *   Solexa:
 *     crit-level from 0 to 59, see code
 *   PacBio:
 *     crit1: 80% overlap
 *     crit2: 60% overlap
 *
 * Special: if none of above, level = 240 if fully encased
 *          made for hits against rails, so that partial matches
 *          are not preferred to fully encased matches (which
 *          hits against a backbone should normally be)
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUG(bla)   {if(adse.getID1()==2 && adse.getID2()==140820) {cout << bla; cout.flush();}}

#if CPP_READ_SEQTYPE_END != 6
#error "This code is made for 6 sequencing types, adapt!"
#endif
  
void Skim::getOverlapCriterionLevel(const uint32 actreadid, const uint8 seqtype, const ADSEstimator & adse, const uint8 relscore, uint8 & levell, uint8 & levelr)
{
  FUNCSTART("void Skim::getOverlapCriterionLevel(const uint32 actreadid, const uint8 seqtype, const ADSEstimator & adse, const uint8 relscore, uint8 & levell, uint8 & levelr)");

  uint32 overlapratiopc=100*adse.getEstimatedOverlap()/adse.getLen(actreadid);

  CEBUG(adse);
  CEBUG("gOCL: " << actreadid << " " << static_cast<uint16>(relscore) << " " << adse.getEstimatedOverlap() << " " << adse.getLen(actreadid) << " " << overlapratiopc);

  levell=255;
  levelr=255;
  switch(seqtype){
  case Read::SEQTYPE_SOLEXA : {
    if(relscore>=95){
      // level from 0 to 179
      uint8 startlevel=(100-relscore)*30;;
      startlevel+=29-(adse.getEstimatedOverlap()*29/adse.getLen(actreadid));
      BUGIFTHROW(startlevel>=200,"Startlevel>=200? " << static_cast<uint16>(startlevel) << "\tid1: " << adse.getID1() << " id2: " << adse.getID2() << "\n" << adse);
      if(adse.getEstimatedLeftExpand(actreadid)>0){
	levell=startlevel;
      }
      if(adse.getEstimatedRightExpand(actreadid)>0){
	levelr=startlevel;
      }
    }
    break;
  }
  case Read::SEQTYPE_SANGER :
  case Read::SEQTYPE_454GS20 :
  case Read::SEQTYPE_PACBIO :{
    if(overlapratiopc>=80){
      if(adse.getEstimatedLeftExpand(actreadid)>0){
	levell=0;
      }
      if(adse.getEstimatedRightExpand(actreadid)>0){
	levelr=0;
      }
    }else if(overlapratiopc>=60){
      if(adse.getEstimatedLeftExpand(actreadid)>0){
	levell=1;
      }
      if(adse.getEstimatedRightExpand(actreadid)>0){
	levelr=1;
      }
    }
    break;
  }
  case Read::SEQTYPE_IONTORRENT :{
    if(adse.getLen(actreadid)>120){
      if(overlapratiopc>=80){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=0;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=0;
	}
      }else if(overlapratiopc>=60){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=1;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=1;
	}
      }
    }else{
      if(overlapratiopc>=90){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=0;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=0;
	}
      }else if(overlapratiopc>=80){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=1;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=1;
	}
      }else if(overlapratiopc>=70){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=2;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=2;
	}
      }else if(overlapratiopc>=60){
	if(adse.getEstimatedLeftExpand(actreadid)>0){
	  levell=3;
	}
	if(adse.getEstimatedRightExpand(actreadid)>0){
	  levelr=3;
	}
      }
    }
    break;
  }
  default : {
    BUGIFTHROW(true,"Unknown/unhandled seqtype " << static_cast<uint16>(seqtype));
  }
  }

  if(levell == 255 && levelr==255){
    uint32 clevel=adse.getContainmentLevel();
    if(clevel >1
       || (clevel==1 && adse.getIDOfContained()==actreadid)){
      levell=240;
      levelr=240;
    }
  }

  CEBUG("\tR: " << static_cast<uint16>(levell) << " " << static_cast<uint16>(levelr) << '\n');

  FUNCEND();
}
//#define CEBUG(bla)
//#define CEBUG2(bla)














/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/


//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUG_extra_cFPH

void Skim::checkForPotentialHits(const int8 direction, const uint32 actreadid, vector<matchwithsorter_t> & tmpmatchwith, vector<readhashmatch_t> & readhashmatches, vector<uint32> & smallhist4repeats)
{
  //bool dodebug=false;

  tmpmatchwith.clear();

  // readhashmatches should not be empty ... normally.
  // but new method to deal with megahubs reduces this vector, keeping only
  //  'approximately normal' frequencies. Which in turn means: some vectors
  //  might be completely emptied
  // so, if it is empty, return immediately
  if(readhashmatches.empty()) return;

  sort(readhashmatches.begin(), readhashmatches.end(), Skim__sortreadhashmatch_t_);
  
  bool actreadisrail=SKIM3_readpool->getRead(actreadid).isRail();

  vector<readhashmatch_t>::const_iterator sI=readhashmatches.begin();
  uint32 possiblehits=0;
  uint32 acceptedhits=0;

  uint32 countid=sI->rid2;
  while(sI != readhashmatches.end()){
    uint32 rid2=sI->rid2;

    // disregard this potential match if
    //  1) both reads are rails
    if((actreadisrail && SKIM3_readpool->getRead(rid2).isRail())){
      for(;sI!=readhashmatches.end() && sI->rid2==rid2; ++sI);
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      continue;
    }
    //  2) we scan only against rails and both reads are non-rails
    if((SKIM3_onlyagainstrails
	&& (!actreadisrail && !SKIM3_readpool->getRead(rid2).isRail()))){
      for(;sI!=readhashmatches.end() && sI->rid2==rid2; ++sI);
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      continue;
    }
    //  3) this read pair has been banned previously
    if((*SKIM3_bannedoverlaps).checkIfBanned(actreadid,rid2)){
      for(;sI!=readhashmatches.end() && sI->rid2==rid2; ++sI);
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      {
	boost::mutex::scoped_lock lock(SKIM3_globalclassdatamutex);
	SKIM3_totalpermbans++;
      }
      continue;
    }

    ++possiblehits;
    if(possiblehits==1){
      CEBUG("Potential hits of " << actreadid << " (" << static_cast<int16>(direction) << '/' << SKIM3_readpool->getRead(actreadid).getLenClippedSeq() << ")\n----------------\n");
      CEBUG(SKIM3_readpool->getRead(actreadid) << endl);
      CEBUG("----------------\n");
    }

    uint16 oldhashpos=sI->hashpos1;
    uint16 hp1min=0xffff;
    uint16 hp1max=0;

    uint16 hp2min=0xffff;
    uint16 hp2max=0;

    int32  eoffsetmin=0x7fffffff;
    int32  eoffsetmax=0x80000000;
    int32  oldeoffset=sI->eoffset;

    int32  maxeoffsetjump=0;
    int32  weighteoffsetjumps=0;

    uint32 numhashes=0;

    bool flag_norept=true;
    size_t totalfreq3counter=0;
    size_t totalfreq5counter=0;
    size_t contiguousfreq3counter=0;
    size_t maxcontiguousfreq3counter=0;
    size_t contiguousfreq32counter=0;
    size_t maxcontiguousfreq32counter=0;


//    if((actreadid==52053 && rid2==208673)
//       || (rid2==52053 && actreadid==208673)) dodebug=true;

//    if(actreadid==0 || rid2==0) dodebug=true;
//#define CEBUG(bla)   {if(dodebug) cout << bla; cout.flush();}
    

    vector<readhashmatch_t>::const_iterator sIS=sI;
    for(;sI != readhashmatches.end() && sI->rid2 == countid; sI++){
      CEBUG(*sI);

      // this ensures that the eoffset between two following
      //  entries may not differ by too much (10 bases here)
      // IF they do, then this is treated like a different hit
      //  by breaking the loop
      if(abs(sI->eoffset - oldeoffset) > 10){
	CEBUG("BREAKER!\n");
	break;
      }
      numhashes++;

      if(oldhashpos + SKIM3_hashsavestepping != sI->hashpos1){
	CEBUG("NOT CONTIGUOUS!\n");
	maxcontiguousfreq3counter=max(maxcontiguousfreq3counter,contiguousfreq3counter);
	maxcontiguousfreq32counter=max(maxcontiguousfreq32counter,contiguousfreq32counter);
	contiguousfreq3counter=0;
	contiguousfreq32counter=0;
	maxeoffsetjump=max(maxeoffsetjump,abs(abs(sI->eoffset)-abs(oldeoffset)));
	weighteoffsetjumps+=abs(abs(sI->eoffset)-abs(oldeoffset));
      }
      hp1min=min(hp1min,sI->hashpos1);
      hp1max=max(hp1max,sI->hashpos1);
      eoffsetmin=min(eoffsetmin,sI->eoffset);
      eoffsetmax=max(eoffsetmax,sI->eoffset);
      oldeoffset=sI->eoffset;

      hp2min=min(hp2min,sI->hashpos2);
      hp2max=max(hp2max,sI->hashpos2);

      if(sI->bhashstats.getFrequency() >= 5){
	totalfreq5counter++;
	flag_norept=false;
	contiguousfreq3counter=0;
	contiguousfreq32counter=0;
      }else if(sI->bhashstats.getFrequency() > 3){
	flag_norept=false;
	contiguousfreq3counter=0;
	contiguousfreq32counter=0;
      }else if(sI->bhashstats.getFrequency() == 3){
	contiguousfreq3counter++;
	maxcontiguousfreq3counter=max(maxcontiguousfreq3counter,contiguousfreq3counter);
	totalfreq3counter++;

	contiguousfreq32counter++;
	maxcontiguousfreq32counter=max(maxcontiguousfreq32counter,contiguousfreq32counter);
      }else if(sI->bhashstats.getFrequency() == 2){
	contiguousfreq32counter++;
	maxcontiguousfreq32counter=max(maxcontiguousfreq32counter,contiguousfreq32counter);
      }else{
	contiguousfreq3counter=0;
	contiguousfreq32counter=0;
      }


#ifdef CEBUG_extra_cFPH
      {
	boost::mutex::scoped_lock lock(SKIM3_coutmutex);
	CEBUG(sI->rid2
	      << "\t" << SKIM3_readpool->getRead(sI->rid2).getName()
	      << "\t" << SKIM3_readpool->getRead(sI->rid2).getLenClippedSeq()
	      << "\t" << sI->eoffset
	      << "\t" << sI->hashpos1
	      << "\t" << oldhashpos
	      << "\tfq: " << static_cast<uint16>(sI->bhashstats.getFrequency())
	      << "\t" << flag_norept
	      << ' ' << contiguousfreq3counter
	      << ' ' << maxcontiguousfreq3counter
	      << ' ' << contiguousfreq32counter
	      << ' ' << maxcontiguousfreq32counter
	      << ' ' << totalfreq3counter
	      << ' ' << totalfreq5counter
	      //<< "\t" << sI->hashpos2
	      << '\n');
      }
#endif

      oldhashpos=sI->hashpos1;
    }

    // disregard this potential match if
    //  1) both reads are rails
    if((actreadisrail && SKIM3_readpool->getRead(rid2).isRail())){
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      continue;
    }
    //  2) we scan only against rails and both reads are non-rails
    if((SKIM3_onlyagainstrails
	&& (!actreadisrail && !SKIM3_readpool->getRead(rid2).isRail()))){
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      continue;
    }
    //  3) this read pair has been banned previously
    if((*SKIM3_bannedoverlaps).checkIfBanned(actreadid,rid2)){
      if(sI!=readhashmatches.end()) countid=sI->rid2;
      {
	boost::mutex::scoped_lock lock(SKIM3_globalclassdatamutex);
	SKIM3_totalpermbans++;
      }
      continue;
    }

    int32 maxoverlap;

    // adjust min positions for the hash length
    hp1min-=(SKIM3_basesperhash-1);
    hp2min-=(SKIM3_basesperhash-1);

    int32 eoffsetmean=eoffsetmin+(eoffsetmax-eoffsetmin)/2;
    
    // calc max overlap
    // currently only for one offset
    if(eoffsetmean<0){
      maxoverlap=min(SKIM3_readpool->getRead(rid2).getLenClippedSeq()+eoffsetmean,SKIM3_readpool->getRead(actreadid).getLenClippedSeq());
    }else{
      maxoverlap=min(SKIM3_readpool->getRead(actreadid).getLenClippedSeq()-eoffsetmean,SKIM3_readpool->getRead(rid2).getLenClippedSeq());
    }

    // correct the maxoverlap by the modulo of the hash steps as the
    //  border hashes will be found only in 1/(hash stepping) cases
    maxoverlap=maxoverlap-(maxoverlap%SKIM3_hashsavestepping);

    // hashe3soverlap is not the number of hashes in the overlap,
    // but the length of the overlap
    int32 hashesoverlap=hp1max-hp1min+1;

    int32 perc=100*hashesoverlap/maxoverlap;
    bool disregardperc=false;

    int32 minpercentrequired=min(
      SKIM3_percentrequired[SKIM3_readpool->getRead(actreadid).getSequencingType()],
      SKIM3_percentrequired[SKIM3_readpool->getRead(rid2).getSequencingType()]);

    uint32 maxnumhashes=((maxoverlap-SKIM3_basesperhash)/SKIM3_hashsavestepping)+1;

    CEBUG(static_cast<int16>(direction) << "\tmo: " << maxoverlap << "\tperc: " << perc << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\tmnh: " << maxnumhashes << "\teom: " << eoffsetmean << "\teomin: " << eoffsetmin << "\teomax: " << eoffsetmax << "\tmej: " << maxeoffsetjump << endl);

    bool majorrecalc=false;

    if(maxnumhashes>0){
      if(perc>=minpercentrequired
	 && numhashes>1){
	if((perc == 100 && maxeoffsetjump>=3)
	   || numhashes>maxnumhashes
	   // NO!!! this would be bad for microrepeats || weighteoffsetjumps>=3
	  ) {
	  CEBUG("part1\n");
	  majorrecalc=true;
	  // find eoffset with most hashes (this is our new eoffset mean)
	  //  and numhashes will now only concern that offset (disregarding frameshifts,
	  //  but that cannot be helped)
	  smallhist4repeats.clear();
	  smallhist4repeats.resize(eoffsetmax-eoffsetmin+1,0);
	  vector<readhashmatch_t>::const_iterator rI=sIS;
	  numhashes=0;
	  for(;rI != sI; ++rI){
	    //cout << *rI;
	    //if(abs(rI->eoffset-eoffsetmin)>=smallhist4repeats.size()){
	    //  cout << "Eh, what?\n";
	    //  exit(0);
	    //}
	    ++smallhist4repeats[rI->eoffset-eoffsetmin];
	    if(smallhist4repeats[rI->eoffset-eoffsetmin] > numhashes){
	      numhashes=smallhist4repeats[rI->eoffset-eoffsetmin];
	    }
	  }

//	cout << "hist\n";
//	for(uint32 ii=0; ii<smallhist4repeats.size(); ++ii){
//	  cout << ii << "\t" << smallhist4repeats[ii] << endl;
//	}
	
	  int32 newmini=0;
	  for(; newmini<smallhist4repeats.size(); ++newmini){
	    if(smallhist4repeats[newmini] == numhashes){
	      break;
	    }
	  }
	  int32 newmaxi=newmini;
	  for(; newmaxi<smallhist4repeats.size(); ++newmaxi){
	    if(smallhist4repeats[newmaxi] != numhashes){
	      break;
	    }
	  }
	  --newmaxi;
	  //cout << "mini: " << newmini << "\tmaxi: " << newmaxi << endl;
	  newmini+=eoffsetmin;
	  newmaxi+=eoffsetmin;
	  //cout << "mini: " << newmini << "\tmaxi: " << newmaxi << endl;
	  eoffsetmean=(newmaxi+newmini)/2;
	  
	  // recalc hp1min/max
	  hp1min=0xffff;
	  hp1max=0;
	  
	  for(rI=sIS;rI != sI; ++rI){
	    if(rI->eoffset == eoffsetmean){
	      hp1min=min(hp1min,rI->hashpos1);
	      hp1max=max(hp1max,rI->hashpos1);
	    }
	  }
	  // adjust min positions for the hash length
	  hp1min-=(SKIM3_basesperhash-1);
	  
	  hashesoverlap=hp1max-hp1min+1;
	  perc=100*hashesoverlap/maxoverlap;
	  
	  if(perc==100){
	    eoffsetmin=eoffsetmean;
	    eoffsetmax=eoffsetmean;
	  }

	  //cout << static_cast<int16>(direction) << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\teom: " << eoffsetmean 
	  //     << "\tho: " << hashesoverlap << "\tper: " << perc
	  //     << endl;
	  
	  // saver:
	  // we might have been too harsh, very probably so in mapping asemblies for the following case:
	  //
	  //  C     ..............................
	  //  R          ........*.....
	  //
	  // where only ~half of the hashes in R are counted. In that case, set percent to the
	  //  minimum needed to not be thrown out.
	  if(perc<minpercentrequired){
	    disregardperc=true;
	  }

	  CEBUG("recalc\n" << static_cast<int16>(direction) << "\tperc: " << perc << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\tmnh: " << maxnumhashes << "\teom: " << eoffsetmean << "\teomin: " << eoffsetmin << "\teomax: " << eoffsetmax << endl);
	  
	  
	}
      }

      // look a bit closer at potential perfect matches
      if(perc == 100){
	if(eoffsetmin != eoffsetmax){
	  // this could not be: have at least two different expected offsets
	  //  and a 100% coverage. Side effects from intra-read repeats
	  //  or intre-read indel
	  // therefore, make sure this does not get through as a 100% match
	  //  by using the number of hashes as percentage
	  //  but do not fall below minimum required
	  //perc=99;
	  //cout << "dida\n";
	  perc=100*numhashes/maxnumhashes;
	  if(perc<minpercentrequired) disregardperc=true;
	}else if((numhashes-1)*SKIM3_hashsavestepping+SKIM3_basesperhash < maxoverlap){
	  // maxoverlap covers the whole potential overlap, but
	  //  there are not enough hashes supporting for 100% match
	  //  (base mismatch somewhere)
	  // reduce the percentage to show it's not a perfect match
	  //  but do not fall below minimum required
	  //perc=99;
	  //cout << "dide\n";
	  perc=100*numhashes/maxnumhashes;
	  if(perc<minpercentrequired) disregardperc=true;
	}
      }else if(eoffsetmin == eoffsetmax){
	if(perc>100) {
	  perc=100;
	}else{
	  if(perc>=minpercentrequired && numhashes==maxnumhashes){
	    CEBUG("maxnumhashes 100% saver: "  << perc << '\n');
	    perc=100;
	  }
	}
      }

      CEBUG("after closer look\n" << static_cast<int16>(direction) << "\tperc: " << perc << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\tmnh: " << maxnumhashes << "\teom: " << eoffsetmean << "\teomin: " << eoffsetmin << "\teomax: " << eoffsetmax << endl);

      // if rail and only partial match -> reduce percentage
      if(!majorrecalc
	 && SKIM3_readpool->getRead(rid2).isRail()
	 && maxnumhashes+SKIM3_basesperhash*SKIM3_hashsavestepping<SKIM3_readpool->getRead(actreadid).getLenClippedSeq()){
	  if(perc>=minpercentrequired) disregardperc=true;
	  perc=100*numhashes/maxnumhashes;

	  CEBUG("partial match recalc\n" << static_cast<int16>(direction) << "\tperc: " << perc << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\tmnh: " << maxnumhashes << "\teom: " << eoffsetmean << "\teomin: " << eoffsetmin << "\teomax: " << eoffsetmax << endl);

      }

      // reduce percentage by 3*largest gap we have encountered
      // and by number of gaps encountered
      if(perc>=minpercentrequired){
	perc-=3*maxeoffsetjump;
	if(!majorrecalc) perc-=weighteoffsetjumps;
	if(perc<=0) perc=1;
	if(perc<minpercentrequired) disregardperc=true;

	CEBUG("gap recalc\n" << static_cast<int16>(direction) << "\tperc: " << perc << "\tari: " << actreadid << "\trid2: " << rid2 << "\tnumh: " << numhashes << "\tmnh: " << maxnumhashes << "\teom: " << eoffsetmean << "\teomin: " << eoffsetmin << "\teomax: " << eoffsetmax << endl);
      }

      int32 minoverlaprequired=min(
	SKIM3_overlaplenrequired[SKIM3_readpool->getRead(actreadid).getSequencingType()],
	SKIM3_overlaplenrequired[SKIM3_readpool->getRead(rid2).getSequencingType()]);

#ifdef CEBUG_extra_cFPH
      {
	boost::mutex::scoped_lock lock(SKIM3_coutmutex);
	CEBUG("eomin: " << eoffsetmin << "\teomax: " << eoffsetmax 
	      << "\tmor: " << minoverlaprequired
	      << "\tho: " << hashesoverlap
	      << "\t%: " << perc
	      << "\t%<: " << minpercentrequired << endl);
      }
#endif

      // we take the hit if the overlap percentage is above threshold
      // NEW: or if both reads have MNRr tags
      if(hashesoverlap >= minoverlaprequired
	 && (perc>=minpercentrequired
	     || disregardperc
	     || (SKIM3_hasMNRr[actreadid] && SKIM3_hasMNRr[rid2]))) {
	acceptedhits++;

	// increase overlapcounter only for "real" reads,
	//  not for rails
	if(!SKIM3_readpool->getRead(actreadid).isRail()
	   && !SKIM3_readpool->getRead(rid2).isRail()){
	  boost::mutex::scoped_lock lock(SKIM3_globalclassdatamutex);
	  (*SKIM3_overlapcounter)[actreadid]+=1;
	  (*SKIM3_overlapcounter)[rid2]+=1;
	}

//#define CEBUG(bla)   {if(actreadid==273252 && rid2==273250) cout << bla; cout.flush();}
//#define CEBUG(bla)   {cout << bla; cout.flush();}
	matchwithsorter_t tmp;
	tmp.otherid=rid2;
	tmp.eoffset=eoffsetmean;
      
	if(perc>100) perc=100;
	tmp.percent_in_overlap=perc;
	tmp.numhashes=numhashes;
	tmp.estimscore=0;
	tmp.taken=false;

	// this was the standard, works good but sometimes too harsh
	// tmp.ol_belowavgfreq=maxcontiguousfreq32counter > (SKIM3_basesperhash/SKIM3_hashsavestepping-1);
	// tmp.ol_weakgood=maxcontiguousfreq3counter > (SKIM3_basesperhash/SKIM3_hashsavestepping-1);
	// this is worse in performance on lpla synthetic data
	//  tmp.ol_weakgood=maxcontiguousfreq3counter > 1;

	tmp.ol_belowavgfreq=false;
	if(maxcontiguousfreq32counter){
	  tmp.ol_belowavgfreq=(SKIM3_basesperhash+(maxcontiguousfreq32counter-1)*SKIM3_hashsavestepping) >= 26;
	}
	tmp.ol_weakgood=false;
	if(maxcontiguousfreq3counter){
	  tmp.ol_weakgood=(SKIM3_basesperhash+(maxcontiguousfreq3counter-1)*SKIM3_hashsavestepping) >= 20;
	}


	tmp.ol_stronggood=tmp.ol_weakgood & (totalfreq3counter > (SKIM3_basesperhash*2/SKIM3_hashsavestepping-1));

	/* still too harsh sometimes

	// if no strong good and one of the reads is Solexa:
	//  extra rule for short (<64 bases) Solexa: strong good also for contiguous
	//  overlaps of >= 30
	if(!tmp.ol_stronggood){
	  bool acceptshort=false;
	  if(SKIM3_readpool->getRead(actreadid).getSequencingType() == Read::SEQTYPE_SOLEXA
	     && SKIM3_readpool->getRead(actreadid).getLenClippedSeq() < 64) {
	    acceptshort=true;
	  }else if(SKIM3_readpool->getRead(rid2).getSequencingType() == Read::SEQTYPE_SOLEXA
		   && SKIM3_readpool->getRead(rid2).getLenClippedSeq() < 64) {
	    acceptshort=true;
	  }
	  if(acceptshort){
	    tmp.ol_stronggood=(SKIM3_basesperhash+(maxcontiguousfreq3counter-1)*SKIM3_hashsavestepping) >= 30;
	    CEBUG("Should accept short: " << SKIM3_basesperhash+(maxcontiguousfreq3counter-1)*SKIM3_hashsavestepping << ' ' << tmp.ol_stronggood << '\n');
	  }
	}
	*/

	// all contiguous HAF33 overlaps >= 30 are stronggood
	if(!tmp.ol_stronggood){
	  tmp.ol_stronggood=(SKIM3_basesperhash+(maxcontiguousfreq3counter-1)*SKIM3_hashsavestepping) >= 30;
	}



	tmp.ol_norept=flag_norept;
	tmp.ol_rept=(totalfreq5counter > 0);

	tmp.ol_fulllength=false;
	tmp.ol_fullencased=false;

	// not 100% accurate, moved to updateCriterionLevels()
	//
	//if(hashesoverlap>=SKIM3_readpool->getRead(actreadid).getLenClippedSeq()-SKIM3_hashsavestepping
	//	 || hashesoverlap>=SKIM3_readpool->getRead(rid2).getLenClippedSeq()-SKIM3_hashsavestepping){
	//	tmp.ol_fulllength=true;
	//
	//	if(perc == 100 
	//	   && tmp.ol_norept){
	//	  int32 lendiff=abs(static_cast<int32>(SKIM3_readpool->getRead(actreadid).getLenClippedSeq())
	//			    -static_cast<int32>(SKIM3_readpool->getRead(tmp.otherid).getLenClippedSeq()));
	//	  if(lendiff>=8){
	//	    tmp.ol_fullencased=true;
	//	  }
	//	}
	//}

	tmpmatchwith.push_back(tmp);

	if(tmp.ol_rept) {CEBUG("\nREPT!!!\n")};
	CEBUG("Pushing possible hit with offset: " << tmp.eoffset << endl
	      << rid2
	      << "\t" << actreadid
	      << "\t" << SKIM3_readpool->getRead(rid2).getLenClippedSeq()
	      << "\t" << hp1min
	      << "\t" << hp1max
	      << "\t" << eoffsetmin
	      << "\t" << eoffsetmax
	      << "\t" << maxoverlap
	      << "\t" << hashesoverlap
	      << "\t" << numhashes
	      << "\t" << minoverlaprequired
	      << "\t" << perc << '%'
	      << "\t" << maxcontiguousfreq3counter
	      << "\t" << maxcontiguousfreq32counter
	      << "\t" << totalfreq3counter
	      << "\t" << totalfreq5counter
	      << "\n" << tmp.ol_stronggood << ' ' << tmp.ol_weakgood << ' ' << tmp.ol_belowavgfreq << ' ' << tmp.ol_norept << ' ' << tmp.ol_rept
	      << '\n');
//#define CEBUG(bla)

	if(SKIM3_chimerahunt.size()){
	  chimeraHuntStoreOverlapCoverage(direction, actreadid, rid2,
					  hp1min,hp1max,hp2min,hp2max);
	}
      }
    }

    if(sI!=readhashmatches.end()) countid=sI->rid2;
  }

  if(possiblehits!=0){
    CEBUG("Numhits " << actreadid << "\t" << possiblehits << "\t" << acceptedhits << "\n\n");
  }

  boost::mutex::scoped_lock lock(SKIM3_globalclassdatamutex);
  SKIM3_possiblehits+=possiblehits;
  SKIM3_acceptedhits+=acceptedhits;
}
#define CEBUG(bla)
#undef CEBUG_extra_cFPH


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::chimeraHuntStoreOverlapCoverage(const int8 direction, const uint32 actreadid, const uint32 rid2, uint16 hp1min, uint16 hp1max, uint16 hp2min, uint16 hp2max)
{
  bool cebug=false;
  //if(actreadid==0 || rid2==0) cebug=true;

//#define CEBUG(bla)   {if(cebug) cout << bla; cout.flush();}

  CEBUG("checkForChimeras: " << SKIM3_readpool->getRead(actreadid).getName()
	<< " (" << actreadid << ':' << static_cast<int16>(direction) << ") / " 
	<< SKIM3_readpool->getRead(rid2).getName()
	<< " (" << rid2 << ":1)\n");
  CEBUG("hp1min: " << hp1min << '\n');
  CEBUG("hp1max: " << hp1max << '\n');
  CEBUG("hp2min: " << hp2min << '\n');
  CEBUG("hp2max: " << hp2max << '\n');

  // *sigh* Must also handle this:
  // instead of re-searching, get them passed by caller
  //
  // E0K6C4E01E3QB2 (-1) / E0K6C4E01C6VEK (1)
  // rid2: 8 eoffset: -49    hp1: 91 hp2: 140
  // rid2: 8 eoffset: -49    hp1: 93 hp2: 142
  // rid2: 8 eoffset: -49    hp1: 95 hp2: 144
  // rid2: 8 eoffset: -49    hp1: 97 hp2: 146
  // ...
  // rid2: 8 eoffset: -48    hp1: 206        hp2: 254
  // rid2: 8 eoffset: -48    hp1: 208        hp2: 256
  // rid2: 8 eoffset: -48    hp1: 210        hp2: 258
  // rid2: 8 eoffset: -46    hp1: 20 hp2: 66
  // rid2: 8 eoffset: -46    hp1: 22 hp2: 68
  // rid2: 8 eoffset: -46    hp1: 24 hp2: 70

  // ok, cut of 2 from each side

  vector<uint8> & id1hunt=SKIM3_chimerahunt[actreadid];
  vector<uint8> & id2hunt=SKIM3_chimerahunt[rid2];

  //id1hunt.clear();
  //id1hunt.resize(SKIM3_readpool->getRead(actreadid).getLenClippedSeq(),0);
  //id2hunt.clear();
  //id2hunt.resize(SKIM3_readpool->getRead(rid2).getLenClippedSeq(),0);

  if(hp2max-hp2min>4+2*SKIM3_hashsavestepping
     && hp1max-hp1min>4+2*SKIM3_hashsavestepping){

    // the +2 is "magic" ... at least it mixes MIRA not recognising
    //  some chimeras
    hp1min+=SKIM3_hashsavestepping+2;
    hp1max-=SKIM3_hashsavestepping+2;
    hp2min+=SKIM3_hashsavestepping+2;
    hp2max-=SKIM3_hashsavestepping+2;

    uint8 * ptr = &(id2hunt[hp2min]);
    for(uint16 i=0; i<hp2max-hp2min; i++, ptr++){
      *ptr=1;
    }

    if(direction>0){
      ptr = &(id1hunt[hp1min]);
      for(uint16 i=0; i<hp1max-hp1min; i++, ptr++){
	*ptr=1;
      }
    }else{
      ptr = &(id1hunt[SKIM3_readpool->getRead(actreadid).getLenClippedSeq()-1-hp1min]);
      for(uint32 i=0; i<hp1max-hp1min; i++, --ptr){
	*ptr=1;
      }
    }
  }

  if(cebug){
    cout << "id1hunt: " << SKIM3_readpool->getRead(actreadid).getName() << endl;
    uint32 conscounter=0;
    uint32 longest=0;
    for(uint32 i=0; i<id1hunt.size(); i++){
      cout << i << '\t' << static_cast<uint16>(id1hunt[i]) << '\n';
      if(id1hunt[i]){
	conscounter++;
	longest=max(longest,conscounter);
      }else{
	conscounter=0;
      }
    }
    cout << "id1longest: " << longest << endl;
    cout << "id2hunt: " << SKIM3_readpool->getRead(rid2).getName() << endl;
    conscounter=0;
    longest=0;
    for(uint32 i=0; i<id2hunt.size(); i++){
      cout << i << '\t' << static_cast<uint16>(id2hunt[i]) << '\n';
      if(id2hunt[i]){
	conscounter++;
	longest=max(longest,conscounter);
      }else{
	conscounter=0;
      }
    }
    cout << "id2longest: " << longest << endl;
  }


}
//#define CEBUG(bla)



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {if(actreadid==0) cout << bla; cout.flush();}

void Skim::chimeraHuntLocateChimeras()
{
  for(uint32 actreadid=0; actreadid<SKIM3_chimerahunt.size(); actreadid++){
    vector<uint8> & hunt = SKIM3_chimerahunt[actreadid];
    if(hunt.size()==0) continue;

    //if(actreadid==0){
    //  cout << "\nchimhunt\n";
    //  Read::setCoutType(Read::AS_TEXT);
    //  cout << SKIM3_readpool->getRead(actreadid);
    //  cout << "idhunt: " << SKIM3_readpool->getRead(actreadid).getName() << endl;
    //  for(uint32 i=0; i<hunt.size(); i++){
    //	cout << i << '\t' << static_cast<uint16>(hunt[i]) << '\n';
    //  }
    //}

    // we have this vector (with x=any of 0/1:
    // 000....xxx...000
    // now, to make thing easier in search, fill up leading
    // and traing 0s with 1
    // actually, if no fill up were used, this could also be used as
    //  automatic clipping ... but the analyseHashStats() in 
    //  conjunction with the dataprocessing.C routine does it already
    //  quite well.
    //
    // BaCh: 09.02.2010
    // Actually, there are instances were the above fail, so let's try a fallback
    // If no chimera found, give back proposed right and left cuts as negative
    // values if these are longer than 4 bases.

    int32 proposedleft=0;
    for(int32 i=0; i<hunt.size() && hunt[i]==0; i++){
      hunt[i]=1;
      proposedleft--;
    };
    int32 proposedright=0;
    for(int32 i=hunt.size()-1; i>=0 && hunt[i]==0; --i){
      hunt[i]=1;
      proposedright--;
    };

    // search for holes with 0s >= 2*SKIM3_hashsavestepping
    int32 consecutivezeroes=0;
    int32 leftcut=0;
    int32 longestleftcut=0;
    int32 longestrightcut=0;

    int32 acti=0;
    bool foundcuts=false;
    for(; acti<hunt.size(); acti++){
      if(consecutivezeroes){
	if(hunt[acti]==0){
	  consecutivezeroes++;
	}else{
	  if(consecutivezeroes>=2*SKIM3_hashsavestepping){
	    foundcuts=true;
	    CEBUG("Chimera candidate " 
		  << SKIM3_readpool->getRead(actreadid).getName()
		  << '\n');
	    CEBUG("consecutivezeroes: " << consecutivezeroes << '\n');
	    CEBUG("leftcut: " << leftcut << '\n');
	    CEBUG("acti: " << acti << '\n');
	    CEBUG("longestleftcut: " << longestleftcut << '\n');
	    CEBUG("longestrightcut: " << longestrightcut << '\n');
	    leftcut=acti;
	    consecutivezeroes=0;
	  }
	}
      }else{
	if(hunt[acti]==0){
	  consecutivezeroes++;
	}else{
	  if(acti-leftcut>longestrightcut-longestleftcut){
	    longestleftcut=leftcut;
	    longestrightcut=acti;
	  }
	}
      }
    }

    if(foundcuts){
      CEBUG("Chosen chimeric fragment " << SKIM3_readpool->getRead(actreadid).getName() << " : " << longestleftcut << '\t' << longestrightcut << '\n');
      (*SKIM3_chuntleftcut)[actreadid]=longestleftcut;
      (*SKIM3_chuntrightcut)[actreadid]=longestrightcut;
    }else{
      if(proposedright<-4) (*SKIM3_chuntrightcut)[actreadid]=proposedright;
      if(proposedleft<-4)(*SKIM3_chuntleftcut)[actreadid]=proposedleft;
    }
  }
}
//#define CEBUG(bla)


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::fillTagMaskVector(const uint32 readid, vector<uint8> & tagmaskvector)
{
  tagmaskvector.clear();
  tagmaskvector.resize(SKIM3_readpool->getRead(readid).getLenClippedSeq(),0);


  if(SKIM3_hasMNRr[readid]
     || SKIM3_hasFpAS[readid]){

    for(uint32 tn=0; tn<SKIM3_readpool->getRead(readid).getNumOfTags(); tn++){
      if(SKIM3_readpool->getRead(readid).getTag(tn).identifier==Read::REA_tagentry_idMNRr
	 || SKIM3_readpool->getRead(readid).getTag(tn).identifier==Read::REA_tagentry_idFpAS){
	CEBUG("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASK!\n");
	
	int32 from=SKIM3_readpool->getRead(readid).getTag(tn).from;
	from-=SKIM3_readpool->getRead(readid).getLeftClipoff();
	int32 to=SKIM3_readpool->getRead(readid).getTag(tn).to;
	to-=SKIM3_readpool->getRead(readid).getLeftClipoff();
	
	/* the masking routines will mask only if *ALL* positions
	   in a hash are masked. This allows for cases like this
	   (shown here: dot == masked)
	   
	   ............A...............
	   
	   to still make hashes that have the "A" in them
	   
	   But for FpAS (poly A), we want stop dead at the beginning
	   of the poly A. Therefore, we need to expand the masked
	   area by (number of bases in a hash)-1
	*/
	if(SKIM3_readpool->getRead(readid).getTag(tn).identifier==Read::REA_tagentry_idFpAS){
	  from-=(SKIM3_basesperhash-1);
	  to+=(SKIM3_basesperhash-1);
	}
	
	
	CEBUG("ftmv: " << from << " " << to << '\n');
	for(int32 i=from; i<=to; i++){
	  if(i>=0 && i<static_cast<int32>(tagmaskvector.size())) {
	    tagmaskvector[i]=1;
	  }
	}
      }
    }
  }

  CEBUG("TMV " << SKIM3_readpool->getRead(readid).getName() << '\n');
  for(uint32 i=0; i<tagmaskvector.size(); i++){
    CEBUG(i << '\t' << static_cast<uint16>(tagmaskvector[i]) << '\n');
  }

}


void Skim::reverseTagMaskVector(vector<uint8> & tagmaskvector)
{
  if(tagmaskvector.size()){
    vector<uint8>::iterator f=tagmaskvector.begin();
    vector<uint8>::iterator r=tagmaskvector.end();
    r--;
    uint8 tmp;
    for(; f<r; f++, r--) {
      tmp=*f;
      *f=*r;
      *r=tmp;
    }
  }

  for(uint32 i=0; i<tagmaskvector.size(); i++){
    CEBUG(i << '\t' << static_cast<uint16>(tagmaskvector[i]) << '\n');
  }
}

//#define CEBUG(bla)




#include "boost/format.hpp"
using boost::format;


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
void Skim::provideHashStatistics(const string & directory, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool alsosavesinglehashes, bool fwdandrev, uint32 fwdrevmin,uint8  basesperhash, uint8  hashsavestepping)
{
  string hashstatfilename;

  SKIM3_hs_basesperhash=basesperhash;
  size_t avghashcov=prepareHashStatistics(directory, rp, checkusedinassembly, onlyagainstrails, alsosavesinglehashes,
					  fwdandrev, fwdrevmin, basesperhash, hashsavestepping, 
					  hashstatfilename,
					  SKIM3_hs_hashstats,
					  SKIM3_hs_hsshortcuts_begin,
					  SKIM3_hs_hsshortcuts_end);
}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::analyseHashes(const string & directory, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool alsosavesinglehashes, bool fwdandrev, uint32 fwdrevmin, uint8  basesperhash, uint8  hashsavestepping, bool masknastyrepeats)
{
  string hashstatfilename;
  vector<hashstat_t> hashstats;
  vector<vector<hashstat_t>::const_iterator > hsshortcuts_begin;
  vector<vector<hashstat_t>::const_iterator > hsshortcuts_end;

  size_t avghashcov=prepareHashStatistics(directory, rp, checkusedinassembly, onlyagainstrails, alsosavesinglehashes,
					  fwdandrev, fwdrevmin, basesperhash, hashsavestepping,
					  hashstatfilename,
					  hashstats,
					  hsshortcuts_begin,
					  hsshortcuts_end);

  CEBUG("Avg. " << avghashcov << endl);

  dumpHashStatisticsInfo(avghashcov, hashstats);

  cout << "Assigning statistics values:\n";
  dateStamp(cout);

  assignReadBaseStatistics(rp,
			   avghashcov,
  			   hashstats, 
  			   basesperhash,
  			   hsshortcuts_begin,
  			   hsshortcuts_end,
			   masknastyrepeats);

  //cout << "Correcting by RMB:\n";
  //correctReadBaseStatisticsByRMB(rp,basesperhash);

  dateStamp(cout);

  return;
}
//#define CEBUG(bla)



/*************************************************************************
 *
 * all steps until a usable hash statistics file is in memory
 * Note: does not delete the hash statistics file on disk (only the
 *  temporary files)
 *
 * Returns explicitly:
 *   average hash coverage
 *
 * Returns implicitly:
 *   hashstatfilename
 *   hashstats
 *   hsshortcuts_begin & hsshortcuts_end
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
size_t Skim::prepareHashStatistics(const string & directory, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool alsosavesinglehashes, bool fwdandrev, uint32 fwdrevmin, uint8  basesperhash, uint8  hashsavestepping, string & hashstatfilename, vector<hashstat_t> & hashstats,  vector<vector<hashstat_t>::const_iterator> & hsshortcuts_begin,  vector<vector<hashstat_t>::const_iterator> & hsshortcuts_end)
{
  SKIM3_readpool=&rp;

  vector<string> hashfilenames;
  vector<size_t> elementsperfile;

  hashstats.clear();
  hsshortcuts_begin.clear();
  hsshortcuts_end.clear();


  dateStamp(cout);

  cout << "Writing temporary hstat files:\n";
  hashes2disk(hashfilenames,elementsperfile,
	      rp,checkusedinassembly,onlyagainstrails,fwdandrev,
	      basesperhash,hashsavestepping,directory);

  dateStamp(cout);

  cout << "\nAnalysing hstat files:\n";
  size_t numhashstats=
    createHashStatisticsFile(hashstatfilename,
			     hashfilenames,
			     elementsperfile,
			     basesperhash,
			     fwdrevmin,
			     rp,
			     onlyagainstrails,
			     alsosavesinglehashes,
			     directory);

  cout << "\n";

  dateStamp(cout);

  cout << "clean up temporary stat files..."; cout.flush();
  // clean up temporary stat files
  for(uint32 hfni=0; hfni<hashfilenames.size();hfni++){
    string system_rmdir = static_cast<string>("rm -rf ")+hashfilenames[hfni];
    // tmp ... dear compiler: don't complain about unused variable
    int tmp=system(system_rmdir.c_str()); 
    (void) tmp;    
  }

  dateStamp(cout); cout.flush();

  hashstats.resize(numhashstats);
  size_t avghashcov=loadHashStatisticsFile(hashstatfilename,
					   hashstats);

  makeHashStatArrayShortcuts(hashstats,basesperhash, 
			     hsshortcuts_begin, hsshortcuts_end);

  dateStamp(cout);

  return avghashcov;
}
//#define CEBUG(bla)



/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

void Skim::showHashStatisticsInfo(string & hashstatfilename)
{
  FUNCSTART("void Skim::showHashStatisticsInfo(string & hashstatfilename)");

  ifstream finfin;
  finfin.open(hashstatfilename.c_str(), ios::in|ios::ate);
  if(!finfin){
    MIRANOTIFY(Notify::FATAL, "File not found: " << hashstatfilename);
  }
  size_t finfinsize=finfin.tellg();
  finfin.close();

  if(finfinsize%sizeof(hashstat_t)){
    MIRANOTIFY(Notify::FATAL, "File probably not a hash stat: " << hashstatfilename);
  }

  size_t numhashstats=finfinsize/sizeof(hashstat_t);
  

  vector<hashstat_t> hashstats(numhashstats);
  size_t avghashcov=loadHashStatisticsFile(hashstatfilename,
					   hashstats);

  CEBUG("Avg. " << avghashcov << endl);

  dumpHashStatisticsInfo(avghashcov, hashstats);

  FUNCEND();
}


/*************************************************************************
 *
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::hashes2disk(vector<string> & hashfilenames, vector<size_t> & elementsperfile, ReadPool & rp, bool checkusedinassembly, bool onlyagainstrails, bool fwdandrev, uint8  basesperhash, uint8  hashsavestepping, const string & directory)
{
  FUNCSTART("void Skim::hashes2disk(uint32 basesperhash)");

  const size_t upperbases=2;

  hashfilenames.clear();

  BUGIFTHROW(basesperhash==0,"basesperhash == 0 ???");
  BUGIFTHROW(upperbases>=basesperhash,"upperbases (" << upperbases << ") >=basesperhash " << static_cast<uint16>(basesperhash) << ") ???");

  size_t numfiles=1<<(upperbases*2);
  size_t rightshift=(basesperhash-upperbases)*2;

  CEBUG("bph: " << static_cast<uint16>(basesperhash) << ".\n");
  CEBUG("Must create " << numfiles << " files.\n");
  CEBUG("Rightshift:" << rightshift << '\n');
  CEBUG("sizeof(vhash_t): " << sizeof(vhash_t) << '\n');

  vector<FILE *> hashfiles(numfiles);
  vector<vector<diskhash_t> > hashfilebuffer(numfiles);
  for(size_t i=0; i<numfiles; i++){
    string fname=directory+"/stattmp"+str(format("%x") % i )+".bin";
    hashfilenames.push_back(fname);
    hashfiles[i]=fopen(fname.c_str(), "w");
    hashfilebuffer[i].reserve(131072);
  }

  elementsperfile.clear();
  elementsperfile.resize(numfiles,0);

  vector<vhrap_t> singlereadvhraparray;
  singlereadvhraparray.reserve(10000);

  // we will not use a mask, but
  //  we need to supply an empty one anyway
  vector<uint8> tagmaskvector;

  diskhash_t tmpdh;

  ProgressIndicator<int32> P(0, rp.size());

  for(uint32 actreadid=0; actreadid<rp.size(); actreadid++){
    P.progress(actreadid);

    //if(actreadid>100) return;

    Read & actread= rp.getRead(actreadid);

    // Has been taken out as hash statistics now also used for mirabait
    // TODO: check whether this has big influence on "normal" assembly jobs
    //  !!! it has ... for mapping assemblies !!! 

    if(!actread.hasValidData()
       || (checkusedinassembly && !actread.isUsedInAssembly())) continue; 

    uint32 slen=actread.getLenClippedSeq();

    if(slen<basesperhash) continue;

    singlereadvhraparray.resize(slen);
    tagmaskvector.resize(slen,0);

    vector<vhrap_t>::iterator srvaI=singlereadvhraparray.begin();

    uint32 hashesmade;

    const vector<Read::bposhashstat_t> & bposhashstats=actread.getBPosHashStats();
    int32 bfpos=actread.calcClippedPos2RawPos(0);
    int32 bfposinc=0;

    hashesmade=transformSeqToVariableHash(
      actreadid,
      actread,
      actread.getClippedSeqAsChar(),
      slen,
      basesperhash,
      srvaI,
      false,
      1,
      tagmaskvector,
      bposhashstats,
      bfpos,
      bfposinc
      );
    singlereadvhraparray.resize(hashesmade);

    tmpdh.seqtype=actread.getSequencingType();

    tmpdh.dir=1;

    size_t hashfilesindex;
    srvaI=singlereadvhraparray.begin();
    for(uint32 i=0;i<hashesmade; i++, srvaI++){
      tmpdh.hashpos=srvaI->hashpos-(basesperhash-1);
      tmpdh.vhash=srvaI->vhash;
      hashfilesindex=tmpdh.vhash>>rightshift;
      CEBUG("Want to write fwd: " << tmpdh << " to " << hashfilesindex << endl);
      BUGIFTHROW(hashfilesindex>=hashfiles.size(),"hashfilesindex>=hashfiles.size() ???");

      if(hashfilebuffer[hashfilesindex].size()==hashfilebuffer[hashfilesindex].capacity()){
	if(fwrite(&hashfilebuffer[hashfilesindex][0],sizeof(tmpdh),hashfilebuffer[hashfilesindex].size(),hashfiles[hashfilesindex]) != hashfilebuffer[hashfilesindex].size()){
	  MIRANOTIFY(Notify::FATAL, "1 Could not write anymore to hash file. Disk full? Changed permissions?");
	}
	hashfilebuffer[hashfilesindex].clear();
      }
      hashfilebuffer[hashfilesindex].push_back(tmpdh);

      elementsperfile[hashfilesindex]++;
    }
    //abort();

    if(fwdandrev){
      srvaI=singlereadvhraparray.begin();
      hashesmade=transformSeqToVariableHash(
	actreadid,
	actread,
	actread.getClippedComplementSeqAsChar(),
	slen,
	basesperhash,
	srvaI,
	false,
	1,
	tagmaskvector,
	bposhashstats,
	bfpos,
	bfposinc
	);
      singlereadvhraparray.resize(hashesmade);
      
      tmpdh.dir=-1;
      srvaI=singlereadvhraparray.begin();
      for(uint32 i=0;i<hashesmade; i++, srvaI++){
	tmpdh.hashpos=srvaI->hashpos-(basesperhash-1);
	tmpdh.vhash=srvaI->vhash;
	hashfilesindex=tmpdh.vhash>>rightshift;
	CEBUG("Want to write rev: " << tmpdh << " to " << hashfilesindex << endl);
	BUGIFTHROW(hashfilesindex>=hashfiles.size(),"hashfilesindex>=hashfiles.size() ???");

	if(hashfilebuffer[hashfilesindex].size()==hashfilebuffer[hashfilesindex].capacity()){
	  if(fwrite(&hashfilebuffer[hashfilesindex][0],sizeof(tmpdh),hashfilebuffer[hashfilesindex].size(),hashfiles[hashfilesindex]) != hashfilebuffer[hashfilesindex].size()){
	    MIRANOTIFY(Notify::FATAL, "2 Could not write anymore to hash file. Disk full? Changed permissions?");
	  }
	  hashfilebuffer[hashfilesindex].clear();
	}
	hashfilebuffer[hashfilesindex].push_back(tmpdh);

	elementsperfile[hashfilesindex]++;
      }
    }
  }

  for(size_t i=0; i<numfiles; i++){
    if(!hashfilebuffer[i].empty()){
      if(fwrite(&(hashfilebuffer[i][0]),sizeof(tmpdh),hashfilebuffer[i].size(),hashfiles[i]) != hashfilebuffer[i].size()){
	MIRANOTIFY(Notify::FATAL, "3 Could not write anymore to hash file. Disk full? Changed permissions?");
      }
    }
  }

  P.finishAtOnce();
  cout << "done\n";

  for(size_t i=0; i<numfiles; i++){
    fclose(hashfiles[i]);
  }

  FUNCEND();
}
#define CEBUG(bla) 


/*************************************************************************
 *
 * sorter to sort from low to high in vhash (first) and direction
 *
 *************************************************************************/

inline bool Skim__sortDiskHashComparator_(const diskhash_t & a, 
					  const diskhash_t & b);
inline bool Skim__sortDiskHashComparator_(const diskhash_t & a, 
					  const diskhash_t & b)
{
  if(a.vhash==b.vhash){
    return a.dir<b.dir;
  }
  return a.vhash < b.vhash;
}

/*************************************************************************
 *
 * sorter to sort from low to high in count
 *
 *************************************************************************/

inline bool Skim__sortHashStatComparatorByCount_(const hashstat_t & a, 
						 const hashstat_t & b);
inline bool Skim__sortHashStatComparatorByCount_(const hashstat_t & a, 
						 const hashstat_t & b)
{
  return a.count < b.count;
}


/*************************************************************************
 *
 * sorts every hashfile and writes a hash statistics file
 *
 * TODO: delete single hash files and clear vectors of filenames and sizes
 * 
 * returns:
 *  - by value: elements in hash statistics file
 *  - name of file in the call by reference variable
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

size_t Skim::createHashStatisticsFile(string & hashstatfilename, vector<string> & hashfilenames, vector<size_t> & elementsperfile, uint8 basesperhash, uint32 fwdrevmin, ReadPool & rp, bool onlyagainstrails, bool alsosavesinglehashes, const string & directory)
{
  FUNCSTART("size_t Skim::createHashStatisticsFile(string & hashstatfilename, vector<string> & hashfilenames, vector<size_t> & elementsperfile, ReadPool & rp, bool onlyagainstrails, const string & directory)");

  FILE * fout;
  hashstatfilename=directory+"/hashstat.bin";
  fout=fopen(hashstatfilename.c_str(), "w");

  size_t maxelementsperfile=0;

  for(size_t fi=0; fi< elementsperfile.size(); fi++){
    maxelementsperfile=max(maxelementsperfile,elementsperfile[fi]);
  }

  CEBUG("Max elements per file: " << maxelementsperfile << '\n');

  vector<diskhash_t> hashpool;
  hashpool.reserve(maxelementsperfile+10);

  size_t numhashstats=0;

  ProgressIndicator<int32> P(0, static_cast<int32>(elementsperfile.size()));

  for(size_t fi=0; fi< elementsperfile.size(); fi++){
    P.increaseprogress();

    CEBUG("Loading " << hashfilenames[fi] << endl);
    CEBUG("elements in file: " << elementsperfile[fi] << endl);

    if(elementsperfile[fi]==0) continue;
    hashpool.clear();
    hashpool.resize(elementsperfile[fi]);
    
    FILE * fin;
    fin=fopen(hashfilenames[fi].c_str(), "r");
    if(fread(&hashpool[0],sizeof(diskhash_t),elementsperfile[fi],fin) != elementsperfile[fi]) {
      MIRANOTIFY(Notify::FATAL, "Expected to read " << elementsperfile[fi] << " elements in file " << hashfilenames[fi] << " but read less. Was the file deleted? Disk full?");
    }
    fclose(fin);

    //for(size_t i=0; i<hashpool.size(); i++){
    //  CEBUG(hashpool[i] << '\n');
    //}

    CEBUG("Sorting ... "); cout.flush();
    sort(hashpool.begin(), hashpool.end(), Skim__sortDiskHashComparator_);
    CEBUG("done.\n");

    for(size_t i=0; i<hashpool.size(); i++){
      CEBUG(hashpool[i] << '\n');
    }

    vector<diskhash_t>::const_iterator dhI=hashpool.begin();
    uint32 hasforward=0;
    uint32 hasreverse=0;
    bool hasmultipleseqtype=false;
    uint8 thisseqtype=0;
    uint32 thishashcounter=0;
    uint16 thislowpos=0;
    hashstat_t tmphs;
    // setting this leads the very first iteration of the main loop 
    //  to set correct values
    vhash_t thishash=(dhI->vhash)-1;
    for(; dhI!=hashpool.end(); dhI++){
      if(dhI->vhash != thishash){
	// save only hashes that appeared more than once
	bool writeit=false;
	if(thishashcounter>1){
	  writeit=true;
	  tmphs.vhash=thishash;
	  tmphs.count=thishashcounter;
	  tmphs.lowpos=thislowpos;
	  tmphs.hasfwdrev=(hasforward>=fwdrevmin) & (hasreverse>=fwdrevmin);
	  tmphs.hasmultipleseqtype=hasmultipleseqtype;	  
	  CEBUG("Write multiple: " << tmphs << '\n');
	}else if(thishashcounter==1 && alsosavesinglehashes){
	  writeit=true;
	  tmphs.vhash=thishash;
	  tmphs.count=1;
	  tmphs.lowpos=thislowpos;
	  tmphs.hasfwdrev=false;
	  tmphs.hasmultipleseqtype=false;
	  CEBUG("Write single: " << tmphs << '\n');
	}
	if(writeit){
	  if(fwrite(&tmphs,sizeof(hashstat_t),1,fout) != 1){
	    MIRANOTIFY(Notify::FATAL, "Expected to write 1 element in file " << hashstatfilename << " but could not. Was the file deleted? Disk full?");
	  }
	  numhashstats++;
	}
	hasforward=0;
	hasreverse=0;
	hasmultipleseqtype=false;
	thishash=dhI->vhash;
	thislowpos=dhI->hashpos;
	thishashcounter=0;
	thisseqtype=dhI->seqtype;
      }
      thishashcounter++;
      if(dhI->dir>0) {
	++hasforward;
      }else{
	++hasreverse;
      }
      if(dhI->hashpos < thislowpos) thislowpos=dhI->hashpos;
      if(dhI->seqtype != thisseqtype) hasmultipleseqtype=true;
    }

    // were out of the loop, write last elements if there were any
    bool writeit=false;
    if(thishashcounter>1){
      writeit=true;
      tmphs.vhash=thishash;
      tmphs.count=thishashcounter;
      tmphs.lowpos=thislowpos;
      tmphs.hasfwdrev=(hasforward>=fwdrevmin) & (hasreverse>=fwdrevmin);
      tmphs.hasmultipleseqtype=hasmultipleseqtype;	  
      if(fwrite(&tmphs,sizeof(hashstat_t),1,fout) != 1){
	MIRANOTIFY(Notify::FATAL, "Expected to write 1 element in file " << hashstatfilename << " but could not. Was the file deleted? Disk full?");
      }
      CEBUG("Written multiple: " << tmphs << '\n');
    }else if(thishashcounter==1 && alsosavesinglehashes){
      writeit=true;
      tmphs.vhash=thishash;
      tmphs.count=1;
      tmphs.lowpos=thislowpos;
      tmphs.hasfwdrev=false;
      tmphs.hasmultipleseqtype=false;
      CEBUG("Write single: " << tmphs << '\n');
    }
    if(writeit){
      if(fwrite(&tmphs,sizeof(hashstat_t),1,fout) != 1){
	MIRANOTIFY(Notify::FATAL, "Expected to write 1 element in file " << hashstatfilename << " but could not. Was the file deleted? Disk full?");
      }
      numhashstats++;
    }
  }
  
  fclose(fout);

  P.finishAtOnce();

  FUNCEND();
  return numhashstats;
}
//#define CEBUG(bla)


/*************************************************************************
 *
 * Needs: Name of the hashstat filename and
 *  vector<hashstat_t at the correct size to load all elements of the file
 * returns:
 *  - average hash statistics
 *  - the hash statistics vector (sorted by count)
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}
size_t Skim::loadHashStatisticsFile(string & hashstatfilename, vector<hashstat_t> & hashstats)
{
  FUNCSTART("size_t Skim::loadHashStatisticsFile(string & hashstatfilename, vector<hashstat_t> & hashstats)");

  CEBUG("Loading hash stats: " << hashstats.size() << endl);

  FILE * fin;
  fin=fopen(hashstatfilename.c_str(), "r");
  size_t numread=fread(&hashstats[0],sizeof(hashstat_t),hashstats.size(),fin);
  if(numread != hashstats.size()){
    MIRANOTIFY(Notify::FATAL, "Expected to read " << hashstats.size() << " elements in hashfile " << hashstatfilename << " but read less (" << numread << "). Was the file deleted? Disk full?");
  }
  fclose(fin);

  if(hashstats.begin() == hashstats.end()) return 0;

  sort(hashstats.begin(),hashstats.end(),Skim__sortHashStatComparatorByCount_);

  size_t mhi=calcMidHashStatIndex(hashstats,0);
  // if mh index is in last 10 % of the hashstats, we have a pretty skewed
  //  distribution. In that case, recalc without last 10%
  // TODO: check whether 40 or 50% wouldn't be better.
  if(mhi >= (hashstats.size()-hashstats.size()/10)){
    mhi=calcMidHashStatIndex(hashstats,10);
  }

  FUNCEND();
  return hashstats[mhi].count;
}
//#define CEBUG(bla)



/*************************************************************************
 *
 * 
 *
 *************************************************************************/


//#define CEBUG(bla)   {cout << bla; cout.flush();}
size_t Skim::calcMidHashStatIndex(const vector<hashstat_t> & hashstats, size_t dontcarepercent)
{
  FUNCSTART("size_t Skim::calcMidHashStatIndex(const vector<hashstat_t> & hashstats, size_t dontcarepercent)");

  if(hashstats.empty()) return 0;

  size_t firsti=0;
  size_t lasti=hashstats.size();
  if(dontcarepercent){
    firsti=hashstats.size()*dontcarepercent/100;
    lasti-=hashstats.size()*dontcarepercent/100;
  }else{
    // 5% default
    firsti=hashstats.size()/20;
    lasti-=hashstats.size()/20;
  }

  size_t sumhashcounts=0;
  uint32 oldhashcount=hashstats[0].count-1;
  size_t oldsumhashcounts=0;
  for(size_t i=firsti; i<lasti; i++){
    if(hashstats[i].count != oldhashcount){
      oldhashcount=hashstats[i].count;
      CEBUG("count: " << oldhashcount << "\tsumhash: " << sumhashcounts << "\tdiff: " << sumhashcounts-oldsumhashcounts << endl);      
      oldsumhashcounts=sumhashcounts;
    }
    if(hashstats[i].hasfwdrev) sumhashcounts+=hashstats[i].count;
  }
  CEBUG("count: " << oldhashcount << "\tsumhash: " << sumhashcounts << endl);

  // Hmmm, pathological case. Maybe all reads were in the same direction.
  //  simply recalc without the "has fwd/rev" clause
  bool dontusefwdrev=false;
  if(sumhashcounts==0){
    dontusefwdrev=true;
    for(size_t i=firsti; i<lasti; i++){
      sumhashcounts+=hashstats[i].count;
    }
    CEBUG("recalc sumhash: " << sumhashcounts << endl);
  }

  size_t midhashstats=sumhashcounts/2;

  CEBUG("midhashstats: " << midhashstats << endl);

  sumhashcounts=0;
  for(size_t i=firsti; i<lasti; i++){
    if(dontusefwdrev || hashstats[i].hasfwdrev) sumhashcounts+=hashstats[i].count;
    if(sumhashcounts>midhashstats) {
      return i;
    }
  }

  FUNCEND();

  return 0;
}
//#define CEBUG(bla)





/*************************************************************************
 *
 * Needs: 
 *  - the hash statistics vector (sorted by count)
 *
 *************************************************************************/

void Skim::dumpHashStatisticsInfo(size_t avgcov, vector<hashstat_t> & hashstats)
{
  FUNCSTART("void Skim::dumpHashStatisticsInfo(size_t avgcov, vector<hashstat_t> & hashstats)");

  cout << "Hash statistics:\n"
       << "=========================================================\n"
       << "Measured avg. frequency coverage: " << avgcov << endl

       << "\nDeduced thresholds:\n"
       << "-------------------"
       << "\nMin normal cov: " << SKIM3_freqest_minnormal*avgcov
       << "\nMax normal cov: " << SKIM3_freqest_maxnormal*avgcov
       << "\nRepeat cov: " << SKIM3_freqest_repeat*avgcov
       << "\nHeavy cov: " << SKIM3_freqest_heavyrepeat*avgcov
       << "\nCrazy cov: " << SKIM3_freqest_crazyrepeat*avgcov
       << "\nMask cov: " << SKIM3_nastyrepeatratio*avgcov
       << "\n\nRepeat ratio histogram:\n"
       << "-----------------------"
       << endl;

  vector<size_t> ratiocounts(1,0);
  for(size_t i=0; i<hashstats.size(); i++){
    uint32 rci=static_cast<uint32>((static_cast<double>(hashstats[i].count) / avgcov) + 0.5);
    if(rci>=ratiocounts.size()){
      ratiocounts.resize(rci+1,0);
    }
    ratiocounts[rci]++;
  }

  for(size_t i=0; i<ratiocounts.size(); i++){
    if(ratiocounts[i]) cout << i << '\t' << ratiocounts[i] << endl;
  }

  cout << "=========================================================\n\n";
  
  FUNCEND();

  return;
}



/*************************************************************************
 *
 * sorter to sort from low to high, but lower 24bit grouped
 *
 *
 *************************************************************************/

inline bool Skim__sortHashStatComparatorByLow24bit_(const hashstat_t & a, 
						    const hashstat_t & b);
inline bool Skim__sortHashStatComparatorByLow24bit_(const hashstat_t & a, 
						    const hashstat_t & b)
{
  if((a.vhash & MAXVHASHMASK) != (b.vhash & MAXVHASHMASK)) {
    return (a.vhash & MAXVHASHMASK) < (b.vhash & MAXVHASHMASK);
  }
  return a.vhash < b.vhash;
}


/*************************************************************************
 *
 * needs: 
 *  - hashstats filled with entries (can be unsorted, will be re-sorted
 *    anyway)
 *
 * returns:
 *  - hashstats array sorted by low 24 bit (low to high), then by vhash
 *  - hsshortcuts_begin and ..._end pointing to start and end of each
 *    low 24 bit group of same value
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::makeHashStatArrayShortcuts(vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)
{
  FUNCSTART("void Skim::makeHashStatArrayShortcuts(vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)");

  CEBUG("makeHashStatArrayShortcuts: basesperhash: " << static_cast<uint16>(basesperhash) << "\n");

  BUGIFTHROW(basesperhash==0, "basesperhash == 0 ???");

  for(size_t i=0; i<hashstats.size(); i++){
    CEBUG(hashstats[i] << '\n');
  }

  sort(hashstats.begin(), hashstats.end(), Skim__sortHashStatComparatorByLow24bit_);

  hsshortcuts_begin.clear();
  hsshortcuts_end.clear();
  vector<hashstat_t>::const_iterator hsI=hashstats.begin();

  hsshortcuts_begin.resize(
    1<<(min(static_cast<uint8>(12),basesperhash)*2),
//    static_cast<vector<hashstat_t>::iterator>(NULL)
//    SKIM3_empty_vector_hashstat_t.end()
    hashstats.end()
   );

  hsshortcuts_end.resize(
    1<<(min(static_cast<uint8>(12),basesperhash)*2),
//    static_cast<vector<hashstat_t>::iterator>(NULL)
//    SKIM3_empty_vector_hashstat_t.end()
    hashstats.end()
    );

  if(hsI==hashstats.end()) return;

  CEBUG("hsshortcuts_begin.size(): " << hsshortcuts_begin.size() << endl);
  CEBUG("hsshortcuts_end.size(): " << hsshortcuts_end.size() << endl);

  vhash_t acthash= (hsI->vhash & MAXVHASHMASK);
  while(hsI != hashstats.end()){
    CEBUG("begin " << hex << acthash << dec << " is: " << *hsI << endl);
    hsshortcuts_begin[acthash]=hsI;
    for(;(hsI != hashstats.end()) && ((hsI->vhash & MAXVHASHMASK) == acthash); hsI++) {
      CEBUG("INC\n")
    }
    CEBUG("end " << hex << acthash << dec << " is: " << *hsI << endl);
    hsshortcuts_end[acthash]=hsI;
    //cout << "vhash: " << hex << acthash << "\t" << dec << hsshortcuts_end[acthash]-hsshortcuts_begin[acthash] << '\n';
    if(hsI != hashstats.end()) acthash= hsI->vhash & MAXVHASHMASK;
  }

  FUNCEND();
}
//#define CEBUG(bla)


/*************************************************************************
 *
 * comparator for lower_bound below
 *
 *************************************************************************/

inline bool Skim__compareHashStatHashElem_(const hashstat_t & a, 
					   const hashstat_t & b);
inline bool Skim__compareHashStatHashElem_(const hashstat_t & a, 
					   const hashstat_t & b)
{
  return a.vhash < b.vhash;
}

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end, bool masknastyrepeats)
{
  FUNCSTART("void Skim::assignReadBaseStatistics(ReadPool & rp, size_t avghashcov, vector<hashstat_t> & hashstats, const uint8 basesperhash, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)");

  uint32 minnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_freqest_minnormal); 
  uint32 maxnormalhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_freqest_maxnormal); 
  uint32 repeathashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_freqest_repeat);
  uint32 heavyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_freqest_heavyrepeat);
  uint32 crazyrepthashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_freqest_crazyrepeat);
  uint32 maskhashcov=static_cast<uint32>(static_cast<double>(avghashcov)*SKIM3_nastyrepeatratio); 

  CEBUG("minnormalhashcov: " << minnormalhashcov << endl);
  CEBUG("maxnormalhashcov: " << maxnormalhashcov << endl);
  CEBUG("repeathashcov: " << repeathashcov << endl);

  vector<vhrap_t> singlereadvhraparray;
  singlereadvhraparray.reserve(10000);

  // we will not use a mask, but
  //  we need to supply an empty one anyway
  vector<uint8> tagmaskvector;

  // stores in each read whether the given hash frequency was seen
  vector<uint8> hasfrequency(8);

  vector<uint8> mcmask;
  mcmask.reserve(10000);

  ProgressIndicator<int32> P(0, rp.size());

  for(uint32 actreadid=0; actreadid<rp.size(); actreadid++){
    P.progress(actreadid);

    //if(actreadid>100) return;

    Read & actread= rp.getRead(actreadid);
    if(!actread.hasValidData()
      || !actread.isUsedInAssembly()) continue; 

    // get rid of old values
    actread.clearAllBPosHashStats();

//#define CEBUG(bla)   {if(cebugok) cout << bla; cout.flush();}
//    bool cebugok=false;
//    if(actread.getName()=="E0K6C4E01CTNQI") cebugok=true;

    uint32 slen=actread.getLenClippedSeq();

    if(slen<basesperhash) continue;

    mcmask.clear();
    mcmask.resize(actread.getLenSeq(),0);

    hasfrequency.clear();
    hasfrequency.resize(8,0);

    CEBUG("name: " << actread.getName() << '\n');

    //cout << "Before ...\n";
    //Read::setCoutType(Read::AS_TEXT);
    //cout << actread;

    singlereadvhraparray.resize(slen);
    tagmaskvector.resize(slen,0);

    vector<vhrap_t>::iterator srvaI=singlereadvhraparray.begin();

    vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
    uint32 hashesmade;

    {
      int32 bfpos=actread.calcClippedPos2RawPos(0);
      int32 bfposinc=1;

      hashesmade=transformSeqToVariableHash(
	actreadid,
	actread,
	actread.getClippedSeqAsChar(),
	slen,
	basesperhash,
	srvaI,
	false,
	1,
	tagmaskvector,
	bposhashstats,
	bfpos,
	bfposinc
	);
    }
    singlereadvhraparray.resize(hashesmade);

    CEBUG("hashesmade: " << hashesmade << endl);

    vector<hashstat_t>::const_iterator lowerbound;
    vector<hashstat_t>::const_iterator upperbound;

    vector<hashstat_t>::const_iterator hssearchI;
    srvaI=singlereadvhraparray.begin();

    int32 bfpos1,bfpos2;
    hashstat_t hstmp;
    bool foundit;
    for(; srvaI != singlereadvhraparray.end(); srvaI++){
      CEBUG(*srvaI << '\n');

      foundit=false;
      lowerbound=hsshortcuts_begin[srvaI->vhash & MAXVHASHMASK];
      upperbound=hsshortcuts_end[srvaI->vhash & MAXVHASHMASK];
	
      // "SKIM3_empty_vector_hashstat_t.end()" is the "NULL" replacement
      if(hashstats.end() != lowerbound){
	if(basesperhash>12){
	  // with more than 12 bases in a hash, the array is subdivided 
	  hstmp.vhash=srvaI->vhash;
	  hssearchI=lower_bound(lowerbound,
				upperbound,
				hstmp,
				Skim__compareHashStatHashElem_);
	  if(hssearchI != hashstats.end()
	     && hssearchI->vhash == srvaI->vhash) foundit=true;
	}else{
	  hssearchI=lowerbound;
	  foundit=true;
	}
      }else{
	CEBUG("---------- NO LB HIT??? -------\n");
      }

      if(foundit) {
	CEBUG("VHRAP: " << *srvaI << '\n');
	CEBUG("HashStat: " << *hssearchI << '\n');
	CEBUG("srvaI->hashpos: " << srvaI->hashpos << '\n');

	bfpos1=actread.calcClippedPos2RawPos(srvaI->hashpos-(basesperhash-1));
	bfpos2=bfpos1+basesperhash-1;

	CEBUG("b bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
	CEBUG("b bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);

	bposhashstats[bfpos1].fwd.setValid();
	bposhashstats[bfpos2].rev.setValid();

	if(hssearchI->hasfwdrev) {
	  //bhs|=Read::BFLAGS_CONFIRMED_FWDREV;
	  CEBUG("Set ConfFWDREV\n");
	  bposhashstats[bfpos1].fwd.setConfirmedFwdRev();
	  bposhashstats[bfpos2].rev.setConfirmedFwdRev();
	}
	if(hssearchI->lowpos<=4){
	  //bhs|=Read::BFLAGS_SEENATLOWPOS;
	  CEBUG("Set SeenAtLowPos\n");
	  bposhashstats[bfpos1].fwd.setSeenAtLowPos();
	  bposhashstats[bfpos2].rev.setSeenAtLowPos();
	}
	if(hssearchI->hasmultipleseqtype){
	  //bhs|=Read::BFLAGS_CONFIRMED_MULTIPLESEQTYPE;
	  CEBUG("Set ConfMultSeqType\n");
	  bposhashstats[bfpos1].fwd.setConfirmedMultipleSeqType();
	  bposhashstats[bfpos2].rev.setConfirmedMultipleSeqType();
	}
	uint8 frequency=2;
	if(hssearchI->count == 1){
	  frequency=1;
	}else if(hssearchI->count<minnormalhashcov) {
	  frequency=2;
	}else if(hssearchI->count>=minnormalhashcov
	   && hssearchI->count<=maxnormalhashcov) {
	  frequency=3;
	  //}else if(hssearchI->count > minnormalhashcov*20){
	}else if(hssearchI->count > crazyrepthashcov){
	  frequency=7;
	}else if(hssearchI->count > heavyrepthashcov){
	  frequency=6;
	}else if(hssearchI->count>=repeathashcov){
	  frequency=5;
	}else{
	  frequency=4;
	}
	CEBUG("Set frequency: " << static_cast<uint16>(frequency) << endl);

	if(maskhashcov>0 && hssearchI->count>=maskhashcov){
	  for(uint32 j=0; j<basesperhash; j++){
	    mcmask[bfpos1+j]=1;
	  }
	}

	CEBUG("a1 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
	CEBUG("a1 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);

	bposhashstats[bfpos1].fwd.setFrequency(frequency);
	bposhashstats[bfpos2].rev.setFrequency(frequency);

	CEBUG("a2 bfpos1: " << bfpos1 << '\t' << bposhashstats[bfpos1] << endl);
	CEBUG("a2 bfpos2: " << bfpos2 << '\t' << bposhashstats[bfpos2] << endl);

	hasfrequency[frequency]=1;
	
	//cout.flush();
	//actread.setBaseFlagsInClippedSequence(bhs,
	//				      srvaI->hashpos-(basesperhash-1),
	//				      basesperhash);
	//actread.setHasBaseFlags(true);
      }
    }

    actread.setHasFreqAvg(false);
    actread.setHasFreqRept(false);

    if(hasfrequency[3]){
      actread.setHasFreqAvg(true);
    }
    if(hasfrequency[5] || hasfrequency[6] || hasfrequency[7]){
      actread.setHasFreqRept(true);
    }

    actread.setHasBaseHashStats(true);

    //cout << "After ...\n";
    //Read::setCoutType(Read::AS_TEXT);
    //cout << actread;


    // BaCh 07.04.2009 Bad Idea!!!
    // BaCh 12.07.2009 Why? Forgot ... :-(
    ///* the fwd/rev of a read now looks like this (e.g.)
    //   (for better viewing dot == 0)
    //
    //   f   ..........2222222233333....355555....................
    //   r   ................2222222....33333355555...............
    //
    //   in dubio pro reo and to allow for potential matches,
    //   do this:
    //
    //   f   ..........2222222233333....355555->..................
    //   r   ..............<-2222222....33333355555...............
    //
    //   so that this 
    //
    //   f   ..........2222222233333....35555555555...............
    //   r   ..........2222222222222....33333355555...............
    //
    //   is generated
    //
    //*/
    //
    //{
    //  uint32 bfposi=0;
    //  for(; bfposi<bposhashstats.size() && bposhashstats[bfposi].fwd.getFrequency()==0; bfposi++) {};
    //  uint32 bfpose=bfposi;
    //  for(; bfpose<bposhashstats.size() && bposhashstats[bfpose].rev.getFrequency()==0; bfpose++) {};
    //  if(bfposi<bposhashstats.size() && bfpose<bposhashstats.size()){
    //	for(uint32 i=bfposi; i<bfpose; i++){
    //	  bposhashstats[i].fwd=bposhashstats[bfpose].rev;
    //	}
    //  }
    //
    //  bfposi=bposhashstats.size()-1;
    //  for(; bfposi>0 && bposhashstats[bfposi].rev.getFrequency()==0; bfposi--) {};
    //  bfpose=bfposi;
    //  for(; bfpose>0 && bposhashstats[bfpose].fwd.getFrequency()==0; bfpose--) {};
    //  if(bfposi>0){
    //	for(uint32 i=bfposi; i>bfpose; i--){
    //	  bposhashstats[i].fwd=bposhashstats[bfpose].rev;
    //	}
    //  }
    //}


    // go through multicopy array and set MNRr tags for
    //  consecutive positions in read tagged as multicopy
    if(masknastyrepeats){
      bool inrun=false;
      uint32 runstart=0;
      uint32 pos=0;
      for(; pos<mcmask.size(); pos++){
	CEBUG("pos: " << pos << '\t' << static_cast<uint16>(mcmask[pos]) << '\t' << inrun << '\n');
	if(mcmask[pos]){
	  if(!inrun){
	    runstart=pos;
	    inrun=true;
	  }
	}else{
	  if(inrun){
	    CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
	    actread.addTag(runstart, pos-1, Read::REA_tagentry_idMNRr, Read::REA_tagentry_coEmpty);
	    inrun=false;
	  }
	}
      }
      if(inrun){
	CEBUG("reprun " << actread.getName() << '\t' << runstart << '\t' << pos-1 << endl);
	actread.addTag(runstart, pos-1, Read::REA_tagentry_idMNRr, Read::REA_tagentry_coEmpty);
	inrun=false;
      }
    }



  }

  P.finishAtOnce();
  cout << '\n';

  //cout << "\nskim Needs redo!\n";
  //exit(0);

  FUNCEND();
}






/*************************************************************************
 *
 *
 *
 *************************************************************************/


uint32 Skim::checkBaitHit(Read & actread)
{
  //, const uint8 basesperhash, vector<hashstat_t> & hashstats, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_begin, vector<vector<hashstat_t>::const_iterator > & hsshortcuts_end)
  FUNCSTART("checkBaitHit()");

  if(!actread.hasValidData()) return 0;
  uint32 slen=actread.getLenClippedSeq();
  if(slen<SKIM3_hs_basesperhash) return 0;

  // don't really need to clear out these re-used vectors
  //   - tagmask just needs to contain "0" at every elemnt
  //   - singlereadvhraparray needs to be big enough to be
  //     written into by transformSeqToVariableHash()
  if(SKIM3_baiting_tagmaskvector.size() < slen){
    SKIM3_baiting_tagmaskvector.resize(slen,0);
  }
  if(SKIM3_baiting_singlereadvhraparray.size() < slen){
    SKIM3_baiting_singlereadvhraparray.resize(slen);
  }

  vector<vhrap_t>::iterator srvaI=SKIM3_baiting_singlereadvhraparray.begin();

  vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
  uint32 hashesmade;

  {
    int32 bfpos=0;
    int32 bfposinc=1;
    
    uint32 actreadid=0;
    
    hashesmade=transformSeqToVariableHash(
      actreadid,
      actread,
      actread.getClippedSeqAsChar(),
      slen,
      SKIM3_hs_basesperhash,
      srvaI,
      false,
      1,
      SKIM3_baiting_tagmaskvector,
      bposhashstats,
      bfpos,
      bfposinc
      );
  }
  SKIM3_baiting_singlereadvhraparray.resize(hashesmade);
  
  CEBUG("hashesmade: " << hashesmade << endl);
  
  vector<hashstat_t>::const_iterator lowerbound;
  vector<hashstat_t>::const_iterator upperbound;
  
  vector<hashstat_t>::const_iterator hssearchI;
  srvaI=SKIM3_baiting_singlereadvhraparray.begin();

  hashstat_t hstmp;
  bool foundit;
  uint32 numhits=0;
  for(; srvaI != SKIM3_baiting_singlereadvhraparray.end(); srvaI++){
    CEBUG(*srvaI << '\n');
    
    foundit=false;
    lowerbound=SKIM3_hs_hsshortcuts_begin[srvaI->vhash & MAXVHASHMASK];
    upperbound=SKIM3_hs_hsshortcuts_end[srvaI->vhash & MAXVHASHMASK];
    
    // "SKIM3_empty_vector_hashstat_t.end()" is the "NULL" replacement
    if(SKIM3_hs_hashstats.end() != lowerbound){
      if(SKIM3_hs_basesperhash>12){
	// with more than 12 bases in a hash, the array is subdivided 
	hstmp.vhash=srvaI->vhash;
	hssearchI=lower_bound(lowerbound,
			      upperbound,
			      hstmp,
			      Skim__compareHashStatHashElem_);
	if(hssearchI != SKIM3_hs_hashstats.end()
	   && hssearchI->vhash == srvaI->vhash) foundit=true;
      }else{
	hssearchI=lowerbound;
	foundit=true;
      }
    }else{
      CEBUG("---------- NO LB HIT??? -------\n");
    }
    
    if(foundit) {
      ++numhits;
    }
  }

  //cout << "\nskim Needs redo!\n";
  //exit(0);

  FUNCEND();
  return numhits;
}




/*************************************************************************
 *
 * Around each RMB base, set the base statitsics to "repeat"
 * More a test.
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::correctReadBaseStatisticsByRMB(ReadPool & rp, const uint8 basesperhash)
{
  ProgressIndicator<int32> P(0, rp.size());

  for(uint32 actreadid=0; actreadid<rp.size(); actreadid++){
    P.progress(actreadid);

    Read & actread= rp.getRead(actreadid);
    if(!actread.hasValidData()
       || !actread.isUsedInAssembly()
       || !actread.hasBaseHashStats()) continue; 

    if(actread.hasTag(Read::REA_tagentry_idSRMr)
       || actread.hasTag(Read::REA_tagentry_idCRMr)){

      CEBUG("Read " << actread.getName() << endl);
      for(uint32 tagnr=0; tagnr<actread.getNumOfTags(); tagnr++){
	const multitag_t & acttag=actread.getTag(tagnr);
	if(acttag.identifier==Read::REA_tagentry_idSRMr
	   || acttag.identifier==Read::REA_tagentry_idCRMr){
	  
	  CEBUG("Tag " << acttag << endl);
	  vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
	  if(!bposhashstats.empty()){
	    Read::setCoutType(Read::AS_TEXT);
	    CEBUG("Before\n" << actread);

	    uint32 bposfrom=acttag.from;
	    uint32 bposto=acttag.to;
	    if(bposto<bposfrom) swap(bposfrom,bposto);
	    if(bposfrom>=basesperhash-1){
	      bposfrom-=basesperhash-1;
	    }else{
	      bposfrom=0;
	    }

	    CEBUG("bposfrom: " << bposfrom << "\tbposto: " << bposto << endl);
	    for(; bposfrom<=bposto; bposfrom++){
	      if(bposhashstats[bposfrom].fwd.isValid()
		 && bposhashstats[bposfrom].fwd.getFrequency()<5){
		bposhashstats[bposfrom].fwd.setFrequency(5);
	      }
	      if(bposhashstats[bposfrom].rev.isValid()
		 && bposhashstats[bposfrom].rev.getFrequency()<5){
		bposhashstats[bposfrom].rev.setFrequency(5);
	      }
	    }

	    bposfrom=acttag.from;
	    bposto=acttag.to;
	    if(bposto<bposfrom) swap(bposfrom,bposto);
	    bposto+=basesperhash-1;
	    if(bposto>=bposhashstats.size()) bposto=bposhashstats.size()-1;

	    CEBUG("bposfrom: " << bposfrom << "\tbposto: " << bposto << endl);
	    for(; bposfrom<=bposto; bposfrom++){
	      if(bposhashstats[bposfrom].rev.isValid()
		 && bposhashstats[bposfrom].rev.getFrequency()<5){
		bposhashstats[bposfrom].rev.setFrequency(5);
	      }
	    }

	    CEBUG("After\n" << actread);

	  }
	}
      }
    }

  }
  P.finishAtOnce();
  cout << '\n';
}
//#define CEBUG(bla)








/*************************************************************************
 *
 *
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::skimStreamPrepare(ReadPool & rp, 
			     uint8  bph,
			     uint8  hss,
			     const char * additionalregexp)
{
  FUNCSTART("uint32 Skim::skimGo( ... )");

  dateStamp(cout);
  Read::setCoutType(Read::AS_CLIPPEDFASTA);

  init();

  SKIM3_readpool=&rp;

  if(sizeof(vhash_t)==4){
    if(bph>14) bph=14;
  }
  if(sizeof(vhash_t)==8){
    if(bph>30) bph=30;
  }
  SKIM3_basesperhash=bph;
  SKIM3_hashsavestepping=hss;

  fillTagStatusInfoOfReads();

  prepareSkim(0, rp.size(), SKIM3_vhraparray,false);

  SKIM3_farc_addregexes_file.clear();

  //// prepare regular expressions
  if(additionalregexp!=NULL){
    SKIM3_farc_addregexes_file=additionalregexp;
  }
  //if(additionalregexp!=NULL){
  //  istringstream tmpis(additionalregexp);
  //  string line;
  //  while(true){
  //    getline(tmpis,line);
  //    if(tmpis.eof()) break;
  //    boost::to_upper(line);
  //    SKIM3_farc_addregexes_templates.push_back(boost::regex(line));
  //  }
  //}


 
  FUNCEND();
  return;
}
//#define CEBUG(bla)






/*************************************************************************
 *
 * seqtype -1 == all sequencing types, else only the given
 *
 *************************************************************************/

//#define CEBUG(bla)   {cout << bla; cout.flush();}

void Skim::findAdaptorRightClip(ReadPool & searchpool, vector<int32> & results, int8 seqtype, uint32 minhashes, uint32 numthreads)
{
  FUNCSTART("void Skim::checkForAdaptor(Read & actread, cfh_threaddata_t & cfhd)");

  results.clear();
  results.resize(searchpool.size(),-1);

  SKIM3_farc_searchpool=&searchpool;
  SKIM3_farc_results=&results;
  SKIM3_farc_minhashes=minhashes;
  SKIM3_farc_seqtype=seqtype;

  startMultiThreading(1,numthreads,10000,0,searchpool.size(),
		      boost::bind( &Skim::farcThreadsDataInit, this, _1 ),
		      boost::bind( &Skim::farcThreadLoop, this, _1 ));

  FUNCEND();
}

int32 Skim::findAdaptorRightClip(Read & actread, uint32 minhashes)
{
  FUNCSTART("void Skim::checkForAdaptor(Read & actread, cfh_threaddata_t & cfhd)");

  return findAdaptorRightClip_internal(actread,minhashes,SKIM3_farcdata_fornonmultithread);

  FUNCEND();
}



#define CEBUG2(bla)
//#define CEBUG(bla)   {cout << bla; cout.flush();}

int32 Skim::findAdaptorRightClip_internal(Read & actread, uint32 minhashes, farc_threaddata_t & farcd)
{
  FUNCSTART("void Skim::checkForAdaptor(Read & actread, cfh_threaddata_t & cfhd)");

  CEBUG("farc_i: " << actread.getName() << endl);

  if(SKIM3_vashortcuts_begin.empty() || SKIM3_vashortcuts_end.empty()) return -1;
  if(!actread.hasValidData()) return -1;
  uint32 slen=actread.getLenClippedSeq();
  if(slen<SKIM3_basesperhash) return -1;

  // don't really need to clear out these re-used vectors
  //   - tagmask just needs to contain "0" at every elemnt
  //   - singlereadvhraparray needs to be big enough to be
  //     written into by transformSeqToVariableHash()
  if(farcd.tagmaskvector.size() < slen){
    farcd.tagmaskvector.resize(slen,0);
  }
  if(farcd.singlereadvhraparray.size() < slen){
    farcd.singlereadvhraparray.resize(slen);
  }

  farcd.readhashmatches.clear();

  vector<vhrap_t>::iterator srvaI=farcd.singlereadvhraparray.begin();

  vector<Read::bposhashstat_t> & bposhashstats=const_cast<vector<Read::bposhashstat_t> &>(actread.getBPosHashStats());
  uint32 hashesmade;

  uint32 actreadid=0xffffffff;

  {
    int32 bfpos=0;
    int32 bfposinc=1;
    
    hashesmade=transformSeqToVariableHash(
      actreadid,
      actread,
      actread.getClippedSeqAsChar(),
      slen,
      SKIM3_basesperhash,
      srvaI,
      false,
      1,
      farcd.tagmaskvector,
      bposhashstats,
      bfpos,
      bfposinc
      );
  }
  farcd.singlereadvhraparray.resize(hashesmade);
  
  CEBUG("hashesmade: " << hashesmade << endl);
  

  srvaI=farcd.singlereadvhraparray.begin();

  vector<vhrap_t>::const_iterator lowerbound;
  vector<vhrap_t>::const_iterator upperbound;
  for(; srvaI != farcd.singlereadvhraparray.end(); srvaI++){
    lowerbound=SKIM3_vashortcuts_begin[srvaI->vhash & MAXVHASHMASK];
    upperbound=SKIM3_vashortcuts_end[srvaI->vhash & MAXVHASHMASK];
    
    // "SKIM3_empty_vector_vhrap_t.end()" is the "NULL" replacement
    if(SKIM3_completevhraparray_end != lowerbound){
      if(SKIM3_basesperhash>12){
	// with more than 12 bases in a hash, the vhrap array is
	//  subdivided 
	pair<vector<vhrap_t>::const_iterator, vector<vhrap_t>::const_iterator>
	  p=equal_range(lowerbound,
			upperbound,
			*srvaI,
			Skim__compareVHRAPArrayElem_);
	lowerbound=p.first;
	upperbound=p.second;
      }
      
      for(;lowerbound!=upperbound; lowerbound++){
	//CEBUG("/// " << actreadid << '\t' << lowerbound->readid << '\n');
	//CEBUG("/// take!\n");
	farcd.readhashmatches.resize(farcd.readhashmatches.size()+1);
	farcd.readhashmatches.back().rid2=lowerbound->readid;
	farcd.readhashmatches.back().hashpos1=srvaI->hashpos;
	farcd.readhashmatches.back().hashpos2=lowerbound->hashpos;
	farcd.readhashmatches.back().eoffset=srvaI->hashpos - lowerbound->hashpos;
	farcd.readhashmatches.back().bhashstats=srvaI->bhashstats;

	CEBUG2("added: " << farcd.readhashmatches.back());
      }
    }
  }

  int32 retvalue=-1;

  if(!farcd.readhashmatches.empty()){
    checkForPotentialAdaptorHits(1, actreadid, actread, farcd.tmpmatchwith, farcd.readhashmatches);

    if(!farcd.tmpmatchwith.empty()){
      CEBUG2("Hits of: " << actread.getName() << endl);
      vector<matchwithsorter_t>::const_iterator ssmwsI=farcd.tmpmatchwith.begin();
      vector<matchwithsorter_t>::const_iterator leftmostokI=farcd.tmpmatchwith.end();
      vector<matchwithsorter_t>::const_iterator largestokI=farcd.tmpmatchwith.end();
      int32 leftmostpos=100000000;
      uint32 largesthash=0;
      for(; ssmwsI != farcd.tmpmatchwith.end(); ++ssmwsI){
	CEBUG2(actread.getName() << " to " << SKIM3_readpool->getRead(ssmwsI->otherid).getName() << " : " << *ssmwsI);
	if(ssmwsI->numhashes>=minhashes){
	  int32 numhashes=ssmwsI->numhashes;
	  if(ssmwsI->eoffset < 0) numhashes+=ssmwsI->eoffset;
	  if(numhashes>=minhashes){
	    if(ssmwsI->eoffset < leftmostpos){
	      leftmostpos=ssmwsI->eoffset;
	      leftmostokI=ssmwsI;
	    }
	    if(numhashes>largesthash){
	      largesthash=numhashes;
	      largestokI=ssmwsI;
	    }
	  }
	}
      }
      if(leftmostokI!=farcd.tmpmatchwith.end()){
	//cout << "Chosen " << SKIM3_readpool->getRead(leftmostokI->otherid).getName() << ": " << *leftmostokI;
	retvalue=leftmostokI->eoffset;
	if(retvalue<0) retvalue=0;
	if(leftmostokI!=largestokI){
	  //cout << "Chosen != Biggest " << SKIM3_readpool->getRead(largestokI->otherid).getName() << ": " << *largestokI;
	}
      }
    }
    //selectPotentialHitsForSave2(direction, actreadid, 
    //				cfhd);
    
  }

  // not hit via SKIM? use plain old regex
  if(0){
  //if(retvalue<0){
    CEBUG("No hit, trying harder: " << actread.getName() << endl);

    string seq(actread.getSeqAsChar());
    boost::to_upper(seq);

    list<boost::regex>::const_iterator areI=farcd.addregexes.begin();
    boost::match_flag_type flags = boost::match_default; 
    std::string::const_iterator start, end; 
    for(; areI != farcd.addregexes.end(); ++areI){
      start = seq.begin(); 
      end = seq.end(); 
      if(regex_search(start, end, farcd.rematches, *areI, flags)) { 
//	++numclipped;
//	actread.setRSClipoff(what.position());
//	logfout << logprefix << " Solexa partial end adaptor: " << actread.getName()
//		<< " changed right clip to " << farcd.rematches.position() << "\n";
	retvalue=farcd.rematches.position();
	CEBUG("Regex hit: " << retvalue << endl);
	break;
      }
    }
  }

  return retvalue;
}

#define CEBUG(bla) 





/*************************************************************************
 *
 * a simplified version of checkForPotentialHits()
 *
 *
 *************************************************************************/


//#define CEBUG(bla)   {cout << bla; cout.flush();}
//#define CEBUG_extra_cFPH

void Skim::checkForPotentialAdaptorHits(const int8 direction, const uint32 actreadid, Read & actread, vector<matchwithsorter_t> & tmpmatchwith, vector<readhashmatch_t> & readhashmatches)
{
  //bool dodebug=false;

  CEBUG("Potential hits of " << actread.getName() << " (" << actreadid << ") (" << static_cast<int16>(direction) << '/' << actread.getLenClippedSeq() << ")\n----------------\n");
  CEBUG(actread << endl);
  CEBUG("----------------\n");

  tmpmatchwith.clear();

  // readhashmatches should not be empty ... normally.
  // but new method to deal with megahubs reduces this vector, keeping only
  //  'approximately normal' frequencies. Which in turn means: some vectors
  //  might be completely emptied
  // so, if it is empty, return immediately
  if(readhashmatches.empty()) return;

  sort(readhashmatches.begin(), readhashmatches.end(), Skim__sortreadhashmatch_t_);
  
  vector<readhashmatch_t>::const_iterator sI=readhashmatches.begin();

  uint32 countid=sI->rid2;
  while(sI != readhashmatches.end()){
    
    uint32 rid2=sI->rid2;
    uint16 oldhashpos=sI->hashpos1;
    uint16 hp1min=0xffff;
    uint16 hp1max=0;

    uint16 hp2min=0xffff;
    uint16 hp2max=0;

    int32  eoffsetmin=0x7fffffff;
    int32  eoffsetmax=0x80000000;
    int32 oldeoffset=sI->eoffset;

    uint32 numhashes=0;

    vector<readhashmatch_t>::const_iterator sIS=sI;
    for(;sI != readhashmatches.end() && sI->rid2 == countid; sI++){
      CEBUG(*sI);

      // this ensures that the eoffset between two following
      //  entries may not differ by too much (2 bases here for adaptor search)
      // IF they do, then this is treated like a different hit
      //  by breaking the loop
      if(abs(sI->eoffset - oldeoffset) > 2){
	CEBUG("BREAKER!\n");
	break;
      }
      numhashes++;

      hp1min=min(hp1min,sI->hashpos1);
      hp1max=max(hp1max,sI->hashpos1);
      eoffsetmin=min(eoffsetmin,sI->eoffset);
      eoffsetmax=max(eoffsetmax,sI->eoffset);
      oldeoffset=sI->eoffset;

      hp2min=min(hp2min,sI->hashpos2);
      hp2max=max(hp2max,sI->hashpos2);

#ifdef CEBUG_extra_cFPH
      {
	boost::mutex::scoped_lock lock(SKIM3_coutmutex);
	CEBUG(sI->rid2
	      << "\t" << SKIM3_readpool->getRead(sI->rid2).getName()
	      << "\t" << SKIM3_readpool->getRead(sI->rid2).getLenClippedSeq()
	      << "\t" << sI->eoffset
	      << "\t" << sI->hashpos1
	      << "\t" << oldhashpos
	      << '\n');
      }
#endif

      oldhashpos=sI->hashpos1;
    }

    int32 maxoverlap;

    // adjust min positions for the hash length
    hp1min-=(SKIM3_basesperhash-1);
    hp2min-=(SKIM3_basesperhash-1);

    int32 eoffsetmean=eoffsetmin+(eoffsetmax-eoffsetmin)/2;
    
    // calc max overlap
    // currently only for one offset
    if(eoffsetmean<0){
      maxoverlap=min(SKIM3_readpool->getRead(rid2).getLenClippedSeq()+eoffsetmean,actread.getLenClippedSeq());
    }else{
      maxoverlap=min(actread.getLenClippedSeq()-eoffsetmean,SKIM3_readpool->getRead(rid2).getLenClippedSeq());
    }

    // correct the maxoverlap by the modulo of the hash steps as the
    //  border hashes will be found only in 1/(hash stepping) cases
    maxoverlap=maxoverlap-(maxoverlap%SKIM3_hashsavestepping);

    // hashe3soverlap is not the number of hashes in the overlap,
    // but the length of the overlap
    int32 hashesoverlap=hp1max-hp1min+1;

    int32 perc=100*hashesoverlap/maxoverlap;


    int32 minpercentrequired=0;

    // look a bit closer at potential perfect matches
    if(perc == 100){
      if(eoffsetmin != eoffsetmax){
	// this could not be: have at least two different expected offsets
	//  and a 100% coverage. Side effects from intra-read repeats
	// therefore, make sure this does not get through as a 100% match
	perc=99;
      }else if((numhashes-1)*SKIM3_hashsavestepping+SKIM3_basesperhash < maxoverlap){
	// maxoverlap covers the whole potential overlap, but
	//  there are not enough hashes supporting for 100% match
	//  (base mismatch somewhere)
	// reduce the percentage to show it's not a perfect match
	perc=99;
      }
    }else if(eoffsetmin == eoffsetmax){
      if(perc>100) {
	perc=100;
      }else{
	uint32 maxnumhashes=((maxoverlap-1-SKIM3_basesperhash)/SKIM3_hashsavestepping)+1;
	if(perc>=minpercentrequired && numhashes==maxnumhashes){
	  CEBUG("maxnumhashes 100% saver: "  << perc << '\n');
	  perc=100;
	}
      }
    }

    int32 minoverlaprequired=13;

#ifdef CEBUG_extra_cFPH
    {
      boost::mutex::scoped_lock lock(SKIM3_coutmutex);
      CEBUG("eomin: " << eoffsetmin << "\teomax: " << eoffsetmax 
	    << "\tmor: " << minoverlaprequired
	    << "\tho: " << hashesoverlap
	    << "\t%: " << perc
	    << "\t%<: " << minpercentrequired << endl);
    }
#endif

    // we take the hit if the overlap percentage is above threshold
    if(hashesoverlap >= minoverlaprequired
       && perc>=minpercentrequired){

//#define CEBUG(bla)   {if(actreadid==273252 && rid2==273250) cout << bla; cout.flush();}
//#define CEBUG(bla)   {cout << bla; cout.flush();}
      matchwithsorter_t tmp;
      tmp.otherid=rid2;
      tmp.eoffset=eoffsetmean;
      
      if(perc>100) perc=100;
      tmp.percent_in_overlap=perc;
      tmp.numhashes=numhashes;
      tmp.estimscore=0;
      tmp.taken=false;

      tmpmatchwith.push_back(tmp);

      CEBUG("Pushing possible hit with offset: " << tmp.eoffset << endl
	    << rid2
	    << "\t" << actreadid
	    << "\t" << SKIM3_readpool->getRead(rid2).getLenClippedSeq()
	    << "\t" << hp1min
	    << "\t" << hp1max
	    << "\t" << eoffsetmin
	    << "\t" << eoffsetmax
	    << "\t" << maxoverlap
	    << "\t" << hashesoverlap
	    << "\t" << numhashes
	    << "\t" << minoverlaprequired
	    << "\t" << perc << '%'
	    << '\n');
//#define CEBUG(bla)

    }
    if(sI!=readhashmatches.end()) countid=sI->rid2;
  }

}
//#define CEBUG(bla)



void Skim::farcThreadsDataInit(const uint32 numthreads)
{
  FUNCSTART("void Skim::cfhThreadsDataInit(const uint32 numthreads)");

  SKIM3_farcd_vector.resize(numthreads);
  for(uint32 ti=0; ti<numthreads;++ti){
    
    SKIM3_farcd_vector[ti].readhashmatches.clear();
    SKIM3_farcd_vector[ti].readhashmatches.reserve(2000);
    SKIM3_farcd_vector[ti].singlereadvhraparray.clear();
    SKIM3_farcd_vector[ti].singlereadvhraparray.reserve(2000);
    SKIM3_farcd_vector[ti].tagmaskvector.clear();
    SKIM3_farcd_vector[ti].tagmaskvector.reserve(2000);
    SKIM3_farcd_vector[ti].tmpmatchwith.clear();
    SKIM3_farcd_vector[ti].tmpmatchwith.reserve(2000);

    if(!SKIM3_farc_addregexes_file.empty()){
      istringstream tmpis(SKIM3_farc_addregexes_file);
      string line;
      while(true){
        getline(tmpis,line);
        if(tmpis.eof()) break;
        boost::to_upper(line);
        SKIM3_farcd_vector[ti].addregexes.push_back(boost::regex(line));
      }
    }
  }
  FUNCEND();
}

void Skim::farcThreadLoop(const uint32 threadnr)
{
  FUNCSTART("void Skim::threadloop(const uint32 threadnr)");

  // threads need their own try() catch() block

  try {
    CEBUG("Thread: " << threadnr << " starting.\n");

//    farc_threaddata_t farcd;
//    farcd.readhashmatches.reserve(2000);
//    farcd.singlereadvhraparray.reserve(2000);
//    farcd.tagmaskvector.reserve(2000);
//    farcd.tmpmatchwith.reserve(2000);
//
//    if(!SKIM3_farc_addregexes_file.empty()){
//      istringstream tmpis(SKIM3_farc_addregexes_file);
//      string line;
//      while(true){
//        getline(tmpis,line);
//        if(tmpis.eof()) break;
//        boost::to_upper(line);
//        farcd.addregexes.push_back(boost::regex(line));
//      }
//    }
//    //farcd.addregexes=SKIM3_farc_addregexes_templates;


    BUGIFTHROW(threadnr>=SKIM3_farcd_vector.size(),"threadnr>=SKIM3_farcd_vector.size()???");
    farc_threaddata_t & farcd=SKIM3_farcd_vector[threadnr];

    farcd.readhashmatches.clear();
    farcd.singlereadvhraparray.clear();
    farcd.tagmaskvector.clear();
    farcd.tmpmatchwith.clear();


    // we'll jump out with a break;
    while(true){
      { 
	boost::mutex::scoped_lock mylock(SKIM3_mutex);
	CEBUG("Thread " << threadnr << " waiting ...\n");
	while(!SKIM3_threadcontrol[threadnr].flag_datavalid
	      && ! SKIM3_threadcontrol[threadnr].flag_endthread){
	  SKIM3_master2slavesignal.wait(mylock);
	}
      }
      if(SKIM3_threadcontrol[threadnr].flag_datavalid){
	CEBUG("Thread " << threadnr << " working on " << SKIM3_threadcontrol[threadnr].from << " to " << SKIM3_threadcontrol[threadnr].to << "\n");
	
	for(uint32 readi=SKIM3_threadcontrol[threadnr].from; readi<SKIM3_threadcontrol[threadnr].to; ++readi){
	  if(SKIM3_farc_seqtype < 0 
	     || SKIM3_farc_searchpool->getRead(readi).getSequencingType() == SKIM3_farc_seqtype){
	    int32 clip=findAdaptorRightClip_internal(SKIM3_farc_searchpool->getRead(readi),SKIM3_farc_minhashes,farcd);
	    if(clip>=0){
	      boost::mutex::scoped_lock lock(SKIM3_resultfileoutmutex);
	      (*SKIM3_farc_results)[readi]=clip;
	    }
	  }
	}
	
	boost::mutex::scoped_lock mylock(SKIM3_mutex);
	SKIM3_threadcontrol[threadnr].flag_datavalid=false;
	
	SKIM3_slave2mastersignal.notify_one();
      }else if(SKIM3_threadcontrol[threadnr].flag_endthread){
	CEBUG("Thread " << threadnr << "  exiting.\n");
	break;
      }
    }

  }
  catch(Notify n){
    n.handleError(THISFUNC);
  }

  FUNCEND();
}

//#define CEBUG(bla)

