/*****************************************************************
* Unipro UGENE - Integrated Bioinformatics Suite
* Copyright (C) 2008,2009 Unipro, Russia (http://ugene.unipro.ru)
* All Rights Reserved
* 
*     This source code is distributed under the terms of the
*     GNU General Public License. See the files COPYING and LICENSE
*     for details.
*****************************************************************/

#include "SAMFormat.h"
#include "DocumentFormatUtils.h"

#include <core_api/Task.h>
#include <core_api/IOAdapter.h>
#include <core_api/DNAAlphabet.h>
#include <core_api/L10n.h>
#include <datatype/DNAQuality.h>

#include <gobjects/DNASequenceObject.h>
#include <gobjects/AnnotationTableObject.h>
#include <gobjects/GObjectTypes.h>
#include <gobjects/MAlignmentObject.h>
#include <util_text/TextUtils.h>

#include <QtCore/QRegExp>

namespace GB2 {

const QByteArray SAMFormat::VERSION = "1.0";
const QByteArray SAMFormat::SAM_SECTION_START = "@";
const QByteArray SAMFormat::SECTION_HEADER = "@HD"; 
const QByteArray SAMFormat::SECTION_SEQUENCE = "@SQ"; 
const QByteArray SAMFormat::SECTION_READ_GROUP = "@RG";
const QByteArray SAMFormat::SECTION_PROGRAM = "@PG";
const QByteArray SAMFormat::SECTION_COMMENT = "@CO";

const QByteArray SAMFormat::TAG_VERSION = "VN";
const QByteArray SAMFormat::TAG_SORT_ORDER = "SO";
const QByteArray SAMFormat::TAG_GROUP_ORDER = "GO";
 
const QByteArray SAMFormat::TAG_SEQUENCE_NAME = "SN";
const QByteArray SAMFormat::TAG_SEQUENCE_LENGTH = "LN";
const QByteArray SAMFormat::TAG_GENOME_ASSEMBLY_ID = "AS";
const QByteArray SAMFormat::TAG_SEQUENCE_MD5_SUM = "M5";
const QByteArray SAMFormat::TAG_SEQUENCE_URI = "UR";
const QByteArray SAMFormat::TAG_SEQUENCE_SPECIES = "SP";

static bool validateField(SAMFormat::Field &field, TaskStateInfo &ti) {
	if(!field.pattern.exactMatch(field.val)) {
		ti.setError(SAMFormat::tr("Field \"%1\" not matched pattern \"%2\", expected pattern \"%3\"").arg(field.name).arg(QString(field.val)).arg(field.pattern.pattern()));
		return false;
	}
	return true;
}

SAMFormat::SAMFormat( QObject* p ): DocumentFormat(p, DocumentFormatFlags_SW, QStringList()<< "sam")
{
		formatName = tr("SAM");
		supportedObjectTypes+=GObjectTypes::MULTIPLE_ALIGNMENT;
		
}

bool SAMFormat::checkRawData( const QByteArray& rawData ) const
{
	QRegExp rx("@HD\\s+(VN:\\d\\.\\d|SO:(unsorted|queryname|coordinate)|GO:(group|query|reference))");
	return rx.indexIn(rawData)==0;
}

Document* SAMFormat::loadDocument( IOAdapter* io, TaskStateInfo& ti, const QVariantMap& _fs, DocumentLoadMode mode /*= DocumentLoadMode_Whole*/ )
{
	Q_UNUSED(mode);
    if( NULL == io || !io->isOpen() ) {
		ti.setError(L10N::badArgument("IO adapter"));
		return NULL;
	}
	QList<GObject*> objects;
	QVariantMap fs = _fs;

	QString lockReason;

	static const int READ_BUFF_SIZE = 1024;
	static const char SPACE = ' ';

	QMap<QString, MAlignment> maMap; //file may contain multiple MA objects
	MAlignment defaultMA("Alignment " + io->getURL().baseFileName());

	QByteArray readBuffer(READ_BUFF_SIZE, '\0');
	char* buff  = readBuffer.data();
	bool lineOk = false;

	Field fields[] = { //alignment section fields excluding optional tags
		Field("QNAME", "", QRegExp("[^ \\t\\n\\r]+")),
		Field("FLAG", "", QRegExp("[0-9]+")),
		Field("RNAME", "", QRegExp("[^ \\t\\n\\r@=]+")),
		Field("POS", "", QRegExp("[0-9]+")),
		Field("MAPQ", "", QRegExp("[0-9]+")),
		Field("CIGAR", "", QRegExp("([0-9]+[MIDNSHP])+|\\*")),
		Field("MRNM", "", QRegExp("[^ \\t\\n\\r@]+")),
		Field("MPOS", "", QRegExp("[0-9]+")),
		Field("ISIZE", "", QRegExp("-?[0-9]+")),
		Field("SEQ", "", QRegExp("[acgtnACGTN.=]+|\\*")),
		Field("QUAL", "", QRegExp("[!-~]+|\\*"))
	};

	int len = 0;
	while(!ti.cancelFlag && (len = io->readLine(buff, READ_BUFF_SIZE, &lineOk)) > 0)
	{
		QByteArray line = QByteArray::fromRawData( buff, len ).simplified();

		if(line.startsWith(SAM_SECTION_START)) { //Parse sections

			QList<QByteArray> tags;
			
			if(getSectionTags(line, SECTION_SEQUENCE, tags)) { //Parse sequence section
				foreach(QByteArray tag, tags) {
					if(tag.startsWith(TAG_SEQUENCE_NAME)) { // Set alignment name
						QString maName = QByteArray::fromRawData(tag.constData() + 3, tag.length() - 3);
						MAlignment ma;
						ma.setName(maName);
						maMap[maName] = ma;
					} 
				}
			} else if(getSectionTags(line, SECTION_HEADER, tags)) { //Parse header section
				foreach(QByteArray tag, tags) {
					if(tag.startsWith(TAG_VERSION)) { //Check file format version
						QString version = QByteArray::fromRawData(tag.constData() + 3, tag.length() - 3);	
						if(version != "1.0") {
							ti.setError(SAMFormat::tr("Unsupported file version \"%1\"").arg(version));
							return NULL;
						}
					}
				}
			} 
			// Skip other sections

			continue;
		}

		QList<QByteArray> fieldValues = line.split(SPACE);

		int readFieldsCount = fieldValues.count();

		//if(readFieldsCount < 11) readFieldsCount--;
		QBitArray terminators = TextUtils::WHITES | TextUtils::LINE_BREAKS;
		char lastTerminator = lineOk ? '\n' : 0;

		while(readFieldsCount < 11 && (len = io->readUntil(buff, READ_BUFF_SIZE, terminators, IOAdapter::Term_Include, &lineOk)) > 0) {
			QByteArray addline = QByteArray::fromRawData( buff, len - 1 ).simplified();
			fieldValues[readFieldsCount - 1].append(addline);
			lastTerminator = buff[len-1];
			if(lineOk) 
				break;
		}
		{
			bool merge = false;
			while(!TextUtils::LINE_BREAKS.at(lastTerminator) && readFieldsCount < 11 && (len = io->readUntil(buff, READ_BUFF_SIZE, terminators, IOAdapter::Term_Include, &lineOk)) > 0) {
				QByteArray addline = QByteArray::fromRawData( buff, len - 1 ).simplified();
				if(merge) {
					fieldValues[readFieldsCount - 1].append(addline);	
				} else {
					fieldValues.append(addline);
					readFieldsCount++;
				}
				lastTerminator = buff[len-1];
				merge = !lineOk;
			}

			// skiping optional tags
			if(!TextUtils::LINE_BREAKS.at(lastTerminator))
				while((len = io->readLine(buff, READ_BUFF_SIZE, &lineOk)) > 0 && !lineOk);
		}

		if(readFieldsCount < 11) {
			ti.setError(SAMFormat::tr("Unexpected end of file"));
			return NULL;
		} 

		for(int i=0; i < qMin(11, readFieldsCount); i++) {
			fields[i].val = fieldValues[i];
 			if(!validateField(fields[i], ti)) {
 				return NULL;
			}
		}

		QString rname = fields[2].val;

		if(rname != "*" && !maMap.contains(rname)) {
			ti.setError(SAMFormat::tr("Unexpected reference sequence name value \"%1\"").arg(rname));
			return NULL;
		}

		MAlignmentRow row;

		short flag = fields[1].val.toShort();
		bool isReversed = flag & 0x0010;

		row.setName(fields[0].val);
		if(fields[9].val == "*") {
			row.setSequence("", 0);
		} else {
			if(isReversed) {
				QByteArray &seq1 = fields[9].val;
				int len = seq1.length();
				QByteArray seq2(len, '\0');
				for(int i=0;i<len;i++) seq2[i] = seq1[len-i-1];
				row.setSequence(seq2, fields[3].val.toInt()-1);
			} else 
				row.setSequence(fields[9].val, fields[3].val.toInt()-1);
		}

		if(fields[10].val != "*") {
			if(isReversed) {
				QByteArray &seq1 = fields[10].val;
				int len = seq1.length();
				QByteArray seq2(len, '\0');
				for(int i=0;i<len;i++) seq2[i] = seq1[len-i-1];
				row.setQuality(DNAQuality(seq2));
			}
			else 
				row.setQuality(DNAQuality(fields[10].val));
		}

		if(rname == "*") {
			defaultMA.addRow(row);
		} else {
			maMap[rname].addRow(row);
		}

		ti.progress = io->getProgress();
	}

	foreach(MAlignment ma, maMap.values()) {
		DocumentFormatUtils::assignAlphabet(ma);
		if (ma.getAlphabet() == NULL) {
			ti.setError( SAMFormat::tr("alphabet_unknown"));
			return NULL;
		}

		objects.append(new MAlignmentObject(ma));
	}

	if(defaultMA.getRows().count() != 0) {
		DocumentFormatUtils::assignAlphabet(defaultMA);
		if (defaultMA.getAlphabet() == NULL) {
			ti.setError( SAMFormat::tr("alphabet_unknown"));
			return NULL;
		}

		objects.append(new MAlignmentObject(defaultMA));
	}

	if (ti.hasErrors() || ti.cancelFlag) {
		qDeleteAll(objects);
		return NULL;
	}

	DocumentFormatUtils::updateFormatSettings(objects, fs);
	Document* doc = new Document(this, io->getFactory(), io->getURL(), objects, fs, lockReason);
	return doc;
}


void SAMFormat::storeDocument( Document* d, TaskStateInfo& ts, IOAdapter* io )
{
	//TODO: sorting options?
	if( NULL == d ) {
		ts.setError(L10N::badArgument("doc"));
		return;
	}
	if( NULL == io || !io->isOpen() ) {
		ts.setError(L10N::badArgument("IO adapter"));
		return;
	}

	QList<const MAlignmentObject*> maList;
	foreach(GObject *obj, d->findGObjectByType(GObjectTypes::MULTIPLE_ALIGNMENT)) {
		const MAlignmentObject* maObj = qobject_cast<const MAlignmentObject*>(obj);
		assert(maObj != NULL);
		maList.append(maObj);
	}

	QByteArray tab = "\t";
	QByteArray block;

	//Writing header
	block.append(SECTION_HEADER).append("\t").append("VN:").append(VERSION).append("\n");
	if (io->writeBlock( block ) != block.length()) {
		throw 0;
	}

	//Writing sequence section
	foreach(const MAlignmentObject* maObj, maList) {
		const MAlignment &ma = maObj->getMAlignment();
		block.clear();
		block.append(SECTION_SEQUENCE).append(tab).append(TAG_SEQUENCE_NAME).append(":").append(ma.getName().replace(QRegExp("\\s|\\t"), "_"))
			.append(tab).append(TAG_SEQUENCE_LENGTH).append(":").append(QByteArray::number(ma.getLength())).append("\n");
		if (io->writeBlock( block ) != block.length()) {
			throw 0;
		}
	}

	//Writing alignment section
	foreach(const MAlignmentObject* maObj, maList) {
		const MAlignment &ma = maObj->getMAlignment();
		QByteArray rname(ma.getName().replace(QRegExp("\\s|\\t"), "_").toAscii());
		foreach(MAlignmentRow row, ma.getRows()) {
			block.clear();
			QByteArray qname = QString(row.getName()).replace(QRegExp("\\s|\\t"), "_").toAscii();
			QByteArray flag("0"); // can contains strand, mapped/unmapped, etc.
			QByteArray pos = QByteArray::number(row.getCoreStart()+1);
			QByteArray mapq("255"); //255 indicating the mapping quality is not available
			QByteArray cigar("*");
			QByteArray mrnm("*");
			QByteArray mpos("0");
			QByteArray isize("0");
			QByteArray seq(row.getCore());
			QByteArray qual(row.getCoreQuality().qualCodes);
			if(qual.isEmpty()) qual.fill('I', row.getCoreLength()); //I - 50 Phred quality score (99.999%)
			
			block = qname + tab + flag + tab+ rname + tab + pos + tab + mapq + tab + cigar + tab + mrnm
				+ tab + mpos + tab + isize + tab + seq + tab + qual + "\n";
			if (io->writeBlock( block ) != block.length()) {
				throw 0;
			}
		}
	}
}

bool SAMFormat::getSectionTags( QByteArray &line, const QByteArray &sectionName, QList<QByteArray> &tags )
{
	if(!line.startsWith(sectionName)) return false;
	QByteArray tagsLine = QByteArray::fromRawData(line.constData() + 3, line.length() - 3);
	tags = tagsLine.split(' ');
	return true;
}
}// namespace