#include<iostream>
#include "HomData.h"
#include "ClusterAssignment.h"
#include "DEBUG.h"
#include "SimValue.h"

using namespace std;

typedef map< int, ClustAssignData > ClustAssignMap_t;
typedef ClustAssignMap_t::iterator ClustAssignMapIter_t;
typedef binary_function<ClustAssignMapIter_t, ClustAssignMapIter_t, bool> CompFunc;
struct GreaterScore : public CompFunc {
	bool operator() (const ClustAssignMapIter_t& a,
			const ClustAssignMapIter_t& b) const {
		ClustAssignData d1 = (*a).second;
		ClustAssignData d2 = (*b).second;
		return ( d1.mean_score > d2.mean_score );
	}
};
struct LessPAM : public CompFunc {
	bool operator() (const ClustAssignMapIter_t& a,
			const ClustAssignMapIter_t& b) {
		ClustAssignData& d1 = (*a).second;
		ClustAssignData& d2 = (*b).second;
		return ( d1.mean_pam < d2.mean_pam );
	}
};

struct LessPos {
	bool operator() (const ClustAssignData& a,
			const ClustAssignData& b) {
		return ( a.maxscore_dom.getFrom() < b.maxscore_dom.getFrom() );
	}
};
struct LessClustID {
	bool operator() (const ClustAssignData& a,
			const ClustAssignData& b) {
		return ( a.clustid < b.clustid );
	}
};

struct HomData_GreaterScore : public CompFunc {
	bool operator() (HomData*& a,
			HomData*& b) const {
		return ( a->score > b->score );
	}
};
struct HomData_LessPAM : public CompFunc {
	bool operator() (HomData*& a,
			HomData*& b) const {
		return ( a->pam < b->pam );
	}
};

//class ClustAssignList
void ClustAssignList::addDomain(ClustAssignData *cl) {
	bool flag = false;
	int higher_num = 0;
	for (vector<ClustAssignData>::iterator p = clustAssignList.begin();
			p != clustAssignList.end(); p++) {
		/* check cl->maxscore_dom is overlapped with a domain already stored */
		if ( cl->maxscore_dom.overlapCheckStrict( &((*p).maxscore_dom) ) ) {
			if (DEBUG::debug_flag) {
				cout << "discard:" << *cl << endl;
			}
			flag = true;
			break;
		}
		/* number of hits having a score higher than the current cluster cl */
		if ( (SimValue::getReprType() == 's' && cl->mean_score < (*p).mean_score) ||
		     (SimValue::getReprType() != 's' && cl->mean_pam > (*p).mean_pam) ) {
			higher_num++;
		}
	}
	if (! flag) {
		if ( higher_num > 0 && ! ClusterAssignment::check_cutoff(cl) ) {
			/* score/pam is below the given cutoff: do not add this domain */
		} else {
			if (DEBUG::debug_flag) {
				cout << "OK:" << *cl << endl;
			}
			clustAssignList.push_back(*cl);
		}
	}
	
}
void ClustAssignList::assignDomNum() {
	int domn = 1;
	if (clustAssignList.size() == 1) {
		// no domain -- do nothing
		return;
	}
	sort(clustAssignList.begin(), clustAssignList.end(), LessPos());
	for (vector<ClustAssignData>::iterator p = clustAssignList.begin();
			p != clustAssignList.end(); p++) {
		(*p).maxscore_dom.setDomNum(domn++);
		if (DEBUG::debug_flag) {
			cout << "DomAssign: " << *p << endl;
		}
	}
}

// class ClustAssignData
bool ClustAssignData::check_subdata(int subtreeid) {
// cout << subtreeid << " " << subdata[subtreeid].sum_score << " " << sub_best_score << endl;
	if (subdata.size() <= subtreeid) {
		return false;
	}
	return (subdata[subtreeid].selected);
}

// class ClusterAssignment
double ClusterAssignment::cutoff=-1;
double ClusterAssignment::missScore=55;
double ClusterAssignment::missDist=260;
double ClusterAssignment::bestRatio=0.8;
int ClusterAssignment::checkHit=20;
int ClusterAssignment::entireGroupAverage=0;

double ClusterAssignment::bestRatio_forSub=0.9;

ClusterAssignment::ClusterAssignment() {
}
void ClusterAssignment::entireGroupAverageMode(double _missScore, double _missDist) {
	entireGroupAverage = 1;
	if (_missScore)  missScore = _missScore;
	if (_missDist)  missDist = _missDist; 
}
void ClusterAssignment::setParam(double _bestRatio, int _checkHit, double _cutoff) {
	if (_bestRatio >= 0) bestRatio = _bestRatio;
	if (_checkHit >= 0) checkHit = _checkHit;
	if (_cutoff >= 0) cutoff = _cutoff;
}
/* assign a clusterID to each newly added gene */
ClusterAssignment *ClusterAssignment::assignGeneToClusterAll(
	    HomDataSet *homData, AllClustersInfo *clInfo)
{
	ClusterAssignment *clAssignAll = new ClusterAssignment();
	Domain tmpdom;
	HomDataList homList;
	ClustAssignList *clAssignList;

	for (int i = homData->getNewSpDataIdx(); i < homData->getGeneDataSize(); i++) {
		GeneData gdata = homData->getGeneData(i);
		string gname = gdata.getName();
		tmpdom.setData(gname, 1, gdata.getLength());
//cerr << tmpdom << endl;
		/* get all homologs of the current genedata (stored into homList) */
		homData->getGeneHomologs(&tmpdom, &homList, 1);
		if (homList.size() == 0) {
			continue;
		}
		clAssignList = assignGeneToCluster(homData, &gdata, &homList, clInfo);
		clAssignAll->addHitList(clAssignList);
	}
	clAssignAll->createIndex();
	return clAssignAll;
}

ClustAssignList *ClusterAssignment::assignGeneToCluster(
	HomDataSet *homData, GeneData *gdata, HomDataList *homList, AllClustersInfo *clInfo)
{
	string gname = gdata->getName();
	ClustAssignMap_t clustAssignMap;

	if (homList == NULL) {
		homList = homData->getAllHomologData();
	}

	if (SimValue::getReprType() == 's') {
		homList->sort(HomData_GreaterScore());
	} else {
		homList->sort(HomData_LessPAM());
	}

/*
cout << gname << endl;
*/
	int treeNum;
	for (HomDataList::iterator hp = homList->begin(); hp != homList->end(); hp++) {
		int clustid = (*hp)->dom1->getClustID();
		if (clustid == 0) {
			continue;
		}

		ClustAssignData& clstData = clustAssignMap[ clustid ];

		treeNum = clInfo->getClusterInfoByID(clustid)->getTreeNum();

		if (! entireGroupAverage) {
			// use only the highest scores to calculate average group score
			if (bestRatio) {
				if (clstData.best_val < 0) {
					if (homData->checkAddedSpec((*hp)->dom1->getSpec())  &&
						homData->checkAddedSpec((*hp)->dom2->getSpec()) ) {
						/* homology between added species:
							should be ignored for orthology assignment */
					} else {
						/* homList is sorted by score or pam -- the first hit is the best */
						clstData.best_val = (SimValue::getReprType() == 's') ? (*hp)->score : (*hp)->pam;
/*
cout << "BestVal: " << clustid << " " << clstData.best_val << " " << (*hp)->pam << endl;
*/
					}
				} else {
					/* ignore scores (pams) worse than the given ratio */
					if (SimValue::getReprType() == 's') {
						if ((*hp)->score < clstData.best_val * bestRatio) continue;
					} else {
						if ((*hp)->pam > clstData.best_val / bestRatio) continue;
					}
				}
			}
			if (checkHit > 0 && clstData.count >= checkHit) {
				continue;
			}
		}

//	      cout << *((*hp)->dom2) << " " << (*hp)->score << " " << (*hp)->pam << " " << (*hp)->dom1->getClustID() << endl;
//cerr << *((*hp)->dom1) << endl;

		clstData.sum_score += (*hp)->score;
		clstData.sum_pam += (*hp)->pam;
		clstData.count ++;
		clstData.clustid = clustid;
		clstData.name = gname;
		Domain *dom1 = (*hp)->dom1;

		if (dom1->subids_size()) {
			clstData.subdata.resize(treeNum);
			for ( list<int>::iterator ii = dom1->beginSubClustID();  ii != dom1->endSubClustID(); ii++ ) {
				clstData.subdata[ (*ii) ].sum_score += (*hp)->score;
				clstData.subdata[ (*ii) ].sum_pam += (*hp)->pam;
				clstData.subdata[ (*ii) ].count ++;
			}
		}

		if (clstData.max_score < (*hp)->score) {
			clstData.max_score = (*hp)->score;
			clstData.maxscore_dom.copyData((*hp)->dom2);
		}
	}
	/* Vector of iterators for ClustAssignMap for sort */
	vector<ClustAssignMapIter_t> clustAssignList;

//int iii = 0;
	for (ClustAssignMapIter_t p = clustAssignMap.begin(); p != clustAssignMap.end(); p++) {
		int clustid = (*p).first;
		int clsize = clInfo->getClusterInfoByID(clustid)->getSize();
		ClustAssignData& cl = (*p).second;
		if (entireGroupAverage) {
			cl.mean_score = (cl.sum_score + missScore * (clsize-cl.count))
						/ clsize;
			cl.mean_pam = (cl.sum_pam + missDist * (clsize-cl.count))
					/ clsize;
		} else {
			cl.mean_score = cl.sum_score / cl.count;
			cl.mean_pam = cl.sum_pam / cl.count;
		}
		cl.sub_best_score = -99999.0, cl.sub_best_pam = 99999.0;
		for ( vector<ClustAssignData_sub>::iterator dd = cl.subdata.begin();  dd != cl.subdata.end(); dd++ ) {
			if ((*dd).count == 0) {
				// should be ignore
				(*dd).sum_score = -99999.0;
				(*dd).sum_pam = 99999.0;
			} else {
				(*dd).sum_score /= (*dd).count;
				(*dd).sum_pam /= (*dd).count;
			}
			if ( (*dd).sum_score > cl.sub_best_score )  cl.sub_best_score = (*dd).sum_score;
			if ( (*dd).sum_pam < cl.sub_best_pam )  cl.sub_best_pam = (*dd).sum_pam;
		}
		int treenum = 0;
		for ( vector<ClustAssignData_sub>::iterator dd = cl.subdata.begin();  dd != cl.subdata.end(); dd++ ) {
			(*dd).selected = ( (*dd).sum_score > cl.sub_best_score );
			if (SimValue::getReprType() == 's') {
				/* score */
//cout <<  "DD:" << cl.maxscore_dom << ": {" <<  clustid << "} " << (*dd).sum_score << " " << cl.sub_best_score << " " << ( (*dd).sum_score >= cl.sub_best_score * bestRatio_forSub) << endl;
				(*dd).selected = ( (*dd).sum_score >= cl.sub_best_score * bestRatio_forSub);
			} else {
				/* dist */
//cout <<  "DDdist:" << cl.maxscore_dom << ": {" <<  clustid << "} " << "[" << treenum << "] " << (*dd).sum_pam << " " << cl.sub_best_pam << " " << ( (*dd).sum_pam <= cl.sub_best_pam / bestRatio_forSub) << endl;
				(*dd).selected = ( (*dd).sum_pam <= cl.sub_best_pam / bestRatio_forSub );
			}
			treenum++;
		}
		cl.maxscore_dom.setClustID(clustid);
		clustAssignList.push_back(p);
//++iii;
	}
//cout << ">>" << iii << endl;
	/* sort by score */
	if (SimValue::getReprType() == 's') {
		sort(clustAssignList.begin(), clustAssignList.end(), GreaterScore());
	} else {
		sort(clustAssignList.begin(), clustAssignList.end(), LessPAM());
	}

	ClustAssignList *clAssignList = new ClustAssignList();
	for (vector<ClustAssignMapIter_t>::iterator
			p = clustAssignList.begin();
			p != clustAssignList.end(); p++) {
		ClustAssignData& cl = (*p)->second;
//cerr << cl << "<<000<<AA" << endl;

		/* select a non-overlapping set */
		clAssignList->addDomain(&cl);


/*
		int clustid = (*p)->first;
		int clsize = clInfo->getClusterInfoByID(clustid)->getSize();
		cerr << gname << " " <<
			clustid << " " <<
			(double) cl.mean_score << " " <<
			(double) cl.mean_pam << " " <<
			cl.count << " " << clsize << " " <<
			cl.max_score << " " <<
			cl.maxscore_dom << endl;
*/
	}
	clAssignList->assignDomNum();
	clustAssignMap.clear();
	return clAssignList;
}

ostream& operator<<(ostream& ost, const ClustAssignData& cd) {
	return (ost << cd.maxscore_dom << " " <<
		cd.best_val << " " <<
		cd.mean_score << " " << cd.mean_pam << " " <<
		cd.count << " " << cd.clustid);
}


void ClusterAssignment::addHitList(ClustAssignList *cl) {
//cerr << "add\n";
	for (vector<ClustAssignData>::iterator p = cl->begin(); p != cl->end(); p++) {
		clustAssign.push_back(*p);
	}
}
void ClusterAssignment::printAssignment() {
	for (vector<ClustAssignData>::iterator p = clustAssign.begin(); p != clustAssign.end(); p++) {
		Domain& dom = (*p).maxscore_dom;
		cout << dom.getName() << " "  << dom.getFrom() << " " << dom.getTo()
			<< " " << dom.getDomNum() << " "<< (*p).clustid << " ";
		int i = 0, flag  = 0;
		for (vector<ClustAssignData_sub>::iterator subp = (*p).subdata.begin(); subp != (*p).subdata.end(); subp++) {
			if ((*subp).selected) {
				if (flag > 0) cout << ",";
				cout << i;
				flag++;
			}
			i++;
		}
		cout << endl;
	}
}
DomInfo *ClusterAssignment::createDomInfo() {
	DomInfo *domInfo = new DomInfo();
	for (vector<ClustAssignData>::iterator p = clustAssign.begin(); p != clustAssign.end(); p++) {
		domInfo->addDomain( &((*p).maxscore_dom) );
	}
	/* re-indexing domains apearing in both the original clusters and the newly added genomes */
	domInfo->makeIndex();
	return domInfo;
}
void ClusterAssignment::createIndex() {
	int currClustID = -1;
	int idx = 0;
	sort(clustAssign.begin(), clustAssign.end(), LessClustID());
	for (vector<ClustAssignData>::iterator p = clustAssign.begin(); p != clustAssign.end(); p++) {
		if ((*p).clustid != currClustID) {
			currClustID = (*p).clustid;
			clustidIdx.insert(map<int,int>::value_type(currClustID, idx));
		}
		idx++;
	}
}
void ClusterAssignment::addClustAssign(Domain *dom, int clustid, string treenums="") {
	ClustAssignData cl;
	cl.maxscore_dom = *dom;
	cl.clustid = clustid;
	istringstream its(treenums);
	int i, tnum;
	char buf[256];
	ClustAssignData_sub subd;
//cout << "treenums: " << treenums << endl;
	i = 0;
	while (its.getline(buf, sizeof(buf), ',')) {
		tnum = atoi(buf);
		while (i++ < tnum) {
			subd.selected = false;
//cout << i-1 << " " << subd.selected << endl;
			cl.subdata.push_back(subd);
		}
		subd.selected = true;
//cout << i-1 << " " << subd.selected << endl;
		cl.subdata.push_back(subd);
//		cout << "III>>>" << tnum << endl;
	}
	clustAssign.push_back(cl);
}
/*
list<ClustAssignData*> *ClusterAssignment::findDomAssign(Domain *dom) {
	
}
*/
list<ClustAssignData*> *ClusterAssignment::findAssignment(int clustid) {
	list<ClustAssignData*> *retList = new list<ClustAssignData*>;
	ClustAssignData *cldata;
	int idx = clustidIdx[ clustid ];
	cldata = &clustAssign[ idx ];
//cout << "CCC " << clustid<< " " << idx << " " << cldata << endl;
	if (cldata == NULL) {
		cerr << "Clust data not found: id=" << clustid << endl;
		return NULL;
	}
	while (cldata->clustid == clustid) {
		retList->push_back(cldata);
		cldata = &clustAssign[ ++idx ];
	}
	return retList;
}

ClusterAssignment *ReadAssignment::readAssign(const char *filename)
{
	ifstream ifs;
	ClusterAssignment *clstAssign = new ClusterAssignment();
	char buf[BUFSIZ];
        ClustAssignList *assList;
	string name;
	int from, to, domnum, clustid;
	Domain dom;
	string treenums;

//	clstAssign = new ClusterAssignment();

	ifs.open(filename);
	if (ifs.fail()) {
		cerr << "assignfile open failed\n";
		return NULL;
	}
	while (ifs.getline(buf, BUFSIZ)) {
		istringstream is(buf);
		is >> name >> from >> to >> domnum >> clustid >> treenums;
		dom.setData(name, from, to, domnum);
		clstAssign->addClustAssign(&dom, clustid, treenums);
	}
	clstAssign->createIndex();

        return clstAssign;
}
bool ClusterAssignment::check_cutoff(ClustAssignData *cldata)
{
	if (SimValue::getReprType() == 's') {
		if (cldata->mean_score < cutoff) {
			return false;
		}
	} else {
		if (cutoff >= 0 && cldata->mean_pam > cutoff) {
			return false;
		}
	}
	return true;
}
