#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include<math.h>

#define MAXLINE 5000

typedef struct {
	char **name;
	int **pat;
	int rownum;
	int colnum;
} PatternSet;

char *filename;
PatternSet *readpat();
double calc_mutinfo();
double calc_prob_hypgeom();
double calc_corr();
double hypergeometric(), hypergeometric_cum();
double cutoff, prob_cutoff, distpass, cutoff_r, distpass_r;
double mutinfo_cutoff, corr_cutoff;
int flag_dist, flag_mutinfo, flag_corr, flag_prob;
double *lnFact;
double scale = 1;
char disttype = 'D';
int nameout = 1;
int abs_flag;
int print_pat_flag;
FILE *fh_log = NULL;

main(int argc, char **argv)
{
	int i, j;
	PatternSet *patset;
	getargs(argc, argv);
	patset = readpat(filename);
	if (prob_cutoff || flag_prob) {
		calc_lnFact(patset->colnum);
	}
	if (cutoff_r) {
		cutoff = patset->colnum * cutoff_r;
	}
	if (distpass_r) {
		distpass = patset->colnum * distpass_r;
	}
/*
	print_pat(patset);
*/
	if (nameout) {
		print_names(patset);
	}
	cmpr_phylopat_all(patset);
	exit(0);
}
PatternSet *readpat(char *filename)
{
	char *p, *nextp, *name;
	static char buf[MAXLINE];
	int val;
	int i, j;
	int rownum, colnum;
	FILE *fp = stdin;
	PatternSet *patset;
	int *tmppat;
	int flag = 0;

	if (filename) {
		if ((fp = fopen(filename, "r")) == NULL) {
			fprintf(stderr, "Can't open file\n");
			exit(1);
		}
	}
	if ( (patset = (PatternSet*) malloc(sizeof(PatternSet))) == NULL ) {
		fprintf(stderr, "Can't alloc patset\n");
		exit(1);
	}
	while (fgets(buf, MAXLINE, fp)) {
		if (buf[0] == '#') {
			if (strncmp(&buf[1],"colnum=",7)==0) {
				sscanf(&buf[8], "%d", &colnum);
			} else if (strncmp(&buf[1],"rownum=",7)==0) {
				sscanf(&buf[8], "%d", &rownum);
			}
			continue;
		}
		if (! flag) {
			if (! colnum || ! rownum) {
				fprintf(stderr, "rownum and colnum should be specified\n");
				exit(1);
			}
			if ( (patset->pat = (int **) malloc(sizeof(int*)*rownum)) == NULL ) {
				fprintf(stderr, "Can't alloc pattern\n");
				exit(1);
			}
			for (i = 0; i < rownum; i++) {
				if ( (patset->pat[i] = (int *) malloc(
						sizeof(int) * colnum))==NULL) {
					fprintf(stderr, "Can't alloc pattern\n");
					exit(1);
				}
memset(patset->pat[i], 0, sizeof(int) * colnum);
			}
			patset->rownum = rownum;
			patset->colnum = colnum;
			if ( (patset->name = (char **) malloc(
					sizeof(char*)*rownum)) == NULL ) {
				fprintf(stderr, "Can't alloc pattern\n");
				exit(1);
			}
			i = 0;
			flag = 1;
		}
		if (i >= patset->rownum) {
			fprintf(stderr, "excess number of data are truncated\n");
			break;
		}
		j = 0;
		name = strtok_r(buf, " \t", &p);
		patset->name[i] = strdup(name);
		nextp = NULL;
		while  (1) {
			val = strtol(p, &nextp, 10);
			if (p == nextp) {
				break;
			} else {
				patset->pat[i][j++] = val;
			}
			if (j >= colnum) {
				break;
			}
			p = nextp;
		}
		i++;
	}
	return patset;
}
print_pat_all(PatternSet *patset)
{
	int i;
	for (i = 0; i < patset->rownum; i++) {
		print_pat(patset->pat[i], patset->colnum);
		putchar('\n');
	}
	putchar('\n');
}
print_pat(int *pat, int colnum)
{
	int i;
	for (i = 0; i < colnum; i++) {
		printf(" %d", pat[i]);
	}
}
print_names(PatternSet *patset)
{
	int i;
	for (i = 0; i < patset->rownum; i++) {
		printf("dmy %s 100 %d 1\n", patset->name[i],i+1);
	}
	printf("//\n");
}
cmpr_phylopat_all(PatternSet *patset)
{
	int i, j;
	int cnt1, cnt2, match;
	double p, n, r, dist;
	int d;
	double s;
    int n_out = 0;
	for (i = 0; i < patset->rownum; i++) {
		for (j = i+1; j < patset->rownum; j++) {
			count_match(patset, i, j, &cnt1, &cnt2, &match);
/*
			d = calc_hamming(patset,i,j);
*/

			if (print_pat_flag) {
				print_pat(patset->pat[i],patset->colnum);
				putchar(' ');
				print_pat(patset->pat[j],patset->colnum);
				putchar('\n');
			}

			if (flag_dist) {
				d=calc_hamming(patset->colnum, cnt1, cnt2, match);
				n = (double) d / patset->colnum;
				if (disttype == 'D') {
					dist = (double) d;
				} else {
					dist = (double) n * scale;
				}
			}
			if (flag_corr) {
				r = calc_corr(patset->colnum,cnt1,cnt2,match);
				if (abs_flag) {
					dist = (1 - fabs(r)) * scale;
				} else {
					dist = (1 - r) * scale / 2;
				}
			}
			if (flag_mutinfo) {
				s = calc_mutinfo(patset->colnum,cnt1,cnt2,match);
				dist = (1-s) * scale;
			}
			if (cutoff && dist > cutoff) {
				continue;
			}
			if ((prob_cutoff && (! distpass || d > distpass))
					||flag_prob) {
				p=calc_prob_hypgeom(patset->colnum, cnt1, cnt2, match);
				if (p > prob_cutoff) continue;
			}
			if (disttype == 'A') {
				printf("%s %s %d %d %d %d %lf %lf %lf %lf\n",
					patset->name[i],patset->name[j],
					cnt1, cnt2, match,
					d, n, r, p, s);
			} else {
				printf("dmy:%s dmy:%s 1 100 1 100 %lf",
					patset->name[i], patset->name[j], dist);
/***
				switch (disttype) {
				case 'D':
					printf(" %d", d);
					break;
				case 'N':
					printf(" %lf", (double) d*scale/patset->colnum);
					break;
				case 'R':
					printf(" %lf", (1-r)*scale);
					break;
				case 'I':
					printf(" %lf", (1-s)*scale);
					break;
				case 'P':
					printf(" %lf", p*scale );
					break;
				}
**/
				if (flag_mutinfo) {
					printf(" %lf\n",s*100);
				} else {
					printf(" 10\n");
				}
			}
			n_out++;
			if (fh_log != NULL) {
				if (n_out % 100000 == 0) {
					fprintf(fh_log, "phylopat %d\n", n_out);
				}
			}
		}
	}
	if (fh_log != NULL) {
		fprintf(fh_log, "Done (%d)\n", n_out);
	}
}
/*
calc_hamming(PatternSet *patset, int i, int j)
{
	int k;
	int dist = 0;
	for (k = 0; k < patset->colnum; k++) {
		dist += abs(patset->pat[i][k] - patset->pat[j][k]);
	}

	return dist;
}
*/
calc_hamming(int totnum, int cnt1, int cnt2, int match)
{
	return cnt1 + cnt2 - 2 * match;
}
double calc_corr(int totnum, int cnt1, int cnt2, int match)
{
	double corr;
	corr = (totnum * match - cnt1 * cnt2) /
	    sqrt((totnum * cnt1 - cnt1 * cnt1)*(totnum * cnt2 - cnt2 * cnt2));
	return corr;
}
/***
double calc_prob_hgd(PatternSet *patset, int i, int j)
{
	int cnt1 = 0, cnt2 = 0, match = 0;
	int k;
	count_match(patset, i, j, &cnt1, &cnt2, &match);
	return hypergeometric_cum(patset->colnum, cnt1, cnt2, match);
}
***/
double calc_mutinfo(int totnum, int cnt1, int cnt2, int match)
{
	double p00, p01, p10, p11, p1, p2, minf;
	double sub_mutinfo();
	p00 = (double)(totnum + match - cnt1 - cnt2) / totnum;
	p01 = (double)(cnt1 - match) / totnum;
	p10 = (double)(cnt2 - match) / totnum;
	p11 = (double)match / totnum;
	p1 = (double)cnt1 / totnum;
	p2 = (double)cnt2 / totnum;

	minf = sub_mutinfo(p00, 1-p1, 1-p2)
		+ sub_mutinfo(p01, p1, 1-p2)
		+ sub_mutinfo(p10, p2, 1-p1)
		+ sub_mutinfo(p11, p1, p2);

/*
	minf = p00*log(p00/(1-p1)/(1-p2))
		+ p01*log(p01/p1/(1-p2))
		+ p10*log(p10/p2/(1-p1))
		+ p11*log(p11/p1/p2);
*/
	return minf;
}
double sub_mutinfo(double p12, double p1, double p2)
{
	if (p12 == 0) {
		return 0;
	}
	return p12 * log( p12 / p1 / p2 ) / log(2);
}
count_match(PatternSet *patset, int i, int j,
			int *cnt1, int *cnt2, int *match)
{
	int k;
	*cnt1 = *cnt2 = *match = 0;
	for (k = 0; k < patset->colnum; k++) {
		if (patset->pat[i][k]) {
			(*cnt1)++;
			if (patset->pat[j][k]) {
				(*match)++;
			}
		}
		if (patset->pat[j][k]) {
			(*cnt2)++;
		}
	}
}
double calc_prob_hypgeom(int N, int n, int m, int k)
{
	int i, mini, maxi, negflag = 0;
	double p = 0.0;
	maxi = (n <= m) ? n : m;
/***
	for (i = 0; i <= maxi; i++) {
		double pp = hypergeometric(N, n, m, i);
		p += pp;
printf(">>>%d,%lf,%lf\n",i,pp,p);
	}
p = 0.0;
***/

	if (k < (double)maxi/2) {
		mini = (n + m - N > 0) ? n + m - N : 0;
		maxi = k; negflag = 1;
	} else {
		mini = k;
	}
	for (i = mini; i <= maxi; i++) {
		p += hypergeometric(N, n, m, i);
	}
	if (! abs_flag && negflag) {
		p = 1.0-p;
	}
	return p;
}
double hypergeometric(int N, int n, int m, int k)
{
	double comb();
	return comb(n, k) * comb(N-n, m-k) / comb(N, m);
}
double comb(int n, int m)
{
/**
	int i;
	double c = 1.0;
**/
	if (n < m) {
		/** error **/
		fprintf(stderr, "comb: invalid argument: %d,%d\n",n,m);
		return 1;
	}
	return ( exp(lnFact[n] - lnFact[n-m] - lnFact[m]) );
/**
	for (i = 0; i < m; i++) {
		c *= ( (double) (n - i) / (m - i) );
	}
	return c;
**/
}

/* create a table of the logarithms of the factorials */
calc_lnFact(int size)
{
	int i;
	if ( (lnFact = malloc(sizeof(double) * (size+1))) == NULL ) {
		fprintf(stderr, "Can't allocate memory\n");
		exit(1);
	}
	lnFact[0] = 0;
	for (i = 1; i <= size; i++) {
		lnFact[i] = lnFact[i-1] + log(i);
	}
}
getargs(int argc, char **argv)
{
	int i;
	for (i = 1; i < argc; i++) {
		if (*argv[i] == '-') {
			switch (*++argv[i]) {
			case 'v':
                fh_log = stderr;
				break;
			case 'c':
				if (*++argv[i] == 'r') {
					cutoff_r = atof(++argv[i]);
				} else {
					cutoff = atof(argv[i]);
				}
				break;
			case 'C':
				if (*++argv[i] == 'r') {
					distpass_r = atof(++argv[i]);
				} else {
					distpass = atof(argv[i]);
				}
				flag_dist = 1;
			case 'p':
				prob_cutoff = atof(++argv[i]);
				break;
			case 'I':
				mutinfo_cutoff = atof(++argv[i]);
				flag_mutinfo = 1;
				break;
			case 'r':
				corr_cutoff = atof(++argv[i]);
				break;
			case 'a':
				abs_flag = 1;
				break;
			case 's':
				scale = atof(++argv[i]);
				break;
			case 'P':
				print_pat_flag = 1;
				break;
			case 'd':
				switch (*++argv[i]) {
				case 'D':
					/* hamming distance */
					flag_dist = 1;
					disttype = 'D';
					break;
				case 'N':
					/* normalized hamming distance */
					flag_dist = 1;
					disttype = 'N';
					break;
				case 'I':
					/* mutual information */
					disttype = 'I';
					flag_mutinfo = 1;
					break;
				case 'R':
					/* mutual information */
					disttype = 'R';
					flag_corr = 1;
					break;
				case 'P':
					/* prob (hypergeometric dist.) */
					disttype = 'P';
					flag_prob = 1;
					break;
				case 'A':
					/* table */
					disttype = 'A';
					nameout = 0;
					flag_corr = flag_mutinfo = flag_prob =1;
					break;
				}
				break;
			}
		} else {
			filename = argv[i];
		}
	}
}
