package DomRefine::Motif;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(read_motif_cutoff sort_hit_motifs get_cog_hits_from_file
	     get_hit_positions get_hit_pos read_motif_list
	     read_cog_hits get_overlaps reverse_overlaps print_cluster_overlaps print_cluster_overlap_max count_overlap count_overlap_fast get_overlaps_check print_cluster_overlaps_1to1
             calc_overlap_aa
	     );

use strict;
use DomRefine::General;
use DomRefine::Read;

my $TMP_PRINT_CLUSTER_OVERLAP_MAX = define_tmp_file("print_cluster_overlap_max");
END {
    remove_tmp_file($TMP_PRINT_CLUSTER_OVERLAP_MAX);
}

sub read_motif_list {
    my ($motif_list_file, $r_motif_list) = @_;
    
    open(MOTIF_LIST, $motif_list_file) || die;
    while (<MOTIF_LIST>) {
	my ($motif) = split;
	${$r_motif_list}{$motif} = 1;
    }
    close(MOTIF_LIST);

}

sub read_motif_len {
    my ($motif_len_file, $r_motif_len) = @_;
    
    open(MOTIF, $motif_len_file) || die;
    while (<MOTIF>) {
	my @f = split("\t", $_);
	my ($motif, $len) = ($f[1], $f[3]);
	${$r_motif_len}{$motif} = $len;
    }
    close(MOTIF)
}

sub read_motif_cutoff {
    my ($motif_cutoff_file, $r_motif_cutoff) = @_;
    
    open(MOTIF, $motif_cutoff_file) || die;
    while (<MOTIF>) {
	my ($motif, $cutoff) = split;
	${$r_motif_cutoff}{$motif} = $cutoff;
    }
    close(MOTIF)
}

sub get_motif_len_mysql {
    my ($dbh, $motif) = @_;

    my $r_r_len = $dbh->selectall_arrayref("select length from motif where motid='$motif'");
    if (@{$r_r_len} != 1 or @{${$r_r_len}[0]} != 1) {
	die;
    }
    my $len = ${$r_r_len}[0][0];
    
    return $len;
}

sub get_cog_hits_from_file {
    my ($r_cogs, $r_motif_cutoff, $r_hit_gene, $r_hit_start, $r_hit_end, $r_hit_score, $file) = @_;

    open(FILE, $file) || die;
    while (<FILE>) {
	chomp;
	my @f = split("\t", $_);
	if (match($f[7], @{$r_cogs})) {
	    if ($f[12] > ${$r_motif_cutoff}{$f[7]}) {
		push @{$r_hit_gene}, "$f[2]:$f[3]";
		push @{$r_hit_start}, $f[4];
		push @{$r_hit_end}, $f[5];
		push @{$r_hit_score}, $f[12];
	    }
	}
    }
    close(FILE);
}

sub get_hit_positions {
    my ($fused_gene, $r_hit_start_j, $r_hit_end_j, $r_hit_motif, $r_hit_gene_i, $r_hit_evalue, $r_get_j_row, $i, $r_reference_domain, %opt) = @_;

    my @fused_gene = split(/\|/, $fused_gene);
    my $previous_gene = "";
    my $offset = 0;
    for my $gene (@fused_gene) {
	if ($previous_gene ne "" and $previous_gene ne $gene) {
	    $offset += get_gene_length($opt{r_seq}, $previous_gene);
	}
	get_hit_pos($gene, $r_hit_start_j, $r_hit_end_j, $r_hit_motif, $i, $r_reference_domain, %opt, offset => $offset, r_get_j_row => $r_get_j_row, r_hit_gene_i => $r_hit_gene_i);
	$previous_gene = $gene;
    }
}

sub read_cog_hits {
    my ($file, $r_cog_hits) = @_;
    
    print STDERR "Reading COG hits..\n";
    open(COG_HITS, "$file") || die;
    while (<COG_HITS>) {
	chomp;
	my @f = split("\t", $_);
	my ($sp, $name) = @f[2,3];
	my $gene = "$sp:$name";

	${$r_cog_hits}{line}{$.} = $_;
	if (${$r_cog_hits}{gene}{$gene}) {
	    push @{${$r_cog_hits}{gene}{$gene}}, $.;
	} else {
	    ${$r_cog_hits}{gene}{$gene} = [$.];
	}
    }
    close(COG_HITS);
}

sub get_hit_pos {
    my ($gene, $r_hit_start_j, $r_hit_end_j, $r_hit_motif, $i, $r_cog_domain, %opt) = @_;
    my $offset = $opt{offset};

    for my $domain (keys %{${$r_cog_domain}{$gene}}) {
	my $hit_start = ${$r_cog_domain}{$gene}{$domain}{begin};
	my $hit_end = ${$r_cog_domain}{$gene}{$domain}{end};
	my $motif = ${$r_cog_domain}{$gene}{$domain}{cluster};
    
	if ($offset) {
	    $hit_start += $offset;
	    $hit_end += $offset;
	}
	push @{$r_hit_motif}, $motif;
	if ($opt{r_get_j_row}) {
	    my $r_get_j_row = $opt{r_get_j_row};
	    push @{$r_hit_start_j}, ${$r_get_j_row}{$hit_start};
	    push @{$r_hit_end_j}, ${$r_get_j_row}{$hit_end};
	} else {
	    push @{$r_hit_start_j}, $hit_start;
	    push @{$r_hit_end_j}, $hit_end;
	}
	if ($opt{r_hit_gene_i}) {
	    my $r_hit_gene_i = $opt{r_hit_gene_i};
	    push @{$r_hit_gene_i}, $i;
	}
	if ($opt{r_hit_gene}) {
	    my $r_hit_gene = $opt{r_hit_gene};
	    push @{$r_hit_gene}, $gene;
	}
	
    }
}

sub sort_hit_motifs {
    my @motif = @_;

    my %count = ();
    for my $motif (@motif) {
	$count{$motif}++;
    }

    return sort {$count{$b} <=> $count{$a}} keys %count;
}

sub get_overlaps {
    my ($r_domain, $r_motif, $r_overlap, %opt) = @_;
    
    for my $gene (keys %{$r_motif}) {
	for my $motif_no (keys %{${$r_motif}{$gene}}) {
	    my $motif = ${$r_motif}{$gene}{$motif_no}{cluster};
	    my $motif_begin = ${$r_motif}{$gene}{$motif_no}{begin};
	    my $motif_end = ${$r_motif}{$gene}{$motif_no}{end};
	    my $motif_len = ($motif_end - $motif_begin + 1);
	    for my $domain_no (keys %{${$r_domain}{$gene}}) {
		my $cluster = ${$r_domain}{$gene}{$domain_no}{cluster};
		my $begin = ${$r_domain}{$gene}{$domain_no}{begin};
		my $end = ${$r_domain}{$gene}{$domain_no}{end};
		my $domain_len = ($end - $begin + 1);
		my $overlap_len = overlap_len($motif_begin, $motif_end, $begin, $end);
		if ($overlap_len > 0) {
		    my $r_over = $overlap_len / max($motif_len, $domain_len);
		    my $r_over1 = $overlap_len / $domain_len;
		    my $r_over2 = $overlap_len / $motif_len;
		    if (! ${$r_overlap}{$motif}{$cluster}{common}) {
			${$r_overlap}{$motif}{$cluster}{common} = 0;
		    }
		    if (defined $opt{r_over}) {
			if ($r_over > $opt{r_over}) {
			} else {
			    next;
			}
		    } elsif (defined $opt{r_over2}) {
			if ($r_over2 > $opt{r_over2}) {
			} else {
			    next;
			}
		    }
		    ${$r_overlap}{$motif}{$cluster}{common} ++;
		    ${$r_overlap}{$motif}{$cluster}{aa} += $overlap_len;
		}
	    }
	}
    }
}

sub count_overlap {
    my ($r_motif, $r_domain, $r_cluster, $r_count) = @_;

    for my $gene (keys %{$r_motif}) {
	for my $motif_no (keys %{${$r_motif}{$gene}}) {
	    my $motif = ${$r_motif}{$gene}{$motif_no}{cluster};
	    my $motif_begin = ${$r_motif}{$gene}{$motif_no}{begin};
	    my $motif_end = ${$r_motif}{$gene}{$motif_no}{end};
	    my $motif_len = ($motif_end - $motif_begin + 1);
	    for my $domain_no (keys %{${$r_domain}{$gene}}) {
		my $cluster = ${$r_domain}{$gene}{$domain_no}{cluster};
		my $begin = ${$r_domain}{$gene}{$domain_no}{begin};
		my $end = ${$r_domain}{$gene}{$domain_no}{end};
		my $domain_len = ($end - $begin + 1);
		my $overlap_len = overlap_len($motif_begin, $motif_end, $begin, $end);
		my $overlap_len_ratio = $overlap_len / max($motif_len, $domain_len);
		if ($overlap_len > 0) {
		    if (${$r_cluster}{$cluster}) {
			${$r_count}{$motif}{$gene}{$motif_no} ++;
		    }
		}
	    }
	}
    }
}

sub count_overlap_fast {
    my ($r_motif_domain, $r_dclst_domain, $r_cluster, $r_count, %opt) = @_;

    for my $gene (keys %{$r_dclst_domain}) {
	for my $dclst_no (keys %{${$r_dclst_domain}{$gene}}) {
	    my $dclst = ${$r_dclst_domain}{$gene}{$dclst_no}{cluster};
	    if ($opt{skip}) {
	    } else {
		if (! ${$r_cluster}{$dclst}) {
		    next;
		}
	    }
	    my $dclst_begin = ${$r_dclst_domain}{$gene}{$dclst_no}{begin};
	    my $dclst_end = ${$r_dclst_domain}{$gene}{$dclst_no}{end};
	    my $dclst_len = ($dclst_end - $dclst_begin + 1);
	    for my $motif_no (keys %{${$r_motif_domain}{$gene}}) {
		my $motif = ${$r_motif_domain}{$gene}{$motif_no}{cluster};
		my $motif_begin = ${$r_motif_domain}{$gene}{$motif_no}{begin};
		my $motif_end = ${$r_motif_domain}{$gene}{$motif_no}{end};
		my $motif_len = ($motif_end - $motif_begin + 1);
		my $overlap_len = overlap_len($motif_begin, $motif_end, $dclst_begin, $dclst_end);
		my $overlap_len_ratio = $overlap_len / max($motif_len, $dclst_len);
		if ($overlap_len > 0) {
		    ${$r_count}{$motif}{$gene}{$motif_no} ++;
		}
	    }
	}
    }
}

sub get_overlaps_check {
    my ($r_dclst_domain, $r_motif_domain, $cluster_pair) = @_;

    my %cluster = ();
    for my $cluster (split(/[-,\s]/, $cluster_pair)) {
	$cluster{$cluster} = 1;
    }

    my %count = ();
    count_overlap_fast($r_motif_domain, $r_dclst_domain, \%cluster, \%count);

    my $n_splitted_motif = 0;
    my $n_motif = 0;
    for my $motif (keys %count) {
	for my $gene (keys %{$count{$motif}}) {
	    for my $no (keys %{$count{$motif}{$gene}}) {
		my $count = $count{$motif}{$gene}{$no};
		if ($count >= 2) {
		    $n_splitted_motif ++;
		}
		$n_motif ++;
	    }
	}
    }

    return $n_motif, $n_splitted_motif, $n_motif ? $n_splitted_motif / $n_motif : "NA", 
}

sub reverse_overlaps {
    my ($r_hash, $r_hash2) = @_;
    
    for my $key (keys %{$r_hash}) {
	for my $key2 (keys %{${$r_hash}{$key}}) {
	    ${$r_hash2}{$key2}{$key}{aa} = ${$r_hash}{$key}{$key2}{aa};
	    ${$r_hash2}{$key2}{$key}{common} = ${$r_hash}{$key}{$key2}{common};
	}
    }
}

sub print_cluster_overlaps_1to1 {
    my ($r_ref_clus, $r_input_count, $r_reference_count, %opt) = @_;

    my %clus_ref = ();
    reverse_overlaps($r_ref_clus, \%clus_ref);
    for my $reference (sort {$a cmp $b} keys %{$r_ref_clus}) {
	my @cluster = keys %{${$r_ref_clus}{$reference}};
	my $n_corresp_clusters = @cluster;
	for my $cluster (@cluster) {
	    my @reference = keys %{$clus_ref{$cluster}};
	    my $n_corresp_references = @reference;
	    if ($opt{one2one}) {
		unless ($n_corresp_references == 1 and $n_corresp_clusters == 1) {
		    next;
		}
	    }
	    my $n_common_member = ${$r_ref_clus}{$reference}{$cluster}{common};
	    my $n_member = ${$r_input_count}{$cluster};
	    my $n_reference_member = ${$r_reference_count}{$reference};
	    my $r_com = $n_common_member / max($n_member, $n_reference_member);
	    my $r_com1 = $n_common_member / $n_member;
	    my $r_com2 = $n_common_member / $n_reference_member;
	    # print join("\t", $cluster, $reference, "$n_member", "$n_reference_member"
	    # 	       , "${n_corresp_references}motifs", "${n_corresp_clusters}clusters",
	    # 	       "$n_common_member", "$r_com", "$r_com2"), "\n";
	    print join("\t", $cluster, "n=$n_member", "pair=${n_corresp_references}",
	    	       $reference, "n=$n_reference_member", "pair=${n_corresp_clusters}",
	    	       "n_com=$n_common_member", "r_com=$r_com", "r_com1=$r_com1", "r_com2=$r_com2"), "\n";
	}
    }
}

sub print_cluster_overlaps {
    my ($r_ref_clus, $r_input_count, $r_reference_count, %opt) = @_;

    my %print_ref_clus = ();
    for my $reference (sort {$a cmp $b} keys %{$r_ref_clus}) {
	for my $cluster (keys %{${$r_ref_clus}{$reference}}) {
	    my $n_common_member = ${$r_ref_clus}{$reference}{$cluster}{common};
	    my $n_member = ${$r_input_count}{$cluster};
	    my $n_reference_member = ${$r_reference_count}{$reference};
	    my $r_com = $n_common_member / max($n_member, $n_reference_member);
	    my $r_com1 = $n_common_member / $n_member;
	    my $r_com2 = $n_common_member / $n_reference_member;
	    if (defined $opt{r_com}) {
		if ($r_com > $opt{r_com}) {
		} else {
		    next;
		}
	    }
	    if (defined $opt{r_com2}) {
		if ($r_com2 > $opt{r_com2}) {
		} else {
		    next;
		}
	    }
	    $print_ref_clus{$reference}{$cluster}{common} = $n_common_member;
	}
    }

    print_cluster_overlaps_1to1(\%print_ref_clus, $r_input_count, $r_reference_count, %opt);
}

sub calc_overlap_aa {
    my ($r_ref_clus, $r_input_count, $r_reference_count, %opt) = @_;

    my $overlap_cluster = 0;
    my $overlap_member = 0;
    my $overlap_aa = 0;
    for my $reference (sort {$a cmp $b} keys %{$r_ref_clus}) {
	for my $cluster (keys %{${$r_ref_clus}{$reference}}) {
	    my $n_common_member = ${$r_ref_clus}{$reference}{$cluster}{common};
	    my $n_member = ${$r_input_count}{$cluster};
	    my $n_reference_member = ${$r_reference_count}{$reference};
	    my $r_com = $n_common_member / max($n_member, $n_reference_member);
	    if (defined $opt{r_com}) {
		if ($r_com > $opt{r_com}) {
		} else {
		    next;
		}
	    }
	    $overlap_cluster ++;
	    $overlap_member += ${$r_ref_clus}{$reference}{$cluster}{common};
	    $overlap_aa += ${$r_ref_clus}{$reference}{$cluster}{aa};
	}
    }
    return ($overlap_cluster, $overlap_member, $overlap_aa);
}

sub print_cluster_overlap_max {
    my ($r_overlap, $r_cluster_count, $r_motif_count) = @_;

    # sort
    open(TMP_PRINT_CLUSTER_OVERLAP_MAX, ">$TMP_PRINT_CLUSTER_OVERLAP_MAX") || die;
    for my $motif (sort {$a cmp $b} keys %{$r_overlap}) {
	for my $cluster (keys %{${$r_overlap}{$motif}}) {
	    my $aa = ${$r_overlap}{$motif}{$cluster}{aa};
	    print TMP_PRINT_CLUSTER_OVERLAP_MAX "$motif\t$cluster\t$aa\n";
	}
    }
    close(TMP_PRINT_CLUSTER_OVERLAP_MAX);
    my @result = `cat $TMP_PRINT_CLUSTER_OVERLAP_MAX | sort -t '\t' -k3,3gr`;

    # select max
    my %selected_motif = ();
    my %selected_cluster = ();
    for my $result (@result) {
	chomp($result);
	my ($motif, $cluster, $aa) = split("\t", $result);
	if (! $selected_motif{$motif} and ! $selected_cluster{$cluster}) {
	    print join("\t", 
		       $motif, $cluster, $aa,
		       ${$r_overlap}{$motif}{$cluster}{common} / max(${$r_cluster_count}{$cluster}, ${$r_motif_count}{$motif}), 
		       ${$r_overlap}{$motif}{$cluster}{common} / ${$r_motif_count}{$motif}
		), "\n";
	    $selected_motif{$motif} = 1;
	    $selected_cluster{$cluster} = 1;
	}
    }
}

1;
