package DomRefine::Tree;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(create_tree phylogenetic_tree find_min_tree divide_tree tree_children
	     save_support_values
	     check_species_overlap get_sub_tree_leaves sum_of_duplication get_sub_tree_branch_length
	     put_support_values put_node_labels get_species_from_leaves
	     print_tree
	     );

use strict;
use Bio::TreeIO;
use DomRefine::General;
use DomRefine::Read;

sub phylogenetic_tree {
    my ($tmp_alignment, $tmp_tree, $tmp_tree_log) = @_;
    
    escape_colon($tmp_alignment);

    my $program = "FastTree";
    print STDERR "${program} .. ";
    system "cat $tmp_alignment | $program > $tmp_tree 2> $tmp_tree_log";
    print STDERR "done\n";
}

sub find_min_tree {
    my ($tree_file, $r_nodes) = @_;

    my @height = ();
    for (my $i=0; $i<@{$r_nodes}; $i++) {
	if (${$r_nodes}[$i]->branch_length > 0) {
	    my $tree = create_tree($tree_file);
	    my @nodes = $tree->get_root_node->get_all_Descendents;
	    $tree->reroot_at_midpoint($nodes[$i]);
	    my $root_node = $tree->get_root_node;
	    my $height = sprintf("%.5f", $root_node->height);
	    $height[$i] = $height;
	} else {
	    # print STDERR "i=$i\tnode=", ${$r_nodes}[$i]->to_string, "\n"; # print for debug
	}
    }
    my $i_best = min_i(@height);

    return $i_best;
}

sub divide_tree {
    my ($tmp_input, $tmp_tree, $idx) = @_;

    # members of a sub-cluster
    my ($sp_overlap, $r_leaves1, $r_leaves2, $detail) = check_species_overlap($tmp_tree, $idx);
    my %sub_cluster_domain;
    my %sub_cluster_gene;
    for my $leaf_name (@{$r_leaves1}) {
	if ($leaf_name =~ /^(\d+)_([A-Za-z0-9]+)_(\S+)$/) {
	    my ($domain, $sp, $gene) = ($1, $2, $3);
	    $sub_cluster_domain{"$sp:$gene"}{$domain} = 1;
	} elsif ($leaf_name =~ /^([A-Za-z0-9]+)_(\S+)$/) {
	    my ($sp, $gene) = ($1, $2, $3);
	    $sub_cluster_gene{"$sp:$gene"} = 1;
	} else {
	    die $leaf_name;
	}
    }
    # output sub-clusters
    open(TMP_INPUT, $tmp_input) || die;
    if (%sub_cluster_domain and %sub_cluster_gene) {
	die;
    }
    my $out = "";
    while (<TMP_INPUT>) {
	my ($cluster, $gene, $domain) = split;
	if (%sub_cluster_gene) {
	    if ($sub_cluster_gene{$gene}) {
		s/^(\S+)/${1}d/;
	    } else {
		s/^(\S+)/${1}D/;
	    }
	} elsif (%sub_cluster_domain) {
	    if ($sub_cluster_domain{$gene}{$domain}) {
		s/^(\S+)/${1}d/;
	    } else {
		s/^(\S+)/${1}D/;
	    }
	}
	$out .= $_;
    }
    close(TMP_INPUT);
    return $out;
}

sub tree_children {
    my ($node, $offset, @bar) = @_;

    my @child = $node->each_Descendent;
    if (@child == 2) {
	my @bar1 = reset_bar(-$offset, @bar, -($offset+2));
	my @bar2 = reset_bar($offset, @bar, $offset+2);
	return
	    tree_children($child[0], $offset+2, @bar1) .
	    tree_padding($offset, @bar) . "+-|\n" .
	    tree_children($child[1], $offset+2, @bar2);
    } elsif (@child == 0) {
	return
	    tree_padding($offset, @bar) . "+- " . tree_node($node) . "\n";
    } else {
	die;
    }
}

sub tree_node {
    my ($node) = @_;
    
    my $string = $node->to_string;
    my ($gene) = split(":",  $string);
    if ($gene =~ /^(\S+?)_(\S+)$/) {
	my ($organism, $name) = ($1, $2);
	return "$organism:$name";
    } else {
	die;
    }
}

sub reset_bar {
    my ($bar_reset, @bar) = @_;
    
    my @new_bar = ();
    for my $bar (@bar) {
	if ($bar != $bar_reset) {
	    push @new_bar, $bar;
	}
    }

    return @new_bar;
}

sub tree_padding {
    my ($offset, @bar) = @_;

    my @padding = ();
    for (my $i=0; $i<$offset; $i++) {
	$padding[$i] = " ";
    }

    for my $bar (@bar) {
	my $pos = abs($bar);
	if ($pos < $offset) {
	    $padding[$pos] = "|";
	}
    }

    return join("", @padding);
}

sub check_duplication {
    my ($node) = @_;

    my ($sub_tree1_node, $sub_tree2_node) = $node->each_Descendent;
    
    my @leaves1 = get_sub_tree_leaves($sub_tree1_node);
    my @leaves2 = get_sub_tree_leaves($sub_tree2_node);
    my @species1 = get_species_from_leaves(@leaves1);
    my @species2 = get_species_from_leaves(@leaves2);
    my @all_species = uniq(@species1, @species2);
    my @common_species = check_redundancy(@species1, @species2);

    my $sp_disappeared = @all_species - @common_species;

    if (@common_species) {
	return (1, $sp_disappeared);
    } else {
	return 0;
    }
}

sub sum_of_duplication {
    my ($r_tree) = @_;

    my @node = grep {! $_->is_Leaf} ${$r_tree}->get_nodes();

    my $sum_of_duplication = 0;
    my $sum_of_sp_disappeared = 0;
    for my $node (@node) {
	my ($is_diuplication, $sp_disappeared) = check_duplication($node);
	if ($is_diuplication) {
	    $sum_of_duplication += $is_diuplication;
	    $sum_of_sp_disappeared += $sp_disappeared;
	}
    }

    return ($sum_of_duplication, $sum_of_sp_disappeared);
}

sub save_support_values {
    my ($r_tree, $r_support_value) = @_;
    
    my @nodes = grep {! $_->is_Leaf} ${$r_tree}->get_root_node->get_all_Descendents;
    
    for (my $i=0; $i<@nodes; $i++) {
	my $id1 = $nodes[$i]->internal_id;
	my $id2 = $nodes[$i]->ancestor->internal_id;
	my $support_value = $nodes[$i]->id;
	
	${$r_support_value}[$id1][$id2] = $support_value;
	${$r_support_value}[$id2][$id1] = $support_value;
    }
    
}

sub put_support_values {
    my ($r_tree, $r_support_value) = @_;

    my $root_node = ${$r_tree}->get_root_node;
    my @nodes = grep {! $_->is_Leaf} $root_node->get_all_Descendents;
    for (my $i=0; $i<@nodes; $i++) {
	my $id1 = $nodes[$i]->internal_id;
	my $id2 = $nodes[$i]->ancestor->internal_id;
	$nodes[$i]->id("");
# 	if (${$r_support_value}[$id1][$id2]) {
# 	    $nodes[$i]->id(${$r_support_value}[$id1][$id2]);
# 	}
    }
    my ($node1, $node2) = $root_node->each_Descendent;
    my $id1 = $node1->internal_id;
    my $id2 = $node2->internal_id;
    $node1->id(${$r_support_value}[$id1][$id2]);
    $node2->id(${$r_support_value}[$id1][$id2]);
}

sub put_node_labels {
    my ($r_tree) = @_;

    my @node = grep {! $_->is_Leaf} ${$r_tree}->get_root_node->get_all_Descendents;
    for my $node (@node) {
	my $id = $node->internal_id;
	$node->id($id);
    }
}

sub check_species_overlap {
    my ($tree_file, $i, $node1, $node2) = @_;

    my $tree = create_tree($tree_file);
    my @nodes = $tree->get_root_node->get_all_Descendents;
    my $n_seq = grep {$_->is_Leaf} @nodes;
    my $branch_length = $nodes[$i]->branch_length;
    $tree->move_id_to_bootstrap;
    my $boot = $nodes[$i]->bootstrap || "";

    # calculation
    $tree->reroot_at_midpoint($nodes[$i]);
    my $root_node = $tree->get_root_node;
    my ($sub_tree1_node, $sub_tree2_node) = $root_node->each_Descendent;
    my @leaves1 = get_sub_tree_leaves($sub_tree1_node);
    my @leaves2 = get_sub_tree_leaves($sub_tree2_node);
#     print STDERR "leaves1 @leaves1\n";
#     print STDERR "leaves2 @leaves2\n";

    my @species1 = get_species_from_leaves(@leaves1);
    my @species2 = get_species_from_leaves(@leaves2);
#     print STDERR "species1 @species1\n";
#     print STDERR "species2 @species2\n";
    my @all_species = uniq(@species1, @species2);
    my @common_species = check_redundancy(@species1, @species2);

    my $sum_of_duplication = 0;
    my $sum_of_sp_disappeared = 0;
    ($sum_of_duplication, $sum_of_sp_disappeared) = sum_of_duplication(\$tree);

    # print
    $node1 ||= "";
    $node2 ||= "";
    my $height = sprintf("%.5f", $root_node->height);
    my @len1 = get_sub_tree_branch_length($sub_tree1_node);
    my @len2 = get_sub_tree_branch_length($sub_tree2_node);
    my $len1 = mean(@len1) || 0;
    my $len2 = mean(@len2) || 0;
    $len1 = sprintf("%.5f", $len1);
    $len2 = sprintf("%.5f", $len2);
    my $len12 = mean(@len1, @len2);
    my $len_relative = 0;
    if ($len12) {
	$len_relative = $branch_length / $len12;
    }
    $len_relative = sprintf("%.5f", $len_relative);
    my $log_ratio_len = "";
    if ($len2 and $len1/$len2) {
	$log_ratio_len = log($len1/$len2)/log(2);
	$log_ratio_len = abs($log_ratio_len);
	$log_ratio_len = sprintf("%.5f", $log_ratio_len);
    }
    my $height1 = sprintf("%.5f", $sub_tree1_node->height);
    my $height2 = sprintf("%.5f", $sub_tree2_node->height);
    my $n1 = scalar(@leaves1);
    my $n2 = scalar(@leaves2);
    my $n_sp = scalar(@all_species);
    my $n_sp_common = scalar(@common_species);
    my $sp_overlap = @common_species/@all_species;
    $sp_overlap = sprintf("%.5f", $sp_overlap);
    my $sp_overlap1 = 0;
    if (@species1) {
	$sp_overlap1 =  @common_species/@species1;
    }
    my $sp_overlap2 = 0;
    if (@species2) {
	$sp_overlap2 =  @common_species/@species2;
    }
    my $max_sp_overlap_part = max($sp_overlap1, $sp_overlap2);

    my $min_height = ($height1 + $height2 + $branch_length) / 2;
    if ($min_height < $height1 and $min_height < $height2) {
	die;
    } elsif ($min_height < $height1) {
	$min_height = $height1;
    } elsif ($min_height < $height2) {
	$min_height = $height2;
    }

    my $detail = "i=$i($node1,$node2)\tb=$boot,\tl= $branch_length"
	. ", l_rel= $len_relative"
	. ", l1= $len1, l2= $len2"
	. ", |log2(l1/l2)|= $log_ratio_len,"
	. "\th= $min_height"
	# . "\th= $height"
	# . ", h1= $height1, h2= $height2"
	. ", n= $n_seq = $n1 + $n2, "
	. "o_sp=$n_sp_common/$n_sp=\t$sp_overlap\t, o_sp_part=\t$max_sp_overlap_part, n_dup=$sum_of_duplication, n_dis=$sum_of_sp_disappeared\n";

    return $sp_overlap, \@leaves1, \@leaves2, $detail
	# , $height
	, $min_height
	;
}

sub get_species_from_leaves {
    my @leaves = @_;
    
    my %species;
    for my $leaf (@leaves) {
	if ($leaf =~ /^\d+_([A-Za-z0-9]+)_/) {
	    my $sp = $1;
	    $species{$sp} = 1;
	} elsif ($leaf =~ /^([A-Za-z0-9]+)_/) {
	    my $sp = $1;
	    $species{$sp} = 1;
	} else {
	    die;
	}
    }

    return keys %species;
}

sub get_sub_tree_leaves {
    my ($sub_tree_node) = @_;

    my @leaves = grep {$_->is_Leaf} ($sub_tree_node, $sub_tree_node->get_Descendents);
    my @leaves_id = ();
    for my $leaf (@leaves) {
	push @leaves_id, $leaf->id;
    }

    return @leaves_id;
}

sub get_sub_tree_branch_length {
    my ($sub_tree_node) = @_;

    my @node = $sub_tree_node->get_Descendents;

    my @len = ();
    for my $node (@node) {
    	push @len, $node->branch_length;
    }
    return @len;

    # my $sum = 0;
    # for my $node (@node) {
    # 	$sum += $node->branch_length;
    # }

    # my $mean = 0;
    # if (@node != 0) {
    # 	$mean = $sum/@node;
    # }

    # return sprintf("%.5f", $mean);
}

sub print_tree {
    my ($r_tree) = @_;

    my $tree_io = Bio::TreeIO->new(-format=>'newick');
    $tree_io->write_tree(${$r_tree});
}

sub create_tree {
    my ($tree_file) = @_;

    my $tree_io = Bio::TreeIO->new(-format => "newick", -file => "$tree_file");
    my $tree = $tree_io->next_tree;
    
    return $tree;
}

sub my_reroot_at_midpoint {
    my ($r_tree, $node) = @_;

    my $midpoint = $node->create_node_on_branch(-FRACTION=>0.5);
    ${$r_tree}->reroot($midpoint);
}

sub escape_colon {
    my ($tmp_alignment) = @_;

    my @alignment = `cat $tmp_alignment`;
    chomp(@alignment);
    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /^>\S+/) {
	    $alignment[$i] =~ s/:/_/g;
	    $alignment[$i] =~ s/\|/_/g;
	}
    }

    open(TMP_ALIGNMENT, ">$tmp_alignment") || die;
    print TMP_ALIGNMENT join("\n", @alignment), "\n";
    close(TMP_ALIGNMENT);
}

1;
