package DomRefine::Read;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(extract_dclst extract_dclst_by_genes extract_dclst_with_check extract_dclst_compl
	     get_annotation get_clusters get_alignment_structure get_alignment_matrices get_alignment_matrices_from_file
	     get_dclst_structure cache_file_name define_tmp_file remove_tmp_file
	     parse_dclst_to_structure decompose_gene_id get_rep_cluster map_domains decompose_dclst_line
	     save_stdin save_contents
	     create_get_j create_get_pos create_get_pos_from_a
	     aligner create_alignment
	     extract_genes get_gene_idx read_dclst_seq read_dclst_geneset read_dclst_to_seq_file
	     read_dclst_to_seq_file_domain get_seq_fast read_seq
	     get_dbh
	     mysql_gene_descr get_geneset_descr get_genes_fused get_geneset_seq
	     read_cluster_members get_cluster_count
	     output_clusters output_domains
	     get_gene_length calc_total_aa
	     md5_of_fasta
             get_dclst_of_domain read_alignment check_alignment_length summarize_amino_acid_frequency
	     );

use strict;
use Digest::MD5 qw(md5_hex);
use DomRefine::General;

$ENV{DOMREFINE_CACHE} || die "set environmental DOMREFINE_CACHE\n";
$ENV{DOMREFINE_TMP} || die "set environmental DOMREFINE_TMP\n";
$ENV{DOMREFINE_ALIGNER} || die "set environmental DOMREFINE_ALIGNER\n";

my $TMP_GET_A_CLUSTER = define_tmp_file("get_a_cluster");
my $TMP_DCLST_TO_GET_STRUCTURE = define_tmp_file("dclst_to_get_structure");
my $TMP_SEQ_TO_CREATE_ALIGNMENT = define_tmp_file("seq_to_create_alignment");
my $TMP_ALIGNMENT_TO_GET_ALIGNMENT_STRUCTURE = define_tmp_file("alignment_to_get_alignment_structure");
my $TMP_SEQ_FOR_CLUSTALW = define_tmp_file("seq_for_clustalw");
my $TMP_ALIGNMENT_BY_ALIGNER = define_tmp_file("output_for_clustalw");
END {
    remove_tmp_file($TMP_GET_A_CLUSTER);
    remove_tmp_file($TMP_SEQ_TO_CREATE_ALIGNMENT);
    remove_tmp_file($TMP_ALIGNMENT_TO_GET_ALIGNMENT_STRUCTURE);
    remove_tmp_file("$TMP_ALIGNMENT_TO_GET_ALIGNMENT_STRUCTURE.err");
    remove_tmp_file($TMP_SEQ_FOR_CLUSTALW);
    remove_tmp_file($TMP_ALIGNMENT_BY_ALIGNER);
}

################################################################################
### Function ###################################################################
################################################################################

sub aligner {
    my ($seq_file, $alignment_file) = @_;

    my $aligner = $ENV{DOMREFINE_ALIGNER};

    my $cache_file = cache_file_name(md5_of_fasta($seq_file), ".$aligner");

    if ($ENV{DOMREFINE_READ_ALIGNMENT_CACHE} and find_alignment_cache($seq_file, $cache_file)) {
	print STDERR " found alignment chache $cache_file\n";
	system "cp $cache_file $alignment_file";
	return;
    }

    my ($n_seq, $n_aa, $mean_len) = seq_file_stat($seq_file);
    printf STDERR "$aligner: n_seq=$n_seq n_aa=$n_aa mean_len=%.1f ", $mean_len;
    my $start_time = get_time;

    if ($aligner eq "mafft") {
	system "mafft --auto --anysymbol $seq_file > $alignment_file 2> $alignment_file.err";
	# system "mafft --localpair $seq_file > $alignment_file 2> $alignment_file.err";
    } elsif ($aligner eq "mafft7") {
	system "/bio/bin/mafft --auto --anysymbol $seq_file > $alignment_file 2> $alignment_file.err";
    } elsif ($aligner eq "muscle") {
	system "muscle -in $seq_file -out $alignment_file 2> $alignment_file.err";
	# system "muscle -stable -in $seq_file -out $alignment_file 2> $alignment_file.err";
    } elsif ($aligner eq "clustalo") {
	system "clustalo --auto -i $seq_file > $alignment_file 2> $alignment_file.err";
    } elsif ($aligner eq "clustalo_order") {
	system "/bio/bin/clustalo --output-order=tree-order --auto -i $seq_file > $alignment_file 2> $alignment_file.err";
    } elsif ($aligner eq "clustalw") {
	convert_seq_file($seq_file, $TMP_SEQ_FOR_CLUSTALW);
	system "clustalw2 -endgaps -infile=$TMP_SEQ_FOR_CLUSTALW -outfile=$TMP_ALIGNMENT_BY_ALIGNER -output=gde -outorder=input 2>&1 > $alignment_file.err";
    } elsif ($aligner eq "t_coffee") {
	system "t_coffee $seq_file -mode regular -outfile=$TMP_ALIGNMENT_BY_ALIGNER -output=fasta_aln > /dev/null 2> $alignment_file.err";
    } elsif ($aligner eq "t_coffee_quick") {
	system "t_coffee $seq_file -mode quickaln -outfile=$alignment_file -output=fasta_aln > /dev/null 2> $alignment_file.err";
    }

    if ($aligner =~ /^(clustalw|t_coffee|t_coffee_quick)$/) {
	convert_alignment_file($TMP_ALIGNMENT_BY_ALIGNER, $alignment_file);
    }

    my $end_time = get_time;
    printf STDERR "%d sec\n", diff_time($start_time, $end_time);

    if ($ENV{DOMREFINE_WRITE_ALIGNMENT_CACHE}) {
	print STDERR " saving alignment chache $cache_file\n";
	system "cp -a $alignment_file $cache_file";
	system "cp -a $alignment_file.err $cache_file.err";
    }
}

sub map_domains {
    my ($r_gene, $r_h_domain, $r_b, $r_d) = @_;

    my $n = @{$r_b};
    my $m = @{${$r_b}[0]};

    initialize_matrix($r_d, $n, $m, 0);
    for (my $i=0; $i<$n; $i++) {
	my $gene = ${$r_gene}[$i];
	my @domains = keys %{${$r_h_domain}{$gene}};
	for my $domain (@domains) {
	    my $begin_pos = ${$r_h_domain}{$gene}{$domain}{begin};
	    my $end_pos = ${$r_h_domain}{$gene}{$domain}{end};
	    my $cluster = ${$r_h_domain}{$gene}{$domain}{cluster};
	    my ($begin_j, $end_j) = get_j_of_domain(${$r_b}[$i], $begin_pos, $end_pos);
	    for (my $j=$begin_j; $j<=$end_j; $j++) {
		${$r_d}[$i][$j] = $cluster;
	    }
	}
    }
}

sub map_domains_one_by_one {
    my ($r_gene, $r_h_domain, $r_b, $r_d, $dclst_file) = @_;

    my $n = @{$r_b};
    my $m = @{${$r_b}[0]};

    my @domain = `cat $dclst_file | cut.sh 3`;
    chomp(@domain);

    initialize_matrix($r_d, $n, $m, 0);
    for (my $i=0; $i<$n; $i++) {
	my $gene = ${$r_gene}[$i];
	my $begin_pos = ${$r_h_domain}{$gene}{$domain[$i]}{begin};
	my $end_pos = ${$r_h_domain}{$gene}{$domain[$i]}{end};
	my $cluster = ${$r_h_domain}{$gene}{$domain[$i]}{cluster};
	my ($begin_j, $end_j) = get_j_of_domain(${$r_b}[$i], $begin_pos, $end_pos);
	for (my $j=$begin_j; $j<=$end_j; $j++) {
	    ${$r_d}[$i][$j] = $cluster;
	}
    }
}

sub get_j_of_domain {
    my ($r_b, $begin_pos, $end_pos) = @_;
    
    my $m = @{$r_b};

    my ($begin_j, $end_j);
    my ($seq_begin_j, $seq_end_j);
    
    my $count = 0;
    for (my $j=0; $j<$m; $j++) {
	if (${$r_b}[$j] == 1) {
	    $count ++;

	    if ($count == 1) {
		$seq_begin_j = $j;
	    }
	    $seq_end_j = $j;
	    
	    if ($begin_pos and $begin_pos == $count) {
		$begin_j = $j;
	    }
	    if ($end_pos and $end_pos == $count) {
		$end_j = $j;
	    }
	}
    }

    # BUG?: if (begin_pos, end_pos) is outside of b, idx is undefined
    if (! defined($begin_j) or ! defined($end_j)) {
	return ($seq_begin_j, $seq_end_j);
    }

    return ($begin_j, $end_j);
}

sub summarize_amino_acid_frequency {
    my ($r_a, $r_b, $r_p) = @_;

    my $n = @{$r_a};
    my $m = @{${$r_a}[0]};

    my %f = ();
    for (my $j=0; $j<$m; $j++) {
# 	my $count_b = 0;
	for (my $i=0; $i<$n; $i++) {
	    if (${$r_a}[$i][$j] ne "-") {
		${$r_b}[$i][$j] = 1;
# 		$count_b ++;
	    } else {
		${$r_b}[$i][$j] = 0;
	    }
	    $f{$j}{${$r_a}[$i][$j]}++;
	}
	if (! $f{$j}{"-"}) {
	    $f{$j}{"-"} = 0;
	}
	for my $a (keys %{$f{$j}}) {
	    ${$r_p}{$j}{$a} = $f{$j}{$a}/$n;
# 	    ${$r_p}{$j}{$a} = $f{$j}{$a}/$count_b;
	}
    }

    # check terminal gaps
    for (my $i=0; $i<$n; $i++) {
	my $j = 0;
	while (${$r_a}[$i][$j] eq "-" and $j<$m) {
	    ${$r_b}[$i][$j] = -1;
	    $j ++;
	}
	$j = $m - 1;
	while (${$r_a}[$i][$j] eq "-" and $j>=0) {
	    ${$r_b}[$i][$j] = -1;
	    $j --;
	}
    }

}

sub create_get_j {
    my ($r_b, $r_get_j) = @_;

    my $n = @{$r_b};
    my $m = @{${$r_b}[0]};

    for (my $i=0; $i<$n; $i++) {
	my $count = 0;
# 	print STDERR "$i\n";
	for (my $j=0; $j<$m; $j++) {
	    if (${$r_b}[$i][$j] == 1) {
		$count ++;
		${${$r_get_j}[$i]}{$count} = $j;
# 		print STDERR "$count,$j ";
	    }
	}
# 	print STDERR "\n";
    }
}

sub create_get_pos {
    my ($r_b, $r_get_pos, $r_get_len) = @_;

    my $n = @{$r_b};
    my $m = @{${$r_b}[0]};

    initialize_matrix($r_get_pos, $n, $m, ""); # Is this necessary?
    for (my $i=0; $i<$n; $i++) {
	my $count = 0;
	for (my $j=0; $j<$m; $j++) {
	    if (${$r_b}[$i][$j] == 1) {
		$count ++;
		${$r_get_pos}[$i][$j] = $count;
	    } else {
		${$r_get_pos}[$i][$j] = 0; # if it is gap
	    }
	}
	${$r_get_len}[$i] = $count;
    }
}

sub create_get_pos_from_a {
    my ($r_a, $r_get_pos, $r_get_j) = @_;

    my $n = @{$r_a};
    my $m = @{${$r_a}[0]};

    for (my $i=0; $i<$n; $i++) {
	my $count = 0;
	for (my $j=0; $j<$m; $j++) {
	    if (${$r_a}[$i][$j] ne '-') {
		$count ++;
		${$r_get_pos}[$i][$j] = $count;
		${${$r_get_j}[$i]}{$count} = $j;
	    }
	}
    }
}

sub read_dclst_geneset {
    my ($dclst_file) = @_;

    my @geneset = `cat $dclst_file | cut.sh 2`;
    chomp(@geneset);
    @geneset = sort {$a cmp $b} uniq(@geneset);

    return @geneset;
}

sub read_dclst_seq {
    my ($dclst_file, $r_seq) = @_;

    my @geneset = read_dclst_geneset($dclst_file);
    my @gene = get_genes_fused(\@geneset);
    read_seq($r_seq, \@gene);
}

sub get_genes_fused {
    my ($r_geneset) = @_;

    my @gene = ();
    for my $geneset (@{$r_geneset}) {
	for my $gene (split(/\|/, $geneset)) {
	    push @gene, $gene;
	}
    }

    return @gene;
}

sub extract_genes { # cannot tread fused genes
    my ($dclst) = @_;

    my @line = split("\n", $dclst);
    chomp(@line);

    my %gene = ();
    for my $line (@line) {
	if ($line =~ /^\S+\s([A-Za-z0-9]+[:\.]\S+)\s/) {
	    $gene{$1} = 1;
	} else {
	    die;
	}
    }

    return keys %gene;
}

sub get_clusters {
    my ($dclst_file, $sort_option) = @_;
    $sort_option ||= "";
    
    my @cluster = `cat $dclst_file | cut.sh 1 | sort -n $sort_option | uniq`; # inappropreate for cluster_id of string
    chomp(@cluster);

    return @cluster;
}

sub get_a_cluster_with_check {
    my ($dclst) = @_;

    save_contents($dclst, $TMP_GET_A_CLUSTER);

    my @cluster = get_clusters($TMP_GET_A_CLUSTER);
    if (@cluster != 1) {
	die;
    }

    return $cluster[0];
}

sub extract_dclst {
    my ($dclst_table_file, @cluster) = @_;
    
    my %hash = ();
    for my $cluster (@cluster) {
	$hash{$cluster} = 1;
    }

    my $extracted_dclst = "";
    open(DCLST_TABLE_FILE, $dclst_table_file) || die;
    while (<DCLST_TABLE_FILE>) {
	unless (/./) {
	    next;
	}
	my ($cluster_id) = split;
	if (! defined $cluster_id) {
	    die;
	}
	if ($hash{$cluster_id}) {
	    $extracted_dclst .= $_;
	}
    }
    close(DCLST_TABLE_FILE);

    return $extracted_dclst;
}

sub extract_dclst_with_check {
    my ($dclst_table_file, @cluster) = @_;

    my $dclst = "";
    for my $cluster (@cluster) {
	my $dclst_part = extract_dclst($dclst_table_file, $cluster);
	if ($dclst_part eq "") {
	    print STDERR "[$cluster] is not containded.\n";
	}
	$dclst .= $dclst_part;
    }
    
    return $dclst;
}

sub extract_dclst_compl {
    my ($dclst_file, @cluster) = @_;
    
    my %hash = ();
    for my $cluster (@cluster) {
	$hash{$cluster} = 1;
    }

    my $dclst = "";
    open(EXTRACT_DCLST_COMPL, $dclst_file) || die;
    while (<EXTRACT_DCLST_COMPL>) {
	my ($cluster_id) = split;
	if ($hash{$cluster_id}) {
	} else {
	    $dclst .= $_;
	}
    }
    close(EXTRACT_DCLST_COMPL);

    # my $pattern =  join("|", @cluster);
    # my $dclst = `cat $dclst_file | grep -P -v '^($pattern) '`;

    return $dclst;
}


sub extract_dclst_by_genes {
    my ($dclst_table_file, $r_select) = @_;

    my $extracted_dclst = "";

    my @line = `cat $dclst_table_file`;
    for my $line (@line) {
	my ($cluster, $gene) = split(/\s+/, $line);
	if (${$r_select}{$gene}) {
	    $extracted_dclst .= $line;
	}
    }

    return $extracted_dclst;
}

sub wrap_lines_for_same_cluster_and_gene {
    my ($input, $output) = @_;
    
    open(INPUT, $input) || die;
    my %hash = ();
    while (<INPUT>) {
	chomp;
	my ($cluster_id, $gene, @domain_info) = split;
	unless (@domain_info and @domain_info % 3 == 0) {
	    die;
	}
	for (my $i=0; $i<@domain_info; $i+=3) {
	    my ($domain, $begin_pos, $end_pos) = ($domain_info[$i], $domain_info[$i+1], $domain_info[$i+2]);
	    $hash{$cluster_id}{$gene}{$domain}{begin} = $begin_pos;
	    $hash{$cluster_id}{$gene}{$domain}{end} = $end_pos;
	}
    }
    close(INPUT);

    open(OUTPUT, ">$output") || die;;
    for my $cluster_id (sort {$a cmp $b} keys %hash) {
	for my $gene (sort {$a cmp $b} keys %{$hash{$cluster_id}}) {
	    my @domain_info = ();
	    for my $domain (sort {$a<=>$b} keys %{$hash{$cluster_id}{$gene}}) {
		my $begin = $hash{$cluster_id}{$gene}{$domain}{begin};
		my $end = $hash{$cluster_id}{$gene}{$domain}{end};
		push @domain_info, $domain, $begin, $end;
	    }
	    print OUTPUT "$cluster_id $gene @domain_info\n";
	}
    }
    close(OUTPUT);
}

sub get_rep_cluster {
    my ($dclst) = @_;

    my @cluster = ();
    my $flg_string = 0;
    while ($dclst =~ /^(\S+) /gm) {
	my $cluster = $1;
	push @cluster, $cluster;
	if ($cluster !~ /^\d+$/) {
	    $flg_string = 1;
	}
    }

    my @cluster_sorted = ();
    if ($flg_string) {
	@cluster_sorted = sort {$a cmp $b} @cluster;
    } else {
	@cluster_sorted = sort {$a <=> $b} @cluster;
    }

    return $cluster_sorted[0];
}

sub read_cluster_members {
    my ($dclst, $r_member) = @_;

    for my $dclst (split("\n", $dclst)) {
	my ($cluster, $gene, $domain, $start, $end) = split(/\s+/, $dclst);
	${$r_member}{$cluster}{$gene}{$domain}{start} = $start;
	${$r_member}{$cluster}{$gene}{$domain}{end} = $end;
    }
}

sub get_dclst_structure {
    my ($dclst_table_file, $h_cluster, $h_domain) = @_;

    open(DCLST, $dclst_table_file) || die "$dclst_table_file: $!";
    while (my $line = <DCLST>) {
	chomp($line);
	if ($line eq "") {
	    print STDERR "read blank line $.\n";
	    next;
	}
	# my ($cluster, $gene, @domain_info) = split /\s+/, $line;
	my ($cluster, $gene, @domain_info) = decompose_dclst_line($line);
	unless (@domain_info and @domain_info % 3 == 0) {
	    die $line;
	}
	for (my $i=0; $i<@domain_info; $i+=3) {
	    my ($domain, $begin_pos, $end_pos) = ($domain_info[$i], $domain_info[$i+1], $domain_info[$i+2]);
	    ${$h_domain}{$gene}{$domain}{cluster} = $cluster;
	    ${$h_domain}{$gene}{$domain}{begin} = $begin_pos;
	    ${$h_domain}{$gene}{$domain}{end} = $end_pos;
	    ${$h_cluster}{$cluster}{$gene} ++;
	}
    }
    close(DCLST);
}

sub parse_dclst_to_structure { # obsolete
    my ($dclst_table, $h_cluster, $h_domain) = @_;

    save_contents($dclst_table, $TMP_DCLST_TO_GET_STRUCTURE);
    get_dclst_structure($TMP_DCLST_TO_GET_STRUCTURE, $h_cluster, $h_domain);
}

sub get_cluster_count {
    my ($r_cluster_count, $r_cluster) = @_;

    for my $cluster (keys %{$r_cluster}) {
	for my $gene (keys %{${$r_cluster}{$cluster}}) {
	    ${$r_cluster_count}{$cluster} += ${$r_cluster}{$cluster}{$gene};
	}
    }
}

sub get_dclst_of_domain {
    my ($r_domain) = @_;

    my $dclst = "";
    for my $gene (sort {$a cmp $b} keys %{$r_domain}) {
	for my $domain (sort {$a <=> $b} keys %{${$r_domain}{$gene}}) {
	    my $cluster = ${$r_domain}{$gene}{$domain}{cluster};
	    my $begin = ${$r_domain}{$gene}{$domain}{begin};
	    my $end = ${$r_domain}{$gene}{$domain}{end};
	    $dclst .= "$cluster $gene $domain $begin $end\n";
	}
    }
    return $dclst;
}

sub output_clusters {
    my ($r_cluster, $r_domain) = @_;

    for my $cluster (sort {$a cmp $b} keys %{$r_cluster}) {
	for my $gene (sort {$a cmp $b} keys %{${$r_cluster}{$cluster}}) {
	    for my $domain (sort {$a <=> $b} keys %{${$r_domain}{$gene}}) {
		my $cluster_for_this_domain = ${$r_domain}{$gene}{$domain}{cluster};
		my $begin = ${$r_domain}{$gene}{$domain}{begin};
		my $end = ${$r_domain}{$gene}{$domain}{end};
		if ($cluster_for_this_domain) {
		    if ($cluster_for_this_domain == $cluster) {
			print "$cluster $gene $domain $begin $end\n";
		    }
		} else {
		    print STDERR "error: [$cluster] $gene ($domain)\n";
		}
	    }
	}
    }
}

sub output_domains {
    my ($r_domain) = @_;

    for my $gene (sort {$a cmp $b} keys %{$r_domain}) {
	for my $domain (sort {$a <=> $b} keys %{${$r_domain}{$gene}}) {
	    my $cluster = ${$r_domain}{$gene}{$domain}{cluster};
	    my $begin = ${$r_domain}{$gene}{$domain}{begin};
	    my $end = ${$r_domain}{$gene}{$domain}{end};
	    print "$cluster $gene $domain $begin $end\n";
	}
    }
}

sub calc_total_aa {
    my ($r_domain) = @_;

    my $total_aa = 0;
    for my $gene (%{$r_domain}) {
	for my $domain (keys %{${$r_domain}{$gene}}) {
	    my $begin = ${$r_domain}{$gene}{$domain}{begin};
	    my $end = ${$r_domain}{$gene}{$domain}{end};
	    my $len = ($end - $begin + 1);
	    $total_aa += $len;
	}
    }

    return $total_aa;
}

sub read_alignment_list {
    my ($file, $r_a) = @_;

    my $alignment = "";
    open(TMP_ALIGNMENT, $file) || die;
    while (<TMP_ALIGNMENT>) {
	chomp;
	if (/^>(.*)/) {
	    if ($alignment) {
		$alignment =~ s/\s//g;
		push @{$r_a}, $alignment;
		$alignment = "";
	    }
	} else {
	    $alignment .= $_;
	}
    }
    $alignment =~ s/\s//g;
    push @{$r_a}, $alignment;
    close(TMP_ALIGNMENT);
    return @{$r_a};
}

sub read_alignment {
    my ($file, $r_a) = @_;

    my $alignment = "";
    open(F, $file) || die;
    while (<F>) {
	chomp;
	if (/^>(.*)/) {
	    if ($alignment) {
		$alignment =~ s/\s//g;
		my @a = split //, $alignment;
		push @{$r_a}, \@a;
		$alignment = "";
	    }
	} else {
	    $alignment .= $_;
	}
    }
    $alignment =~ s/\s//g;
    my @a = split //, $alignment;
    push @{$r_a}, \@a;
    return @{$r_a};
}

sub read_alignment_for_gene {
    my ($file, $r_a, $r_gene, $r_gene_list) = @_;

    @{$r_gene} = ();
    my $alignment = "";
    my $entry_id = "";
    open(F, $file) || die;
    while (<F>) {
	chomp;
	if (/^>(\S+)/) {
	    $entry_id = $1;
	    if ($alignment) {
		$alignment =~ s/\s//g;
		my @a = split //, $alignment;
		push @{$r_a}, \@a;
		$alignment = "";
	    }
	    if (${$r_gene_list}{$entry_id}) {
		push @{$r_gene}, $entry_id;
	    }
	} elsif (/^>/) {
	    die;
	} else {
	    if ($entry_id ne "") {
		if (${$r_gene_list}{$entry_id}) {
		    $alignment .= $_;
		}
	    } else {
		die;
	    }
	}
    }
    if ($alignment) {
	$alignment =~ s/\s//g;
	my @a = split //, $alignment;
	push @{$r_a}, \@a;
    }
    return @{$r_a};
}

sub read_seq {
    my ($r_seq, $r_gene) = @_;

    my %seq_to_read = ();
    for my $gene (@{$r_gene}) {
	my ($sp, $name) = decompose_gene_id($gene);
	$seq_to_read{$sp}{$gene} = 1;
    }

    if (-f $ENV{DOMREFINE_SEQ_DB}) {
	print STDERR " read $ENV{DOMREFINE_SEQ_DB} ..\n";
	open(BLDB, "$ENV{DOMREFINE_SEQ_DB}") || die;
	my $read_flg = 0;
	my $gene;
	my $organism_to_read;
	while (<BLDB>) {
	    # get gene and set read_flg
	    if (/^>(\S+)/) {
		$gene = $1;
		$read_flg = 0;
		for my $organism (keys %seq_to_read) {
		    if ($seq_to_read{$organism}{$gene}) {
			# duplicated sequence will be error
			if (${$r_seq}{$organism}{$gene}) {
			    die;
			}
			$read_flg ++;
			$organism_to_read = $organism;
			${$r_seq}{$organism_to_read}{$gene} = "";
		    }
		}
		# error handling
		if ($read_flg >= 2) {
		    die;
		}
	    }
	    if ($read_flg) {
		${$r_seq}{$organism_to_read}{$gene} .= $_;
	    }
	}
	close(BLDB);
    } elsif (-d $ENV{DOMREFINE_SEQ_DB}) {
	print STDERR " read directory $ENV{DOMREFINE_SEQ_DB} ..\n";
	for my $organism (keys %seq_to_read) {
	    my $seq_file = "";
	    if (-f "$ENV{DOMREFINE_SEQ_DB}/$organism") {
		$seq_file = "$ENV{DOMREFINE_SEQ_DB}/$organism";
	    } elsif (-f "$ENV{DOMREFINE_SEQ_DB}/$organism.fa") {
		$seq_file = "$ENV{DOMREFINE_SEQ_DB}/$organism.fa";
	    } else {
		die "can't open $organism";
	    }
	    open(BLDB, "$seq_file") || die "can't open $seq_file";
	    my $read_flg = 0;
	    my $gene_to_read;
	    while (<BLDB>) {
		if (/^>(\S+)/) {
		    my $gene = $1;
		    if ($gene =~ /^(\S+):\S+:(\S+)$/) {
			$gene = "$1:$2";
		    }
		    $read_flg = 0;
		    if ($seq_to_read{$organism}{$gene}) {
			# duplicated sequence will be error
			if (${$r_seq}{$organism}{$gene}) {
			    die;
			}
			$read_flg = 1;
			$gene_to_read = $gene;
			${$r_seq}{$organism}{$gene_to_read} = "";
		    }
		}
		if ($read_flg) {
		    ${$r_seq}{$organism}{$gene_to_read} .= $_;
		}
	    }
	    close(BLDB);
	}
    } else {
	die "\nInvalid value for DOMREFINE_SEQ_DB";
    }
}

sub get_gene_length {
    my ($h_seq, $gene) = @_;

    my ($sp, $name) = decompose_gene_id($gene);
    my $seq = ${$h_seq}{$sp}{$gene};
    $seq =~ s/^>.*//;
    $seq =~ s/\s+//g;

    return length($seq);
}

sub get_seq_fast {
    my ($r_gene, $tmp_seq, $dbh) = @_;

    open(TMP_SEQ, ">$tmp_seq") || die;
    for (my $i=0; $i<@{$r_gene}; $i++) {
	my $seq = get_seq_mysql($dbh, ${$r_gene}[$i]);
	if ($seq) {
	    print TMP_SEQ $seq;
	}
    }
    close(TMP_SEQ);
}

sub read_dclst_to_seq_file {
    my ($dclst_file, $tmp_seq) = @_;

    my %seq = ();
    my @geneset = read_dclst_geneset($dclst_file);
    my @gene = get_genes_fused(\@geneset);
    read_seq(\%seq, \@gene);

    open(TMP_SEQ, ">$tmp_seq") || die;
    for my $geneset (@geneset) {
	my $seq = get_geneset_seq($geneset, \%seq);
	if ($seq) {
	    print TMP_SEQ $seq;
	}
    }
    close(TMP_SEQ);
}

sub read_dclst_to_seq_file_region {
    my ($dclst_file, $tmp_seq, $r_before_seq, $r_after_seq) = @_;

    my %seq = ();
    my @geneset = read_dclst_geneset($dclst_file);
    my @gene = get_genes_fused(\@geneset);
    read_seq(\%seq, \@gene);

    my @dclst_line = `cat $dclst_file`;
    open(TMP_SEQ, ">$tmp_seq") || die "cannot create $tmp_seq";
    for (my $i=0; $i<@geneset; $i++) {
	my $seq = get_geneset_seq($geneset[$i], \%seq);
	if ($seq) {
	    my ($start, $end) = extract_region($dclst_file, $geneset[$i]);
	    my ($header, $region_seq, $before_seq, $after_seq) = trim_to_region($seq, $start, $end);
	    ${$r_before_seq}{$geneset[$i]} = $before_seq;
	    ${$r_after_seq}{$geneset[$i]} = $after_seq;
	    print TMP_SEQ "$header\n$region_seq\n";
	} else {
	    print STDERR "WARNING: cant get seq for $geneset[$i]\n";
	}
    }
    close(TMP_SEQ);
}

sub read_dclst_to_seq_file_domain {
    my ($dclst_file, $tmp_seq, $r_before_seq, $r_after_seq) = @_;
    
    my %seq = ();
    my @geneset = `cat $dclst_file | cut.sh 2`;
    chomp(@geneset);
    my @gene = get_genes_fused(\@geneset);
    read_seq(\%seq, \@gene);

    my @dclst_line = `cat $dclst_file`;
    open(TMP_SEQ, ">$tmp_seq") || die;
    for (my $i=0; $i<@geneset; $i++) {
	my $seq = get_geneset_seq($geneset[$i], \%seq);
	if ($seq) {
	    my ($cluster, $gene, $domain, $start, $end) = decompose_dclst_line($dclst_line[$i]);
	    my ($header, $region_seq, $before_seq, $after_seq) = trim_to_region($seq, $start, $end);
	    ${$r_before_seq}[$i] = $before_seq;
	    ${$r_after_seq}[$i] = $after_seq;
	    print TMP_SEQ "$header $domain\n$region_seq\n";
	} else {
	    print STDERR "WARNING: cant get seq for $geneset[$i]\n";
	}
    }
    close(TMP_SEQ);
}

sub decompose_dclst_line {
    my ($dclst_line) = @_;

    chomp($dclst_line);
    if ($dclst_line =~ /^(\S+)\s(\S+)\s(\S+)\s(\d+)\s(\d+)/) {
	my @f = split(/\s+/, $dclst_line);
	return @f;
    } elsif ($dclst_line =~ /^(\S+)\s(\S+)\s(\d+)\s(\d+)$/) { # eggNOG member format
	my ($cluster, $gene, $start, $end) = ($1, $2, $3, $4);
	my $domain = 0;
	return ($cluster, $gene, $domain, $start, $end);
    }
}

sub get_geneset_seq {
    my ($geneset, $r_seq) = @_;

    my $seq_fused = "";
    my @genes_fused = ();
    for my $gene (split(/\|/, $geneset)) {
	my $seq = get_gene_seq($gene, $r_seq);
	if ($seq) {
	    push @genes_fused, $gene;
	    $seq_fused .= $seq;
	}
    }
    $seq_fused =~ s/^>.*//gm;
    $seq_fused =~ s/^\s*\n//gm;

    if ($seq_fused) {
	my $genes_fused = join("|", @genes_fused);
	return ">$genes_fused\n$seq_fused\n";
    } else {
	return "";
    }
}

sub get_gene_seq {
    my ($gene, $r_seq) = @_;

    my ($sp, $name) = decompose_gene_id($gene);

    my $seq = ${$r_seq}{$sp}{$gene};
    if ($seq) {
	return $seq;
    } else {
	print STDERR "Warning: no seq for $gene, thus drop the seq.\n";
	return "";
    }
}

sub get_annotation {
    my ($annotation_file, $r_annotation) = @_;

    if (-f $annotation_file) {
	open(ANNOT, $annotation_file) || die;
	while (<ANNOT>) {
	    chomp;
	    my ($key, $annotation) = split("\t", $_);
	    if ($key and $annotation) {
		if (${$r_annotation}{$key}) {
		    print STDERR "Warning: duplicated annotation for $key\n";
		    ${$r_annotation}{$key} .= "; $annotation";
		}
		${$r_annotation}{$key} = $annotation;
	    }
	}
	close(ANNOT);
    }
}

sub check_alignment_length {
    my ($r_a) = @_;

    my $length = @{${$r_a}[0]};
    for (my $i=0; $i<@{$r_a}; $i++) {
	my $length2 = @{${$r_a}[$i]};
	if ($length2 != $length) {
	    die "$length != $length2";
	}
    }

    return $length;
}

sub md5_of_fasta {
    my ($fasta) = @_;

    my $sequences = `cat $fasta`;
    # $sequences =~ s/^>.*/>/gm;
    # $sequences =~ s/[-\s]//g;
    my $md5_value = md5_hex($sequences);
    
    return $md5_value;
}

sub convert_seq_file {
    my ($seq_file1, $seq_file2) = @_;

    my $seq = `cat $seq_file1`;
    $seq =~ s/^>.*\s(\S+:\S+)\s.*/>$1/mg;

    open(SEQ_FILE2, "> $seq_file2") || die;
    print SEQ_FILE2 $seq;
    close(SEQ_FILE2);
}

sub convert_alignment_file {
    my ($file1, $file2) = @_;

    my $alignment = `cat $file1`;
    $alignment =~ s/^>(\w+?)_(\S+)/>$1:$2/mg;
    $alignment =~ s/^%(\w+?)_(\S+)/>$1:$2/mg;

    open(ALIGNMENT_FILE2, "> $file2") || die;
    print ALIGNMENT_FILE2 $alignment;
    close(ALIGNMENT_FILE2);
}

sub find_alignment_cache {
    my ($fasta, $cache_file) = @_;

    if (! -f $cache_file) {
	return 0;
    }

    my $sequences = fasta_to_tsv($fasta);
    my $alignment = fasta_to_tsv($cache_file);

    if ($sequences ne $alignment) {
	print STDERR "WARNING: inconsistent cache_file $cache_file\n";
	return 0;
    }
    
    return 1;
}

sub fasta_to_tsv {
    my ($fasta) = @_;

    my @tsv = ();
    my $tmp = "";
    open(FASTA_TO_TSV, $fasta) || die;
    while (<FASTA_TO_TSV>) {
	chomp;
	s/[-\s]//g;
	if (/^>/) {
	    if ($tmp ne "") {
		push @tsv, $tmp;
	    }
	    $tmp = "$_\t";
	    # $tmp = "";
	} else {
	    $tmp .= $_;
	}
    }
    push @tsv, $tmp;
    close(FASTA_TO_TSV);

    @tsv = sort { $a cmp $b } @tsv;
    return join("\n", @tsv);
}

sub get_gene_idx {
    my ($r_gene, $r_gene_idx) = @_;
    
    for (my $i=0; $i<@{$r_gene}; $i++) {
	my $gene = ${$r_gene}[$i];
	${$r_gene_idx}{$gene} = $i;
    }
}

sub get_alignment_matrices {
    my ($dclst_file, $r_gene, $r_a, $r_b, $r_d, $r_p, $h_cluster, $h_domain, $r_get_j, %opt) = @_;

    print STDERR "\nAlignment..\n";
    my ($n, $m) = get_alignment_structure($dclst_file, $r_gene, $r_a, $r_b, $r_p, %opt);
    print STDERR " n_seq=$n n_pos=$m\n";

    get_dclst_structure($dclst_file, $h_cluster, $h_domain);
    if ($opt{domain}) {
	map_domains_one_by_one($r_gene, $h_domain, $r_b, $r_d, $dclst_file);
    } else {
	map_domains($r_gene, $h_domain, $r_b, $r_d);
    }
    create_get_j($r_b, $r_get_j);

    return ($n, $m);
}

sub get_alignment_matrices_from_file {
    my ($dclst_file, $tmp_alignment_file, $r_gene, $r_a, $r_b, $r_d, $r_p, $h_cluster, $h_domain, $r_get_j, %opt) = @_;

    my ($n, $m) = get_alignment_structure_from_file($tmp_alignment_file, $r_gene, $r_a, $r_b, $r_p, %opt);
    print STDERR " n_seq=$n n_pos=$m\n";

    get_dclst_structure($dclst_file, $h_cluster, $h_domain);
    if ($opt{domain}) {
	map_domains_one_by_one($r_gene, $h_domain, $r_b, $r_d, $dclst_file);
    } else {
	map_domains($r_gene, $h_domain, $r_b, $r_d);
    }
    create_get_j($r_b, $r_get_j);

    return ($n, $m);
}

sub get_alignment_structure {
    my ($dclst_file, $r_gene, $r_a, $r_b, $r_p, %opt) = @_;

    create_alignment($dclst_file, $r_gene, $TMP_ALIGNMENT_TO_GET_ALIGNMENT_STRUCTURE, %opt);

    my $n_seq = read_alignment($TMP_ALIGNMENT_TO_GET_ALIGNMENT_STRUCTURE, $r_a);
    my $n_pos = check_alignment_length(\@{$r_a});
    if (@{$r_gene} != @{$r_a}) {
	die scalar(@{$r_gene}) . " != " . scalar(@{$r_a});
    }
    print STDERR " n_seq=$n_seq, n_pos=$n_pos\n";

    summarize_amino_acid_frequency($r_a, $r_b, $r_p);

    return (scalar(@{$r_a}), scalar(@{${$r_a}[0]}));
}

sub get_alignment_structure_from_file {
    my ($tmp_alignment_file, $r_gene, $r_a, $r_b, $r_p, %opt) = @_;

    my $r_gene_list = $opt{gene_list};
    my $n_seq;
    if (%{$r_gene_list}) {
	$n_seq = read_alignment_for_gene($tmp_alignment_file, $r_a, $r_gene, $r_gene_list);
    } else {
	$n_seq = read_alignment($tmp_alignment_file, $r_a);
    }
    my $n_pos = check_alignment_length(\@{$r_a});
    if (@{$r_gene} != @{$r_a}) {
	die scalar(@{$r_gene}) . " != " . scalar(@{$r_a});
    }
    print STDERR " n_seq=$n_seq, n_pos=$n_pos\n";

    summarize_amino_acid_frequency($r_a, $r_b, $r_p);

    return (scalar(@{$r_a}), scalar(@{${$r_a}[0]}));
}

sub create_alignment {
    my ($dclst_file, $r_gene, $alignment_file, %opt) = @_;

    # prepare seq file
    my %before_seq = ();
    my %after_seq = ();
    my @before_seq = ();
    my @after_seq = ();
    if ($opt{region} or $opt{REGION}) {
	print STDERR "read region seq\n";
	read_dclst_to_seq_file_region($dclst_file, $TMP_SEQ_TO_CREATE_ALIGNMENT, \%before_seq, \%after_seq);
    } elsif ($opt{domain} or $opt{DOMAIN}) {
	print STDERR "read domain seq\n";
	read_dclst_to_seq_file_domain($dclst_file, $TMP_SEQ_TO_CREATE_ALIGNMENT, \@before_seq, \@after_seq);
    } else {
	print STDERR "read gene seq\n";
	read_dclst_to_seq_file($dclst_file, $TMP_SEQ_TO_CREATE_ALIGNMENT);
    }

    # create alignment file
    my @line = `cat $TMP_SEQ_TO_CREATE_ALIGNMENT | grep '^>'`;
    if (@line == 1) {
	system "cp $TMP_SEQ_TO_CREATE_ALIGNMENT $alignment_file";
    } elsif (@line >= 2) {
	aligner($TMP_SEQ_TO_CREATE_ALIGNMENT, $alignment_file);
    } else {
	die "no seq to align";
    }

    # post process
    if ($opt{region}) {
	append_seq_to_alignments($alignment_file, \%before_seq, \%after_seq);
    } elsif ($opt{domain}) {
	append_seq_to_alignments_array($alignment_file, \@before_seq, \@after_seq);
    }
    read_fasta_entries($alignment_file, $r_gene);


    if ($opt{DOMAIN}) {
	add_domain_number($alignment_file);
    }
}

sub add_domain_number {
    my ($tmp_alignment) = @_;

    my @alignment = `cat $tmp_alignment`;
    chomp(@alignment);

    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /^>\S+/) {
	    if ($alignment[$i] =~ /^>\S+ \S+/) {
		$alignment[$i] =~ s/^>(\S+) (\S+)/>$2_$1/;
	    } else {
		die $alignment[$i];
	    }
	}
    }
    
    # overwite tmp_alignment
    open(TMP_ALIGNMENT, ">$tmp_alignment") || die;
    print TMP_ALIGNMENT join("\n", @alignment), "\n";
    close(TMP_ALIGNMENT);
}

sub read_fasta_entries {
    my ($fasta_file, $r_gene) = @_;
    
    my @line = `cat $fasta_file | grep '^>'`;
    chomp(@line);
    for my $line (@line) {
	if ($line =~ /^>(\S+)/) {
	    my $gene = $1;
	    push @{$r_gene}, $gene;
	} else {
	    die;
	}
    }
}

sub append_seq_to_alignments {
    my ($tmp_alignment, $r_before_seq, $r_after_seq) = @_;

    my @alignment = ();
    read_alignment_list($tmp_alignment, \@alignment);

    my @gene = ();
    read_fasta_entries($tmp_alignment, \@gene);

    # additional lengths needed (before and after)
    my $max_before_len = 0;
    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /^(-*)/) {
	    my $start_gap_len = length($1);
	    my $before_seq_len = length(${$r_before_seq}{$gene[$i]});
	    if ($before_seq_len > $start_gap_len) {
		my $required_len = $before_seq_len - $start_gap_len;
		if ($required_len > $max_before_len) {
		    $max_before_len = $required_len;
		}
	    }
	}
    }
    my $max_after_len = 0;
    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /[^-](-*)$/) {
	    my $end_gap_len = length($1);
	    my $after_seq_len = length(${$r_after_seq}{$gene[$i]});
	    if ($after_seq_len > $end_gap_len) {
		my $required_len = $after_seq_len - $end_gap_len;
		if ($required_len > $max_after_len) {
		    $max_after_len = $required_len;
		}
	    }
	}
    }

    # append before_seq and after_seq
    open(TMP_ALIGNMENT, ">$tmp_alignment") || die;
    for (my $i=0; $i<@alignment; $i++) {
	my $alignment = "-" x $max_before_len . $alignment[$i] . "-" x $max_after_len;
	if ($alignment =~ /^(-*)/) {
	    my $start_gap_len = length($1);
	    $alignment =~ s/^-*//;
	    my $start_gap_len_required = $start_gap_len - length(${$r_before_seq}{$gene[$i]});
	    $alignment = "-" x $start_gap_len_required . ${$r_before_seq}{$gene[$i]} . $alignment;
	}
	if ($alignment =~ /[^-](-*)$/) {
	    my $end_gap_len = length($1);
	    $alignment =~ s/([^-])-*$/$1/;
	    my $end_gap_len_required = $end_gap_len - length(${$r_after_seq}{$gene[$i]});
	    $alignment = $alignment . ${$r_after_seq}{$gene[$i]} . "-" x $end_gap_len_required;
	}
	print TMP_ALIGNMENT ">", $gene[$i], "\n";
	print TMP_ALIGNMENT $alignment, "\n";
    }
    close(TMP_ALIGNMENT);
}

sub append_seq_to_alignments_array {
    my ($tmp_alignment, $r_before_seq, $r_after_seq) = @_;

    my @alignment = ();
    read_alignment_list($tmp_alignment, \@alignment);

    my @gene = ();
    read_fasta_entries($tmp_alignment, \@gene);

    # additional lengths needed (before and after)
    my $max_before_len = 0;
    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /^(-*)/) {
	    my $start_gap_len = length($1);
	    my $before_seq_len = length(${$r_before_seq}[$i]);
	    if ($before_seq_len > $start_gap_len) {
		my $required_len = $before_seq_len - $start_gap_len;
		if ($required_len > $max_before_len) {
		    $max_before_len = $required_len;
		}
	    }
	}
    }
    my $max_after_len = 0;
    for (my $i=0; $i<@alignment; $i++) {
	if ($alignment[$i] =~ /[^-](-*)$/) {
	    my $end_gap_len = length($1);
	    my $after_seq_len = length(${$r_after_seq}[$i]);
	    if ($after_seq_len > $end_gap_len) {
		my $required_len = $after_seq_len - $end_gap_len;
		if ($required_len > $max_after_len) {
		    $max_after_len = $required_len;
		}
	    }
	}
    }

    # append before_seq and after_seq
    open(TMP_ALIGNMENT, ">$tmp_alignment") || die;
    for (my $i=0; $i<@alignment; $i++) {
	my $alignment = "-" x $max_before_len . $alignment[$i] . "-" x $max_after_len;
	if ($alignment =~ /^(-*)/) {
	    my $start_gap_len = length($1);
	    $alignment =~ s/^-*//;
	    my $start_gap_len_required = $start_gap_len - length(${$r_before_seq}[$i]);
	    $alignment = "-" x $start_gap_len_required . ${$r_before_seq}[$i] . $alignment;
	}
	if ($alignment =~ /[^-](-*)$/) {
	    my $end_gap_len = length($1);
	    $alignment =~ s/([^-])-*$/$1/;
	    my $end_gap_len_required = $end_gap_len - length(${$r_after_seq}[$i]);
	    $alignment = $alignment . ${$r_after_seq}[$i] . "-" x $end_gap_len_required;
	}
	print TMP_ALIGNMENT ">", $gene[$i], "\n";
	print TMP_ALIGNMENT $alignment, "\n";
    }
    close(TMP_ALIGNMENT);
}

sub decompose_gene_id {
    my ($gene_id) = @_;

    if (! defined $gene_id) {
	die;
    }
    
    my ($sp, $name);
    if ($gene_id =~ /^[A-Za-z0-9]+:/) {
	my @x = split(":", $gene_id);
	if (@x == 2) {
	    ($sp, $name) = @x;
	} elsif (@x == 3) {
	    ($sp, $name) = @x[0,2]; # bug ?
	} else {
	    die;
	}
    } elsif ($gene_id =~ /^(\d+)\.(\S+)/) { # eggNOG
	($sp, $name) = ($1, $2);	
    } elsif ($gene_id =~ /^(\w+?)_(\S+)/) {
	($sp, $name) = ($1, $2);	
    } else {
	die $gene_id;
    }

    return ($sp, $name);
}

sub extract_region {
    my ($tmp_dclst_file, $geneset) = @_;

    my @start = ();
    my @end = ();
    open(EXTRACT_REGION_DCLST, "$tmp_dclst_file") || die;
    while (<EXTRACT_REGION_DCLST>) {
	chomp;
	my ($cluster, $gene, @domain_info) = split;
	unless (@domain_info and @domain_info % 3 == 0) {
	    die $_;
	}
	if ($gene eq $geneset) {
	    for (my $i=0; $i<@domain_info; $i+=3) {
		my ($domain, $begin_pos, $end_pos) = ($domain_info[$i], $domain_info[$i+1], $domain_info[$i+2]);
		push @start, $begin_pos;
		push @end, $end_pos;
	    }
	}
    }
    close(EXTRACT_REGION_DCLST);
    my $start = min(@start);
    my $end = max(@end);
    if ($start > $end) {
	die;
    }

    return ($start, $end);
}

sub trim_to_region {
    my ($seq, $start, $end) = @_;

    if ($seq =~ /^(>.*?)\n/) {
	my $header = $1;
	$seq =~ s/^.*?\n//;
	$seq =~ s/\s//g;
	my $seq_len = length($seq);
	if ($seq_len < $start || $seq_len < $end) {
	    print STDERR "$start-$end\n$header\n$seq";
	}
	my $before_seq = substr($seq, 0, $start-1);
	my $region_seq = substr($seq, $start-1, $end-$start+1);
	my $after_seq = substr($seq, $end);
	return ($header, $region_seq, $before_seq, $after_seq);
    } else {
	die;
    }
}

sub get_dbh {
    use DBI;
    my $dbh = DBI->connect("DBI:mysql:mbgd:localhost;mysql_socket=/tmp/mysql.sock", "chiba", "chiba"
			    , {'RaiseError' => 1}
			    );
    return $dbh;
}

sub define_tmp_file {
    my ($prefix) = @_;
    
    my $tmp_file = "$ENV{DOMREFINE_TMP}/$ENV{HOSTNAME}.$$.$prefix";
    
    return $tmp_file;
}

sub remove_tmp_file {
    my ($file) = @_;
    
    if ($file and -f $file) {
	unlink $file;
    }
}

sub cache_file_name {
    my ($md5_string, $suffix) = @_;

    my $dir = substr($md5_string, 0, 2);

    my $cache_file = "$ENV{DOMREFINE_CACHE}/$dir/${md5_string}${suffix}";

    return $cache_file;
}

sub save_stdin {
    my ($file) = @_;

    my $contents;
    if (defined $file) {
	system "cat > $file";
	$contents = `cat $file`;
    } else {
	$contents = `cat`;
    }

    return $contents;
}

sub save_contents {
    my ($contents, $file) = @_;
    
    open(FILE, ">$file") || die;
    print FILE $contents;
    close(FILE);
}

sub get_geneset_descr {
    my ($geneset, %opt) = @_;
    my $r_gene_descr = $opt{r_gene_descr};

    my @gene = split(/\|/, $geneset);

    my @descr = ();
    for (my $i=0; $i<@gene; $i++) {
	if ($opt{mysql}) {
	    $descr[$i] = mysql_gene_descr($gene[$i]);
	} elsif (%{$r_gene_descr}) {
	    my $r_gene_descr = $opt{r_gene_descr};
	    if (${$r_gene_descr}{$gene[$i]}) {
		$descr[$i] = ${$r_gene_descr}{$gene[$i]};
	    } elsif ($gene[$i] =~ /^([A-Za-z0-9]+):(\S+)$/) {
		my ($sp_lc, $name_uc) = (lc($1), uc($2));
		if (${$r_gene_descr}{"$sp_lc:$name_uc"}) {
		    $descr[$i] = ${$r_gene_descr}{"$sp_lc:$name_uc"};
		} else {
		    $descr[$i] = "NOT_FOUND";
		}
	    } else {
		die;
	    }
	}
    }

    return join(" | ", @descr);
}

sub mysql_gene_descr {
    my ($gene) = @_;

    my ($sp, $name) = decompose_gene_id($gene);

    my $dbh = get_dbh();

    my $r_r_descr = $dbh->selectall_arrayref("select descr from gene where sp='$sp' and name='$name'");
    if (@{$r_r_descr} != 1 or @{${$r_r_descr}[0]} != 1) {
	print STDERR "Cannot find annotation for $gene\n";
# 	die;
    }

    my $descr = ${$r_r_descr}[0][0];
    if (! $descr) {
	$descr = "NOT_FOUND";
    }

    return $descr;
}

sub get_seq_mysql {
    my ($dbh, $gene) = @_;

    my ($sp, $name) = decompose_gene_id($gene);

    my $r_r_id = $dbh->selectall_arrayref("select aaseq from gene where sp='$sp' and name='$name'");
    if (@{$r_r_id} != 1 or @{${$r_r_id}[0]} != 1) {
	die;
    }
    my $id = ${$r_r_id}[0][0];

    my $r_r_seq = $dbh->selectall_arrayref("select seq from proteinseq where id=$id");
    if (@{$r_r_seq} != 1 or @{${$r_r_seq}[0]} != 1) {
	die;
    }
    my $seq = ${$r_r_seq}[0][0];

    $seq =~ s/(.{1,60})/$1\n/g;
    return ">$gene\n$seq\n";
}

1;
