#!/usr/bin/perl -s
use strict;
use MBGD;
use KeggOrthology;
use MBGD::FunctionCategory;
require "MBGD_commonPath.pl";

###############################################################################
sub searchGbkSyn {
    my($refGbk) = shift;
    my($sp) = shift;
    my($orf) = shift;
    my($orfOrig) = shift;

    foreach my$refSp(@{$refGbk->{"$sp"}}) {
        foreach my$feat (@{$refSp->{'data'}->{'features'}}) {
            next if ($feat->{'keyname'} !~ /gene/);
            foreach my$attr (keys(%{$feat->{'attr'}})) {
                next if ($attr ne 'note');
                if ($feat->{'attr'}->{"$attr"} =~ /^synonyms?\:/) {
                    my($syn) = $feat->{'attr'}->{"$attr"};
                    $syn =~ s#^synonyms?\:\s*##;
#print STDERR "SYN :: $feat->{'attr'}->{$attr}\n" if ($orf =~ /yhaP/i);
                    foreach my$s (split(/[\s\,\;]+/, $syn)) {
                        next if ($s =~ /^\s*$/);
                        if ($s =~ /^$orf$/i) {
#print STDERR "MATCH :: $sp:$orf($orfOrig) :: $s\n";
                            return $feat->{'attr'}->{'locus_tag'};
                        }
#print STDERR "NOT MATCH :: $sp:$orf($orfOrig) :: $s\n" if ($orf =~ /yhaP/i);
                    }
                }
            }
        }
    }

    return '';
}

###############################################################################
sub searchGbkSeq {
    my($refGbk) = shift;
    my($sp) = shift;
    my($seq) = shift;
    my($orfOrig) = shift;

    my($locus_tag) = '';
    my($n) = 0;
    foreach my$refSp(@{$refGbk->{"$sp"}}) {
        foreach my$feat (@{$refSp->{'data'}->{'features'}}) {
            next if ($feat->{'keyname'} !~ /CDS/);
            foreach my$attr (keys(%{$feat->{'attr'}})) {
                next if ($attr ne 'translation');

                if ($feat->{'attr'}->{"$attr"} =~ /^$seq$/i) {
print STDERR "MATCH :: $sp:SEQ($orfOrig)\n";
                    $locus_tag = $feat->{'attr'}->{'locus_tag'};
                    $n++;
                }
            }
        }
    }

if (1 < $n) {
    print STDERR "WARNING :: matched $n times\n";
}

    return $locus_tag;
}


###############################################################################
if ($0 eq __FILE__) {
    my($ok) = 0;
    my($ng) = 0;
    my($skip) = 0;
    my($n) = 0;
    my($refGbk) = {};

    my($dbType) = 'kegg';
    my($funcCat) = new MBGD::FunctionCategory($dbType);
    my($kegg) = new KeggOrthology();

    my($dbname) = $main::DBNAME_ACCUM;
    my($db) = new MBGD::DB($dbname);

    my(@spList) = keys(%{$kegg->{'SPNAME2ENTRY'}});
    if (scalar(@ARGV) != 0) {
        @spList = @ARGV;
    }

print STDERR "TARGET :: @spList\n";
    foreach my$sp (@spList) {
        $sp = lc($sp);

#next if (($sp ne 'aae') && ($sp ne 'bsu'));
#next if (($sp ne 'eco') && ($sp ne 'bsu'));

        my($f) = "./geneFunction.$sp.kegg";
#next if (-e $f);

        my($fhw) = new FileHandle(">$f") || die("Can not open $f($!)");

        my($orf);
        foreach my$orig_orf (keys(%{$kegg->{'SPNAME2ENTRY'}->{"$sp"}})) {
            $orig_orf =~ s#\'##g;
            my($staFound) = 0;

            $n++;

            my($refEntries) = $kegg->getEntriesBySpname($sp, $orig_orf);

            foreach my$regpat ('', '_', '\.\d+$') {
                $orf = $orig_orf;
                if ($regpat ne '') {
                    $orf =~ s#$regpat##;
                }

                my($opt) = { 'where' => "sp='$sp' and name='$orf'" };
                my($res) = $db->select_fetch('gene', $opt);
                if ($res->{'ROWS'} == 1) {
                    $ok++;
                    my($spec) = $res->{'INFO'}->[0]->{'sp'};
                    my($name) = $res->{'INFO'}->[0]->{'name'};
                    foreach my$refEnt (@{$refEntries}) {
                        foreach my$cname1 (keys(%{$refEnt->{'CLASS'}})) {
                            foreach my$cname2 (keys(%{$refEnt->{'CLASS'}->{"$cname1"}})) {
                                my($refFunc) = $funcCat->getFunctionByName($cname1, $cname2);
                                my($lev) = $refFunc->{'LEVEL'};
                                $lev = '98.1' if (! $lev); # set 'Others'
                                $fhw->print(join("\t", $sp, $name, 'kegg', $lev), "\n");
                            }
                        }
                    }
                    $staFound = 1;
                    last;
                }
                elsif (1 < $res->{'ROWS'}) {
                    print STDERR "WARNING :: FOUND $orf :: $res->{'ROWS'} genes.\n";
                }
            }
            next if ($staFound);

            my($gene) = '';
            $orf = lc($orig_orf);
            $orf = $kegg->{'NAME_P2S'}->{"$sp"}->{"$orf"};
            $orf =~ s#'#\\'#;
            if ($orf) {
                my($opt) = { 'where' => "sp='$sp' and gene='$orf'" };
                my($res) = $db->select_fetch('gene', $opt);
                if ($res->{'ROWS'} == 1) {
                    $ok++;
                    my($spec) = $res->{'INFO'}->[0]->{'sp'};
                    my($name) = $res->{'INFO'}->[0]->{'name'};
                    foreach my$refEnt (@{$refEntries}) {
                        foreach my$cname1 (keys(%{$refEnt->{'CLASS'}})) {
                            foreach my$cname2 (keys(%{$refEnt->{'CLASS'}->{"$cname1"}})) {
                                my($refFunc) = $funcCat->getFunctionByName($cname1, $cname2);
                                my($lev) = $refFunc->{'LEVEL'};
                                $lev = '98.1' if (! $lev); # set 'Others'
                                $fhw->print(join("\t", $sp, $name, 'kegg', $lev), "\n");
                            }
                        }
                    }
                    next;
                }
                elsif (1 < $res->{'ROWS'}) {
                    print STDERR "WARNING :: FOUND $orf :: $res->{'ROWS'} genes.\n";
                }

                if ($orf =~ /^rrn.+\d+s$/) {
                    $skip++;
                    next;
                }
                $gene = $orf;
            }



            my($isMatch);
            if (! exists($refGbk->{"$sp"})) {
                $refGbk->{"$sp"} = [];

#                my($d) = "$ENV{'MBGD_HOME'}/species/$sp/GB";
                my($d) = "/db5/project/MBGD/species/$sp/GB";
                my($dh) = new DirHandle($d) || next;
                foreach my$f ($dh->read()) {
                    next if ($f !~ /\.gbk$/);
                    my($refSp) = new GenBank("$d/$f");
                    $refSp->read_entry();

                    push(@{$refGbk->{"$sp"}}, $refSp);
                }
            }



            $isMatch = searchGbkSyn($refGbk, $sp, $orf, $orig_orf);
            if ($isMatch) {
                $ok++;
                my($spec) = $sp;
                my($name) = $isMatch;
                foreach my$refEnt (@{$refEntries}) {
                    foreach my$cname1 (keys(%{$refEnt->{'CLASS'}})) {
                        foreach my$cname2 (keys(%{$refEnt->{'CLASS'}->{"$cname1"}})) {
                            my($refFunc) = $funcCat->getFunctionByName($cname1, $cname2);
                            my($lev) = $refFunc->{'LEVEL'};
                            $lev = '98.1' if (! $lev); # set 'Others'
                            $fhw->print(join("\t", $sp, $name, 'kegg', $lev), "\n");
                        }
                    }
                }
                next;
            }


            # KEGG $B$N(B Web $B$+$i(B proteinseq $B$r<hF@(B
            my($url) = sprintf("http://www.genome.ad.jp/dbget-bin/www_bget?-f+-n+a+%s+%s+-s", $sp, $orig_orf);
            my($cmd) = "$main::CMD_wget -q -O - $url";
            my($fhSeq) = new FileHandle("$cmd |");
            my($seq) = '';
            if ($fhSeq) {
                my($l);
                my($inSeq) = 0;
                while($l = $fhSeq->getline()) {
                    if ($l =~ /\<pre\>/) {
                        $inSeq = 1;
                        next;
                    }
                    elsif ($l =~ /\<\/pre\>/) {
                        last;
                    }
                    next if (! $inSeq);

                    $seq .= $l;
                }
                $fhSeq->close();
                $seq =~ s#[\r\n]+##g;
            }
            if ($seq) {
                # $BG[Ns(B(protein)$B%l%Y%k$G$N0lCW$r$_$k!J(B*.gbk$B!K(B
                $orf = $orig_orf;
                $isMatch = searchGbkSeq($refGbk, $sp, $seq, $orig_orf);
                if ($isMatch) {
                    $ok++;
                    my($name) = $isMatch;
                    foreach my$refEnt (@{$refEntries}) {
                        foreach my$cname1 (keys(%{$refEnt->{'CLASS'}})) {
                            foreach my$cname2 (keys(%{$refEnt->{'CLASS'}->{"$cname1"}})) {
                                my($refFunc) = $funcCat->getFunctionByName($cname1, $cname2);
                                my($lev) = $refFunc->{'LEVEL'};
                                $lev = '98.1' if (! $lev); # set 'Others'
                                $fhw->print(join("\t", $sp, $name, 'kegg', $lev), "\n");
                            }
                        }
                    }
                    next;
                }
            }


            ##########
            print STDERR "NOT FOUND :: $sp:$orig_orf ($gene)\n";
            $ng++;
        }
        $fhw->close();
    }
    print "TOTAL :: $n\n";
    print "OK    :: $ok\n";
    print "NG    :: $ng\n";
    print "SKIP  :: $skip\n";
}

1;#
