#!/usr/bin/perl -s
package Cog;
use strict;
use DirHandle;
use FileHandle;
use MBGD;
use GenBank;
require "MBGD_commonPath.pl";

###############################################################################
# COG $B$N@8J*<o%3!<%I$+$i(B MBGD $B$N(B spid $B$X$NJQ49%F!<%V%k(B
$Cog::SPEC_COG2SPID = {  # COG      MBGD
# Euryarchaeota
                         'mth' => 'gm00009', # 'mth',
                         'mja' => 'gm00004', # 'mja',
                         'hbs' => 'gm00037', # 'hal',	#
                         'tac' => 'gm00034', # 'tac',
                         'tvo' => 'gm00039', # 'tvo',
                         'pho' => 'gm00014', # 'pho',
                         'pab' => 'gm00048', # 'pab',
                         'afu' => 'gm00011', # 'afu',
                         'mka' => 'gm00075', # 'mka',
                         'mac' => 'gm00078', # 'mac',
# Crenarchaeota
                         'pya' => 'gm00071', # 'pai',	#
                         'sso' => 'gm00052', # 'sso',
                         'ape' => 'gm00021', # 'ape',
# Ascomycota
                         'sce' => 'gm00006', # 'sce',
                         'spo' => 'gm00074', # 'spo',
# Microsporidia
                         'ecu' => 'gm00066', # 'ecu',
# Aquificae
                         'aae' => 'gm00013', # 'aae',
# Thermotogae
                         'tma' => 'gm00022', # 'tma',
# Cyanobacteria
                         'nos' => 'gm00065', # 'ana',	#
                         'syn' => 'gm00003', # 'syn',
# Deinococcus-Thermus
                         'dra' => 'gm00023', # 'dra',
# Fusobacteria
                         'fnu' => 'gm00077', # 'fnu',
# Spirochaetes
                         'tpa' => 'gm00016', # 'tpa',
                         'bbu' => 'gm00012', # 'bbu',
# Chlamydiae
                         'ctr' => 'gm00017', # 'ctr',
                         'cpn' => 'gm00020', # 'cpn',
# Actinobacteria
                         'cgl' => 'gm00043', # 'cgl',
                         'mtu' => 'gm00015', # 'mtu',
                         'mtc' => 'gm00058', # 'mtc',
                         'mle' => 'gm00041', # 'mle',
# Firmicutes
                         'cac' => 'gm00055', # 'cac',
                         'sau' => 'gm00047', # 'sau',
                         'lin' => 'gm00063', # 'lin',
                         'bsu' => 'gm00010', # 'bsu',
                         'bha' => 'gm00029', # 'bha',
                         'lla' => 'gm00049', # 'lla',
                         'spy' => 'gm00045', # 'spy',
                         'spn' => 'gm00053', # 'spn',
                         'uur' => 'gm00036', # 'uur',
                         'mpu' => 'gm00050', # 'mpu',
                         'mpn' => 'gm00005', # 'mpn',
                         'mge' => 'gm00002', # 'mge',
# Proteobacteria
                         'pae' => 'gm00035', # 'pae',
                         'eco' => 'gm00008', # 'eco',
                         'ecz' => 'gm00040', # 'ece',	#
                         'ecs' => 'gm00042', # 'ecs',
                         'ype' => 'gm00059', # 'ype',
                         'sty' => 'gm00062', # 'stm',	#
                         'buc' => 'gm00033', # 'buc',
                         'xfa' => 'gm00031', # 'xfa',
                         'vch' => 'gm00032', # 'vch',
                         'hin' => 'gm00001', # 'hin',
                         'pmu' => 'gm00044', # 'pmu',
                         'rso' => 'gm00072', # 'rso',
                         'nme' => 'gm00025', # 'nme',
                         'nma' => 'gm00028', # 'nma',
                         'hpy' => 'gm00007', # 'hpy',
                         'jhp' => 'gm00019', # 'hpj',
                         'cje' => 'gm00024', # 'cje',
                         'ccr' => 'gm00051', # 'ccr',
                         'atu' => 'gm00068', # 'atu',
                         'sme' => 'gm00054', # 'sme',
                         'bme' => 'gm00069', # 'bme',
                         'mlo' => 'gm00038', # 'mlo',
                         'rpr' => 'gm00018', # 'rpr',
                         'rco' => 'gm00057', # 'rco',
    };

###############################################################################
# $BL>>N(B
#     new()
# $B35MW(B
#     $B%3%s%9%H%i%/%?(B
# $B0z?t(B
#
# $BLaCM(B
#
# $B@bL@(B
#
# $BHw9M(B
#
sub new {
    my($class) = shift;
    my($dir) = shift;
    my($self) = {};

    bless($self, $class);
    $self->_init($dir);

    return $self;
}

###############################################################################
# $BL>>N(B
#     _init()
# $B35MW(B
#     $B=i4|=hM}(B
# $B0z?t(B
#
# $BLaCM(B
#
# $B@bL@(B
#     $B?F$N=i4|=hM}$r<B9T$7$F$+$i!"<+J,$N=i4|=hM}$r<B9T$9$k!#(B
# $BHw9M(B
#
sub _init {
    my($self) = shift;
    my($dir) = @_;

    # COG $B$H(B MBGD $B$N@8J*<o%3!<%IBP1~(B
    $self->{'SPEC_COG2SPID'} = $Cog::SPEC_COG2SPID;

    #
    $self->setDir($dir);
    $self->readTable();

    return;
}

###############################################################################
#
sub setDir {
    my($self) = shift;
    my($dir) = shift;

    $self->{'DIR'} = $dir;

    return;
}

###############################################################################
#
sub getDir {
    my($self) = shift;

    return $self->{'DIR'};
}

###############################################################################
#
sub readTable {
    my($self) = shift;
    my($fileWhog) = shift;

    #
    if (! $fileWhog) {
        my($dir) = $self->getDir();
        $fileWhog = "$dir/whog";
    }
    my($ref) = $self->{'WHOG'} = {};
    my($refRev) = $self->{'WHOG_REV'} = {};
    my($category);
    my($spec);
    my($spid);

    #
    my($line);
    my($total) = 0;
    my($fh) = new FileHandle("$fileWhog") or die("Can not open $fileWhog($!)");
    while($line = $fh->getline()) {
        next if ($line =~ /^\s*$/);
        next if ($line =~ /^\_*$/);

        $line =~ s#[\r\n]+$##;

        if ($line =~ /^\[([A-Z]+)\]/) {
            $category = $1;
            if (! exists($ref->{"$category"})) {
                $ref->{"$category"} = {};
            }
            next;
        }

        my($sporfSet) = '';
        if ($line =~ /^\s+([A-Z]+):\s+(.+)$/i) {
            $spec = lc($1);
            $sporfSet = lc($2);

            #
            if (exists($self->{'SPEC_COG2SPID'}->{"$spec"})) {
                $spid = $self->{'SPEC_COG2SPID'}->{"$spec"};
            }
            else {
                print STDERR "WARNING :: unknown COG-spec :: $spec\n";
            }

            if (! exists($ref->{"$category"}->{"$spid"})) {
                $ref->{"$category"}->{"$spid"} = [];
            }
        }
        else {
            $sporfSet = $line;
        }

        my($n) = 0;
        foreach my$orf (split(/\s+/, $sporfSet)) {
            next if ($orf =~ /^\s*$/);

            push(@{$ref->{"$category"}->{"$spid"}}, $orf);

            $refRev->{"$spid"}->{"$orf"} = $category;

            $n++;
        }

        $total += $n;
    }
    $fh->close();
    print STDERR "TOTAL :: $total\n" if ($main::v);

    return;
}

###############################################################################
#
sub readName2GiTable {
    my($self) = shift;
    my($filename) = shift;

    #
    if (! $filename) {
        my($dir) = $self->getDir();
        $filename = "$dir/myva=gb";
    }
    my($ref) = $self->{'MYVAGB'} = {};

    #
    print STDERR "DBG :: reading...($filename)\n";
    my($line);
    my($fh) = new FileHandle("$filename") or die("Can not open $filename($!)");
    while($line = $fh->getline()) {
        next if ($line =~ /^\s*$/);
        next if ($line =~ /^\_*$/);

        $line =~ s#[\r\n]*$##;

        my($name, $gi) = split(/\s+/, $line);

        $name = lc($name);
        if (exists($ref->{"$name"})) {
            print STDERR "WARNING :: Found same name :: $name\n";
        }
        $ref->{"$name"} = $gi;
    }
    $fh->close();

    return;
}

###############################################################################
#
sub getCategories {
    my($self) = shift;

    return sort(keys(%{$self->{'WHOG'}}));
}

###############################################################################
#
sub getCategorySpec {
    my($self) = shift;
    my($cat) = shift;

    #
    if (! exists($self->{'WHOG'}->{"$cat"})) {
        print STDERR "WARNING :: Unknown category :: $cat\n" if ($main::v);
        return;
    }

    return sort(keys(%{$self->{'WHOG'}->{"$cat"}}));
}

###############################################################################
#
sub getCategorySpecOrfList {
    my($self) = shift;
    my($cat) = shift;
    my($sp) = shift;

    #
    if (! exists($self->{'WHOG'}->{"$cat"})) {
        print STDERR "WARNING :: Unknown category :: $cat\n" if ($main::v);
        return;
    }
    if (! exists($self->{'WHOG'}->{"$cat"}->{"$sp"})) {
        print STDERR "WARNING :: Unknown species :: $sp\n" if ($main::v);
        return;
    }

    return @{$self->{'WHOG'}->{"$cat"}->{"$sp"}};
}

###############################################################################
#
sub getCategoryBySpname {
    my($self) = shift;
    my($spname) = shift;

    my($sp, $name) = split(/:/, $spname);

    return $self->{'WHOG_REV'}->{"$sp"}->{"$name"};
}

###############################################################################
#
sub name2Gi {
    my($self) = shift;
    my($name) = shift;

    if (exists($self->{'MYVAGB'}->{"$name"})) {
        return $self->{'MYVAGB'}->{"$name"};
    }

    $name = lc($name);
    if (exists($self->{'MYVAGB'}->{"$name"})) {
        return $self->{'MYVAGB'}->{"$name"};
    }

    $name =~ s#_\d+$##;
    if (exists($self->{'MYVAGB'}->{"$name"})) {
        return $self->{'MYVAGB'}->{"$name"};
    }

    return '';
}

###############################################################################
# COG $B$NG[Ns%G!<%?$NFI$_9~$_(B
sub readSequence {
    my($self) = shift;
    my($fileMyva) = shift;

    #
    if (! $fileMyva) {
        my($dir) = $self->getDir();
        $fileMyva = "$dir/myva";
    }
    my($ref) = $self->{'MYVA'} = {};

    #
    print STDERR "DBG :: reading...($fileMyva)\n";
    my($line);
    my($spname, $sp);
    my($name) = '';
    my($fh) = new FileHandle("$fileMyva") or die("Can not open $fileMyva($!)");
    while($line = $fh->getline()) {
        $line =~ s#[\r\n]*$##;

        if ($line =~ /^>(\S+)/) {
            $spname = $name = lc($1);
            if ($spname =~ /:/) {
                ($sp, $name) = split(/:/, $spname);
            }
            if (exists($ref->{"$name"})) {
                print STDERR "WARNING :: FOUND same name :: $name\n";
            }
            $ref->{"$name"} = '';
        }
        else {
            $ref->{"$name"} .= $line;
        }
    }
    $fh->close();

    return;
}

###############################################################################
# COG $B$NL>>N$r;XDj$7$FG[Ns$r<hF@$9$k(B
sub getSequence {
    my($self) = shift;
    my($name) = shift;

    my($name1) = lc($name);
    my($name2) = ($name1 =~ /(\S+)\_\d+$/);
    if (exists($self->{'MYVA'}->{"$name1"})) {
        return $self->{'MYVA'}->{"$name1"};
    }
    elsif (exists($self->{'MYVA'}->{"$name2"})) {
        return $self->{'MYVA'}->{"$name2"};
    }
    else {
        print STDERR "WARNING :: No sequence :: $name\n" if ($main::v);
    }

    return;
}

###############################################################################
# $B@8J*<o%3!<%I$r;XDj$7!"(BCOG $B$NG[Ns$r<h$j=P$9(B
sub writeSequence {
    my($self) = shift;
    my($category) = shift;
    my($spec) = shift;
    my($fileOut) = shift;

    #
    my($fh) = new FileHandle(">$fileOut") or return;

    #
    foreach my$cat ($self->getCategories()) {
        if (($category !~ /^all$/i) && ($cat !~ /^$category$/i)) {
            next;
        }

        foreach my$sp ($self->getCategorySpec($cat)) {
            next if ($sp !~ /^$spec$/i);

            foreach my$orf ($self->getCategorySpecOrfList($cat, $sp)) {
                my($seq) = $self->getSequence($orf);
                $seq =~ s#(.{60})#$1\n#g;

                $fh->print(">$orf", "\n");
                $fh->print($seq, "\n");
            }
        }
    }
    $fh->close();

    return;
}

###############################################################################
1;#
###############################################################################
