#!/usr/bin/perl -s
use strict;
use FileHandle;
require "MBGD_common.pl";
require "InfoSpec.pl";

###############################################################################
#
$main::HASH_id = {};

###############################################################################
#
sub readTaxonTab {
    my($refTaxonTab);
    $refTaxonTab->{'ID2NAME'} = {};
    $refTaxonTab->{'NAME2ID'} = {};

    #
    my$filename = "$ENV{'MBGD_HOME'}/etc/taxon.tab";
    my$fh = new FileHandle("$filename") || die("ERROR :: Can not open $filename($!)");
    while(my$line = $fh->getline()) {
        my($taxid, $name) = split(/\t/, $line);
        my($ucName) = uc($name);
        $refTaxonTab->{'ID2NAME'}->{"$taxid"} = $name;
        $refTaxonTab->{'NAME2ID'}->{"$ucName"} = $taxid;
    }
    $fh->close();

    return $refTaxonTab;
}

###############################################################################
#
sub readGenome {
    my($filename) = shift;
    my($ref) = {};

    my($fh) = new FileHandle("$filename") || die("Can not open $filename($!)");
    while(my$line = $fh->getline()) {
        next if ($line =~ /^\s*#/);
        next if ($line =~ /^\s*$/);

        $line =~ s#[\r\n]*$##;

        my($k, $v) = split(/\t/, $line);
        $k = lc($k);
        $ref->{"$k"} = $v;
    }
    $fh->close();

    return $ref;
}

###############################################################################
#
sub readChromosome {
    my($filename) = shift;

    return readGenome($filename);
}

###############################################################################
#
$main::USER_DATA_COLS = "id gene from to dir type descr";
sub readGene {
    my($filename) = shift;
    my($refList) = [];

    my($fh) = new FileHandle("$filename") || return $refList;
    my(@cols_user) = split(/\s/, $main::USER_DATA_COLS);
    while(my$line = $fh->getline()) {
        next if ($line =~ /^\s*$/);
        if ($line =~ /^\s*#/) {
            if ($line =~ /cols\t(.+)/) {
                $main::USER_DATA_COLS = $1;
                @cols_user = split(/\s/, $main::USER_DATA_COLS);
            }
            next;
        }

        $line =~ s#[\r\n]*$##;

        my(@dat) = split(/\t/, $line);
        my($ref) = {};
        my($idx) = 0;
        foreach my $k (@cols_user) {
            $ref->{"$k"}   = $dat[$idx];

            $idx++;
        }

        push(@{$refList}, $ref);
    }
    $fh->close();

    return $refList;
}

###############################################################################
#
sub readProteinSeq {
    my($filename) = shift;
    my($refHash) = {};
    my($ref);

    my($fh) = new FileHandle("$filename") || return $refHash;
    while(my$line = $fh->getline()) {
        next if ($line =~ /^\s*#/);
        next if ($line =~ /^\s*$/);

        $line =~ s#[\r\n]*$##;

        if ($line =~ /^>\s*(\S+)\s*(.*)$/) {
            my($newId) = $1;
            $newId =~ s#^[^\:]+\:##;
            my($newDesc) = $2;
            if ($ref) {
                $ref->{'length'} = length($ref->{'seq'});
                my($id) = $ref->{'id'};
                $refHash->{"$id"} = $ref;
            }
            $ref = {};
            $ref->{'id'}    = $newId;
            $ref->{'id_uc'} = uc($newId);
            $ref->{'desc'}  = $newDesc;
            $ref->{'seq'}   = '';
        }
        else {
            $ref->{'seq'} .= $line;
        }

    }
    if ($ref) {
        $ref->{'length'} = length($ref->{'seq'});
        my($id) = $ref->{'id'};
        $refHash->{"$id"} = $ref;
    }
    $fh->close();

    return $refHash;
}

###############################################################################
#
sub readDnaSeq {
    my($filename) = shift;

    my($refProteinSeq) = readProteinSeq($filename);
    my($refDnaSeq) = values(%{$refProteinSeq});

    return $refDnaSeq;
}

###############################################################################
#
sub saveGenome {
    my($filename) = shift;
    my($ref) = shift;

    unlink("$filename.bak") if (-e "$filename.bak");
    rename("$filename", "$filename.bak");
    print STDERR "LOG :: Save $filename\n";
    my($fh) = new FileHandle(">$filename") || die("Can not open $filename($!)");
    foreach my$k (sort(keys(%{$ref}))) {
        my($v) = $ref->{"$k"};
        $fh->print(join("\t", $k, $v), "\n");
    }
    $fh->close();

    return;
}

###############################################################################
#
sub saveChromosome {
    my($filename) = shift;
    my($ref) = shift;

    saveGenome($filename, $ref);

    return;
}

###############################################################################
#
sub saveGene {
    my($filename) = shift;
    my($refList) = shift;
    my(@key_list) = split(/\s/, $main::USER_DATA_COLS);

    unlink("$filename.bak") if (-e "$filename.bak");
    rename("$filename", "$filename.bak");
    print STDERR "LOG :: Save $filename\n";
    my($fh) = new FileHandle(">$filename") || die("Can not open $filename($!)");
    $fh->print("#format\tuser_data", "\n");
    $fh->print("#cols\t", $main::USER_DATA_COLS, "\n");
    foreach my$ref (@{$refList}) {
        my(@dat);
        foreach my$k (@key_list) {
            push(@dat, $ref->{"$k"});
        }
        $fh->print(join("\t", @dat), "\n");
    }
    $fh->close();

    return;
}

###############################################################################
#
sub saveProteinSeq {
    my($filename) = shift;
    my($refHash) = shift;

    unlink("$filename.bak") if (-e "$filename.bak");
    rename("$filename", "$filename.bak");
    print STDERR "LOG :: Save $filename\n";
    my($fh) = new FileHandle(">$filename") || die("Can not open $filename($!)");
    foreach my$id (keys(%{$refHash})) {
        my($ref) = $refHash->{"$id"};
        my($desc) = $ref->{'desc'};
        my($seq)  = $ref->{'seq'};
        $seq =~ s#(.{1,60})#$1\n#g;
        $fh->print(">$id $desc\n");
        $fh->print($seq);
    }
    $fh->close();

    return;
}

###############################################################################
#
sub saveDnaSeq {
    my($filename) = shift;
    my($refHash) = shift;

    saveProteinSeq($filename, {'dnaseq' => $refHash});

    return;
}

###############################################################################
#
sub checkUserGenomeData {
    my($spid) = shift;
    my($refTaxonTab) = shift;
    my($k);
    my($filename);
    my($nError) = 0;
    my($saveGenome) = 0;
    my($saveChromosome) = {};
    my($saveGene) = {};
    my($saveProteinSeq) = {};
    my($saveDnaSeq) = {};

    my($fileGenome) = "$ENV{'MBGD_HOME'}/species/$spid/gm/genome.txt";
    my($refGenome) = readGenome($fileGenome);

    #
    foreach $k ('sp', 'orgname', 'taxid', 'date_release') {
        if ($refGenome->{"$k"} =~ /^\s*$/) {
            print STDERR "ERROR :: genome.txt('$k') is blank.\n";
            $nError++;
        }
    }

    #
    $k = 'sp';
    my($sp) = $refGenome->{"$k"};
    my($tmpSpid) = sp2spid($sp, $main::FILE_spidtab);
    if ((! $main::force) && ($tmpSpid)) {
#        print STDERR "WARNING :: genome.txt('$k') is already used. ['$sp' is contained $main::FILE_spidtab]\n";
    }

    #
    $k = 'abbrev';
    if ($refGenome->{"$k"} =~ /^\s*$/) {
        my($abb1, $abb2) = ($refGenome->{'orgname'} =~ /(\S)\S*\s+(\S+)/);
        $refGenome->{"$k"} = sprintf("%s.%s", uc($abb1), lc($abb2));
        print STDERR "WARNING :: genome.txt('$k') is blank. Then set ", $refGenome->{"$k"}, "\n";
        $saveGenome = 1;
    }

    #
    $k = 'taxid';
    my($taxid) = $refGenome->{"$k"};
    if ($taxid && !exists($refTaxonTab->{'ID2NAME'}->{"$taxid"})) {
        print STDERR "WARNING :: genome.txt('$k') is illegal. Then set 32644(unknown)\n";
        $refGenome->{"$k"} = 32644; # unknown
        $saveGenome = 1;
    }

    #
    $k = 'date_release';
    my($dateRelease) = $refGenome->{"$k"};
    my($valRelease) = 0;
    my($y, $m, $d) = ($dateRelease =~ /(\d+)\-(\d+)\-(\d+)/);
    if ($dateRelease =~ /yyyy\-mm\-dd/i) {
        print STDERR "WARNING :: genome.txt('$k') is '$dateRelease'. Then set today.\n";
        my($sec, $min, $hour, $mday, $mon, $year) = localtime(time());
        $year += 1900;
        $mon++;
        $refGenome->{"$k"} = sprintf("%04d-%02d-%02d", $year, $mon, $mday); # today
        $saveGenome = 1;
        $valRelease = ($year * 100 + $mon) * 100 + $mday;
    }
    elsif ($dateRelease && (!$y || !$m || !$d)) {
        print STDERR "ERROR :: genome.txt('$k') is illegal.\n";
        $nError++;
    }
    else {
        $valRelease = ($y * 100 + $m) * 100 + $d;
    }

    #
    $k = 'specweight';
    my($specweight) = $refGenome->{"$k"};
    if (! $specweight) {
        print STDERR "WARNING :: genome.txt('$k') is blank. ";
        $refGenome->{"$k"} = 0;
        if ($valRelease) {
            print STDERR "Then set default-weight.";
            $refGenome->{"$k"} = (20300000 - $valRelease) * 100;
        }
        print STDERR "\n";
        $saveGenome = 1;
    }
    elsif ($specweight !~ /^\d+$/) {
        print STDERR "ERROR :: genome.txt('$k') is illegal.\n";
        $nError++;
    }

    # chromosome
    my($refChromosome) = {};
    my($refGene)       = {};
    my($refProteinSeq) = {};
    my($refDnaSeq)     = {};
    for(my$i = 1; $i <= 99999; $i++) {
        my($fileChromosome) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.chromosome.txt";
        my($fileGene) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.gene";
        my($fileProteinSeq) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.protseq";
        my($fileDnaSeq) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.chrseq";
        if (! -e "$fileGene") {
            last;
        }
        print STDERR "LOG :: Found $fileGene\n";

        if (! -e "$fileChromosome") {
            print STDERR "WARNING :: Not exists :: $fileChromosome\n";
        }
        if (! -e "$fileProteinSeq") {
            print STDERR "ERROR :: Not exists :: $fileProteinSeq\n";
            $nError++;
        }
        if (! -e "$fileDnaSeq") {
            print STDERR "WARNING :: Not exists :: $fileDnaSeq\n";
        }

        #
        if (-e "$fileChromosome") {
            $refChromosome->{"$i"} = readChromosome($fileChromosome);
        }
        $refGene->{"$i"}       = readGene($fileGene);
        $refProteinSeq->{"$i"} = readProteinSeq($fileProteinSeq);
        foreach $k (keys(%{$refProteinSeq->{"$i"}})) {
            my($ref) = $refProteinSeq->{"$i"}->{"$k"};
            if ($ref->{'seq'} =~ /[^A-Z]/i) {
                print STDERR "WARNING :: Found illegal character :: $fileProteinSeq\n";
                $ref->{'seq'} =~ s#[^A-Z]##ig;
                $ref->{'length'} = length($ref->{'seq'});
                $saveProteinSeq->{"$i"} = 1;
            }
        }
        if (-e "$fileDnaSeq") {
            $refDnaSeq->{"$i"}     = readDnaSeq($fileDnaSeq);
            if ($refDnaSeq->{"$i"}->{'seq'} =~ /[^A-Z]/i) {
                print STDERR "WARNING :: Found illegal character :: $fileDnaSeq\n";
                $refDnaSeq->{"$i"}->{'seq'} =~ s#[^A-Z]##ig;
                $refDnaSeq->{"$i"}->{'length'} = length($refDnaSeq->{"$i"}->{'seq'});
                $saveDnaSeq->{"$i"} = 1;
            }
        }

        if (exists($refChromosome->{"$i"})) {
            #
            $k = 'seqno';
            if ($refChromosome->{"$i"}->{"$k"} != $i) {
                print STDERR "LOG :: chromosome.txt('$k') is updated.\n";
                $refChromosome->{"$i"}->{"$k"} = $i;
                $saveChromosome->{"$i"} = 1;
            }

            # UserGenome ˤơaccession  seqno Ʊͤꤹ
            # accession ϡchromosome ơ֥ PRIMARY_KEY ΰǤ뤿
            $k = 'accession';
            if ($refChromosome->{"$i"}->{"$k"} != $i) {
                print STDERR "LOG :: chromosome.txt('$k') is updated.\n";
                $refChromosome->{"$i"}->{"$k"} = $i;
                $saveChromosome->{"$i"} = 1;
            }

            #
            $k = 'shape';
            if ($refChromosome->{"$i"}->{"$k"} !~ /^\s*(circular|linear)\s*$/i) {
                print STDERR "ERROR :: chromosome.txt('$k') is illegal.\n";
                $nError++;
            }

            #
            $k = 'type';
            if ($refChromosome->{"$i"}->{"$k"} !~ /^\s*(chromosome|plasmid|contig)\s*$/i) {
                print STDERR "ERROR :: chromosome.txt('$k') is illegal.\n";
                $nError++;
            }

            #
            $k = 'length';
            if (($refChromosome->{"$i"}->{"$k"} <= 0) && ($refDnaSeq->{"$i"}->{"$k"} <= 0)) {
                print STDERR "ERROR :: Can not get chromosome length.\n";
                $nError++;
            }
            elsif (($refChromosome->{"$i"}->{"$k"} <= 0) && (0 < $refDnaSeq->{"$i"}->{"$k"})) {
                $refChromosome->{"$i"}->{"$k"} = $refDnaSeq->{"$i"}->{"$k"};
                $saveChromosome->{"$i"} = 1;
            }
            elsif ((0 < $refChromosome->{"$i"}->{"$k"}) && ($refDnaSeq->{"$i"}->{"$k"} <= 0)) {
                $refDnaSeq->{"$i"}->{"$k"} = $refChromosome->{"$i"}->{"$k"};
                $refDnaSeq->{"$i"}->{'seq'} = 'N' x $refChromosome->{"$i"}->{"$k"};
                $saveDnaSeq->{"$i"} = 1;
            }
        }

        #
        if (scalar(@{$refGene->{"$i"}}) == 0) {
            print STDERR "ERROR :: Can not found gene(s).\n";
            $nError++;
        }
        foreach my$ref (@{$refGene->{"$i"}}) {
            foreach $k ('id', 'dir') {
                if ($ref->{"$k"} =~ /^\s*$/) {
                    print STDERR "ERROR :: gene('$k') is blank.\n";
                    $nError++;
                }
            }
            if (($ref->{'from'} ne '') && ($ref->{'to'} eq '')) {
                print STDERR "ERROR :: gene('from') is blank.\n";
                $nError++;
            }
            elsif (($ref->{'from'} eq '') && ($ref->{'to'} ne '')) {
                print STDERR "ERROR :: gene('to') is blank.\n";
                $nError++;
            }

            #
            $k = 'id';
            my($id) = $ref->{"$k"};
            if ($main::HASH_id->{"$id"} != 0) {
                print STDERR "ERROR :: Found same Gene_ID.($id)\n";
                $nError++;
            }
            $main::HASH_id->{"$id"} = 1;

            if ($ref->{'to'} < $ref->{'from'}) {
                if (0 < $refChromosome->{"$i"}->{'length'}) {
                    if ($refChromosome->{"$i"}->{'length'} / 2 < $ref->{'from'} - $ref->{'to'}) {
                        # pos0
                    }
                    else {
                        print STDERR "WARNING :: Change position. 'from' position is bigger than 'to' position. ($id)\n";
                        my($wk) = $ref->{'from'};
                        $ref->{'from'} = $ref->{'to'};
                        $ref->{'to'} = $wk;
                        $saveGene->{"$i"} = 1;
                    }
                }
            }

            $k = 'dir';
            if (($ref->{"$k"} != -1) && ($ref->{"$k"} != 1)) {
                print STDERR "ERROR :: gene('$id : $k') is illegal.\n";
                $nError++;
            }

            $k = 'type';
            if ($ref->{"$k"} =~ /^\s*$/) {
                print STDERR "WARNING :: gene('$k') is blank. Use 'CDS' as default.($id)\n";
                $ref->{"$k"} = 'CDS';
                $saveGene->{"$i"} = 1;
            }
            if ($ref->{"$k"} !~ /^(cds|.*rna)$/i) {
                print STDERR "ERROR :: Found illegal gene-type. ($id)\n";
                $nError++;
            }

            $k = 'seq';
            $ref->{"$k"} = $refProteinSeq->{"$i"}->{"$id"};
            if (($ref->{'type'} =~ /^cds$/i) && ($ref->{"$k"}->{'seq'} =~ /^\s*$/)) {
                print STDERR "ERROR :: No protein sequence found. ($id)\n";
                $nError++;
            }
            if ($ref->{"$k"}->{'seq'} =~ /[^A-Z]/i) {
                print STDERR "ERROR :: Found illegal character in protein seq. ($id)\n";
                $nError++;
            }
        }

    }

    #
    if ($nError != 0) {
        print STDERR "STOP :: Please repair error(s)\n";
        exit(-1);
    }
    if ($main::UPDATE && $saveGenome) {
        my($fileGenome) = "$ENV{'MBGD_HOME'}/species/$spid/gm/genome.txt";
        saveGenome($fileGenome, $refGenome);
    }
    for(my$i = 1; $i <= 99999; $i++) {
        my($fileChromosome) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.chromosome.txt";
        my($fileGene) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.gene";
        my($fileGeneSeq) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.geneseq";
        my($fileProteinSeq) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.protseq";
        my($fileDnaSeq) = "$ENV{'MBGD_HOME'}/species/$spid/gm/data/$i.chrseq";
        if (! -e "$fileGene") {
            last;
        }

        if ($main::UPDATE && $saveChromosome->{"$i"}) {
            saveChromosome($fileChromosome, $refChromosome->{"$i"});
        }
        if ($main::UPDATE && $saveGene->{"$i"}) {
            saveGene($fileGene, $refGene->{"$i"});
        }
        if ($main::UPDATE) {
            new FileHandle(">$fileGeneSeq");
        }
        if ($main::UPDATE && $saveProteinSeq->{"$i"}) {
            saveProteinSeq($fileProteinSeq, $refProteinSeq->{"$i"});
        }
        if ($main::UPDATE && $saveDnaSeq->{"$i"}) {
            saveDnaSeq($fileDnaSeq, $refDnaSeq->{"$i"});
        }
    }

    #
    if ($main::UPDATE) {
        my($name) = $refGenome->{'sp'};
        my($file) = sprintf("%s(%s)", $refGenome->{'orgname'}, $refGenome->{'strain'});
        $file =~ s#\s+#_#g;
        my($fileSpidTab) = "$main::FILE_spidtab";
        my($refSpidTab) = getInfoSpecTab($fileSpidTab);
        addInfoSpecTabEntry($refSpidTab, $spid, $name, $file);
        setInfoSpecTab($fileSpidTab, $refSpidTab);
    }

    return;
}

###############################################################################
if ($0 eq __FILE__) {
    if ($main::spid !~ /^gu\d{5}$/) {
        print STDERR "Usage :: $0 -spid=gu99999\n";
        exit(0);
    }
    my($refTaxonTab) = readTaxonTab();
    checkUserGenomeData($main::spid, $refTaxonTab);
}

###############################################################################
1;#
###############################################################################
