#!/usr/bin/perl -s

###############################################################################
# homology ǡŸspid бǡ
#
# Usage :: $0 [-NO_TMP_TAB] [-dbname=DBNAME]
#
###############################################################################
use FileHandle;
use File::Basename;
use File::Path;
require "MBGD_Conf.pl";
require "MBGD_commonUpdate.pl";
require "libMBGDaxes.pl";
require "libLoadInfo.pl";

#$INS_RECS = 5000;
$INS_RECS = 50000;
$PIDFILE = "$DIR_work/.1120CopyHomologyInfo.pid";

$REPORT = 20000000;
##$BACKUP = 50000000;
#$BACKUP = 20000000;
#$SaveDir = "$UPD_dirMysql/SAVE";

$main::Prefix = "blastdpres";
$main::Suffix = "pack";
$main::FileIndex = "spindex";
$main::Delim = ":";
$main::Uniqcheck='0';

###############################################################################
#
sub get_filename_homology_sp {
    my($file_out) = shift;
    my($sp1) = shift;

    my($filename) = sprintf("%s.%s", $file_out, $sp1);

    return $filename;
}

###############################################################################
#
sub create_dirinfo {
    my($filename) = shift;

    my($fho) = FileHandle->new(">$filename") || die("Can not open $filename($!)");
    $fho->print("uniqcheck=" . $main::Uniqcheck . "\n");
    $fho->print("idxfile="   . $main::FileIndex . "\n");
    $fho->print("prefix="    . $main::Prefix    . "\n");
    $fho->print("suffix="    . $main::Suffix    . "\n");
    $fho->print("delim="     . $main::Delim     . "\n");
    $fho->close();

    return;
}

###############################################################################
sub write_homology {
    my($file_out) = shift;
    my($fh_sp_ref) = shift;
    my($sp1) = shift;
    my($orf1) = shift;
    my($sp2) = shift;
    my($orf2) = shift;
    my(@data) = @_;     # $from1, $to1, $from2, $to2, $ident, $neval, $score, $pam

    # եʾ open ʤ褦ˡsp1 κǽΣʸǽϥեʬ
    my($len_ext) = 2;
    my($ext_sp) = substr($sp1, 0, $len_ext);

    #
    if (!exists($fh_sp_ref->{"$ext_sp"})) {
        my($filename) = get_filename_homology_sp($file_out, $ext_sp);
        $fh_sp_ref->{"$ext_sp"} = FileHandle->new(">$filename") || die("Can not open $filename($!)");
    }
    my($fh) = $fh_sp_ref->{"$ext_sp"};

    # "$sp1:$orf1", "$sp2:$orf2" ȽϤʤ
    #  sort ϡ$sp1, $sp2, $orf1, $orf2 򥭡ȤƼ¹
    $fh->print(join(" ", $sp1, $orf1, $sp2, $orf2, @data) . "\n");

    return;
}

###############################################################################
#
sub pack_bldp {
    my($file_sorted) = shift;
    my($file_pack) = shift;

    my(%hompair_hash);
    my($fhr) = IO::File->new("$file_sorted") || die("Can not open $file_sorted($!)");
    my($fhw) = IO::File->new(">$file_pack")  || die("Can not open $file_pack($!)");
    while (my$line=$fhr->getline()) {
        $line =~ s#[\r\n]*$##;
        my($sp1, $orf1, $sp2, $orf2,
           $from1, $to1, $from2, $to2,
           $ident, $eval, $score, $pam) = split(/\s+/, $line);
        my($spname1) = "$sp1:$orf1";
        my($spname2) = "$sp2:$orf2";
        my($key1) = join(":", $spname1, $from1, $to1);
        my($key2) = join(":", $spname2, $from2, $to2);
        if (exists($hompair_hash{"$key1:$key2"})
         || exists($hompair_hash{"$key2:$key1"})) {
#print STDERR "Found :: duplicated pair :: $key1 - $key2\n";
            next;
        }
        $hompair_hash{"$key1:$key2"} = 1;

        #
        my($pd) = pack($main::PACK_TEMPL38, $spname1,      $spname2,
                                            $from1, $to1,  $from2, $to2,
                                            $ident, $eval, $score, $pam);
        $fhw->print($pd);
    }
    $fhw->close();
    $fhr->close();

    return;
}

###############################################################################
sub CopyHomologyInfo {
    my($dbname, $dbnameAccum, @spid_list) = @_;
    my($db, $sql, $sth);
    my($info);
    my($filenameMd5);
    my($filenameSp);
    my($hom);
    my($filename);
    my($count);
    my($fh);
    my($cmd);
    my($skip_count);
    $PID = $$ if (! $PID);

    #
    my($whereSpid) = '';
    if (scalar(@spid_list) != 0) {
        $whereSpid = ' and p.spid in (' . "'" . join("','", @spid_list) . "'" . ')';
    }

    #
    my($opt_temporary) = 'temporary';
    if ($main::NO_TMP_TAB) {
        print STDERR "##### Use 'NOT temporary table' #####\n";
        $opt_temporary = '';
    }

    # Get entire data from the homology table of $DBNAME_ACCUM
    my($filenameMd5) = "$DIR_work/homologyMd5.$PID";
    if (! -e $filenameMd5 || -z $filenameMd5) {
        print STDERR "get homology data(MD5)\n";

        my($sql) = "select * from homology";
        my($cmd) = "$main::CMD_mysql -q -D $main::DBNAME_ACCUM -e '$sql'";
        system("$cmd > $filenameMd5");
    }

    # Expand sequence names from the MD5 values in the homologyMd5 table
    print STDERR "Convert MD5 ---> SP:ORF\n";
    my($dir_bldp) = sprintf("%s/database.work/bldp", $ENV{'MBGD_HOME'});
    mkpath($dir_bldp, 0, 0750);

    $filenameSp = "$dir_bldp/homologySp";
    my(@ext_sp_list) = ExpandHomology($dbnameAccum, $whereSpid, $filenameSp, $filenameMd5);
    print STDERR "Expand OK :: @ext_sp_list\n";

    #
    my($cmd_spindex) = "$ENV{'MBGD_HOME'}/binaries/spindex";
    my($dir_work) = "$ENV{'MBGD_HOME'}/work";

    #
    print STDERR "DBG :: [START] Sorting bldp. :: " . scalar(localtime()) . "\n";
    my($dir_out) = sprintf("%s/database.work/bldp", $ENV{'MBGD_HOME'});
    my($file_base) = sprintf("%s/%s", $dir_out, $main::Prefix);
    foreach my$ext_sp (@ext_sp_list) {
        my($file_in)  = get_filename_homology_sp($filenameSp,  $ext_sp);
        my($filename) = get_filename_homology_sp($file_base, $ext_sp);

        my($cmd) = "$main::CMD_sort --field-separator=' ' -k 1,1 -k 3,3 -k 2,2 -k 4,4 -T '$dir_work' $file_in | $main::CMD_uniq";
        print STDERR "DBG :: CMD :: $cmd\n" if ($main::DEBUG);
        my($ret) = system("$cmd > $file_in.uniq");
        if ($ret != 0) {
            print STDERR "ERROR :: CMD :: $cmd\n";
            die "Update process was terminated abnormally.($ret)";
        }
        unlink("$file_in");

        # packed
        print STDERR "DBG :: pack_bldp $ext_sp. :: " . scalar(localtime()) . "\n";
        pack_bldp("$file_in.uniq", "$filename");
        unlink("$file_in.uniq");

        #
        print STDERR "DBG :: spindex $ext_sp. :: " . scalar(localtime()) . "\n";
        my($cmd) = "$cmd_spindex $ext_sp";
        system("$cmd");

        print STDERR "DBG :: Done $ext_sp. :: " . scalar(localtime()) . "\n";
    }

    #
    my($cmd) = "$cmd_spindex -m @ext_sp_list";
    system("$cmd");

    #
    my($file_dirinfo) = "$dir_out/dirinfo";
    create_dirinfo($file_dirinfo);

    print STDERR "DBG :: Done. :: " . scalar(localtime()) . "\n";

    return;
}

###############################################################################
#
sub ExpandHomology {
    my($dbnameAccum) = shift;
    my($whereSpid) = shift;
    my($outfile) = shift;
    my(@fileList) = @_;
    my(%MD5_name, %MD5_length);
    my($filename);
    my(@data);
    my($md51, $md52);
    my($sporf1, $sporf2);
    my($sp1, $orf1);
    my($sp2, $orf2);
    my($from1, $to1);
    my($from2, $to2);
    my($idx);
    my($wk);
    my($fh);
    my($fo);
    my($n_total) = 0;

    #
    my $db = MBGD::DB->new($main::DBNAME_WORK);

    #
    my(%MD5_name);
    my(%MD5_length);

    #
    my($col) = "p.sp          as sp,"
             . "p.spid        as spid,"
             . "g.locus_tag   as name,"
             . "ps.md5sum     as md5sum,"
             . "ps.seq_length as length";
    my($tab) = "$dbnameAccum.project p,"
             . "$dbnameAccum.geneset gs,"
             . "$dbnameAccum.geneset_gene gsg,"
             . "$dbnameAccum.gene g,"
             . "$dbnameAccum.transcript t,"
             . "$dbnameAccum.proteinseq ps";
    my($where) = "p.id=gs.project_id"
               . "$whereSpid"
               . " and "
               . "gs.selected>0"
               . " and "
               . "gs.id=gsg.geneset_id"
               . " and "
               . "gsg.gene_id=g.id"
               . " and "
               . "g.id=t.gene_id"
               . " and "
               . " t.type='CDS' "
               . " and "
               . "t.proteinseq_id=ps.id"
               . "";
    my($sql) = "select $col from $tab where $where";
    print STDERR "SQL :: $sql\n";
    my($n_gene) = 0;
    my $sth = $db->execute("$sql");
    while (my $h = $sth->fetchrow_hashref) {
        my($spid) = $h->{'spid'};
        my($sp)   = $h->{'sp'};
        push(@{ $MD5_name{ $h->{md5sum} } }, "$sp:$h->{name}");
        $MD5_length{ $h->{md5sum} } = $h->{length};

        $n_gene++;
    }
    print STDERR "Done.(N=$n_gene)\n";

    # homology (MD5)եɤ߹
    my($len_ext) = 2;
    my($fh_sp_ref) = {};
    foreach $filename (@fileList) {
        $fh = new FileHandle("$filename") || die("Can not open $filename($!)");
        my $head = <$fh>;
        my $coln = 0;
        my %ColNum;
        chomp $head;
        ## header line
        foreach $colname (split(/\t/,$head)) {
            $ColNum{$colname} = $coln++;
        }
        while(<$fh>) {
            chomp();

            @data0 = split(/\t/);

            $md51  = $data0[$ColNum{name1}];
            $from1 = $data0[$ColNum{from1}];
            $to1   = $data0[$ColNum{to1}];
            $md52  = $data0[$ColNum{name2}];
            $from2 = $data0[$ColNum{from2}];
            $to2   = $data0[$ColNum{to2}];

            $ident   = $data0[$ColNum{ident}];
            ## oeval: original E-value
            $oeval   = $data0[$ColNum{eval}];
            $pam   = $data0[$ColNum{pam}];
            $score   = $data0[$ColNum{score}];

            $len1 = $MD5_length{"$md51"};
            $len2 = $MD5_length{"$md52"};
            ## neval: normalized E-value
            $neval = $oeval * $len1 * $len2 * 1e-5;

            foreach $sporf1 (@{$MD5_name{"$md51"}}) {
                ($sp1, $orf1) = split(/:/, $sporf1);
                foreach $sporf2 (@{$MD5_name{"$md52"}}) {
                    ($sp2, $orf2) = split(/:/, $sporf2);
#                    @data = ($sp1, $orf1, $from1, $to1,
#                             $sp2, $orf2, $from2, $to2,
#                             $ident, $neval, $score, $pam);
#                    write_homology($outfile, $fh_sp_ref, @data);

                    if ($sp1 gt $sp2) {
                        ($out_sp1,    $out_sp2)    = ($sp2,    $sp1);
                        ($out_orf1,   $out_orf2)   = ($orf2,   $orf1);
                        ($out_sporf1, $out_sporf2) = ($sporf2, $sporf1);
                        ($out_from1,  $out_from2)  = ($from2,  $from1);
                        ($out_to1,    $out_to2)    = ($to2,    $to1);
                    }
                    else {
                        ($out_sp1,    $out_sp2)    = ($sp1,    $sp2);
                        ($out_orf1,   $out_orf2)   = ($orf1,   $orf2);
                        ($out_sporf1, $out_sporf2) = ($sporf1, $sporf2);
                        ($out_from1,  $out_from2)  = ($from1,  $from2);
                        ($out_to1,    $out_to2)    = ($to1,    $to2);
                    }
                    my($ext_sp) = substr($out_sp1, 0, $len_ext);
                    if (!exists($fh_sp_ref->{"$ext_sp"})) {
                        my($filename) = get_filename_homology_sp($outfile, $ext_sp);
                        $fh_sp_ref->{"$ext_sp"} = FileHandle->new(">$filename") || die("Can not open $filename($!)");
                    }
                    my($fh) = $fh_sp_ref->{"$ext_sp"};

                    # "$sp1:$orf1", "$sp2:$orf2" ȽϤʤ
                    #  sort ϡ$sp1, $sp2, $orf1, $orf2 򥭡ȤƼ¹
                    $fh->print(join(" ", $out_sp1, $out_orf1,
                                         $out_sp2, $out_orf2,
                                         $out_from1, $out_to1,
                                         $out_from2, $out_to2,
                                         $ident, $neval, $score, $pam) . "\n");

                    $n_total++;
                }
            }
        }
        $fh->close();
    }
    print STDERR "LOG :: Total :: $n_total\n";

    #
    my(@sp_list) = sort(keys(%{$fh_sp_ref}));
    foreach my$sp1 (@sp_list) {
        $fh_sp_ref->{"$sp1"}->close();
    }

    return @sp_list;
}

###############################################################################
if ($0 eq __FILE__) {

    # STDERR ˽ϤƤե˽
    &openLogfile($main::log);

    if (! $main::dbname) {
        $main::dbname = $main::UPD_dbiWork;
    }
    if (! $main::dbname_accum) {
        $main::dbname_accum = $main::DBNAME_ACCUM;
    }
    if (-f $PIDFILE) {
        print STDERR "PID file $PIDFILE is found. ";
        if (&cmprFileDate1($PIDFILE, "5d") > 0) {
            open(PID, $PIDFILE);
            $PID = <PID>; chomp $PID;
            $continue = 1;
            print STDERR "-- PID is set to $PID\n";
            $now = time;
            utime $now, $now, $PIDFILE; # touch $PIDFILE
        } else {
            print STDERR "-- ignored\n";
        }
    }
    if (! $PID) {
        open(PID, ">$PIDFILE");
        print PID "$$\n";
    }

    #
    CopyHomologyInfo($main::dbname, $main::dbname_accum);

    unlink($PIDFILE) if (-f $PIDFILE);

    exit;
}

###############################################################################
1;#
###############################################################################
