#!/usr/bin/perl -s
use IO::File;
package ParseKEGGcomplete;
###############################################################################
# ̾
#     ParseKEGGcomplete.pm
#
# 
#     KEGGOrganismsڡparseCompleteΥǡݻ롣
#     http://www.genome.jp/kegg/catalog/org_list.html
#     - Organism NameƬ2򥭡Ȥͤ롣
#     - KEGGʬ̾򥭡Ȥͤ롣
#
# 
#     
#    ʲιܤݻ롣
#	key			value
#       sp			ʪ̾Υܥ
#       orgname			ʪ̾
#       wwwlink			
#       medid			ʸֹ(MEDID)
#       source			
#       sourcewww		󸵤ؤΥ
#
#    ǡ¤
#      ʲ2Υϥå幽¤ݻ
#      - Species NameƬ2ñ줫鸡Ǥ빽¤
#         KEY		   VALUE
#      Organism Name	\@line(1ԥǡΥϥåꥹ)
#      @hash_ref_list = @{$self->{'name'}->{$orgamism_name}};
#      ʤ1ԥǡΥϥåϡ̾KEYбͤVALUEȤ.
#
#      - Species Symbol鸡Ǥ빽¤
#      $ref = $self->{'sym'};
#
# ᥽åɰ
#     new()
#     init()
#     read()
#     parseTR()
#     getLine()
#     getLineSym()
#     getLineName()
#     getAll()
#
# 饹ѿ
#
# 
#
###############################################################################
use strict;
use Net::FTP;
require "MBGD_Conf.pl";

###############################################################################
# ̾
#     new()
# 
#     󥹥ȥ饯
# 
#     $that  : 饹̾(⤷ϥ󥹥󥹤Υե)
#     $file  : ɹե
# 
#     󥹥󥹤ؤΥե
# 
#
# 
#
sub new {
    my $that = shift;
    my $file  = shift(@_);
    my $division = shift(@_);

    # $that ե󥹤ʤ顢ѥå̾롣
    my $class = ref $that || $that;
    my $self  = {};

    bless($self, $class);
    $self->{'name'}={};
    $self->{'sym'}={};
    $self->init($file,$division);

    return $self;
}

##############################################################################
# ̾
# init()
# 
#
# 
#
# 
#
# 
#
sub init {
  my $self = shift;
  my $file = shift(@_);
  my $div = shift(@_);

  $self->read($file, $div);
}

##############################################################################
# ̾
# read()
# 
#     եɹParseƤݻ
# 
#
# 
#
# 
#
sub read {
  my $self = shift;
  my $file = shift(@_);
  my $division = shift(@_);

  my $line="";
  my $div_flag=0; # ɤʬɹǤ뤫򼨤1:Eukaryotes 2:Prokaryotes
  my $fh = IO::File->new($file);
  if (!$fh) {
    print STDERR "WARNING :: Can not open $file($!)\n";
    return;
  }
  while($_=$fh->getline()) {

    $line .= $_;
    if(/\<h4 class=.*\>Eukaryotes/) {
      $div_flag=1;
      next;
    }
    if(/\<h4 class=.*\>Prokaryotes/) {
      $div_flag=2;
      next;
    }

    # <tr></tr>ޤǤΥǡ
    if(/\<tr/) {  
      $line = "";
      $line .= $_;
    }
    elsif(/\<\/tr\>/) {
      if($division =~ /Eukaryotes/i) {  # Eukariyotesλǡɹ
        if($div_flag==1) {
          $self->parseTR($line);
        }
        next;
      }
      elsif($division =~ /Prokaryotes/i) { # Prokaryotesλǡɤ߹
        if($div_flag==2) {
          $self->parseTR($line);
        }
        next;
      }
      else {
        $self->parseTR($line);
      }
    }
  }
  $fh->close();

#  $self->updateAccessions();
}

##############################################################################
# ̾
# parseTR()
# 
#     TRˤǡ
# 
#
# 
#
# 
#
sub parseTR {
  my $self = shift;
  my $html = shift(@_);

  my($spname, $symname, $symlink, $medid, $source,$soulink)="";
  my $mflag = 0;
  my $flag = 0;
  foreach my $l (split/\n/,$html) {
    if($l=~/\<td align.*\>\<a href='(.*)'\>(.*)\<\/a\>/) {
      my $link = $1;
      my $name = $2;
      if(length($spname) < 1) {
         $spname = $name;
      } 
      elsif(length($symname) < 1) {
        $symname = $name;
        $symlink = $link;
      }
      elsif($mflag < 1) {
        if($link =~ /list_uids=(\d+)/) {
          $medid = $1;
          $mflag=1;
        }
      }
      elsif(length($source) < 1) {
        $source = $name;
        $soulink = $link;
        $flag = 1;
      }
      else {
        die "invalid line $l";
      }
    }
    if($l=~/\<td align=center\>\d+\<\/td\>/) {
      $mflag = 1;
      $medid = "";
    }
    if($l=~/\<td align=left\>&nbsp;\<\/td\>/) {
      last;
    }
  }

  if($flag) {
    $self->{'sym'}->{$symname}->{'orgname'} = $spname;
    $self->{'sym'}->{$symname}->{'sp'} = $symname;
    $self->{'sym'}->{$symname}->{'wwwlink'} = $symlink;
    $self->{'sym'}->{$symname}->{'medid'} = $medid;
    $self->{'sym'}->{$symname}->{'source'} = $source;
    $self->{'sym'}->{$symname}->{'sourcewww'} = $soulink;
    if (($source =~ /^refseq$/i) && $soulink) {
#      $self->{'sym'}->{$symname}->{'accession'} = $self->getRefseqAccession($soulink);
    }

    my @list = split/\s/,$spname;
    my $keyname = $list[0] . " " . $list[1];
    my @arrayData;
    if(exists $self->{'name'}->{$keyname}) {
      @arrayData = @{$self->{'name'}->{$keyname}};
    }
    push(@arrayData,$self->{'sym'}->{$symname});
    $self->{'name'}->{$keyname} = \@arrayData;
  }
  
}

##############################################################################
# ̾
# getLineName()
# 
#     ʪ̾Ƭ2ñ򥭡Ȥԥǡ֤
# 
#
# 
#
# 
#
sub getLineName {
  my $self = shift;
  my $name = shift(@_);
  
  my $hitline;
  my @list = @{$self->{'parse'}->{$name}};
  foreach my $r (@list) {
    $hitline .= $self->getLine($r);
  }
  return $hitline; 
}

##############################################################################
# ̾
# getLineSym()
# 
#     ʪ̾Ƭ2ñ򥭡Ȥԥǡ֤
# 
#
# 
#
# 
#
sub getLineSym {
  my $self = shift;
  my $name = shift(@_);

  my $ref = $self->{'sym'}->{$name};
  my $hitline = $self->getLine($ref);
  
  return $hitline;
}

##############################################################################
# ̾
# getLine()
# 
#     ԥǡΥǥե
# 
#
# 
#
# 
#
sub getLine {
  my $self = shift;
  my $ref = shift(@_);

  my $l = $ref->{'orgname'}  . "\t" 
    	  . $ref->{'sp'} . "\t"
  	  . $ref->{'wwwlink'} . "\t"
  	  . $ref->{'medid'}   . "\t"
  	  . $ref->{'source'}  . "\t"
  	  . $ref->{'sourcewww'} . "\n";

  return $l;
  
}

##############################################################################
# ̾
# getAll()
# 
#    ƤΥץ꡼ȥǡꥹȤ֤
# 
#
# 
#
# 
#
sub getAll {
  my $self=shift;

  my @hashreflist;
  foreach my $key (keys %{$self->{'sym'}}) {
    push(@hashreflist, $self->{'sym'}->{$key});
  }
  
  return @hashreflist;
}

##############################################################################
# ̾
# updateAccessions()
# 
#
# 
#
# 
#
# 
#
sub updateAccessions {
    my($self) = shift;
    my(@spList) = @_;
    my($ftp);
    my($host);
    my($prevHost) = undef();

    if (scalar(@spList) == 0) {
        @spList = sort(keys(%{$self->{'sym'}}));
    }
    foreach my$symname (@spList) {
        my($url) = $self->{'sym'}->{$symname}->{'sourcewww'};
        my($host, $path) = ($url =~ m#ftp://([^\/]+)(.+)$#);
        if (! $host) {
            next;
        }

        if ($host ne $prevHost) {
            if ($ftp) {
                $ftp->quit();
            }

            print STDERR "FTP connect($host)\n" if ($main::v);            
            $ftp = new Net::FTP($host);
            if (! $ftp) {
                die("ERROR :: FTP connect($host)\n");
            }
            my$sta = $ftp->login();
            if (! $sta) {
                die("ERROR :: FTP login()\n");
            }
            $prevHost = $host;
        }

        $self->{'sym'}->{$symname}->{'accession'} = $self->getRefseqAccession($ftp, $path);
    }
    if ($ftp) {
        $ftp->quit();
    }
}

##############################################################################
# ̾
# getRefseqAccession()
# 
#
# 
#
# 
#
# 
#
sub getRefseqAccession {
    my($self) = shift;
    my($ftp) = shift;
    my($path) = shift;
    my($sta);
    my($refList);
    my($refAcc) = [];

#    my($host, $path) = ($url =~ m#ftp://([^\/]+)(.+)$#);
#
#    print STDERR "FTP connect($host)\n" if ($main::v);
#    my($ftp) = new Net::FTP($host);
#    if (! $ftp) {
#        print STDERR "ERROR :: FTP connect($host)\n" if ($main::v);
#        return '';
#    }
#
#    $sta = $ftp->login();
#    if (! $sta) {
#        return '';
#    }

    print STDERR "DBG :: Update accession list. :: $path\n";
    print STDERR "FTP cwd($path)\n" if ($main::v);
    $sta = $ftp->cwd($path);
    if (! $sta) {
        return '';
    }

    print STDERR "FTP ls()\n" if ($main::v);
    $refList = $ftp->ls();
    if (! $refList) {
        return '';
    }
    foreach my$file (@{$refList}) {
        if ($file =~ /^([AN]C_\d+)\.gbk$/i) {
            push(@{$refAcc}, $1);
        }
    }

#    print STDERR "FTP disconnect()\n" if ($main::v);
#    $ftp->quit();

    return $refAcc;
}

##############################################################################
if ($0 eq __FILE__) {
    my($url) = "http://www.genome.jp/kegg/catalog/org_list.html";

    my($fileKegg) = "/tmp/KeggOrg.html";
    my($cmd) = "$main::CMD_wget -q -O - $url > $fileKegg";
    system("$cmd");

    my($div) = $main::DIV;
    $div = "Bacteria" if (! $div);
    my($obj) = new ParseKEGGcomplete($fileKegg, $div);

    foreach my$k (@ARGV) {
        my$ref = $obj->{'sym'}->{"$k"};
        foreach my$key (sort(keys(%{$ref}))) {
            print "$key", "\t", $ref->{"$key"}, "\n";
        }
    }
    unlink($fileKegg);
}

##############################################################################
1; #
##############################################################################
