#!/usr/bin/perl -s
use IO::File;
package ParseNCBIcomplete;
###############################################################################
# ̾
#     ParseNCBIcomplete.pm
# 
#     NCBIENTREZ Genome ProjectΥեparseͤݻ
#     http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?dump=selected
#     Organism NameƬ2򥭡Ȥͤ롣
# 
#     
#    
#    ǡ¤
#      ʲ2Υϥå幽¤ݻ
#         KEY		   VALUE
#      Organism Name	\@line(1ԥǡΥϥåΥꥹȤΥե
#      taxon		$hash_ref(1ԥǡΥϥå)
#      accession	$hash_ref(1ԥǡΥϥå)
#    
# ᥽åɰ
#     new()
#     init()
#     read()
#     getLineKTaxon()
# getLineKName()
#
# 饹ѿ
#
# 
#
###############################################################################
use strict;
require "MBGD_Conf.pl";

###############################################################################
# ̾
#     new()
# 
#     󥹥ȥ饯
# 
#     $that  : 饹̾(⤷ϥ󥹥󥹤Υե)
#     $file  : ɹե
# 
#     󥹥󥹤ؤΥե
# 
#
# 
#
sub new {
    my $that = shift;
    my $file  = shift(@_);

    # $that ե󥹤ʤ顢ѥå̾롣
    my $class = ref $that || $that;
    my $self  = {};

    bless($self, $class);
    $self->{'parse'}={};
    $self->{'taxon'}={};
    $self->init($file);

    return $self;
}

##############################################################################
# ̾
# init()
# 
#
# 
#
# 
#
# 
#
sub init {
  my $self = shift;
  my $file = shift(@_);

  $self->read($file);
}

##############################################################################
# ̾
# read()
# 
#     եɹParseƤݻ
# 
#
# 
#    ܤtabڤǥեˤ롣ʥեޥåȤѹˤʤǽ
#
# 
#
sub read {
  my $self = shift;
  my $file = shift(@_);

  my($label2key) = {
    "Project ID"                                   => 'pid',
    "Taxonomy ID"                                  => 'taxid',
    "Organism Name"                                => 'orgname',
    "Super Kingdom"                                => 'super',
    "Group"                                        => 'group',
    "Genome Size"                                  => 'size',
    "GC Content"                                   => 'gc',
    "Number of Chromosomes"                        => 'n_chromosome',
    "Number of Plasmids"                           => 'n_plasmid',
    "Released date"                                => 'date_released',
    "Modified date"                                => 'date_modified',
    "List of accessions (comma separated)"         => 'accession_id',
    "List of publications (comma separated)"       => 'medid',                # ºݤϡ'&uid=' ǶڤƤ
    "List of Center/Consortium (pipe separated)"   => 'institution',
    ""                                             => 'wwwlink',              # wwwlink ˳̵
  };

  my @title;
  my $fh = IO::File->new("$file");
  if (!$fh) {
    print STDERR "WARNING :: Can not open $file($!)\n";
    return;
  }
  while($_=$fh->getline()) {
    my @line;
    chomp;
    if(/## Complete Microbial Genome Page/) {
      next;
    }
    if(/## Columns:/) {
      my $l = $_;
      $l =~ s/\"//g;
      $l =~ s/## Columns:	//;
      @title = split/\t/,$l;
      next;
    }
    @line = split/\t/;
    my $cnt = 0;
    my $name;
    my $taxid ="";
    my $rec = {};
    foreach my $lab (@title) {
      my $key = $label2key->{"$lab"};
      my $val = $line[$cnt];
      if (($key =~ /^date_released$/) ||
          ($key =~ /^date_modified$/)) {
          my($m, $d, $y) = ($val =~ m#(\d+)/(\d+)/(\d+)#);
          $val = sprintf("%04d-%02d-%02d", $y, $m, $d);
      }
	  $rec->{"$key"} = $val;
      if ($key) {
        print STDERR "DBG :: $key :: $val\n" if ($main::DEBUG);
      }

      if($key =~ /orgname/i) {
        my @n = split/\s/,$val;
        $name = $n[0] . " " . $n[1];
        print STDERR "DBG :: save name :: $name\n" if ($main::DEBUG);
      }
      elsif($key =~ /^taxid$/i) {
        $taxid = $val;
        print STDERR "DBG :: save taxid :: $taxid\n" if ($main::DEBUG);
      }

      $cnt++;
    }
    # Organism NameƬ2򥭡Ȥͤ¸
    push(@{$self->{'parse'}->{$name}},$rec);
    push(@{$self->{'taxon'}->{$taxid}}, $rec);
    foreach my$acc (split(/,/, $rec->{'accession_id'})) {
        push(@{$self->{'accession'}->{"$acc"}}, $rec);
    }

  }
  $fh->close();
}

##############################################################################
# ̾
# getLineKName()
# 
#     ʪ̾Ƭ2ñ򥭡ȤǡΥϥå֤
# 
#
# 
#
# 
sub getLineKName {
  my $self = shift;
  my $name = shift(@_);
  
  if(exists $self->{'parse'}->{$name}) {
    return $self->{'parse'}->{$name};
  } else {
    return undef;
  }
}

##############################################################################
# ̾
# getLineKTaxon()
# 
#     Taxonomy ID򥭡ȤǡΥϥå֤
# 
#
# 
#
# 
#
sub getLineKTaxon {
  my $self = shift;
  my $key = shift(@_);
  
  if(exists $self->{'taxon'}->{$key}) {
    return $self->{'taxon'}->{$key};
  }
  else {
    return undef;
  }
}

##############################################################################
# ̾
# getLineKAccession()
# 
#     Accession ID򥭡ȤǡΥϥå֤
# 
#
# 
#
# 
#
sub getLineKAccession {
  my $self = shift;
  my $key = shift(@_);
  
  if(exists $self->{'accession'}->{$key}) {
    return $self->{'accession'}->{$key};
  }
  else {
    return undef;
  }
}

##############################################################################
if ($0 eq __FILE__) {
    my($url) = "http://www.ncbi.nlm.nih.gov/genomes/lproks.cgi?dump=selected";

    my($fileNcbi) = "/tmp/NcbiOrg.txt";
    my($cmd) = "$main::CMD_wget -q -O - '$url' > $fileNcbi";
    system("$cmd");

    my($obj) = new ParseNCBIcomplete($fileNcbi);

    foreach my$k (@ARGV) {
        foreach my$ref (@{$obj->{'taxon'}->{"$k"}}) {
            foreach my$key (sort(keys(%{$ref}))) {
                print "$key", "\t", $ref->{"$key"}, "\n";
            }
        }
    }
    unlink($fileNcbi);
}

##############################################################################
1; #
##############################################################################
