#!/usr/bin/perl -s
use strict;
use Carp;
use FileHandle;
package ConsenseTitle;
###############################################################################
#
sub new {
    my($class) = shift;
    my($file_conf) = shift;
    my($self) = {};

    bless($self, $class);

    $self->read_conf($file_conf);
    $self->clear();

    return $self;
}

###############################################################################
#
sub read_conf {
    my($self) = shift;
    my($filename) = shift;
    my($conf_ref) = {};

    if (defined($filename) && -e "$filename") {
        my($fh) = FileHandle->new("$filename");
        if (!$fh) {
            print STDERR "WARNING :: Can not open $filename($!)\n";
        }
        else {
            while(my$line = $fh->getline()) {
                $line =~ s#[\r\n]*$##;
                my($k, $v) = split(/\t/, $line);

                $k = lc($k);
                if (!exists($conf_ref->{"$k"})) {
                    $conf_ref->{"$k"} = [];
                }
                push(@{$conf_ref->{"$k"}}, $v);
            }
            $fh->close();
        }
    }

    #
    my($delim);
    if (exists($conf_ref->{'delim'})) {
        $delim = $conf_ref->{'delim'}->[0];
    }
    if ($delim =~ /^\s*$/) {
        $delim = ' \-\.,;:\/'
    }
    $self->{'DELIM'} = '[' . $delim . ']+';
    if ($main::DEBUG) {
        print STDERR "CONF :: DELIM = $self->{'DELIM'}\n";
    }

    #
    if (!exists($conf_ref->{'stop_word'})) {
        $conf_ref->{'stop_word'} = [    'putative',
                                        'probable',
                                        'unknown',
                                        'hypothetical',
                                        'uncharacterized',
                                        'predicted',
                                        'conserved',
                                        'protein',
                                        'precursor',
                                        'domain',
                                        'family',
                                        'subunit',
					'factor',
					'function',
					'similarity',
					'like',
                                    ];
    }
    if (!exists($conf_ref->{'stop_word2'})) {
        $conf_ref->{'stop_word2'} = [   
                                        'and', 'or',
					'the', 'of', 'with', 'in', 'on',
                                        'at', 'to', 'for', 'from', 'as',
                                    ];
    }
    $self->{'STOP_WORDS'} = {};
    foreach my$w (@{$conf_ref->{'stop_word'}}, @{$conf_ref->{'stop_word2'}}) {
        $self->{'STOP_WORDS'}->{"$w"} = 1;
        if ($main::DEBUG) {
            print STDERR "CONF :: STOP_WORDS = $w\n";
        }
    }
    foreach my$w (@{$conf_ref->{'stop_word2'}}) {
        $self->{'STOP_WORDS2'}->{"$w"} = 1;
        if ($main::DEBUG) {
            print STDERR "CONF :: STOP_WORDS2 = $w\n";
        }
    }

    #
    my($freqword_cut);
    if (exists($conf_ref->{'freqword_cut'})) {
        $freqword_cut = $conf_ref->{'freqword_cut'}->[0];
    }
    if ($freqword_cut =~ /^\s*$/) {
        $freqword_cut = 0.3;
    }
    $self->{'FREQWORD_CUT'} = $freqword_cut;
    if ($main::DEBUG) {
        print STDERR "CONF :: FREQWORD_CUT = $self->{'FREQWORD_CUT'}\n";
    }

    #
    my($penalty);
    if (exists($conf_ref->{'penalty'})) {
        $penalty = $conf_ref->{'penalty'}->[0];
    }
    if ($penalty =~ /^\s*$/) {
        $penalty = 0.5;
    }
    $self->{'PENALTY'} = $penalty;
    if ($main::DEBUG) {
        print STDERR "CONF :: PENALTY = $self->{'PENALTY'}\n";
    }

    #
    my($sp_weight);
    if (exists($conf_ref->{'sp_weight'})) {
        $sp_weight = $conf_ref->{'sp_weight'}->[0];
    }
    if ($sp_weight =~ /^\s*$/) {
        $sp_weight = 2;
    }
    $self->{'SP_WEIGHT'} = $sp_weight;
    if ($main::DEBUG) {
        print STDERR "CONF :: SP_WEIGHT = $self->{'SP_WEIGHT'}\n";
    }

    return;
}

###############################################################################
#
sub clear {
    my($self) = shift;

    $self->{'TITLES'} = [];

    return;
}

###############################################################################
#
sub set_title {
    my($self) = shift;
    my($title) = shift;

    while($title =~ / \(.*?\)[ \.]/) {
        my$par = $1;
        $title =~ s/$par/ /;
    }

    my($ref) = {};
    $ref->{'tit'}    = $title;
    $ref->{'weight'} = 1;

    push(@{$self->{'TITLES'}}, $ref);

    return;
}

###############################################################################
#
sub consense_title {
    my($self) = shift;
    my($titles) = $self->{'TITLES'};
    my(%Count, %FreqWord, $maxcnt, $maxcnt2);

    #
    my($Delim)        = $self->{'DELIM'};
    my($FreqWord_Cut) = $self->{'FREQWORD_CUT'};

    #
    foreach my$titd (@{$titles}) {
        my$tit    = $titd->{tit};
        my$weight = $titd->{weight};
	my($lcw, %FoundWord);
        foreach my$w (split(/$Delim/, $tit)) {
	    $lcw = lc($w);
            $Count{$lcw} += $weight if (! $FoundWord{$lcw});
	    $FoundWord{$lcw} = 1;
        }
    }
    foreach my$w (keys %Count) {
        if (! $self->{'STOP_WORDS'}->{"$w"}) {
            $maxcnt = $Count{$w} if($maxcnt < $Count{$w});
        } else {
            $maxcnt2 = $Count{$w} if($maxcnt2 < $Count{$w});
        }
    }
    $maxcnt = $maxcnt2 if ($maxcnt == 0);
    foreach my$w (keys %Count) {
        if ($Count{$w} >= $maxcnt * $FreqWord_Cut) {
            $FreqWord{$w} = 1;
            if ($self->{'STOP_WORDS'}->{"$w"}) {
                $Count{$w} = $maxcnt * $FreqWord_Cut;
            }
if ($main::DEBUG) {
    print STDERR "FREQ>>$w,$Count{$w}\n";
}
        }
    }
    my($maxtit, $maxtit_orig, $maxscore);
    foreach my$titd (@{$titles}) {
        my$tit    = $titd->{tit};
        my$weight = $titd->{weight};
        my($score);
        my($constit);
        my(%FoundWord);
        my(@titWords) = split(/$Delim/, $tit);
        my(@delims) = ($tit =~ /$Delim/g);
        my($delim);
        my($i);
	my($hit);
        foreach my$w (split(/$Delim/, $tit)) {
            my$lcw = lc($w);
	    $hit = 0;
	    if ($self->{'STOP_WORDS2'}->{$lcw} && $FreqWord{$lcw}) {
		if ( ($i > 0 && $FreqWord{ lc($titWords[$i-1]) }) &&
		     ($i < $#titWords && $FreqWord{lc($titWords[$i+1])}) ) {
			$hit = 1;
		} else {
			$hit = 0;
		}
	    } elsif ($FreqWord{$lcw}) {
		$hit = 1;
	    }
	    if ($hit) {
                if (! $FoundWord{$lcw}) {
                    $score += $Count{$lcw} * $weight;
                }
                else {
                }
                $constit .= $delim if ($constit);
                $constit .= $w;
            } else {
		$score -= $self->{'PENALTY'};
	    }
            $delim = $delims[$i++];
            $FoundWord{$lcw} = 1;
        }
if ($main::DEBUG) {
    print STDERR "$weight>$tit>$constit>$score\n";
}
        if ($score > $maxscore) {
            $maxtit = ucfirst($constit);
            $maxtit_orig = $tit;
            $maxscore = $score;
        }
    }

if ($main::DEBUG) {
    print STDERR "return ($maxtit, $maxtit_orig, $maxscore)\n";
}
    return ($maxtit, $maxtit_orig, $maxscore);
}

###############################################################################
package main;
if ($0 eq __FILE__) {
    my($filename) = $ARGV[0];

    my($file_conf) = "$ENV{'MBGD_HOME'}/etc/ConsenseTitle.conf";
    if (defined($main::CONF) && -e "$main::CONF") {
        $file_conf = $main::CONF;
    }
    my($cons_tit_ref) = ConsenseTitle->new($file_conf);

    #
    my(@titles);
    my($prevcid);

    #
    my($fh) = FileHandle->new("$filename") || croak("Can not open $filename($!)");
    while(my$line = $fh->getline()){
        next if ($line =~ /^\s*$/);
        next if ($line =~ /^\s*#/);

        $line =~ s#[\r\n]*$##;

                 my(@dat) = split(/\t/, $line);
                 my($i) = 0;
                 my($cid)        = $dat[$i++];
                 my($name)       = $dat[$i++];
                 my($dom)        = $dat[$i++];
                 my($from)       = $dat[$i++];
                 my($to)         = $dat[$i++];
                 my($spnum)      = $dat[$i++];
                 my($geneid)     = $dat[$i++];
                 my($mbgd_gene)  = $dat[$i++];
                 my($mbgd_descr) = $dat[$i++];

                 if ($prevcid && $cid != $prevcid) {
                     my($maxtit, $maxtit_orig, $maxscore) = $cons_tit_ref->consense_title();

                     if ($main::DEBUG) {
                         print STDERR ">>", join("\t", $prevcid, $maxtit, $maxscore), "\n";
                     }
                     else {
                         print join("\t", $prevcid, $maxtit), "\n";
                     }
                     $cons_tit_ref->clear();
                 }
                 $prevcid = $cid;

                 #
                 $cons_tit_ref->set_title($mbgd_descr);
             }

        my($maxtit, $maxtit_orig, $maxscore) = $cons_tit_ref->consense_title();
        if ($main::DEBUG) {
            print STDERR ">>", join("\t", $prevcid, $maxtit, $maxscore), "\n";
        }
        else {
            print join("\t", $prevcid, $maxtit), "\n";
        }
}

###############################################################################
1;#
###############################################################################
