#!/usr/local/bin/perl

use POSIX;
use MBGD;
use MBGD::Taxonomy;
use Property::Base;
use MbgdUserGenomeCommon;

package Property::HomolParam;
#
# most functions are inherited from the super class: Property::Base
#
@ISA = qw{ Property::Base };

$BLASTCUT = '1e-2';
$CLUSTER_MAXSP = '100';
$CLUSTER_MAXSP_MyMBGD = '60';

sub initialize {
	my($this, %option) = @_;
	$this->SUPER::initialize(%option);

	if (! $this->{Param}->{species}->{value}) {
		my @allspec = MBGD::Taxonomy->new->get_default_spec;
		$this->{Param}->{species}->{value}
			= $this->{Param}->{species}->{default}
			= join("|", @allspec);
	}
}
sub title {
	"Clustering Parameters";
}
sub printHelpSummary {
$DOMCLUST_URL = "/domclust/";
$DOMCLUST_PAPER_URL = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=PubMed&cmd=Retrieve&list_uids=16436801&dopt=Abstract";

<<EOF;
In MBGD, similarity relationships identified by all-against-all BLAST searches
with BLAST E-value <= $BLASTCUT are stored. For each gene pair,
an optimal local alignment score calculated by the dynamic
programming (DP) algorithm was also stored.
After filtering the similarity data with some criteria [selection step],
a hierarchical clustering algorithm, <A HREF=$DOMCLUST_URL>DomClust</A>
(<A HREF=$DOMCLUST_PAPER_URL>Uchiyama 2006</A>), is applied to
the similarity data for grouping genes [clustering step].
By default, <u>MBGD minimally filters similarity data in the selection step</u>
so that the clustering program can use as much information as possible.
You can control the similarity cutoff by the following parameters,
although generally we do not recommend you to modify them extremely.
Note that all pairs that are removed in the selection step
are treated as "missing relationships" in the clustering step and
are assigned the same value that is specified in
the "Score for missing relationships" below.
Note also that you can choose only at most $CLUSTER_MAXSP organisms
when you want to make your own clusters using the new parameter set.
EOF

}

sub getClassParam {
	my($this, $name) = @_;
	return $$name;
}

sub getValue {
	my($this, $name) = @_;

	if ($name eq '*missdist*') {
		return $this->calc_missdist;
	} else {
		return $this->SUPER::getValue($name);
	}
}

sub ChangeHomolParamButton {
	print "<FORM METHOD=\"POST\" ACTION=\"/htbin/SetParamScreen.pl\">\n";
###	print "<INPUT TYPE=hidden NAME=\"printSpec\" VALUE=1>\n" if($specFlag);
	print "<INPUT TYPE=submit VALUE=\"Change Homology Paramters\">\n";
	print "</FORM>\n";
}

sub calc_missdist {
	my($this) = @_;
	my(%par) = $this->asHash;
	my($missdist);
	if ($par{missdist} =~ /^\s*$/) {
		if ($par{sim_measure} eq 'score') {
			$missdist = $par{score} - 10;
		} elsif ($par{sim_measure} eq 'pam') {
			$missdist = $par{pam} + 10;
		}
	} else {
		$missdist = $par{missdist};
	}
	return $missdist;
}
sub getOption_select {
	my($this) = @_;
	my(%par) = $this->asHash;
	my($opt) = "";
	my($species) = join(",", split(/\|/, $par{species}));
	$opt .= " -SPEC=$species";
	$opt .= " -EVAL=$par{eval}" if ($par{eval});
	$opt .= " -SCORE=$par{score}" if ($par{score});
	$opt .= " -IDENT=$par{ident}" if ($par{ident});
	$opt .= " -PAM=$par{pam}" if ($par{pam});
	$opt .= " -COVER=$par{coverage}" if ($par{coverage});
	if ($par{besthit} && $par{besthit} ne 'none' && $par{ratiocut}>0) {
		$opt .= " -BESTHIT=$par{besthit}";
		$opt .= " -RATIOCUT=$par{ratiocut}" if ($par{ratiocut});
	}
	$opt =~ s/  */ /g; 
	return $opt;
}
sub getOption_domclust {
	my($this, $uInfo) = @_;
	my(%par) = $this->asHash;
	my($opt) = "";
	my($sim_measure) = $par{sim_measure};

	$opt = " -o1";
	if ($sim_measure eq 'score') {
		$opt .= " -S";
		$opt .= " -c$par{score}" ;
	} elsif ($sim_measure eq 'pam') {
		$opt .= " -c$par{pam}";
	}
##	my($missdist) = $this->calc_missdist;
##	$opt .= " -m$missdist";
	if ($par{clustmode} eq 'homology') {
		$opt .= " -p2";
	} else {
		$opt .= " -p$par{phylocut}";
	}
	if ($par{taxonlevel} && $par{taxonlevel} ne 'none' && $uInfo) {
		my $tax = MBGD::Taxonomy->new;
		my @taxlist = $tax->get_species(
				{list_related => $par{taxonlevel}});
		my $taxinfo;
		foreach $t (@taxlist) {
			if (@{$t} >= 2) {
				$taxinfo .= ('{'. join(',', @{$t}). '}');
			}
			
		}
		$uInfo->saveData($par{taxonlevel}, $taxinfo);
		$opt .= " -t$par{taxonlevel}";
	}
	if ($par{coverage2}) {
		$opt .= " -V$par{coverage2}"; 
	}
	if ($par{cutoff2}) {
		$opt .= " -C$par{cutoff2}"; 
	}
#	if ($par{sumcut}) {
#		$opt .= " -C$par{sumcut}"; 
#	}
	if ($par{sumcut}) {
		$opt .= " -s$par{sumcut}"; 
	}
	if ($par{domcut}) {
		$opt .= " -Odomcut"; 
	}
	if ($par{nbrratio}) {
		$opt .= " -OchkOvlpClst=$par{nbrratio}";
	}
	if ($par{adjovlp} && $par{adjovlp} ne '-') {
		$opt .= " -ao$par{adjovlp}";
	}
	if ($par{adjincl} && $par{adjincl} ne '-') {
		$opt .= " -ai$par{adjincl}";
	}
	$opt .= " -ne1";
	$opt =~ s/  */ /g; 
	return $opt;
}
sub setOptions {
	my($this, $optString) = @_;
	my(@commands) = split(/\|/, $optString);
	foreach my $cm (@commands) {
		if ($cm =~ /select\.pl /) {
			$this->setOption_select($cm);
		} elsif ($cm =~ /domclust /) {
			$this->setOption_domclust($cm);
		}
	}
}
sub setOption_select {
	my($this, $opt) = @_;
	my($par) = $this->asHashRef;
	if ($opt =~ /\-SPEC=(\S+)/) {
		my $species = $1;
		$par->{species} = join("|", split(/,/, $species));
	}
	$par->{'eval'} = $1 if ($opt =~ /\-EVAL=(\S+)/);
	$par->{'score'} = $1 if ($opt =~ /\-SCORE=(\S+)/);
	$par->{'ident'} = $1 if ($opt =~ /\-IDENT=(\S+)/);
	$par->{'pam'} = $1 if ($opt =~ /\-PAM=(\S+)/);
	$par->{'coverage'} = $1 if ($opt =~ /\-COVER=(\S+)/);
	$par->{'besthit'} = $1 if ($opt =~ /\-BESTHIT=(\S+)/);
	$par->{'ratiocut'} = $1 if ($opt =~ /\-RATIOCUT=(\S+)/);
}
sub setOption_domclust {
	my($this, $opt) = @_;
	my($par) = $this->asHashRef;
	if ($opt =~ /\-S/) {
		$par->{'sim_measure'} = 'score';
	}
	if ($opt =~ /\-c(\d+)/) {
		if ($par->{'sim_measure'} eq 'score') {
			$par->{'score'} = $1;
		} else {
			$par->{'pam'} = $1;
		}
	}
	if ($opt =~ /\-p([\d\.]+)/) {
		$par->{phylocut} = $1;
	}
	if ($opt =~ /\-V([\d\.]+)/) {
		$par->{coverage2} = $1;
	}
	if ($opt =~ /\-C([\d\.]+)/) {
		$par->{cutoff2} = $1;
	}
	if ($opt =~ /\-s([\d\.]+)/) {
		$par->{sumcut} = $1;
	}
	if ($opt =~ /\-ao([\d\.]+)/) {
		$par->{adjovlp} = $1;
	}
	if ($opt =~ /\-ai([\d\.]+)/) {
		$par->{adjincl} = $1;
	}
}

sub getMaxSpec {
	my($this) = shift;
	my($mod) = shift;
	my($uid) = shift;

    return $CLUSTER_MAXSP if ($mod !~ /mymbgd/i);

    if (! MbgdUserGenomeCommon::isReadyUserGenome($uid)) {
        return $CLUSTER_MAXSP;
    }
    my($dbname) = MbgdUserGenomeCommon::getDbname($uid);
    my($db) = new MBGD::DB($dbname);
    if (! $db->{'conn'}) {
        # Can not connect MySQL
        return $CLUSTER_MAXSP_MyMBGD;
    }

	my($tab) = 'proteinseq_user';
	my($res) = $db->select_fetch($tab, { 'columns' => 'count(*) as n'});
	my($ref) = $res->{'INFO'}->[0];

    my($maxSp) = $CLUSTER_MAXSP_MyMBGD;
    if ($ref->{'n'} != 0) {
        print STDERR "DBG :: # of $dbname.proteinseq_user :: $ref->{'n'}\n";
        $maxSp = POSIX::ceil(20000 / $ref->{'n'}) * 10;
        if ($CLUSTER_MAXSP < $maxSp) {
			$maxSp = $CLUSTER_MAXSP;
		}
    }
    print STDERR "DBG :: MAX species(MyMBGD) for $uid :: $maxSp\n";

    return $maxSp;
}

#######################################
1;
######################################
##
## homology parameters
##
######################################
package Property::HomolParam;
__DATA__
name: eval
fullname: Cutoff BLAST E-value
description: This value specifies a cutoff E-value of the BLAST results.
	The maximum possible value is $Property::HomolParam::BLASTCUT.
	Note that in MBGD, E-value is adjusted so that the size
	of the search space (the database size times the query length)
	is 1e9.
step: selection
default: 0.001
//
name: score
fullname: Cutoff DP score
description: Cutoff score of the optimal local alignment with
	JTT-PAM250 scoring matrix (Jones et al. 1992).
	The same cutoff is used for both the selection and the
	clustering steps when you use score as a similarity measure.
step: selection,clustering
#default: 70
default: 60
//
name: ident
fullname: Cutoff percent identity
description: Percent identity is defined as {the number of identical
	residue pairs} / {alignment length} * 100. Alignment
	length includes internal gaps.
step: selection
default:
//
name: pam
fullname: Cutoff PAM distance
description: PAM is a unit of evolutionary distance defined as
	the number of accepted point mutations per 100 residues
	(Dayhoff et al. 1978). Actually, PAM distance is
	estimated by which PAM substitution matrix gives
	the best alignment score.
	The same cutoff is used for both the selection and the
	clustering steps when you use PAM as a dissimilarity measure.
step: selection,clustering
default:
//
name: coverage
fullname: Alignment coverage
description: Alignment coverage is defined as {alignment length}
	/ {length of the shorter sequence} * 100.
	Raising this parameter removes matches in only short regions
	*before* the clustering procedure.
	MBGD does not make this check by default. 
step: selection
//
name: coverage2
fullname: Alignment coverage for domain splitting
description: In MBGD, a domain-splitting procedure is incorporated in
	the hierarchical clustering algorithm. When merging two most similar
	sequences (or clusters), the algorithm searches for
	another sequence (S3) that matches one of the merged
	sequences (S1) in the region outside the alignment between the
	merged pair. The algorithm splits the sequence S1 if such a sequence
	S3 is found and the alignment between S1 and S3 satisfies the
	coverage condition specified by this parameter and score condition
	specified by the next parameter.  Raise this parameter to avoid
	too short domains generated due to partial matches.
step: clustering
#default: 0.4
default: 0.6
//
name: cutoff2
fullname: Score cutoff for domain splitting
description: Cutoff score for the match between S1 and S3 described above
	to split the sequence. This parameter has similar but possibly
	complementary effect with the previous parameter.
step: clustering
#default: 100
default: 80
//
#name: sumcut
#fullname: Cutoff value of the sum of scores between groups
#step: clustering
#//
name: sim_measure
fullname: Similarity measure for orthology
type: radio
description: This option specifies which similarity or dissimilarity
	measure (score or PAM) to use for the orthology identification
	or the clustering process.
	Note that scores depend on the alignment lengths while PAMs do not.
options: score,pam
default: score
step: selection,clustering
//
name: besthit
fullname: Best hit criterion
description: The bi-directional best hit criterion
	(i.e. gene pairs (a,b) of genomes A and B s.t. a is the most
	similar gene of b in A and vice versa)
	is a conventional approach for ortholog identification
	between two genomes. The uni-directional version is also
	routinely used for predicting gene functions.
	MBGD does not use such a criterion in the selection step by default
	since the UPGMA algorithm itself must involve it,
	but in some situation it might be useful for the purpose of
	filtering out some apparent paralogs before clustering.
	See the next section for details.
type: radio
options: none,bidirec,unidirec
default: none
step: selection
//
name: ratiocut
fullname: Cutoff ratio of the score against the best
description: This parameter is not effective when you do not use
	the best-hit criterion above.<br>
	Orthology need not be one-to-one relationship.
	For bidirectional best-hit criterion, a gene pair
	(a,b) is considered as orthologs when score(a,b) satisfies
	<ul>
	score(a,b) / max( max_y( score(a,y) ), max_x( score(x,b) ) )
		* 100 >= <i>cutoff_ratio</i></ul>
	where x and y are any genes of genomes A and B, respectively.
	Using <i>cutoff_ratio</i> =100 corresponds to the exact bidirectional
	best-hit criterion.<br>
	Similarly, for unidirectional best-hit criterion, a gene pair
	(a,b) is considered as orthologs when
	<ul>
	score(a,b) / min( max_y( score(a,y) ), max_x( score(x,b) ) )
		* 100 >= <i>cutoff_ratio</i> </ul>
step: selection
//
name: missdist
fullname: Score for missing relationships
description: Although the usual hierarchical clustering algorithm requires
	a complete similarity/dissimilarity matrix,
	here we use only significant similarities found by the search.
	This option specifies a value
	to be assigned for the relationships missed by the search.
	The value must be smaller (larger) than the
	similarity (dissimilarity) cutoff.
	Specifying an extremely small (large) value will result in 
	classification similar to that by complete linkage clustering,
	whereas specifying a value close to the cutoff gives similar
	results to that by single linkage clustering.
	The default value (=blank) is {score_cutoff * 0.95} or
	{pam_cutoff / 0.95}.
step: clustering
//
name: clustmode
fullname: Clustering Mode
description: This option specifies whether orthologous or homologous
	groups shall be made. Actually, this is just equivalent to
	omitting the tree-splitting procedure described below
	by specifying <i>phylocut</i> &gt 1.
step: clustering
type: radio
options: orthology,homology
default: orthology
//
name: phylocut
fullname: Cutoff ratio of paralogs for tree splitting
description: In MBGD, orthologous groups are made by splitting
	trees of homologous clusters created by the hierarchical
	clustering algorithm.
	The node with two children A and B is split when
	<ul>
	| Intersect(Ph(A),Ph(B)) | / min( |Ph(A)|, |Ph(B)| ) > <i>phylocut</i>,
	</ul>
	where Ph(A) denotes a set of species contained
	in the node A (phylogenetic pattern), |Ph| denotes the cardinality
	of Ph, and Intersect(A,B) is an intersection
	of sets A and B. This parameter is not effective when you
	specify ClusteringMode = 'homology'.
default: 0.5
step: clustering
//
name: domcut
fullname: Use domcut criteria
type: check
//
name: taxonlevel
fullname: Phylogenetically related organisms
type: select
options: none, species, genus, family, order
description: When counting the number of species in the above calculation,
	one can incorporate taxonomic information by counting related
	species only once. You can specify a taxonomic rank to determine
	which set of organimsms you consider to be related.
default: none
step: clustering
//
name: adjovlp
fullname: Overlap ratio (<i>r<sub>adj1</sub></i>) for merging adjacent clusters
description: After the tree splitting procedure described above,
	two clusters of domains are joined when they are almost always
	adjacent to each other. More precisely, two clusters A and B
	are joined when
	<ul>
	|adjacent(A,B)| / max(|A|,|B|) &ge; <i>r<sub>adj1</sub></i>
	</ul>
	<i>or</i>
	<ul>
	|adjacent(A,B)| / min(|A|,|B|) &ge; <i>r<sub>adj2</sub></i> ,
	</ul>
	where adjacent(A,B) is a set of domains belonging to
	A and B that are adjacent to each other, and
	<i>r<sub>adj1</sub></i> and <i>r<sub>adj2</sub></i> are parameters
	satisfying
	0 &le; <i>r<sub>adj1</sub></i> &le; <i>r<sub>adj2</sub></i> &le; 1.
step: clustering
#no_set: true
type: select
options: -, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0
default: 0.8
//
name: adjincl
fullname: Coverage ratio (<i>r<sub>adj2</sub></i>) for absorbing adjacent small clusters
description: See above.  Note that this parameter is not effective
	if <i>r<sub>adj2</sub></i> &le; <i>r<sub>adj1</sub></i> .
step: clustering
#no_set: true
type: select
options: -,0.75, 0.8, 0.85, 0.9, 0.95, 1
default: 0.95
//
name: species
no_set:	true
//
name: motif_eval
no_set:	true
default: 0.001
//
