#!/usr/local/bin/perl


use MBGD::Taxonomy;
package Property::HomolParam;
$BLASTCUT = '1e-2';
$MAXORGS = '60';

sub new {
	my($class, %option) = @_;
	return ${__Param} if (! $option{reset} && ${__Param});
	my($this) = ${__Param} = {};
	my($tmpData) = {};
	my($Param) = {};
	while(<DATA>){
		if (/^([a-z0-9_]+):\s*(\S.*)$/) {
			## field name
			$name = $1;
			$value = $2;
			while ($value =~ /(\$\w+)/g) {
				$var = "\\" . $1; $val = eval $1;
				$value =~ s/$var/$val/;
			}
			$tmpData->{$name} = $value;
		} elsif (/^\/\//) {
			## record delimiter
			if (! $tmpData->{varname}) {
				$tmpData->{varname} = $tmpData->{name};
			}
			$tmpData->{value} = $tmpData->{default};
			$Param->{$tmpData->{name}} = $tmpData;
			push(@{$List}, $tmpData->{name});
			$tmpData = {};
		} elsif (/^\s+\S/) {
			## continuing line
			s/^\s+/ /;
			$value = $_;
			while ($value =~ /(\$\w+)/g) {
				$var = "\\" . $1; $val = eval $1;
				$value =~ s/$var/$val/;
			}
			$tmpData->{$name} .= $value;
		}
	}
	my @allspec = MBGD::Taxonomy->new->get_default_spec;

	$Param->{species}->{value} = $Param->{species}->{default}
		= join("|", @allspec);
	$this->{Param} = $Param;
	$this->{List} = $List;

	bless $this,$class;

	if ($option{datafile}) {
		$this->readdata($option{datafile});
	}
	return $this;
}

# Create and return a reference of a tie-hash object HomolParamHash
sub asHashRef {
	my($this) = @_;
	my(%homHash);
	tie %homHash, "Property::HomolParamHash", $this;
	return \%homHash;
}
# a copied hash table is returned (unoverwritable)
sub asHash {
	my($this) = @_;
	return %{ $this->asHashRef };
}

sub printHelpParam {
	my($this) = @_;
	print <<EOF;
In MBGD, similarity relationships identified by all-against-all BLAST searches
with BLAST E-value <= $BLASTCUT are stored. For each gene pair,
an optimal local alignment score calculated by the dynamic
programming (DP) algorithm was also stored.
After filtering the similarity data with some criteria [selection step],
a hierarchical clustering algorithm known as UPGMA is applied to
the similarity data for grouping genes [clustering step].
By default, <u>MBGD minimally filters similarity data in the selection step</u>
so that the clustering program can use as much information as possible.
You can control the similarity cutoff by the following parameters,
although generally we do not recommend you to modify them extremely.
Note that all pairs that are removed in the selection step
are treated as "missing relationships" in the clustering step and
are assigned the same value that is specified in
the "Score for missing relationships" below.
Note also that you can choose only at most $MAXORGS organisms when you want
to make your own clusters using the new parameter set.
EOF

	foreach $n (@{$this->{List}}) {
		__printParamHelp($this->{Param}->{$n});
	}
}
sub __printParamHelp {
	my($data) = @_;
	if ($data->{fullname} && $data->{description}) {
		print "<H4>$data->{fullname}  [$data->{step}]</H4>\n";
		print "$data->{description}\n";
	}
}
sub printParamSetTab {
	my($this, $mode) = @_;
	print "<table BORDER>\n";
	print qq{<tr><th> Parameters
	<A href="/htbin/SetParamScreen.pl?mode=help">[Help]</A><th>Value};
	foreach $n (@{$this->{List}}) {
		next if (defined $this->{Param}->{$n}->{no_set});
		if ($mode eq 'edit') {
			__printParamSetTabEdit($this->{Param}->{$n});
		} else {
			__printParamSetTab($this->{Param}->{$n});
		}
	}
	print "</table>\n";
}
sub __printParamSetTab {
	my($data) = @_;
	print "<tr><td>$data->{fullname}";
	print "<td>$data->{value}\n";

}
sub __printParamSetTabEdit {
	my($data) = @_;
	print "<tr><td>$data->{fullname}";

	my($value) = $data->{value};
	$value = $data->{default} if (!defined $value);

	if ($data->{type} eq 'radio') {
		my @opt = split(/,/, $data->{options});
		print "<td>";
		foreach $o (@opt) {
			$checked = ($o eq $value ? "CHECKED" : "");
		 	print qq{<input TYPE="radio" } .
			  qq{NAME="$data->{varname}" VALUE="$o" $checked>$o\n};
		}
	} elsif ($data->{type} eq 'select') {
		my @opt = split(/,/, $data->{options});
		print qq{<td><select NAME="$data->{varname}">\n};
		foreach $o (@opt) {
			$selected = ($o eq $value ? "SELECTED" : "");
		 	print qq{<option VALUE="$o" $selected>$o\n};
		}
	} else {
		print qq{<td><input NAME="$data->{varname}" };
		print qq{VALUE="$value">\n};
	}
}
sub getValue {
	my($this, $name) = @_;

	if ($name eq '*missdist*') {
		return $this->calc_missdist;
	}
if (! defined $this->{Param}->{$name}) {
	warn "accessing undefined parameter [$name]\n";
	return '';
}

	return $this->{Param}->{$name}->{value};
}
sub setValue {
	my($this, $name, $val) = @_;
	if (defined $this->{Param}->{$name}) {
		$this->{Param}->{$name}->{value} = $val;
	} else {
		warn "accessing undefined parameter [$name]\n";
	}
}
sub getParams {
	my($this) = @_;
	my($data) = {};
	foreach $key (keys %{$this->{Param}}) {
		$data->{$key} = $this->getValue($key);
	}
	return $data;
}
sub setParams {
	my($this, $data) = @_;
	foreach $key (keys %{$data}) {
		$this->setValue($key, $data->{$key});
	}
}
sub readdata {
	my($this, $file) = @_;
	open(F, $file) || die "Can't open $file\n";
	while(<F>) {
		($name, $value) = split;
		$this->setValue($name, $value);
	}
	close(F);
}


sub ChangeHomolParamButton {
	print "<FORM METHOD=\"POST\" ACTION=\"/htbin/SetParamScreen.pl\">\n";
###	print "<INPUT TYPE=hidden NAME=\"printSpec\" VALUE=1>\n" if($specFlag);
	print "<INPUT TYPE=submit VALUE=\"Change Homology Paramters\">\n";
	print "</FORM>\n";
}

sub calc_missdist {
	my($this) = @_;
	my(%par) = $this->asHash;
	my($missdist);
	if ($par{missdist} =~ /^\s*$/) {
		if ($par{sim_measure} eq 'score') {
			$missdist = $par{score} - 10;
		} elsif ($par{sim_measure} eq 'pam') {
			$missdist = $par{pam} + 10;
		}
	} else {
		$missdist = $par{missdist};
	}
	return $missdist;
}
sub getOption_select {
	my($this) = @_;
	my(%par) = $this->asHash;
	my($opt) = "";
	my($species) = join(",", split(/\|/, $par{species}));
	$opt .= " -SPEC=$species";
	$opt .= " -EVAL=$par{eval}" if ($par{eval});
	$opt .= " -SCORE=$par{score}" if ($par{score});
	$opt .= " -IDENT=$par{ident}" if ($par{ident});
	$opt .= " -PAM=$par{pam}" if ($par{pam});
	$opt .= " -COVER=$par{coverage}" if ($par{coverage});
	if ($par{besthit} && $par{besthit} ne 'none' && $par{ratiocut}>0) {
		$opt .= " -BESTHIT=$par{besthit}";
		$opt .= " -RATIOCUT=$par{ratiocut}" if ($par{ratiocut});
	}
	$opt =~ s/  */ /g; 
	return $opt;
}
sub getOption_domclust {
	my($this) = @_;
	my(%par) = $this->asHash;
	my($opt) = "";
	my($sim_measure) = $par{sim_measure};

	$opt = " -o1";
	if ($sim_measure eq 'score') {
		$opt .= " -S";
		$opt .= " -c$par{score}" ;
	} elsif ($sim_measure eq 'pam') {
		$opt .= " -c$par{pam}";
	}
	my($missdist) = $this->calc_missdist;
#	my($missdist) = $par{missdist};
#	if ($missdist eq '') {
#		if ($sim_measure eq 'score') {
#			$missdist = $par{score} - 10;
#		} elsif ($sim_measure eq 'pam') {
#			$missdist = $par{pam} + 10;
#		}
#	}
	$opt .= " -m$missdist";
	if ($par{clustmode} eq 'homology') {
		$opt .= " -p2";
	} else {
		$opt .= " -p$par{phylocut}";
	}
#	if ($par{coverage}) {
#		$opt .= " -Ocoverage=$par{coverage}"; 
#	}
	$opt .= " -n1";
	$opt =~ s/  */ /g; 
	return $opt;
}
######################################
# Tie-hash interface for accessing the values directly
#	%h = $homolParam->asHash;
#	print $h{'eval'};
#	$h{'score'} = 100;
######################################
package Property::HomolParamHash;
use Tie::Hash;
@ISA = qw(Tie::Hash);
sub TIEHASH {
	my($class, $hom) = @_;
	die if (ref($hom) ne 'Property::HomolParam');
	bless {Hom => $hom}, $class;
}
sub FETCH {
	my($this, $key) = @_;
#print STDERR "fetch: $key\n";
	return $this->{Hom}->getValue($key);
}
sub STORE {
	my($this, $key, $value) = @_;
	$this->{Hom}->setValue($key, $value);
}
sub FIRSTKEY {
	my($this) = @_;
	scalar keys %{$this->{Hom}->{Param}};
	return scalar each %{$this->{Hom}->{Param}};
}
sub NEXTKEY {
	my($this) = @_;
	return scalar each %{$this->{Hom}->{Param}};
}
sub EXISTS {
	my($this) = @_;
	return exists $this->{Hom}->{Param};
}
######################################
1;
######################################
##
## homology parameters
##
######################################
package Property::HomolParam;
__DATA__
name: eval
fullname: Cutoff BLAST E-value
description: This value specifies a cutoff E-value of the BLAST results.
	The maximum possible value is $BLASTCUT.
	Note that in MBGD, E-value is adjusted so that the size
	of the search space (the database size times the query length)
	is 1e9.
step: selection
default: 0.001
//
name: score
fullname: Cutoff DP score
description: Cutoff score of the optimal local alignment with
	JTT-PAM250 scoring matrix (Jones et al. 1992).
	The same cutoff is used for both the selection and the
	clustering steps when you use score as a similarity measure.
step: selection,clustering
default: 80
//
name: ident
fullname: Cutoff percent identity
description: Percent identity is defined as {the number of identical
	residue pairs} / {alignment length} * 100. Alignment
	length includes internal gaps.
step: selection
default:
//
name: pam
fullname: Cutoff PAM distance
description: PAM is a unit of evolutionary distance defined as
	the number of accepted point mutations per 100 residues
	(Dayhoff et al. 1978). Actually, PAM distance is
	estimated by which PAM substitution matrix gives
	the best alignment score.
	The same cutoff is used for both the selection and the
	clustering steps when you use PAM as a dissimilarity measure.
step: selection,clustering
default:
//
name: coverage
fullname: Alignment coverage
description: Alignment coverage is defined as {alignment length}
	/ {length of the shorter sequence} * 100.
	Raising this parameter removes matches in only short regions.
	MBGD does not make this check by default. 
step: selection
//
name: sim_measure
fullname: Similarity measure for orthology
type: radio
description: This option specifies which similarity or dissimilarity
	measure (score or PAM) to use for the orthology identification
	or the clustering process.
	Note that scores depend on the alignment lengths while PAMs do not.
options: score,pam
default: score
step: selection,clustering
//
name: besthit
fullname: Best hit criterion
description: The bi-directional best hit criterion
	(i.e. gene pairs (a,b) of genomes A and B s.t. a is the most
	similar gene of b in A and vice versa)
	is a conventional approach for ortholog identification
	between two genomes. The uni-directional version is also
	routinely used for predicting gene functions.
	MBGD does not use such criterion in the selection step by default,
	but it might be useful for the purpose of filtering out
	some apparent paralogs in some situation.
	See the next section for details.
type: radio
options: none,bidirec,unidirec
default: none
step: selection
//
name: ratiocut
fullname: Cutoff ratio of the score against the best
description: This parameter is not effective when you do not use
	the best-hit criterion above.<br>
	Orthology need not be one-to-one relationship.
	For bidirectional best-hit criterion, a gene pair
	(a,b) is considered as orthologs when score(a,b) satisfies
	<ul>
	score(a,b) / max( max_y( score(a,y) ), max_x( score(x,b) ) )
		* 100 >= <i>cutoff_ratio</i></ul>
	where x and y are any genes of genomes A and B, respectively.
	Using <i>cutoff_ratio</i> =100 corresponds to the exact bidirectional
	best-hit criterion.<br>
	Similarly, for unidirectional best-hit criterion, a gene pair
	(a,b) is considered as orthologs when
	<ul>
	score(a,b) / min( max_y( score(a,y) ), max_x( score(x,b) ) )
		* 100 >= <i>cutoff_ratio</i> </ul>
step: selection
//
name: missdist
fullname: Score for missing relationships
description: Although the usual hierarchical clustering algorithm requires
	a complete similarity/dissimilarity matrix,
	here we use only significant similarities found by the search.
	This option specifies a value
	to be assigned for the relationships missed by the search.
	The value must be smaller (larger) than the
	similarity (dissimilarity) cutoff.
	Specifying an extremely small (large) value will result in 
	classification similar to that by complete linkage clustering,
	whereas specifying a value close to the cutoff gives similar
	results to that by single linkage clsutering.
	The default value (=blank) is {score_cutoff - 10} or
	{pam_cutoff + 10}.
step: clustering
//
name: clustmode
fullname: Clustering Mode
description: This option specifies whether orthologous or homologous
	groups shall be made. Actually, this is just equivalent to
	omitting the tree-splitting procedure described below
	by specifying <i>phylocut</i> &gt 1.
step: clustering
type: radio
options: orthology,homology
default: orthology
//
name: phylocut
fullname: Cutoff ratio of paralogs for tree splitting
description: In MBGD, orthologous groups are made by splitting
	trees of homologous clusters created by the hierarchical
	clustering algorithm.
	The node with two children A and B is split when
	<ul>
	| Intersect(S(A),S(B)) | / min( |S(A)|, |S(B)| ) > <i>phylocut</i>,
	</ul>
	where S(A) denotes a set of species contained in the node A,
	|S| denotes the cardinality of S, and Intersect(A,B) is an intersection
	of sets A and B. This parameter is not effective when you
	specify ClusteringMode = 'homology'.
default: 0.4
step: clustering
//
name: species
no_set:	true
//
name: motif_eval
no_set:	true
//
