#!/usr/bin/perl -w
use strict;
use File::Basename;
use Getopt::Std;
my $PROGRAM = basename $0;
my $USAGE=
"Usage: $PROGRAM [OPTION] CLUSTER_FILE [CLUSTESET_FILE]
-t THRESHOLD
-e GAP_EXT
-o GAP_OPEN
-1: only once (not recursive)

Oversplit domains are merged iteratively using the DSP score change.
Exceptionally, the largest cluster is not merged.
* resume the calculation by skipping the existing merge_test files
-R: forced resume (overwrite output files of size 0)

Intermediate output files:
CLUSTER_FILE*.merge_test(|.jobs|.jobs.watch|.summary)
CLUSTER_FILE*.merge(|.log|.renumber)
CLUSTER_FILE*.link.(to_check|merged)
CLUSTER_FILE*.clusterset.merged
CLUSTER_FILE.to_ignore

Final output files:
CLUSTER_FILE.merged
CLUSTER_FILE.clusterset.merged.total
";

my %OPT;
getopts('t:e:o:1q:R', \%OPT);

### Settings ###
my $THRESHOLD = -0.05;
if (defined $OPT{t}) {
    $THRESHOLD = $OPT{t};
}

my $MERGE_TEST = "dom_merge_test_c";
# $MERGE_TEST .= " -r"; # align by region
if (defined $OPT{e}) {
    $MERGE_TEST .= " -e $OPT{e}";
}
if (defined $OPT{o}) {
    $MERGE_TEST .= " -o $OPT{o}";
}

my $QUEUE = "";
if (defined $OPT{q}) {
    $QUEUE = "-q $OPT{q}";
}

my $RESUME_OPT = "-r";
if ($OPT{R}) {
    $RESUME_OPT = "-R";
}

my $N_IGNORE = 1;

### Main ###
if (@ARGV == 0) {
    print STDERR $USAGE;
    exit 1;
}
my ($CLUSTER_FILE, $CLUSTERSET_FILE) = @ARGV;
my $INITIAL_CLUSTER_FILE = $CLUSTER_FILE;
$INITIAL_CLUSTER_FILE =~ s/\d*$//;

if ($OPT{1}) {
    merge($CLUSTER_FILE, $CLUSTERSET_FILE);
} else {
    do {
	($CLUSTER_FILE, $CLUSTERSET_FILE) = merge($CLUSTER_FILE, $CLUSTERSET_FILE);
    } until (-z $CLUSTERSET_FILE);
    system "ln -fs $CLUSTER_FILE ${INITIAL_CLUSTER_FILE}.merged";
    system "cat *.link.merged | links_to_clustersets.pl -n | sort -k1,1nr > ${INITIAL_CLUSTER_FILE}.clusterset.merged.total";
}
    
################################################################################
### Function ###################################################################
################################################################################
sub merge {
    my ($cluster, $clusterset) = @_;

    my $outdir = $cluster . ".merge_test";

    # restrict links
    if (! defined $clusterset) {
	$clusterset = "";
    }
    system "cat $cluster | cut.sh 1 | sort | uniq -c | sort -nr | head -n $N_IGNORE | cut.sh 2 >> cluster.to_ignore";
    system "cat $cluster | dom_network -l > $cluster.link";
    system "cat $cluster.link | ignore_clusters.pl cluster.to_ignore | restrict_links.pl $clusterset > $cluster.link.to_check";

    # dom_merge_test
    system "cat $cluster.link.to_check | shuffle.pl | sge_script.pl $QUEUE $RESUME_OPT -i $cluster -o $outdir '$MERGE_TEST'";
    # summarize
    my @link = `cat $cluster.link.to_check`;
    chomp(@link);
    open(OUTDIR_SUMMARY, ">$outdir.summary") || die;
    for my $link (@link) {
    	if (-f "$outdir/$link.out" and ! -z "$outdir/$link.out") {
    	    my $out = `cat $outdir/$link.out`;
    	    print OUTDIR_SUMMARY "$link\t$out";
    	}
    }
    close(OUTDIR_SUMMARY);

    # execute
    system "cat $outdir.summary | perl -lane '\$F[1]>=$THRESHOLD and \$F[7]>0 and print \$F[0]' > $cluster.link.merged";
    system "cat $cluster.link.merged | links_to_clustersets.pl -n | sort -k1,1nr > $cluster.clusterset.merged";
    system "cat $cluster | merge_stepwise -f $cluster.clusterset.merged > $cluster.merge 2> $cluster.merge.log";
    system "cat $cluster.merge | dom_renumber > $cluster.merge.renumber";

    my $prefix = $cluster;
    my $n = 2;
    if ($cluster =~ /^(\S*\D)(\d+)$/) {
	($prefix, $n) = ($1, $2);
	$n ++ ;
    }
    system "ln -fs $cluster.merge.renumber $prefix$n"; # overwrite when resuming

    return ("$prefix$n", "$cluster.clusterset.merged");
}
