#!/usr/bin/perl -s

##############################################################################
#use strict;
use lib "$ENV{'CGAT_HOME'}/perllib";
use File::Basename;
use File::Path;
use CmdProc;
use GenomeHomSearch::Blast;
use CGAT_build;
require "CGAT_Conf.pl";
use GenomeFeature;
use Sequence;

###############################################################################
package build_RepBase;
use base CGAT_buildAnnot;
##############################################################################

##############################################################################
# Options
##############################################################################
$EVALUE = 1e-15;
$QueryLen = 1000000;
#$NOPT = "-N " . $QueryLen * 2;
%options = (
	program => 'blastn',
#	progopt => "-m 10 -H -E $EVALUE",
	progopt => "-e $EVALUE",
	QueryLen => $QueryLen,
	QueryOverlap => 1000,
	QueryNum => 200,
	SkipPostProc => 1,
);

##
## RepBase database file
##
## Following is a human repetitive sequnece file (humrep.ref) in Repbase (http://www.girinst.org/repbase/).
## First, create 'database/seqdb' directory and put the sequence file into it.
##
$REPDB = "$main::DIR_database/seqdb/humrep.ref";

##############################################################################
sub checkREPDB {
	my($flag) = 0;
	open(DB, $REPDB) || return -1;
	while(<DB>) {
		if (/>.*\t/) {
			$flag = 1; last;
		}
	}
	if ($flag) {
		seek(DB, 0, 0);
		## convert TAB to SPACE to avoid a format problem
		open(O, ">$REPDB.TMP") || die;
		while(<DB>) {
			if (/>.*\t/) {
				s/\t/ /g;
			}
			print O $_;
		}
		close(O);
		if (! -f "$REPDB.ORIG") {
			rename($REPDB, "$REPDB.ORIG");
		}
		rename("$REPDB.TMP", $REPDB);
	}
	close(DB);
}
sub readREPDB {
	open(DB, $REPDB) || die;
	while (<DB>) {
		if (/^>(\S+)\s*(\S+)\s*(\S.+)/) {
			$name = $1;
			$fam = $2;
			$species = $3;
			$Fam{$name} = $fam;
		}
	}
	close(DB);
}
sub execute_main {
	my($this, @args) = @_;
	my($filesp1) = &CGAT_Data::getGenomeSeqPath($this->{sp}, 'work', 'no_mask');
	$this->{cmdproc}->{filebase} = "$main::DIR_work/fasta.$this->{sp}-repdb";

	$options{database} = $REPDB;
	$options{query} = $filesp1;

	$this->{homsrch} = GenomeHomSearch::Blast->new( %options );
	$this->{homsrch}->execute;
	$gfeat = $this->outputRepBase;

	return;
}
sub outputRepBase {
	my($this) = @_;
	my $gfeat =GenomeFeature->new($this->{sp});
	my $aliList = $this->{homsrch}->get_alignments;
	$gfeat->add_fields("name");
	if (%Fam) {
		&readREPDB;
	}
	foreach my $ali ($aliList->list) {
		($from1,$to1,$from2,$to2,$dir,$repname) = 
			($ali->from1,$ali->to1,$ali->from2,$ali->to2,
				$ali->dir,$ali->name2);
		$fam = $Fam{$repname};
		$gfeat->addSegment($from1,$to1,$dir,$fam, name=>$repname);
	}
	$gfeat->write_table;
	$gfeat;
}

##############################################################################
if($0 eq __FILE__) {
	if (scalar(@ARGV) < 1) {
		die "usage :: $0 species\n";
	}
	if (&checkREPDB() < 0) {
		die "Repeat database is not found: $REPDB\n";
	}
	$cgat_build = build_RepBase->new(\@ARGV, {no_mask=>1});
	$cgat_build->execute;
	exit;
}

##############################################################################
1;#
##############################################################################    
