#!/usr/local/bin/perl
#
# Author: David Kulp (dk/ucsc)
#
# usage: split.pl <data_set_name> <set #>
#
# this simple perl script splits up a GENBANK data set in train/test sets.
# $ARGV[0] contains data set WITHOUT extension, e.g. "combined_GB"
# the corresponding ".dat" (or ".dat.gz") and ".sets" files must exist.
#
# $ARGV[1] contains the number of the set to be used as the test set.
#
# writes data out to "gene.test" and "gene.train"
#

if ($#ARGV != 1) {
    die "Usage: $0 <data_set_name> <set #>";
}

open(LIST,"$ARGV[0].sets") || 
   die "Can't open list of data sets, $ARGV[0].sets";

$setnum = $ARGV[1];
$inset = 0;

while (<LIST>)
{
    chop;
    (/^$/) && ($inset=0);
    if ($inset)
    {
	s/ //g;
	$t{$_}=1;
    }
    if (/PART: $setnum/) 
    {
	($inset=1);
    }
}

close(LIST);

open(TEST,">gene.test");
open(TRAIN,">gene.train");
open(DATA,"< $ARGV[0].dat") || open(DATA,"gunzip -c $ARGV[0].dat.gz |") ||
    die "Can't open data file, $ARGV[0].dat[.gz]";

$intest=0;
while (<DATA>)
{
    if (/^LOCUS/)
    {
	/^LOCUS       ([^ ]+)/;
	if ($t{$1})
	{
	    $intest=1;
	}
	else {
	    $intest=0;
	}
    }
    if ($intest)
    {
	print TEST $_;
    }
    else
    {
	print TRAIN $_;
    }
}
	

