#!/usr/local/bin/perl # # Author: David Kulp (dk/ucsc) # # usage: split.pl # # this simple perl script splits up a GENBANK data set in train/test sets. # $ARGV[0] contains data set WITHOUT extension, e.g. "combined_GB" # the corresponding ".dat" (or ".dat.gz") and ".sets" files must exist. # # $ARGV[1] contains the number of the set to be used as the test set. # # writes data out to "gene.test" and "gene.train" # if ($#ARGV != 1) { die "Usage: $0 "; } open(LIST,"$ARGV[0].sets") || die "Can't open list of data sets, $ARGV[0].sets"; $setnum = $ARGV[1]; $inset = 0; while () { chop; (/^$/) && ($inset=0); if ($inset) { s/ //g; $t{$_}=1; } if (/PART: $setnum/) { ($inset=1); } } close(LIST); open(TEST,">gene.test"); open(TRAIN,">gene.train"); open(DATA,"< $ARGV[0].dat") || open(DATA,"gunzip -c $ARGV[0].dat.gz |") || die "Can't open data file, $ARGV[0].dat[.gz]"; $intest=0; while () { if (/^LOCUS/) { /^LOCUS ([^ ]+)/; if ($t{$1}) { $intest=1; } else { $intest=0; } } if ($intest) { print TEST $_; } else { print TRAIN $_; } }