The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl

use strict;
use warnings;
use 5.010;

use Pod::Usage;
use Getopt::Long qw(:config auto_help);
use File::Spec::Functions;
use File::Slurp;

my %opt = ();
GetOptions( \%opt, 'header|h', 'frac=f') or pod2usage(2);

my $f = shift;
pod2usage("$f is no file") unless(-f $f);

<STDIN> if ( $opt{header} );

my $frac     = $opt{frac}


while(<STDIN>) {

      print
    if ( rand() <= $frac );

  my %result;
  for my $g (@go) {
    $result{ join( "\t", @$g ) } = 1 if ( exists( $train_set{ $g->[0] } ) );
  }

  die "not enough data to sample training data set" unless ( keys %result > 0 );

  my $train_file = catfile( $base_dir, "go.cv_train.$i.in" );

  open my $go_train_fh, '>', $train_file or die "Can't open filehandle: $!";
  # no chomp done, so print
  for ( keys %result ) { print $go_train_fh $_; }
  for ( keys %pass ) { print $go_train_fh $_ if ( !exists( $result{$_} ) ); }
  close $go_train_fh;

  # write test and train sets to file, too
  my $cv_split_file = catfile( $base_dir, "go.cv_split.$i.lst" );

  # use say to print also newline
  open my $cv_split_fh, '>', $cv_split_file or die "Can't open filehandle: $!";
  for ( keys %test_set ) { say $cv_split_fh join("\t", $_, 'test'); }
  for ( keys %train_set ) { say $cv_split_fh join("\t", $_, 'train'); }
  close $cv_split_fh;

  system( "gzip", "-f", $cv_split_file );

  system( "gzip", "-f", $train_file );
}