#!/usr/bin/env perl
# One can run this program directly from the directory above the
# test_data directory within the distribution directory with this command:
# perl examples/simrank_nuc.pl --query test_data/query.fasta --data test_data/db.fasta
# System admins may want to modify and copy this file to /usr/local/bin so users can run it directly.
use strict;
use warnings FATAL => qw ( all );
use Getopt::Long;
use lib 'lib';
use String::Simrank;
my ( $prog_name, $cl_args, $usage, $signature );
$prog_name = ( split "/", $0 )[-1];
$signature = 'Niels Larsen';
$usage = qq (
Program $prog_name, May 2004, April 2002
This program quickly estimates the overall similarity between
a given set of DNA or RNA sequence(s) and a background set of
of homologues. It returns a sorted list of similarities as a
table. The similarity between sequences A and B are the number
of unique k-words (short subsequence) that they share, divided
by the smallest total k-word count in either A or B. The result
are scores that do not depend on sequence lengths. The program,
when run for the first time, builds a binary file for efficieny.
Command line arguments are (brackets mean optional and D means
default value),
--query path ( Query sequence(s), fasta format )
--data path ( Database sequence(s), fasta format )
[ --wordlen int ] ( D = 7; word length used )
[ --minlen int ] ( D = 50; minimum sequence length )
[ --minpct flt ] ( D = 50; minimum match percentage )
[ --outlen int ] ( D = 100; output length cutoff )
[ --outfile path ] ( D = false; output file )
[ --rebuild ] ( D = off; force making new binary )
[ --silent ] ( D = off; progress screen messages )
[ --reverse ] ( D = off; complements input sequence(s) )
[ --noids ] ( D = off; print numbers instead of ids )
Author: $signature
);
print STDERR $usage and exit if not @ARGV;
# >>>>>>>>>>>>>>>>>>>>> GET ARGUMENTS <<<<<<<<<<<<<<<<<<<<<<<<<
if ( not &GetOptions (
"data=s" => \$cl_args->{"data"},
"query=s" => \$cl_args->{"query"},
"wordlen=s" => \$cl_args->{"wordlen"},
"minlen=s" => \$cl_args->{"minlen"},
"minpct=f" => \$cl_args->{"minpct"},
"outlen=s" => \$cl_args->{"outlen"},
"outfile=s" => \$cl_args->{"outfile"},
"rebuild!" => \$cl_args->{"rebuild"},
"silent!" => \$cl_args->{"silent"},
"reverse!" => \$cl_args->{"reverse"},
"noids" => \$cl_args->{"noids"},
) )
{
exit;
}
# >>>>>>>>>>>>>>>>>>>>>>>>>> MATCH <<<<<<<<<<<<<<<<<<<<<<<<<<<<
my $sr = new String::Simrank ({ data => $cl_args->{data} });
if ($cl_args->{"rebuild"} || !$sr->{binary_ready} ) {
$sr->formatdb({ wordlen => $cl_args->{wordlen},
minlen => $cl_args->{minlength},
silent => $cl_args->{silent},
});
}
$sr->match_oligos( { query => $cl_args->{query},
outlen => $cl_args->{outlen},
minpct => $cl_args->{minpct},
reverse => $cl_args->{reverse},
outfile => $cl_args->{outfile},
noids => $cl_args->{noids},
silent => $cl_args->{silent},
});
print STDERR "$0 Done\n" if not $cl_args->{"silent"};
# >>>>>>>>>>>>>>>>>>>> END OF MAIN PROGRAM <<<<<<<<<<<<<<<<<<<<
__END__