#!/usr/bin/perl =pod =head1 NAME umls-senserelate-evaluation.pl - This program evaluates the umls-senserelate algorithm using the semeval scorer program =head1 SYNOPSIS This program evaluates the senses assigned by the UMLS-SenseRelate algorithm using the semeval scorer program =head1 USAGE Usage: umls-senserelate-evaluation.pl LOG_DIRECTORY =head2 OUTPUT =head2 Required Options =head3 LOG_DIRECTORY This is the directory outputted by the umls-targetword-senserelate.pl program which contains the key and answer files for each of the target words. =head2 General Options: =head3 --senses DIR|File Sometimes of the senses in the key file do not map directly to the senses in the answer file. This will happen when using the NLM-WSD dataset which uses tags to indicate the sense of the ambiguous word and then those tags are mapped to concepts in the UMLS. The key file would contain the CUIs while the answer file contains the tags. Therefore, we need the sense (sometimes called choice) files to evaluate the mappings. So this options takes the directory that contains the the sense file for each target word you are going to disambiguate or just the file itself. The files for the target word contains the possible senses of the target word. This may be temporary but right now this is who I have it because often times the possible senses change depending on the version of the UMLS that you are using. I felt this allowed the most flexibility with it. The naming convention for this is a file called: .choices The format for this file is: ||semantic type|CUI This format is based on the choice files in the NLM-WSD dataset which we use for our experiments. If you are using the NLM-WSD dataset you can download these choice files from NLM's site. There are the 1999 tagset and the 2007 tagset available. You can find them here: http://wsd.nlm.nih.gov/collaboration.shtml =head3 --verbose Prints out accuracy information to STDOUT. =head3 --version Displays the version information. =head3 --help Displays the help information =head1 SYSTEM REQUIREMENTS =over =item * Perl (version 5.8.5 or better) - http://www.perl.org =back =head1 CONTACT US If you have any trouble installing and using UMLS-Similarity, please contact us via the users mailing list : umls-similarity@yahoogroups.com You can join this group by going to: http://tech.groups.yahoo.com/group/umls-similarity/ You may also contact us directly if you prefer : Bridget T. McInnes: bthomson at umn.edu Ted Pedersen : tpederse at d.umn.edu =head1 AUTHOR Bridget T. McInnes, University of Minnesota =head1 COPYRIGHT Copyright (c) 2010-2012, Bridget T. McInnes, University of Minnesota Twin Cities bthomson at umn.edu Ted Pedersen, University of Minnesota Duluth tpederse at d.umn.edu Serguei Pakhomov, University of Minnesota Twin Cities pakh0002 at umn.edu Ying Liu, University of Minnesota Twin Cities liux0395 at umn.edu This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to: The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut ############################################################################### # THE CODE STARTS HERE ############################################################################### # ================================ # COMMAND LINE OPTIONS AND USAGE # ================================ use UMLS::Interface; use UMLS::SenseRelate::TargetWord; use Getopt::Long; eval(GetOptions( "version", "verbose", "help", "senses=s", "debug")) or die ("Please check the above mentioned option(s).\n"); my $debug = 0; # if help is defined, print out help if( defined $opt_help ) { $opt_help = 1; &showHelp(); exit; } # if version is requested, show version if( defined $opt_version ) { $opt_version = 1; &showVersion(); exit; } # turn on debug if defined if( defined $opt_debug ) { $debug = 1; } # At least 1 terms should be given on the command line. if(scalar(@ARGV) < 1) { print STDERR "The umls-targetword-senserelate log directory must be given on the command line.\n"; &minimalUsageNotes(); exit; } my $log = shift; my %answerhash = (); my %keyhash = (); &getInputFiles($log); &setSenses(); &evaluateUsingScorer(); # evaluates the files in the log directory using the scorer program sub evaluateUsingScorer { if($debug) { print STDERR "In evaluateUsingScorer\n"; } my $total_precision = 0; my $total_tw = 0; foreach my $tw (sort keys %answerhash) { if($tw eq "sd") { next; } if($tw eq "or") { next; } my $answerfile = $answerhash{$tw}; my $keyfile = $keyhash{$tw}; # check that the key file exists if(! (defined $keyfile)) { print STDERR "ERROR: The key does not exist for $tw.\n"; &minimalUsageNotes(); exit; } my $evaluationfile = "$log/$tw.evaluation"; system "score.pl $log/$answerfile $log/$keyfile > $evaluationfile"; open(EVAL, $evaluationfile) || die "Could not open $evaluationfile\n"; my $precision = ""; while() { if($_=~/precision\:\s+([0-9\.]+) \(/) { $precision = $1; } } close EVAL; if(defined $opt_verbose) { print STDERR "$tw\t$precision\n"; } $total_precision += $precision; $total_tw++; } my $overall = $total_precision / $total_tw; print STDERR "Evaluation files are located in the input ($log) directory\n"; print STDERR "The Overall Precision for this dataset is: $overall\n"; } # gets the answer and key files from the log directory sub getInputFiles { if($debug) { print STDERR "In getInputFiles\n"; } my $dir = shift; # open the log directory opendir(DIR, $dir) || die "Could not open the log ($dir) directory\n"; my @files= grep { $_ ne '.' and $_ ne '..' } readdir DIR; # get each of the answer and key files in the directory foreach my $file (@files) { if($file=~/(.*?).answers/) { $answerhash{$1} = $file; } if($file=~/(.*?).key/) { $keyhash{$1} = $file; } } } # get the sense information from the choice files if the --sense option is defined sub setSenses { if(! (defined $opt_senses)) { return; } if($debug) { print STDERR "In setSenses\n"; } my %files = (); if(-d $opt_senses) { opendir(DIR, $opt_senses) || die "Could not open $opt_senses directory\n"; my @dirs = grep { $_ ne '.' and $_ ne '..' and $_ ne "CVS" and $_ ne "raw_summary" and $_ ne "index.shtml"} readdir DIR; foreach my $file (@dirs) { $file=~/(.*?)\.choices/; my $tw = $1; $files{$tw} = "$opt_senses/$file"; } } else { my @array = split/\//, $opt_senses; my $file = $array[$#array]; $array[$#array]=~/(.*?)\.choices/; my $tw = $1; $files{$tw} = $opt_senses; } foreach my $tw (sort keys %files) { open(FILE, $files{$tw}) || die "Could not open --sense $file\n"; my %sensehsah = (); while() { chomp; my($tag, $concept, $semantics, $cui) = split/\|/; $sensehash{$tag} = $cui; }close FILE; my $keyfile = $keyhash{$tw}; my $nkeyfile = $keyfile . ".sense"; open(KEY, "$log/$keyfile") || die "Could not open key $log/$keyfile\n"; open(NKEY, ">$log/$nkeyfile") || die "Could not open key $log/$nkeyfile\n"; while() { chomp; $_=~/(M[0-9]+)$/; my $tag = $1; my $cui = $sensehash{$tag}; $_=~s/\%$tag/\%$cui/g; print NKEY "$_\n"; } close KEY; close NKEY; $keyhash{$tw} = $nkeyfile; } } ############################################################################## # function to output minimal usage notes ############################################################################## sub minimalUsageNotes { print "Usage: umls-senserelate-evaluation.pl [OPTIONS] LOG_DIRECTORY\n"; &askHelp(); exit; } ############################################################################## # function to output help messages for this program ############################################################################## sub showHelp() { print "This is a utility \n"; print "Usage: umls-senserelate-evaluation.pl [OPTIONS] LOG_DIRECTORY\n\n"; print "\n\nGeneral Options:\n\n"; print "--senses FILE|DIR File or directory containing the sense files\n\n"; print "--verbose Prints accuracy information to STDOUT\n\n"; print "--version Prints the version number\n\n"; print "--help Prints this help message.\n\n"; } ############################################################################## # function to output the version number ############################################################################## sub showVersion { print '$Id: umls-senserelate-evaluation.pl,v 1.5 2012/04/13 22:09:37 btmcinnes Exp $'; print "\nCopyright (c) 2010-2012, Ted Pedersen & Bridget McInnes\n"; } ############################################################################## # function to output "ask for help" message when user's goofed ############################################################################## sub askHelp { print STDERR "Type umls-senserelate-evaluation.pl --help for help.\n"; }