#! /usr/bin/perl -w # # wnDepths.pl version 2.05 # (Last updated $Id: wnDepths.pl,v 1.36 2008/05/30 23:12:44 sidz1979 Exp $) # # --------------------------------------------------------------------- # Use external packages use strict; use warnings; use Getopt::Long; use WordNet::QueryData; use WordNet::Tools; use File::Spec; # Get the command-line arguments our ($opt_wnpath, $opt_outfile, $opt_help, $opt_version); our ($opt_depthfile, $opt_wps, $opt_verbose); my $result = GetOptions("wnpath=s","outfile=s","depthfile=s","help","version","wps","verbose"); # Nothing on command-line unless($result) { showUsage(); exit(1); } # Help requested if($opt_help) { showHelp(); exit(0); } # Version information requested if($opt_version) { showVersion(); exit(0); } # Output to STDOUT undef($opt_outfile) if $opt_outfile and ($opt_outfile eq "-"); # Check output file if($opt_outfile) { open OUTFH, ">$opt_outfile" or die "Cannot open $opt_outfile: $!"; } else { *OUTFH = *STDOUT; } # Check if path to WordNet Data files has been provided ... If so ... save it. my $wnPCPath; my $wnUnixPath; if(defined $opt_wnpath) { $wnPCPath = $opt_wnpath; $wnUnixPath = $opt_wnpath; } elsif (defined $ENV{WNSEARCHDIR}) { $wnPCPath = $ENV{WNSEARCHDIR}; $wnUnixPath = $ENV{WNSEARCHDIR}; } elsif (defined $ENV{WNHOME}) { $wnPCPath = $ENV{WNHOME} . "\\dict"; $wnUnixPath = $ENV{WNHOME} . "/dict"; } else { $wnPCPath = "C:\\Program Files\\WordNet\\3.0\\dict"; $wnUnixPath = "/usr/local/WordNet-3.0/dict"; } # I think the actual OS name for most versions of Windows is 'MSWin32', # even for 64-bit Windows. See here for why: # http://www.perlmonks.org/index.pl?node_id=315372 my $wnpath = ($^O =~ /^MSWin/i) ? $wnPCPath : $wnUnixPath; print STDERR "Loading WordNet::QueryData... "; my $wn = WordNet::QueryData->new($wnpath); unless ($wn) { print STDERR ("failed.\n"); exit(1); } my $wntools = WordNet::Tools->new($wn); unless ($wntools) { print STDERR ("failed.\n"); exit(1); } print STDERR "done\n"; # Find top-level nodes for nouns my %top_level; my $datafile = File::Spec->catfile($wnpath, "data.noun"); $datafile = File::Spec->catfile($wnpath, "noun.dat") if ($^O =~ m/^MSWin/i); open FH, "$datafile" or die "Cannot open $datafile: $!"; my $line; while ($line = ) { next if substr($line, 0, 2) eq " "; next if $line =~ m/\@/; my ($offset) = split /\s+/, $line; $top_level{n}->{$offset} = -1; } close FH; if ($opt_verbose) { print "Offsets of top-level nouns\n",join(", ", keys(%{$top_level{n}})),"\n"; print "There are ", scalar(keys %{$top_level{n}}), " nouns\n"; } # Find top-level nodes for verbs $datafile = File::Spec->catfile($wnpath, "data.verb"); $datafile = File::Spec->catfile($wnpath, "verb.dat") if ($^O =~ /MSWin/i); open FH, "$datafile" or die "Cannot open $datafile: $!"; while($line = ) { next if substr($line, 0, 2) eq " "; next if $line =~ m/\@/; my ($offset) = split /\s+/, $line; $top_level{v}->{$offset} = -1; } if ($opt_verbose) { print "Offsets of top-level verbs\n", join(", ", keys(%{$top_level{v}})),"\n"; print "There are ", scalar(keys(%{$top_level{v}})), " top-level verbs.\n"; } # Determine WordNet version my $wnver = $wntools->hashCode(); print OUTFH "wnver::$wnver\n"; # Find the leaf nodes my $noun_leafs_ref = findLeafs('n'); if ($opt_verbose) { print "There are ", scalar(@{$noun_leafs_ref}), " noun leafs.\n"; } my $verb_leafs_ref = findLeafs('v'); if ($opt_verbose) { print "There are ", scalar(@{$verb_leafs_ref}), " verb leafs.\n"; } # Find the depth of every taxonomy my %wpsDepths; print STDERR "Finding depths of noun taxonomies... "; foreach my $offset (@{$noun_leafs_ref}) { my ($depth, $root_offset) = findDepth($offset, 'n'); $root_offset = sprintf("%08d", $root_offset); if (!defined($top_level{n}->{$root_offset}) || $top_level{n}->{$root_offset} < $depth) { $top_level{n}->{$root_offset} = $depth; } } print STDERR "done.\n"; # Generate depth-file my %depth; if ($opt_depthfile) { print STDERR "Writing depths to $opt_depthfile... "; open DFH, '>', $opt_depthfile or die "Cannot open $opt_depthfile: $!"; print DFH "wnver::$wnver\n"; my $noundepth = 0; my @keys = sort keys %wpsDepths; foreach my $wps (@keys) { my $depth = 100; for (@{$wpsDepths{$wps}}) { $depth = $_->[0] if $depth > $_->[0]; } $noundepth = $depth if $depth > $noundepth; } $noundepth = 2 * ($noundepth + 1) - 1; foreach my $key (@keys) { my %tmp; foreach (@{$wpsDepths{$key}}) { if ($opt_wps) { $tmp{"$_->[0]:$_->[1]"} = 1; } else { my $offset = sprintf("%08d", $wn->offset($_->[1])); $tmp{"$_->[0]:$offset"} = 1; } } my @depths = sort keys %tmp; my $offset = sprintf("%08d", $wn->offset($key)); my $str = $opt_wps ? "n $key " : "n $offset "; $str .= join(" ", @depths) . "\n"; $depth{n}->{$offset} = $str; } print STDERR "done.\n"; } print STDERR "Cleaning junk from memory... "; undef %wpsDepths; print STDERR "done.\n"; # Find the depth of every taxonomy print STDERR "Finding depths of verb taxonomies... "; foreach my $offset (@{$verb_leafs_ref}) { my ($depth, $root_offset) = findDepth($offset, 'v'); $root_offset = sprintf("%08d", $root_offset); if (!defined($top_level{v}->{$root_offset}) || $top_level{v}->{$root_offset} < $depth) { $top_level{v}->{$root_offset} = $depth; } } print STDERR "done.\n"; # Generate depth-file if ($opt_depthfile) { print STDERR "Writing depths to $opt_depthfile... "; my $verbdepth = 0; my @keys = sort keys %wpsDepths; foreach my $wps (@keys) { my $depth = 100; for (@{$wpsDepths{$wps}}) { $depth = $_->[0] if $depth > $_->[0]; } $verbdepth = $depth if $depth > $verbdepth; } $verbdepth = 2 * ($verbdepth + 1) - 1; foreach my $key (@keys) { my %tmp; foreach (@{$wpsDepths{$key}}) { if ($opt_wps) { $tmp{"$_->[0]:$_->[1]"} = 1; } else { my $offset = sprintf("%08d", $wn->offset($_->[1])); $tmp{"$_->[0]:$offset"} = 1; } } my @depths = sort keys %tmp; my $offset = sprintf("%08d", $wn->offset($key)); my $str = $opt_wps ? "v $key " : "v $offset "; $str .= join(" ", @depths) . "\n"; $depth{v}->{$offset} = $str; } for my $pos (qw/n v/) { foreach my $key (sort keys %{$depth{$pos}}) { print DFH $depth{$pos}->{$key}; } } print STDERR "done.\n"; close DFH; } print STDERR "Cleaning junk from memory... "; undef %wpsDepths; print STDERR "done.\n"; # Print output my $deepest_n = 0; while (my ($off, $depth) = each %{$top_level{n}}) { $deepest_n = $depth if ($depth > $deepest_n); if ($opt_wps) { my $wps = $wn->getSense($off, 'n'); print OUTFH "n $wps $depth\n"; } else { print OUTFH "n $off $depth\n"; } } $deepest_n++; print OUTFH "n ", ($opt_wps ? '*Root*#n#1' : '00000000'), " $deepest_n\n"; my $deepest_v = 0; while (my ($off, $depth) = each %{$top_level{v}}) { $deepest_v = $depth if ($depth > $deepest_v); if ($opt_wps) { my $wps = $wn->getSense($off, 'v'); print OUTFH "v $wps $depth\n"; } else { print OUTFH "v $off $depth\n"; } } $deepest_v++; print OUTFH "v ", ($opt_wps ? '*Root*#v#1' : '00000000'), " $deepest_v\n"; exit; ########## subroutines follow ########## sub findLeafs { my $pos = shift || die "No pos specifed"; my $file; if ($pos eq "n") { $file = File::Spec->catfile($wnpath, "data.noun"); $file = File::Spec->catfile($wnpath, "noun.dat") if $^O =~ /MSWin/i; } elsif ($pos eq "v") { $file = File::Spec->catfile($wnpath, "data.verb"); $file = File::Spec->catfile($wnpath, "verb.dat") if $^O =~ /MSWin/i; } else { die "Invalid pos: $pos"; } open WN, $file or die "Cannot open $file"; my @rtr = (); while (my $line = ) { next if index($line, " ") == 0; # Was: next if $line =~ m/~/; # Failed on tilde#n#1 because the gloss contains ~. # Fix provided by Ben Haskell (03/04/08). next if $line =~ m/~.*\|/; my ($offset) = split /\s+/, $line; push @rtr, $offset; } close WN or warn "Cannot close $file"; return \@rtr; } sub findWPSDepths { my $wps = shift; my $curPath = shift; defined $wpsDepths{$wps} and return @{$wpsDepths{$wps}}; my @hypernyms = $wn->querySense($wps, "hypes"); $curPath->{$wn->offset($wps)} = 1; unless (scalar @hypernyms > 0) { $wpsDepths{$wps} = [[1, $wps]]; } else { my @all_paths = (); foreach my $hype (@hypernyms) { unless(defined($curPath->{$wn->offset($hype)})) { my %pathCopy = %{$curPath}; push @all_paths, findWPSDepths($hype, \%pathCopy); } } @all_paths = map {[$_->[0] + 1, $_->[1]]} @all_paths; push(@all_paths, [1, $wps]) if(scalar(@all_paths) <= 0); $wpsDepths{$wps} = \@all_paths; } return @{$wpsDepths{$wps}}; } sub findDepth { my ($offset, $pos) = @_; ($offset and $pos) or die "Internal error: bad input to findDepth($offset, $pos)"; my ($wps) = $wn->getSense($offset, $pos); $wps or die "Internal error: bad offset $offset"; #my ($depth, $root) = findWPSDepths ($wps); my @paths = findWPSDepths($wps, {}); my $mindepth = 1_000; my $root; foreach my $path (@paths) { if ($path->[0] < $mindepth) { $mindepth = $path->[0]; $root = $path->[1]; } } my $root_offset; eval {$root_offset = $wn->offset($root)}; if ($@) { die "$@ \t $root (depth $mindepth) has no offset, baseoffset is $offset"; } return ($mindepth, $wn->offset($root)); } sub showUsage { print "Usage: wnDepths.pl [[--wnpath=PATH] [--outfile=FILE] [--depthfile=FILE] [--wps] [--verbose]]\n"; print " | --help | --version]\n"; } sub showHelp { showUsage(); print "Options:\n"; print "\t--wnpath=PATH PATH is the path to WordNet. The default is\n"; print "\t /usr/local/WordNet-3.0/dict on Unix and\n"; print "\t C:\\WordNet\\3.0\\dict on Windows\n"; print "\t--outfile=FILE File to which the maximum depths of the taxon-\n"; print "\t omies should be output.\n"; print "\t--depthfile=FILE File to which the depth of every synset should\n"; print "\t be output\n"; print "\t--wps output is in 'word#part_of_speech#sense format\n"; print "\t instead of offset format\n"; print "\t--verbose be verbose\n"; print "\t--help show this help message\n"; print "\t--version show version information\n"; } sub showVersion { print "wnDepths.pl version 2.05\n"; print "Copyright (c) 2005-2008, Ted Pedersen, Jason Michelizzi and Siddharth Patwardhan\n\n"; print "This program comes with ABSOLUTELY NO WARRANTY. This program\n"; print "is free software, and you are welcome to redistribute it under\n"; print "certain conditions. See the file GPL.txt for warranty and\n"; print "copyright information.\n"; } 1; __END__ =head1 NAME wnDepths.pl - Find the depths of WordNet taxonomies =head1 SYNOPSIS wnDepths.pl [[--wnpath=PATH] [--outfile=FILE|-] [--depthfile=FILE] [--wps] [--verbose]] | --help | --version] =head1 DESCRIPTION B finds the depths of WordNet's noun and verb taxonomies; it can also find the depth of each synset in WordNet. This program was originally written for use with the WordNet::Similarity::lch and WordNet::Similarity::wup measures of semantic similarity, but it likely has other uses as well. =head1 OPTIONS B<--wnpath>=I The path to WordNet data files. The default is /usr/local/WordNet-3.0/dict on Unix and C:\WordNet\3.0\dict on Windows. B<--outfile>=I The file to which the maximum depths of the taxomomies should be output. If this option is not given, or if the file name is I<->, then output will be sent to the standard output. B<--depthfile>=I The file to which the depth of every noun and verb synset should be sent. B<--wps> The names of synsets in the output as "word#part_of_speech#sense" strings rather than as offsets. B<--verbose> Be verbose. B<--help> Show detailed help message. B<--version> Show version information. =head1 BUGS Report to WordNet::Similarity mailing list : L =head1 SEE ALSO L WordNet home page : L WordNet::Similarity home page : L =head1 AUTHORS Ted Pedersen, University of Minnesota Duluth tpederse at d.umn.edu Jason Michelizzi Siddharth Patwardhan, University of Utah, Salt Lake City sidd at cs.utah.edu =head1 COPYRIGHT Copyright (c) 2005-2008, Ted Pedersen, Jason Michelizzi and Siddharth Patwardhan This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. Note: a copy of the GNU General Public License is available on the web at L and is included in this distribution as GPL.txt. =cut