#!/usr/nikola/bin/perl -w # given a penn file on STDIN, one tree per line, print the utterance # words only. use warnings; use strict; use Getopt::Long; use Pod::Usage; my $man = 0; my $help = 0; my $sgml = 1; my $parens = 0; ## Parse options and print usage if there is a syntax error, ## or if usage was explicitly requested. GetOptions('help|?' => \$help, man => \$man, 'sgml!' => \$sgml, 'parens!' => \$parens, ) or pod2usage(2); pod2usage(1) if $help; pod2usage(-verbose => 2) if $man; ## If no arguments were given, then allow STDIN to be used only ## if it's not connected to a terminal (otherwise print usage) pod2usage("$0: No files given.") if ((@ARGV == 0) && (-t STDIN)); use Lingua::Treebank::Const; while (<>) { # leave blank-ish lines alone if (/^\s*$/) { print $_; next; } my $utt; eval { $utt = Lingua::Treebank::Const->new()->from_penn_string($_); }; if ($@) { die "line $.: $@\n"; } if (not defined $utt) { warn "utterance doesn't parse at line $.\n"; } print "(" if $parens; print "" if $sgml; foreach my $terminal ( $utt->get_all_terminals() ) { print " ", $terminal->word(); } print " " if $sgml; print " )" if $parens; print "\n"; } __END__ =head1 NAME get_words - given collapsed treebank, print words only =head1 SYNOPSIS get_words [options] [file[s] or STDIN] Options: -help brief help message -man full documentation -sgml put and tokens around words -nosgml -parens put ( and ) tokens around words -noparens =head1 OPTIONS =over 8 =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-sgml> =item B<-nosgml> Writes EsE at the beginning of each line and E/sE at the end of each line, or (in the case of C<-nosgml>) don't. Default is C<-sgml>. =item B<-parens> =item B<-noparens> Writes C<(> at the beginning of each line and C<)> at the end of each line, or (in the case of C<-noparens>) don't. Default is C<-noparens>. =back =head1 DESCRIPTION Reads input files (or STDIN) for Penn-style trees, one per line, and prints out only the words, one tree per line. Providing the C<-sgml> tag makes the output pseudo-SGML by including angle-bracketed CsE> and C/sE> tokens at the beginning and end of each line. =cut