#!/usr/nikola/bin/perl -w
# given a penn file on STDIN, one tree per line, print the utterance
# words only.
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
my $man = 0;
my $help = 0;
my $sgml = 1;
my $parens = 0;
## Parse options and print usage if there is a syntax error,
## or if usage was explicitly requested.
GetOptions('help|?' => \$help,
man => \$man,
'sgml!' => \$sgml,
'parens!' => \$parens,
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;
## If no arguments were given, then allow STDIN to be used only
## if it's not connected to a terminal (otherwise print usage)
pod2usage("$0: No files given.") if ((@ARGV == 0) && (-t STDIN));
use Lingua::Treebank::Const;
while (<>) {
# leave blank-ish lines alone
if (/^\s*$/) {
print $_;
next;
}
my $utt;
eval {
$utt = Lingua::Treebank::Const->new()->from_penn_string($_);
};
if ($@) {
die "line $.: $@\n";
}
if (not defined $utt) {
warn "utterance doesn't parse at line $.\n";
}
print "(" if $parens;
print "" if $sgml;
foreach my $terminal ( $utt->get_all_terminals() ) {
print " ", $terminal->word();
}
print " " if $sgml;
print " )" if $parens;
print "\n";
}
__END__
=head1 NAME
get_words - given collapsed treebank, print words only
=head1 SYNOPSIS
get_words [options] [file[s] or STDIN]
Options:
-help brief help message
-man full documentation
-sgml put and tokens around words
-nosgml
-parens put ( and ) tokens around words
-noparens
=head1 OPTIONS
=over 8
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=item B<-sgml>
=item B<-nosgml>
Writes EsE at the beginning of each line and E/sE at
the end of each line, or (in the case of C<-nosgml>) don't.
Default is C<-sgml>.
=item B<-parens>
=item B<-noparens>
Writes C<(> at the beginning of each line and C<)> at
the end of each line, or (in the case of C<-noparens>) don't.
Default is C<-noparens>.
=back
=head1 DESCRIPTION
Reads input files (or STDIN) for Penn-style trees, one per line, and
prints out only the words, one tree per line.
Providing the C<-sgml> tag makes the output pseudo-SGML by including
angle-bracketed CsE> and C/sE> tokens at the beginning and end of each
line.
=cut