The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/env perl
#-*-perl-*-

=encoding utf-8

=head1 NAME

pdf2xml - extract text from PDF files and wraps it in XML

=head1 SYNOPSIS

 pdf2xml [OPTIONS] pdf-file > output.xml

=head1 OPTIONS

 -c ............. split strings into character sequences before finding words
 -h ............. skip de-hypenation (keep hyphenated words)
 -H ............. max heap size for Java VM
 -J path ........ path to Java
 -l lexicon ..... provide a list of words or a text in the target language
 -L ............. skip lowercasing (which is switched on by default)
 -m ............. skip merging character sequences (not recommended)
 -M ............. skip paragraph mergong heuristics
 -r ............. skip 'pdftotext -raw'
 -x ............. skip standard 'pdftotext'
 -X ............. use pdfXtk to convert to XHTML (default)
 -T ............. use Apache Tika for the basic conversion
 -v ............. verbose output

=head1 DESCRIPTION

pdf2xml tries to combine the output of several conversion tools in order to improve the extraction of text from PDF documents. Currently, it uses pdftotext, Apache Tika and pdfxtk. In the default mode, it calls all tools to extract text and pdfxtk is used to create the basic XML file that will be used to produce the final output. Several post-processing heuristics are implemented to split and merge character sequences in order to cleanup the text. Consider the example given below:

  raw:    <p>PRESENTATION ET R A P P E L DES PRINCIPAUX RESULTATS 9</p>
  clean:  <p>PRESENTATION ET RAPPEL DES PRINCIPAUX RESULTATS 9</p>

  raw:    <p>2. Les c r i t è r e s de choix : la c o n s o m m a t i o n 
             de c o m b u s - t ib les et l e u r moda l i t é 
             d ' u t i l i s a t i on d 'une p a r t , 
             la concen t r a t ion d ' a u t r e p a r t 16</p>

  clean:  <p>2. Les critères de choix : la consommation 
             de combustibles et leur modalité 
             d'utilisation d'une part, 
             la concentration d'autre part 16</p>

=head1 TODO

This is quite slow and loading Apache Tika for each conversion is not very efficient. Using the server mode of Apache Tika would be a solution.

Character merging heuristics are very simple. Using the longest string forming a valid word from the vocabulary may lead to many incorrect words in context for some languages. Also, the implementation of the merging procedure is probably not the most efficient one.

De-hyphenation heuristics could also be improved. The problem is to keep it as language-independent as possible.

=head1 SEE ALSO

Apache Tika: L<http://tika.apache.org>

The Poppler Developers - L<http://poppler.freedesktop.org>

pdfXtk L<http://sourceforge.net/projects/pdfxtk/>


=head1 COPYRIGHT AND LICENSE

Copyright (C) 2013 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.

=cut

use strict;

use FindBin qw/$Bin/;
use IPC::Open2;
use XML::Parser;
use XML::Writer;
use File::Temp qw /tempfile/;


use vars qw($opt_c $opt_h $opt_H $opt_J $opt_L $opt_l $opt_m $opt_r 
            $opt_T $opt_x $opt_v $opt_X $opt_M);
use Getopt::Std;
getopts('chH:J:Ll:mrTxXv');

# home of shared data (where Apache Tika should be)

my $SHARED_HOME;
eval{ 
    require File::ShareDir; 
    $SHARED_HOME = File::ShareDir::dist_dir('Text-PDF2XML'); 
};
unless (-d $SHARED_HOME){
    $SHARED_HOME = $Bin.'/share';
}


my $JAVA           = $opt_J || 'java';
my $JAVA_HEAP_SIZE = $opt_H || '1g';
my $TIKA           = $SHARED_HOME.'/lib/tika-app-1.3.jar';
my $PDF2TEXT       = `which pdftotext`;chomp($PDF2TEXT);


# some global variables used for finding words in strings
# LONGEST_WORD = length of the longest word in the vocabulary
# SPLIT_CHAR_IF_NECESSARY = split strings into character sequences
#                           (if they do not contain any single whitespace)
#                           (this is only used with pdfxtk output)
# SPLIT_CHAR = always split strings into character sequence before finding words

my $LONGEST_WORD = undef;
my $SPLIT_CHAR_IF_NECESSARY = 0;
my $SPLIT_CHAR              = $opt_c;

# we require recent versions of pdftotext developed by 
# The Poppler Developers - http://poppler.freedesktop.org
if (-e $PDF2TEXT){
    my $developer = `$PDF2TEXT --help 2>&1 | grep -i 'poppler'`;
    $PDF2TEXT    = undef unless ($developer=~/poppler/i);
}


my %LIGATURES = (
    "\x{0132}" => 'IJ',
    "\x{0133}" => 'ij',
    "\x{FB00}" => 'ff',
    "\x{FB01}" => 'fi',
    "\x{FB02}" => 'fl',
    "\x{FB03}" => 'ffi',
    "\x{FB04}" => 'ffl',
    "\x{FB06}" => 'st');

my $LIGATURES_MATCH = join('|',sort {length($b) <=> length($a)} 
			   keys %LIGATURES);


sub normalize_string{
    chomp($_[0]);
    $_[0]=~s/($LIGATURES_MATCH)/$LIGATURES{$1}/ge;
}


my $pdf_file = shift(@ARGV);

binmode(STDOUT,":encoding(UTF-8)");
binmode(STDERR,":encoding(UTF-8)");

#-------------------------------------------------------
# use pdftotext or Apache Tika to fill the vocabulary 
# and to find possibly hyphenated words
#-------------------------------------------------------

# the vocabulary hash
my %voc=();
my %lm=();

if ($opt_l){
    &read_vocabulary(\%voc,$opt_l);
}

unless ($opt_r || ( ! -e $PDF2TEXT ) ){
    &run_pdftotext_raw($pdf_file,\%voc);
    &make_lm(\%voc,\%lm);
}
$LONGEST_WORD = longest_word(\%voc);

unless ($opt_x){
    &run_pdftotext($pdf_file,\%voc);
    &make_lm(\%voc,\%lm);
}
# run even Apache Tika (unless run_pdftotext does it already)
if ( -e $PDF2TEXT || $opt_x ){
    &run_tika($pdf_file,\%voc);
    &make_lm(\%voc,\%lm);
}


$LONGEST_WORD = longest_word(\%voc);

#-------------------------------------------------------
# use Apache Tika or pdfxtk to produce XHTML output
# and find character sequences that need to be merged
# to form known words (split character sequences, hyphenated words)
#-------------------------------------------------------

binmode(STDOUT,":encoding(UTF-8)");
my $writer = XML::Writer->new( OUTPUT => \*STDOUT, 
			       DATA_MODE => 1,
			       DATA_INDENT => 1 );


my $parser = new XML::Parser( Handlers => { 
    Default => sub{ print $_[1] },
    Char    => sub{ $_[0]->{STRING} .= $_[1] },
    Start   => \&xml_start,
    End     => \&xml_end } );



# use pdfxtk or Apache Tika (default)

if ($opt_T){
    my $pid = open2(\*OUT, undef, $JAVA,'-Xmx'.$JAVA_HEAP_SIZE,
		    '-jar',$TIKA,'-x',$pdf_file);
    $parser->parse(*OUT);
    # close(OUT);
    # waitpid( $pid, 0 );
}
else{
# if ($opt_X){
    my $out_file = &run_pdfxtk($pdf_file);
    open OUT,"<$out_file" || die "cannot read from pdfxtkoutput ($out_file)\n";
    binmode(OUT,":encoding(UTF-8)");
    $SPLIT_CHAR_IF_NECESSARY = 1;
    my $handler = $parser->parse_start;
    while (<OUT>){
#	normalize_string($_);
	$handler->parse_more($_);
    }
    close OUT;
}


# Done!
##########################



sub xml_start{ 
    my $p = shift;
    ## delay printing paragraph boundaries
    ## in order to merge if necessary
    if ($opt_M || $_[0] ne 'p'){
	if ($p->{OPEN_PARA}){
	    $writer->endTag('p');
	    $p->{OPEN_PARA} = 0;
	}
	$writer->startTag(shift, @_);
    }
}

sub xml_end{
    if ($_[0]->{STRING}){

	my @words=();
	normalize_string($_[0]->{STRING});
	my @lines = split(/\n+/,$_[0]->{STRING});

	while (@lines){
	    my $OriginalStr     = shift(@lines);
	    my $DehyphenatedStr = undef;

	    unless ($opt_h){
		while ($OriginalStr=~/\-\s*$/ && @lines){
		    $DehyphenatedStr = $OriginalStr unless ($DehyphenatedStr);
		    $DehyphenatedStr=~s/\-\s*$//;
		    my $nextLine = shift(@lines);
		    $OriginalStr     .= "\n".$nextLine;
		    $DehyphenatedStr .= "\n".$nextLine;
		}
	    }

	    my @tok = find_words( $OriginalStr, 
				  $SPLIT_CHAR_IF_NECESSARY, 
				  $SPLIT_CHAR );
	    if ($DehyphenatedStr){
		my @tok2 = find_words( $DehyphenatedStr, 
				       $SPLIT_CHAR_IF_NECESSARY, 
				       $SPLIT_CHAR );
		@tok = @tok2 if ($#tok2 < $#tok);
	    }
	    push(@words,@tok);
	}

	if (@words){
	    ## check if there is an open paragraph
	    ## merge heuristics: if the first word starts
	    ##  with a lower-cased letter --> merge!
	    ## otherwise: close previous paragraph and start a new one
	    if ($_[0]->{OPEN_PARA}){
		unless ($words[0]=~/^\p{Ll}/){
		    $writer->endTag('p');
		    $writer->startTag('p');
		}
		else{
		    $writer->characters(' ');
		}
	    }
	    else{
		$writer->startTag('p');
	    }
	    $writer->characters( join(' ',@words) );
	    $_[0]->{OPEN_PARA} = 1;
	    if ($words[-1]=~/[.?!]$/){
		$_[0]->{OPEN_PARA} = 0;
	    }
	    unless ($_[0]->{OPEN_PARA} || $opt_M){
		$writer->endTag('p');
	    }
	    $_[0]->{STRING} = '';
	}
    }
    ## delay closing paragraphs
    ## (in case we want to merge with previous one)
    if ($opt_M || $_[1] ne 'p'){
	if ($_[0]->{OPEN_PARA}){
	    $writer->endTag('p');
	    $_[0]->{OPEN_PARA} = 0;
	}
	$writer->endTag($_[1]);
    }
}

sub xml_end_simple{
    if ($_[0]->{STRING}){
	my @words = find_words( $_[0]->{STRING} );
	if (@words){
	    $writer->characters( join(' ',@words) );
	    $_[0]->{STRING} = '';
	}
    }
    $writer->endTag($_[1]);
}



# convert pdf's using pdfxtk

sub run_pdfxtk{
    my $pdf_file = shift;
    my $out_file = shift;

    unless ($out_file){
	(my $fh, $out_file) = tempfile();
	close $fh;
	
    }
    opendir(my $dh, $SHARED_HOME.'/lib/pdfxtk') 
	|| die "can't opendir $SHARED_HOME/lib/pdfxtk: $!";
    my @jars = grep { /\.jar/ } readdir($dh);
    closedir $dh;
    my $CLASSPATH = join( ':', map { $_=$SHARED_HOME.'/lib/pdfxtk/'.$_ } @jars );
    my $pid = open2(undef, undef, 
		    $JAVA,
		    '-Xmx'.$JAVA_HEAP_SIZE,
		    '-cp',$CLASSPATH,
		    'at.ac.tuwien.dbai.pdfwrap.ProcessFile',
		    $pdf_file,$out_file);
    waitpid( $pid, 0 );
    return $out_file;
}


# read output of 'pdftotext -raw'

sub run_pdftotext_raw{
    my $pdf_file = shift;
    my $voc = shift;

    my $pid = open2(\*OUT, undef, $PDF2TEXT,'-raw','-enc','UTF-8',$pdf_file,'-');
    binmode(OUT,":encoding(UTF-8)");

    my $hyphenated=undef;
    while(<OUT>){
	normalize_string($_);
#	chomp;
	my @tok=split(/\s+/);
	if ($hyphenated){
	    my $str = $opt_L ? lc($hyphenated.$tok[0]) : $hyphenated.$tok[0];
	    $$voc{$str}++;
	    print STDERR "possibly hyphenated: $hyphenated -- $tok[0]\n" if ($opt_v);
	    $hyphenated=undef;
	}
	if (@tok){
	    if ($tok[-1]=~/^(.*)-/){
		$hyphenated=$1;
	    }
	}
	foreach (@tok){
	    $_ = lc($_) unless ($opt_L);
	    $$voc{$_}++;
	}
    }
    close(OUT);
    waitpid( $pid, 0 );
}


# read output of standard 'pdftotext' (or Tika if no pdftotext is available)

sub run_pdftotext{
    my $pdf_file = shift;
    my $voc = shift;

    my $pid = ( -e $PDF2TEXT ) ? 
	open2(\*OUT, undef, 'pdftotext','-enc','UTF-8',$pdf_file,'-') :
	open2(\*OUT, undef, $JAVA,'-Xmx'.$JAVA_HEAP_SIZE,
	      '-jar',$TIKA,'-t',$pdf_file);

    binmode(OUT,":encoding(UTF-8)");

    my $hyphenated=undef;
    while(<OUT>){
#	chomp;
	normalize_string($_);
	my @words = find_words($_);
	if ($hyphenated){
	    my $str = $opt_L ? lc($hyphenated.$words[0]) : $hyphenated.$words[0];
	    $$voc{$str}++;
	    print STDERR "possibly hyphenated: $hyphenated -- $words[0]\n" if ($opt_v);
	    $hyphenated=undef;
	}
	if (@words){
	    if ($words[-1]=~/^(.*)-/){
		$hyphenated=$1;
	    }
	}
	foreach (@words){
	    $_ = lc($_) unless ($opt_L);
	    $$voc{$_}++;
	}
    }
    close(OUT);
    waitpid( $pid, 0 );
}


sub run_tika{
    my $pdf_file = shift;
    my $voc = shift;

    my $pid = open2(\*OUT, undef, $JAVA,'-Xmx'.$JAVA_HEAP_SIZE,
		    '-jar',$TIKA,'-t',$pdf_file);

    binmode(OUT,":encoding(UTF-8)");

    my $hyphenated=undef;
    while(<OUT>){
	normalize_string($_);
	chomp;
	my @words = find_words($_);
	if ($hyphenated){
	    my $str = $opt_L ? lc($hyphenated.$words[0]) : $hyphenated.$words[0];
	    $$voc{$str}++;
	    print STDERR "possibly hyphenated: $hyphenated -- $words[0]\n" if ($opt_v);
	    $hyphenated=undef;
	}
	if (@words){
	    if ($words[-1]=~/^(.*)-/){
		$hyphenated=$1;
	    }
	}
	foreach (@words){
	    $_ = lc($_) unless ($opt_L);
	    $$voc{$_}++;
	}
    }
    close(OUT);
    waitpid( $pid, 0 );
}




##########################################################################
#### this is a greedy left-to-right search for the longest known words
#### ---> this easily leads to many mistakes
#### ---> better use the find_segment LM-based method 
####      and its dynamic programming procedure
##########################################################################


# find the longest known words in a string
#
#  $split_char_when_necessary = 1 ---> split into character sequences if string has no whitespaces
#  $split_char = 1 ---> always split into character sequences

sub find_longest_words{
    my @tokens1 = @_;

    return @tokens1 if ($opt_m);          # skip merging ...
    my @words = ();

    my @tokens2   = ();
    my $remaining = \@tokens1;
    my $current   = \@tokens2;

    # max number of tokens to be considered
    my $LENGTH_THR = $LONGEST_WORD || @tokens1;

    while (@{$remaining}){
	($current,$remaining) = ($remaining,$current);
	@{$remaining} = ();

	# pessimistic selection of tokens: 
	# not more than the length of the longest known word
	# (assuming that each token is at least 1 character long)
	my @more = splice(@{$current},$LENGTH_THR);

	# join all current tokens and see if they form a known word
	my $str = join('',@{$current});
	$str = lc($str) unless ($opt_L);

	# remove the final token until we have a known word
	until (exists $voc{$str}){
	    last unless (@{$current});
	    unshift( @{$remaining}, pop(@{$current}) );
	    $str = join('',@{$current});
	    $str = lc($str) unless ($opt_L);
	}

	# more than one token? 
	# --> successfully (?) found a token sequence that should be merged
	if ($#{$current}>0){
	    $voc{$str}++;
	    print STDERR join(' ',@{$current})," --> $str\n" if ($opt_v);
	}

	# need to restore non-lowercased version if necessary
	$str = join('',@{$current}) unless ($opt_L);

	# add the detected word to the list (or the next one)
	if ($str){ push(@words,$str); }
	else{      push(@words,shift @{$remaining}); }

	# add additional tokens from the sentence
	push(@{$remaining},@more);
    }
    return @words;
}


#
# find segments that best match our simple unigram language model
#


sub find_segments{
    my @tokens = @_;
    return @tokens if ($opt_m);          # skip ....

    # max number of tokens to be considered
    my $LENGTH_THR = $LONGEST_WORD || length(join('',@tokens));

    unshift(@tokens,'START');

    my @scores = ();
    my @trace = ();
    for my $i (0..$#tokens){
	for my $j ($i+1..$i+$LENGTH_THR){
	    last if ( $j > $#tokens );
	    my @current = @tokens[$i+1..$j];
	    my $str = join('',@current);
	    $str = lc($str) unless ($opt_L);
	    $str = &try_dehyphenation($str);

	    # stop if the length is longer than the longest known word
	    last if ( length($str) > $LENGTH_THR );

	    # skip if str is not known (and not a single character)
	    next unless (exists($lm{$str}) || $#current == 0);

	    # unigram probability (or unknown word prob)
	    my $prob = exists($lm{$str}) ? $lm{$str} : $lm{__unknown__};
	    my $start_score = $i ? $scores[$i] : 0;
	    if (exists $scores[$j]){
		if ( $start_score + $prob > $scores[$j] ){
		    $scores[$j] = $start_score + $prob;
		    $trace[$j] = $i;
		}
	    }
	    else{
		$scores[$j] = $start_score + $prob;
		$trace[$j] = $i;
	    }
	}
    }

    my @words;
    my $i=$#tokens;
    # print STDERR "best LM score = $scores[$i]\n" if ($scores[$i] && $opt_v);

    while ($i > 0){
	my @current = @tokens[$trace[$i]+1..$i];
	my $str = join('',@current);
	if ($opt_v){
	    if ($i > $trace[$i]+1){
		print STDERR join(' ',@current)," --> $str\n";
	    }
	}
	$str = &try_dehyphenation($str);
	unshift(@words,$str);
	$i = $trace[$i];
    }
    return @words;
}



sub find_words{
    my ($string,$pdfxtk,$charsplit) = @_;
    if ($charsplit){
	return find_words_charlevel($string);
    }
    if ($pdfxtk){
	return find_words_pdfxtk($string);
    }
    return find_words_standard($string);
}


sub find_words_standard{
    $_[0]=~s/^\s*//;
    return find_segments( split(/\s+/,$_[0]) );
    # return find_longest_words( split(/\s+/,$_[0]) );
}

sub find_words_charlevel{
    $_[0]=~s/^\s*//;
    return find_segments( split(//,$_[0]) );
    # return find_longest_words( split(//,$_[0]) );
}


# post-process conversion by pdfxtk

sub find_words_pdfxtk{
    my $string = shift;
    $string=~s/^\s*//;
    
    my %ligatures = ();
    foreach (values %LIGATURES){
	$ligatures{$_} = $_;
    }
    # sometimes only the second letter remains after conversion
    # (using pdftotext for example)
    # TODO: 'ffi' can also become 'i' (example: Effizienz --> Eiizienz)
    $ligatures{'l'} = 'fl';
    $ligatures{'i'} = 'fi';
    $ligatures{'f'} = 'ff';

    my @words = ();
    my @tokens = ();
    if ($string=~/\s/){
	@tokens = split(/\s+/,$string);
    }
    else{
	# return find_words_charlevel($string);
	@tokens = find_words_charlevel($string);
    }

    foreach (@tokens){

	# suspiciously long words ....
	if ( length($_) > $LONGEST_WORD ){
	    push(@words, find_words_charlevel($_) );
	}

	# upper-case letters following a lower-cased one ...
	elsif ( $_ =~/\p{Ll}\p{Lu}/ ){
	    push(@words, find_words_charlevel($_) );
	}
	else{
	    push(@words, $_);
	}
    }

    foreach (0..$#words){
	$words[$_] = &try_dehyphenation($words[$_]);
    }

    # more post-processing: merge words if necessary
    # TODO: check if this does more harm than good for some languages
    #       the heuristics are quite effective for German at least ....
    # TODO: add other ligature-strings that need to be checked for

    my @clean=();
    my $i=0;
WORD:    while ($i<$#words){
	my $this = $words[$i];
	my $next = $words[$i+1];
	$this = lc($this) unless ($opt_L);
	$next = lc($next) unless ($opt_L);

	# # dehyphenate if necessary
	# if ($this=~/^(.+)-/){
	#     if (exists $voc{$1.$next}){
	# 	$words[$i]=~s/\-$//;
	# 	push(@clean,$words[$i].$words[$i+1]);
	# 	print STDERR "merge $words[$i]+$words[$i+1]\n" if ($opt_v);
	# 	$i+=2;
	# 	next;
	#     }
	# }

	# if either this or the next word does not exist in the vocabulary:
	if (! exists $voc{$this} || ! exists $voc{$next} ){

	    # check if a concatenated version exists
	    if (exists $voc{$this.$next}){
		push(@clean,$words[$i].$words[$i+1]);
		print STDERR "merge $words[$i]+$words[$i+1]\n" if ($opt_v);
		$i+=2;
		next;
	    }
	    # check if pdfxtk swallowed ligatures such as 'ff' and 'fi'
	    else{
		foreach my $l (sort {length($b) <=> length($a)} 
			       keys %ligatures){
		    if (exists $voc{$this.$l.$next}){
			push(@clean,$words[$i].$ligatures{$l}.$words[$i+1]);
			print STDERR "add '$ligatures{$l}' and merge $words[$i] + $words[$i+1]\n" if ($opt_v);
			$i+=2;
			next WORD;
		    }
		}
	    }
	}

	# nothing special? --> just add the current word
	push(@clean,$words[$i]);
	$i++;
    }
    if (@words){
	push(@clean,$words[-1]);
    }

    foreach my $i (0..$#clean){

	# don't do it with single letters!
	next if (length($clean[$i]) < 2);

	my $this = $clean[$i];
	$this = lc($this) unless ($opt_L);

	# if the current word does not exist in the vocabulary
	# check if adding ligature strings helps
	if (! exists $voc{$this}){
	    foreach my $l (sort {length($b) <=> length($a)} values %ligatures){
		if (exists $voc{$l.$this}){
		    print STDERR "add '$ligatures{$l}' to $clean[$i]\n" if ($opt_v);
		    $clean[$i]=$ligatures{$l}.$clean[$i];
		    last;
		}
		elsif (exists $voc{$this.$l}){
		    print STDERR "add '$ligatures{$l}' after $clean[$i]\n" if ($opt_v);
		    $clean[$i]=$clean[$i].$ligatures{$l};
		    last;
		}
	    }
	}
    }

    return @clean;
}





sub read_vocabulary{
    my ($voc,$file) = @_;
    if ($file=~/\.gz$/){
	open F,"gzip -cd < $file |" || die "cannot read from $file";
	binmode(F,":encoding(UTF-8)");
    }
    else{
	open F,"<:encoding(UTF-8)",$file || die "cannot read from $file";
    }
    while (<F>){
	chomp;
	my @words = split(/\s+/);
	foreach (@words){
	    $_ = lc($_) unless ($opt_L);
	    $$voc{$_}++;
	}
    }
}

sub longest_word{
    my ($voc) = @_;
    my $len=0;
    foreach (keys %{$voc}){
	my $l = length($_);
	$len = $l if ($l > $len);
    }
    return $len;
}

# make a simple unigram LM

sub make_lm{
    my $voc=shift;
    my $lm=shift;

    %{$lm} = %{$voc};
    my $total=0.1;
    map ($total+=$$lm{$_},keys %{$lm});
    map ($$lm{$_} = log($$lm{$_}) - log($total), keys %{$lm});
    $$lm{__unknown__} = log(0.1) - log($total);
}

sub dehyphenate{
    my ($part1,$part2)=@_;
    $part1=~s/\-$//;
    return $part1.$part2;
}

sub try_dehyphenation{
    my $word=shift;
    if ($word=~/.\-./){
	my $str = $word;
	$str=~s/\-//g;
	my $lc_str = $opt_L ? $str : lcfirst($str);
	if (exists $voc{$lc_str}){
	    $word=$str;
	}
    }
    return $word;
}


__END__