pdf2xml - metacpan.org

#!/usr/bin/env perl
#-*-perl-*-

=encoding utf-8

=head1 NAME

pdf2xml - extract text from PDF files and wraps it in XML

=head1 SYNOPSIS

 pdf2xml [OPTIONS] pdf-file > output.xml

=head1 OPTIONS

 -h ............. skip de-hypenation (keep hyphenated words)
 -l lexicon ..... provide a list of words or a text in the target language
 -L ............. skip lowercasing (which is switched in by default)
 -m ............. skip merging character sequences (not recommended)
 -r ............. skip 'pdftotext -raw' (not recommended)
 -x ............. skip standard 'pdftotext'
 -X ............. use pdfXtk to convert to XHTML (and nothing else)
 -v ............. verbose output

=head1 DESCRIPTION

pdf2xml calls pdftotext and Apache Tika to extract text from PDf files and to convert them to XML (actually XHTML). It also uses some heuristics to find words that should not be split into character sequences (which often happens with pdf-text extraction tools) and it also tries to put hyphenated words together.

Example: raw is without cleanup heuristics

  raw:    <p>PRESENTATION ET R A P P E L DES PRINCIPAUX RESULTATS 9</p>
  clean:  <p>PRESENTATION ET RAPPEL DES PRINCIPAUX RESULTATS 9</p>

  raw:    <p>2. Les c r i t è r e s de choix : la c o n s o m m a t i o n 
             de c o m b u s - t ib les et l e u r moda l i t é 
             d ' u t i l i s a t i on d 'une p a r t , 
             la concen t r a t ion d ' a u t r e p a r t 16</p>

  clean:  <p>2. Les critères de choix : la consommation 
             de combustibles et leur modalité 
             d'utilisation d'une part, 
             la concentration d'autre part 16</p>

=head1 TODO

This is quite slow and loading Apache Tika for each conversion is not very efficient. Using the server mode of Apache Tika would be a solution.

Character merging heuristics are very simple. Using the longest string forming a valid word from the vocabulary may lead to many incorrect words in context for some languages. Also, the implementation of the merging procedure is probably not the most efficient one.

De-hyphenation heuristics could also be improved. The problem is to keep it as language-independent as possible.

=head1 SEE ALSO

Apache Tika: L<http://tika.apache.org>

The Poppler Developers - L<http://poppler.freedesktop.org>

pdfXtk L<http://sourceforge.net/projects/pdfxtk/>


=head1 COPYRIGHT AND LICENSE

Copyright (C) 2013 by Joerg Tiedemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.8 or,
at your option, any later version of Perl 5 you may have available.

=cut

use strict;

use FindBin qw/$Bin/;
use IPC::Open2;
use XML::Parser;
use XML::Writer;
use File::Temp qw /tempfile/;


use vars qw($opt_h $opt_L $opt_l $opt_m $opt_r $opt_x $opt_v $opt_X);
use Getopt::Std;
getopts('hLl:mrxXv');

# home of shared data (where Apache Tika should be)

my $SHARED_HOME;
eval{ 
    require File::ShareDir; 
    $SHARED_HOME = File::ShareDir::dist_dir('Text-PDF2XML'); 
};
unless (-d $SHARED_HOME){
    $SHARED_HOME = $Bin.'/share';
}

my $JAVA     = 'java';
my $TIKA     = $SHARED_HOME.'/lib/tika-app-1.3.jar';
my $PDF2TEXT = `which pdftotext`;chomp($PDF2TEXT);

# we require recent versions of pdftotext developed by 
# The Poppler Developers - http://poppler.freedesktop.org
if (-e $PDF2TEXT){
    my $developer = `$PDF2TEXT --help 2>&1 | grep -i 'poppler'`;
    $PDF2TEXT    = undef unless ($developer=~/poppler/i);
}

my $pdf_file = shift(@ARGV);


##----------------------------------------------
## -X ---> use pdfXtk to convert to HTML
## (and do nothing special about post-processing, at least in this version ....)


if ($opt_X){
    my ($fh, $filename);
    $filename = $ARGV[0];
    unless ($filename){
	($fh, $filename) = tempfile();
	close $fh;
	
    }
    opendir(my $dh, $SHARED_HOME.'/lib/pdfxtk') 
	|| die "can't opendir $SHARED_HOME/lib/pdfxtk: $!";
    my @jars = grep { /\.jar/ } readdir($dh);
    closedir $dh;
    my $CLASSPATH = join( ':', map { $_=$SHARED_HOME.'/lib/pdfxtk/'.$_ } @jars );
    my $pid = open2(undef, undef, 
		    $JAVA,'-cp',$CLASSPATH,'at.ac.tuwien.dbai.pdfwrap.ProcessFile',
		    $pdf_file,$filename);
    waitpid( $pid, 0 );
    unless ($ARGV[0]){
	system('cat',$filename);
    }
    exit 1;
}

##----------------------------------------------


# the vocabulary
my %voc=();

if ($opt_l){
    &read_vocabulary(\%voc,$opt_l);
}


# read output of 'pdftotext -raw'

unless ($opt_r || ( ! -e $PDF2TEXT ) ){
    my $pid = open2(\*OUT, undef, $PDF2TEXT,'-raw','-enc','UTF-8',$pdf_file,'-');
    binmode(OUT,":encoding(UTF-8)");

    my $hyphenated=undef;
    while(<OUT>){
	chomp;
	my @tok=split(/\s+/);
	if ($hyphenated){
	    $voc{$hyphenated.$tok[0]}++;
	    print STDERR "possibly hyphenated: $hyphenated -- $tok[0]\n" if ($opt_v);
	    $hyphenated=undef;
	}
	if (@tok){
	    if ($tok[-1]=~/^(.*)-/){
		$hyphenated=$1;
	    }
	}
	foreach (@tok){
	    $voc{$_}++;
	}
    }
    close(OUT);
    waitpid( $pid, 0 );
}

my $LONGEST_WORD = longest_word(\%voc);

# read output of standard 'pdftotext' (or Tika if no pdftotext is available)

unless ($opt_x){

    my $pid = ( -e $PDF2TEXT ) ? 
	open2(\*OUT, undef, 'pdftotext','-enc','UTF-8',$pdf_file,'-') :
	open2(\*OUT, undef, 'java','-jar',$TIKA,'-x',$pdf_file);

    binmode(OUT,":encoding(UTF-8)");

    while(<OUT>){
	chomp;
	my @words = find_words($_);
	foreach (@words){
	    $voc{$_}++;
	}
    }
    close(OUT);
    waitpid( $pid, 0 );
}

$LONGEST_WORD = longest_word(\%voc);


binmode(STDOUT,":encoding(UTF-8)");
my $writer = XML::Writer->new( OUTPUT => \*STDOUT, 
			       DATA_MODE => 1,
			       DATA_INDENT => 1 );


my $parser = new XML::Parser( Handlers => { 
    Default => sub{ print $_[1] },
    Char    => sub{ $_[0]->{STRING} .= $_[1] },
    Start   => \&xml_start,
    End     => \&xml_end } );


my $pid = open2(\*OUT, undef, 'java','-jar',$TIKA,'-x',$pdf_file);
$parser->parse(*OUT);

# close(OUT);
# waitpid( $pid, 0 );


sub xml_start{ 
    shift;
    $writer->startTag(shift, @_);
}

sub xml_end{
    if ($_[0]->{STRING}){

	my @words=();
	my @lines = split(/\n+/,$_[0]->{STRING});

	while (@lines){
	    my $OriginalStr     = shift(@lines);
	    my $DehyphenatedStr = undef;

	    unless ($opt_h){
		while ($OriginalStr=~/\-\s*$/ && @lines){
		    $DehyphenatedStr = $OriginalStr unless ($DehyphenatedStr);
		    $DehyphenatedStr=~s/\-\s*$//;
		    my $nextLine = shift(@lines);
		    $OriginalStr     .= "\n".$nextLine;
		    $DehyphenatedStr .= "\n".$nextLine;
		}
	    }

	    my @tok = find_words( $OriginalStr );
	    if ($DehyphenatedStr){
		my @tok2 = find_words( $DehyphenatedStr );
		@tok = @tok2 if ($#tok2 < $#tok);
	    }
	    push(@words,@tok);
	}

	if (@words){
	    $writer->characters( join(' ',@words) );
	    $_[0]->{STRING} = '';
	}
    }
    $writer->endTag($_[1]);
}

sub xml_end_simple{
    if ($_[0]->{STRING}){
	my @words = find_words( $_[0]->{STRING} );
	if (@words){
	    $writer->characters( join(' ',@words) );
	    $_[0]->{STRING} = '';
	}
    }
    $writer->endTag($_[1]);
}



sub find_words{
    my @words = ();

    my @tokens1 = split(/\s+/,$_[0]);
    return @tokens1 if ($opt_m);          # skip merging ...

    my @tokens2   = ();
    my $remaining = \@tokens1;
    my $current   = \@tokens2;

    # max number of tokens to be considered
    my $LENGTH_THR = $LONGEST_WORD || @tokens1;

    while (@{$remaining}){
	($current,$remaining) = ($remaining,$current);
	@{$remaining} = ();

	# pessimistic selection of tokens: 
	# not more than the length of the longest known word
	# (assuming that each token is at least 1 character long)
	my @more = splice(@{$current},$LENGTH_THR);

	# join all current tokens and see if they form a known word
	my $str = join('',@{$current});
	$str = lc($str) unless ($opt_L);

	# remove the final token until we have a known word
	until (exists $voc{$str}){
	    last unless (@{$current});
	    unshift( @{$remaining}, pop(@{$current}) );
	    $str = join('',@{$current});
	    $str = lc($str) unless ($opt_L);
	}

	# more than one token? 
	# --> successfully (?) found a token sequence that should be merged
	if ($#{$current}>0){
	    $voc{$str}++;
	    print STDERR join(' ',@{$current})," --> $str\n" if ($opt_v);
	}

	# need to restore non-lowercased version if necessary
	$str = join('',@{$current}) unless ($opt_L);

	# add the detected word to the list (or the next one)
	if ($str){ push(@words,$str); }
	else{      push(@words,shift @{$remaining}); }

	# add additional tokens from the sentence
	push(@{$remaining},@more);
    }
    return @words;
}



sub read_vocabulary{
    my ($voc,$file) = @_;
    if ($file=~/\.gz$/){
	open F,"gzip -cd < $file |" || die "cannot read from $file";
	binmode(F,":encoding(UTF-8)");
    }
    else{
	open F,"<:encoding(UTF-8)",$file || die "cannot read from $file";
    }
    while (<F>){
	chomp;
	my @words = split(/\s+/);
	foreach (@words){
	    $_ = lc($_) unless ($opt_L);
	    $$voc{$_}++;
	}
    }
}

sub longest_word{
    my ($voc) = @_;
    my $len=0;
    foreach (keys %{$voc}){
	my $l = length($_);
	$len = $l if ($l > $len);
    }
    return $len;
}

__END__
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)