package Biblio::Document::Parser::Utils; ###################################################################### # # ParaTools::Document::Parser::Utils; # ###################################################################### # # This file is part of ParaCite Tools ((http://paracite.eprints.org/developers/) # # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. # # ParaTools is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # ParaTools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ParaTools; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ###################################################################### use utf8; use strict; require Exporter; use LWP::UserAgent; use File::Temp qw/ tempfile tempdir /; use URI; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_TRANSFORMS %CONVERTERS $DEBUG); @ISA = qw( Exporter ); @EXPORT_OK = qw( &normalise_multichars ); @EXPORT = qw( &get_content ); $DEBUG = 0; =pod =head1 NAME @ - utility module for handling International characters and document conversion =head1 DESCRIPTION Biblio::Document::Parser::Utils provides some utility functions for handling international characters and for conversion of documents to plaintext. =head1 SYNOPSIS use Biblio::Document::Parser::Utils qw( normalise_multichars ); print normalise_multichars( $str ); =head1 METHODS =over 4 =item $str = normalise_multichar( $str ) Convert multi-char international characters into single UTF-8 chars, e.g.: ¨o => ö These appear in pdftotext output from PDFs generated by pdflatex. =cut $CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]'; %CHAR_TRANSFORMS = ( "\x{5e}a"=>"\x{e2}", "\x{5e}e"=>"\x{ea}", "\x{5e}o"=>"\x{f4}", "\x{5e}u"=>"\x{fb}", "\x{60}a"=>"\x{e0}", "\x{60}e"=>"\x{e8}", "\x{60}o"=>"\x{f2}", "\x{60}u"=>"\x{f9}", "\x{a8}a"=>"\x{e4}", "\x{a8}e"=>"\x{eb}", "\x{a8}o"=>"\x{f6}", "\x{a8}u"=>"\x{fc}", "\x{b4}a"=>"\x{e1}", "\x{b4}e"=>"\x{e9}", "\x{b4}o"=>"\x{f3}", "\x{b4}u"=>"\x{fa}", "\x{b4}n"=>"\x{144}", "\x{b4}z"=>"\x{17a}", "\x{7e}n"=>"\x{f1}", ); %CONVERTERS = ( doc => "wvText _IN_ _OUT_", pdf => "pdftotext -raw _IN_ _OUT_", ps => "pstotext -output _OUT_ _IN_", htm => "links --dump _IN_ > _OUT_", html => "links --dump _IN_ > _OUT_", ); if($DEBUG) { binmode(STDOUT,":utf8"); for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) { print "$_ => $CHAR_TRANSFORMS{$_}\n"; } } sub normalise_multichars { my $str = shift; $str =~ s/($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}/sgo; $str; } =pod =item $content = ParaTools::Utils::get_content($location) This function takes either a filename or a URL as a parameter, and aims to return a string containing the lines in the file. A hash of converters is provided in ParaTools/Utils.pm, which should be customised for your system. For URLs, the file is first downloaded to a temporary directory, then converted, whereas local files are copied straight into the temporary directory. For this reason, some care should be taken when handling very large files. =cut sub get_content { my($location) = @_; # Get some temporary files ready. my $dir = tempdir( CLEANUP => 1 ); my (undef, $tofile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt"); my $type = "txt"; my $converter = ""; # Set up the type. if ($location =~ /\.(\w+?)$/) { $type = $1; } if ($location =~ /^http:\/\//) { if (!$type) { print STDERR "Unknown type - assuming HTML\n"; $type = "html"; } } else { if (!$type) { print STDERR "Unknown type - assuming plaintext\n"; $type = "txt"; } } my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type"); # Now we know the type, grab the files. if ($location =~ /^http:\/\//) { # If it's remote, use the LWP mirror function to grab it. my $ua = new LWP::UserAgent(); $ua->mirror($location, $fromfile); } else { # If it's local, mirror it straight to the $fromfile. open(FIN, $location) or die $!; open(FOUT, ">$fromfile") or die $!; foreach() { print FOUT $_; } close FOUT or die $!; close FIN or die $!; } if ($type ne "txt") { # Convert from the $fromfile to the $tofile. if (!$CONVERTERS{$type}) { print STDERR "Sorry, no converters available for type $type\n"; return; } else { $converter = $CONVERTERS{$type}; $converter =~ s/_IN_/$fromfile/g; $converter =~ s/_OUT_/$tofile/g; } system($converter); } else { # If we have text, just use the fromfile. $tofile = $fromfile; } my $content = ""; open( INPUT, $tofile ) or return; read( INPUT, $content, -s INPUT ); close INPUT or die $!; return $content; } =pod =item $escaped_url = ParaTools::Utils::url_escape($string) Simple function to convert a string into an encoded URL (i.e. spaces to %20, etc). Takes the unencoded URL as a parameter, and returns the encoded version. =cut sub url_escape { my( $url ) = @_; $url =~ s//%3E/g; $url =~ s/#/%23/g; $url =~ s/;/%3B/g; $url =~ s/&/%26/g; my $uri = URI->new( $url ); my $out = $uri->as_string; return $out; } 1; __END__ =pod =back =head1 AUTHOR Tim Brody Mike Jewell (packaging) =cut