package Biblio::Citation::Parser::Jiao::Utility; ###################################################################### # # ParaTools::Citation::Parser::Jiao::Utility; # ###################################################################### # # This file is part of ParaCite Tools # Based on Zhuoan Jiao's (zj@ecs.soton.ac.uk) citation parser (available # at http://arabica.ecs.soton.ac.uk/code/doc/ReadMe.html) # # The code is relatively unchanged, except to bring into compliance # with the ParaCite metadata style, and to allow interoperability with # the other parsers. # # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. # # ParaTools is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # ParaTools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ParaTools; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ###################################################################### use strict; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); require Exporter; @ISA = qw(Exporter); @EXPORT = qw(&normalisation &normalise_html &normalise_name &normalise_date &normalise_journal &num_of_figures); $VERSION = '0.01'; # # Normalisation utilities # sub normalisation { my($Text) = @_; # replace embedded '\n' with ' ' $Text =~ s/^\s+//s; $Text =~ s/\s+$//s; $Text =~ s/\s+/ /g; # Use single space # while ($Text =~ /.+?\n.+?/g) { # $Text =~ s/(.+?)\n(.+?)/$1 $2/ # }; $Text =~ s/``(.*?)''/"$1"/sg; # Replace ``A Paper Title'' with "A Paper Title" $Text =~ s/\s*-\s*/-/g; # remove space around '-' $Text =~ s/\s*'\s*/'/g; # remove space around ' $Text =~ s/\s*:\s*/:/g; # remove space around : $Text =~ s/\(\s+/\(/g; # ( 1998) ==> (1998) $Text =~ s/\s+\)/\)/g; # (1998 ) ==> (1998) # while ($Text =~/--/g) {$Text =~ s/--/-/}; # use single '-'. $Text =~ s/--+/-/g; $Text =~ s/~//g; # remove '~' (e.g. C.~B.~Hanna) $Text =~ s/[,;\s]+$//; # remove last ',;\s' on a line # 'Nr.' caused error in 'find_jnl_name', i.e. it became as # 'journal name' if not removed. (see: arXiv:quant-ph/9905016) $Text =~ s/([,;])\s*Nr\.\s*(\d+[,;])/$1$2/i; # "[12] For example, R. Machleidt, ...Phys. Rep. 149, 1 (1987) $Text =~ s/^([^a-z]+)for example\W+/$1/i; # '[18] *** G. Do Dang, ... (arXiv:nucl-th/9911081) $Text =~ s/\*+//g; # Phys. Rev. D56 => Phys. Rev. D. 56 $Text =~ s/phys.{1,6}rev.{1,6}([a-z])(\d+)/PHYS. REV. $1 $2/ig; # Physica 34 D => Physic D 34 $Text =~ s/physica\s+(\d+)\s+([a-z])/PHYSICA $2 $1/ig; # Nucl. Phys. B567 => Nucl. Phys. B $Text =~ s/nuc.{1,6}phys.{1,6}\s+([a-z])(\d+)/NUCL. PHYS. $1 $2/ig; return $Text; }; sub chr_valid { my $c = shift; if( $c < 128 || $c > 255 ) { return chr($c); } else { return ' '; } } sub normalise_html { my($Text) = @_; use utf8; # remove
tag $Text =~ s/
//ig; # Convert HTML entities to Unicode $Text =~ s/\&#x(\w+);/chr_valid(hex($1))/eg; $Text =~ s/\&#(\d+);/chr_valid($1)/eg; $Text =~ s/&(\w)acute[;,]/$1/g; # a ' on top of (\w) $Text =~ s/&(\w)cedil[;,]/$1/g; # a 'tail' under (\w), e.g Francios $Text =~ s/&(\w)grave[;,]/$1/g; # a ` on top of (\w) $Text =~ s/&(\w)tilde[;,]/$1/g; $Text =~ s/&(\w)uml[;,]/$1e/g; # a '..' on top of (\w) $Text =~ s/&(\w)slash[;,]/$1/g; #$Text =~ s/&#-88;(\w)/$1/g; # as &(\w)uml (see astro-ph/9811179) $Text =~ s/&#-\d+;\s*(\w)/$1/g; $Text =~ s/ß[;,]/ss/g; $Text =~ s/&[;,]/ and /g; $Text =~ s/\s*(\w)/$1/g; # a ~ on top of (\w) $Text =~ s/\/?i>//g; # cogprints $Text =~ s/&[a-z]+;//g; # otherwise ';' cause ref line break. # $Text =~ s/&#\d+;//g; # e.g. '' in 'hep-th/0001001 [99]'. $Text =~ s/\\"(\w)/$1e/g; # G\"unter => Gueter $Text =~ s/\^//g; # remove ^ $Text =~ s/([A-Z])\s*&\s*([A-Z])/$1 and $2/g; # replace '&' with 'and' $Text =~ s/[, ]+& / and /; # remove HTML markups ( etc.) $Text =~ s/<[a-z\/]{1,3}>//ig; return $Text } sub normalise_name { my($Text) = @_; my $Suffix = ''; $Text =~ s/~//; # remove typo # Jr. if ($Text =~ s/[, \.]+(Jr|Sr|Snr)\.?\s*$//i){ $Suffix = $1 } elsif ($Text =~ s/([, \.]+)(Jr|Sr|Snr)[. ]/$1/i){ $Suffix = $2 }; # van der Buren D => D van der Buren" if ($Text =~ /^\s*(((van|von|de|den|der)\s+)+)(\S\S+)\s+(.+)/i) { $Text = "$5 $1 $4" }; $Text =~ s/\s+/ /g; # single space $Text =~ s/^\W+//; $Text =~ s/\s+$//; # "A. Smith" => "A.Smith" $Text =~ s/([a-z])s+\./$1\./ig; # Ghisellini G. A. ==> G.A. Ghisellini # Konenkov D. Yu. => D.Yu. Konenkov if ($Text =~ /^([^\s.]{2,})\s+(([A-Z][a-zA-Z]?\W+)*)([A-Z][a-zA-Z]?)\W*$/) { $Text = "$2$4 $1" }; $Text = "$Text $Suffix" if ($Suffix); $Text = tdb_normalise_name($Text); return $Text; }; # Based on Tim's Authors::splitauthors and Authors::_cmonauthor(); # This subroutine is called simply because we want the author names # to be transformed to a same style used by Tim's programs. Otherwise # a join on author names in Publication and Reference tables will # miss a lot of targets. sub tdb_normalise_name{ my $author = shift; # Strip any brackets $author =~ s/\s*\([^\)]*\)\s*//g; # Get rid of any dashes (except for dashes like Hu-Su) $author =~ s/(\W)-/$1/g; # Remove any "the"s, e.g. The OPAL Collaboration $author =~ s/\bthe\s+//ig; # Sort out Jr/Jr. $author =~ s/,?\sJr\.?/_Jr/ig; $author =~ s/[\{\}]//g; # Remove any specialisations $author =~ s/\\.//g; # Remove any escapes # Convert Convert Hawking S. to S.Hawking (already done - zj) # ($author =~ /(\w\w+)\s+([\.\w\s]+\.)$/) && ($author = $2.$1); $author =~ s/\.\s/\./g; # Convert S.W. Hawking to S.W.Hawking $author =~ s/([A-Z])\s/$1\./g; # Convert S W Hawking to S.W.Hawking ($author = lc($author)) && ($author =~ s/\b(\w)/\U$1/g); # Convert STEPHEN_HAWKING to Stephen_Hawking $author =~ s/\s/_/g; # Convert Stephen W.Hawking to Stephen_W.Hawking $author =~ s/\.\.+/\./g; # Remove double dots return $author; } sub normalise_date { my($Text) = @_; # 12-14 Dec. $Text =~ s/[^\w\/][0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\s+ (Jan[\.\s]|January\b|Feb[\.\s]|February\b| Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| Sep[\.\s]|September\b|Oct[\.\s]|October\b| Nov[\.\s]|November\b|Dec[\.\s]|December\b)//xig; # Dec 12-14 $Text =~ s/(Jan[\.\s]|January\b|Feb[\.\s]|February\b| Mar[\.\s]|March\b|Apr[\.\s]|April\b|May| Jun[\.\s]|June\b|Jul[\.\s]|July\b|Aug[\.\s]|August\b| Sep[\.\s]|September\b|Oct[\.\s]|October\b| Nov[\.\s]|November\b|Dec[\.\s]|December\b) [^\w\/]*[0-3][0-9]?[a-z]*?(\s*\-\s*[0-3][0-9]?[a-z]*?)?\b//xig; return $Text; }; sub normalise_journal { my($Text) = @_; $Text =~ s/^in\s+//i; # "in ..." $Text =~ s/^(see )?also\s+//i; # "also ..."; $Text =~ s/\s*:\s*/:/g; $Text =~ s/\s\s/ /g; # single space; $Text =~ s/\.\s/\./g; # "J. Physics" => "J.Physics" $Text =~ s/\.\(/\. \(/g; $Text =~ s/^\W+//; $Text =~ s/[^\w.]+$//; # remove anything in brackets at the end. $Text =~ s/\s*\([^)]+$//; # e.g. R. Ram, J. Phys. (Paris $Text =~ s/^[^(]+\)\s*//; # e.g. a) R. Ram, J. Phys. 10, 120, 1998 # unify cases #($Text = lc($Text)) && ($Text =~ s/\b(\w)/\U$1/g); $Text = uc($Text); return $Text; }; sub num_of_figures { my($Text) = @_; my($N, @Nlist); $N = 0; @Nlist =(); $Text = normalisation($Text); # e.g. "p. 24-26" regarded as one number. # ignore 'N = 2' kind of equations, and # ignore '25th' kinds (e.g. Proc. 25th ICRC). # ignore 'protein Aquaporin-1 in ...' # ignore 'hep-th/9901001' # ignore ' ... 1.55, ...' while ($Text =~ /(?:^|\b)[a-z]*(\d+)([a-z]*) (?:-[a-z]*\d+[a-z]*)* (?:\b|$)/gix) { next if ($2 =~ /^th$/i); next if ($' =~ /^\.\d+/); next if ($` =~ /\d+\.$/); next if ($` =~ /[=<>\/-]\s*$/); push(@Nlist, $1); }; $N = scalar(@Nlist); return $N } sub remove_extra_spc { my($Text) = @_; $Text =~ s/^\s+//; $Text =~ s/\s+$//; $Text =~ s/\s\s+/ /g; return $Text }; 1; __END__ =head1 NAME ParaTools::Citation::Parser::Jiao::Utility - Perl module containing text processing subroutines. =head1 SYNOPSIS use ParaTools::Citation::Parser::Jiao::Utility; =head1 DESCRIPTION This module is called by ParaTools::Citation::Parser::Jiao.pm. =head1 AUTHOR Original Author: Zhuoan Jiao =cut