package Biblio::Document::Parser::Brody; ###################################################################### # # Biblio::Document::Parser::Brody; # ###################################################################### # # Reference Parser by Tim Brody # # This file is part of ParaCite Tools (http://paracite.eprints.org/developers/) # # Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. # # ParaTools is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # ParaTools is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with ParaTools; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # ###################################################################### =pod =head1 NAME Biblio::Document::Parser::Brody =head1 DESCRIPTION Module that parses reference strings from a document. Relies on a reference section starting with a title "References", "Bibliography", or "Cited". Seperates references by prefixed number (e.g. "[1]" or "1.") or by year (e.g. "Smith, J (1992)"). =head1 SYNOPSIS use Biblio::Document::Parser::Brody; my $parser = new Biblio::Document::Parser::Brody(); my @refs = $parser->parse(\*FILE_IO); my @refs = $parser->parse($str); =head1 METHODS =cut use strict; use Carp; use vars qw($DEBUG $RE_BOR $RE_EOR $RE_NAME_CHARS $RE_NAME $RE_NAME_LIST_CHARS $MAX_SIZE); # Set up the input/output appropriately #use open IN => ':encoding(latin1)', OUT => ':utf8'; $MAX_SIZE = 1024*2000; # 2MB $RE_BOR = qr/^[^a-z]*(?:references(?:\s+cited)?)|(?:bibliography)[^a-z]*$/i; $RE_EOR = qr/^\s*(?:\d+\.?\s*)*(?:acknowledge?ment)|(?:footnote)|(?:appendix)|(?:abbreviation)|(?:glossary)|(?:figure)[^\n]{0,10}\s*$/i; $RE_NAME_CHARS = qr/[a-zA-Z`'\-]/; $RE_NAME_LIST_CHARS = qr/[a-zA-Z,\.;\(\)\-\s\&'`]/; $RE_NAME = qr/(?:[a-zA-Z`'\-]{4,7}, *(?:[a-zA-Z]\. *)+)/; =pod =over 4 =item $p = Biblio::Document::Parser::Brody->new([-debug=>1]) Constructor method for class. =cut sub new { my ($class,%args) = @_; $DEBUG = $args{-debug}; return bless {}, $class; } =pod =item @refs = $p->parse($str) Parses a string $str and returns a list of unstructured reference strings. =cut sub parse { my $self = shift @_; my $arg = shift @_; my $BIBL = ''; # UNIVERSAL::isa($arg,"IO::Handle") doesn't work? if( ref($arg) ) { read($arg,$BIBL,$MAX_SIZE) or croak "Error reading from file handle: $!\n"; } else { $BIBL = join('',$arg,@_); } croak "No data to parse\n" unless length($BIBL); $BIBL =~ s/\f/\n\n/sg; my %HEADERS; while( $BIBL =~ /(?:\n[\r[:blank:]]*){2}([^\n]{0,40}\w+[^\n]{0,40})(?:\n[\r[:blank:]]*){3}/osg ) { $HEADERS{header_to_regexp($1)}++; } if( %HEADERS ) { my @regexps = sort { $HEADERS{$b} <=> $HEADERS{$a} } keys %HEADERS; my $regexp = $regexps[0]; if( $HEADERS{$regexp} > 3 ) { my $c = $BIBL =~ s/(?:\n[\r[:blank:]]*){2}(?:$regexp)(?:\n[\r[:blank:]]*){3}/\n\n/sg; warn "Applying regexp: $regexp ($HEADERS{$regexp} original matches) Removed $c header/footers using ($HEADERS{$regexp} original matches): $regexp\n" if $DEBUG; } else { warn "Not enough matching header/footers were found\n" if $DEBUG; } } else { warn "No header/footers were found\n" if $DEBUG; } # Kill any bad chars # local *lat2uni = convertor( 'latin1', 'utf8' ); # lat2uni(\$BIBL); # if( $BIBL =~ /$RE_BOR/mi ) { # $BIBL = $'; # } else { # croak "FATAL: Unable to find reference section\n"; # } my @REFS; # Attempt to find the reference section while( !@REFS && ($BIBL =~ /$RE_BOR/mi) && ($BIBL = $') ) { my $c = 0; # Count the number of occurences of [\d] over the next 2k of data or so my $buffer = substr($BIBL, 0, 2048); $c = 0; while($buffer =~ m/^\s*\[\d+\]/mog) { last if ++$c == 5 } if( $c >= 5 ) { warn "Style = numbered square ([1])\n" if $DEBUG; last if (@REFS = &style_numbered_square($BIBL)); } # How about 1. notation # $buffer = substr($BIBL, 0, 2046); $c = 0; while($buffer =~ m/^\s*(\d+)\./mog) { last if ++$c == 5 } if( $c >= 5 ) { warn "Style = numbered (1.)\n" if $DEBUG; # $BIBL =~ s/^\s*(\d+)\./\[$1\]/mg; last if (@REFS = &style_numbered($BIBL)); } # Now we're getting desperate - hopefully its a name list followed by year # $buffer = substr($BIBL, 0, 2048); $c = 0; while($buffer =~ m/^$RE_NAME_LIST_CHARS{10,40}[^\d\-]19|20\d{2}[^\d\-]/mog) { last if ++$c == 5 } if( $c >= 5 ) { warn "Style = years\n" if $DEBUG; last if (@REFS = &style_years($BIBL)); } # if( @REFS ) { # last; # } elsif( $BIBL =~ /$RE_BOR/mi ) { # warn "Skipping section ...\n" if $DEBUG; # $BIBL = $'; # } else { # last; # } } for( my $i = 0; $i < @REFS; $i++ ) { my $ref = $REFS[$i] or next; # $REFS[$i] = "[" . ($i+1) . "] " . unicode_string($ref); $REFS[$i] = "[" . ($i+1) . "] " . $ref; } return grep { defined($_) && length($_) } @REFS; } #my ($BIBL, $buffer); #$BIBL = ''; #my $lc = 0; #die "FATAL: Input has gone beyond $MAX_SIZE byte limit" if read(STDIN,$BIBL,$MAX_SIZE) == $MAX_SIZE; #die "Empty input" unless length($BIBL); #while( read(STDIN,$buffer,4096) ) { # $BIBL .= $buffer; # die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; #} #while( <> ) { # s/\f/\n\n/sg; # $BIBL = $_ . $BIBL; # die "FATAL: Input has gone beyond $MAX_SIZE bytes limit" if length($BIBL) > $MAX_SIZE; # if( $_ =~ /^(?:\n\s*){3}/ ) { # # Regexp matches for the end of the string are *really* bad performance # # Lines are in reverse order! # if( $BIBL =~ /^(?:\n\s*){3}([^\n]{0,40}\w+[^\n]{0,40})(?:\n\s*){2}/os ) { # $HEADERS{header_to_regexp($1)}++; # } # } #} # Put the lines back in-order #my @lines = split(/\n/,$BIBL); #$BIBL = ''; #for(@lines) { # $BIBL = $_ . "\n" . $BIBL; #} # Read in the document #while( read(STDIN,$buffer,4096) ) { # if( length($BIBL) > $MAX_SIZE ) { # die "FATAL: Input has gone beyond $MAX_SIZE Bytes limit\n"; # } # $BIBL .= $buffer; #} #print "Ref section:\n", $BIBL; # Change to utf8 #use utf8; #### REMAINING FUNCTIONS ARE INTERNAL OR DEPRECATED #### sub end_of_references { my $ref = shift; if( $$ref =~ /${RE_EOR}/im || $$ref =~ /^\s*acknowledgements:/im ) { $$ref = $`; return 1; } if( $$ref =~ /(?:\s*\n){3,}/s ) { $$ref = $`; return 1; } if( length($$ref) > 1024 ) { return 1; } return 0; } sub style_numbered { my @REFS = split(/^\s*(\d+\.)/m, shift); shift @REFS while (@REFS && ($REFS[0] !~ /^\d+\./ || substr($REFS[0],0,-1) != 1)); my $i = 2; while( $i < @REFS ) { if( $REFS[$i] =~ /^\d+\./ ) { my $val = substr($REFS[$i],0,-1); if( $val != ($i/2)+1 ) { $REFS[$i-1] .= splice(@REFS,$i,1); } else { $i+=2; } } else { $REFS[$i-1] .= splice(@REFS,$i,1); } if( end_of_references(\$REFS[$i-1]) ) { splice(@REFS,$i); } } for( my $i = 0; $i < @REFS; $i++ ) { $REFS[$i] .= splice(@REFS,$i+1,1); $REFS[$i] =~ s/\s+/ /sg; $REFS[$i] =~ s/^\s+//; $REFS[$i] =~ s/\s+$//; } @REFS; } sub style_numbered_square { my $BIBL = shift; # Split the bibliography $BIBL =~ /(?=\[\d+\])/; my @REFS = split(/^\s*\[(\d+)\]/m, $') or return (); shift @REFS unless $REFS[0]; # Make sure there is a "value" to go with a reference number # for( my $i = 0; $i < @REFS; $i+=2 ) { # if( $REFS[$i+1] =~ /\[\d+\]/ ) { # splice(@REFS,$i+1,0,''); # } # } # If there is a large reference its probably the end of the bibliography for( my $i = 10; $i < @REFS; $i++ ) { if( length($REFS[$i]) > 1024 ) { splice(@REFS, $i+1); $REFS[$i] = substr($REFS[$i],0,1024) . " RUNAWAY_REFERENCE_DETECTED "; } } # Add any out-of-order chunks to the previous reference value my $last = 0; my $max = 0; for( my $i = 0; $i < @REFS; $i+=2 ) { my $n = $REFS[$i]; # $n =~ s/\D//g; $max = $n if $n > $max; if( $n == $last+1 ) { $last++; next; } else { # Join this out-of-order chunk onto the previous ref. $REFS[$i-1] .= splice(@REFS,$i,2); } } # Remove any trailing garbage splice(@REFS, $last*2, -1); # Presumably there is a gap between the last reference and any trailing junk $REFS[$#REFS] =~ s/(\r?\n){2}.*//s; # Prettify the references for( my $i = 1; $i < @REFS; $i+=2 ) { $REFS[$i] =~ s/[\r\n]+/ /sg; $REFS[$i] =~ s/^\s+//sg; $REFS[$i] =~ s/\s+$//sg; } # Get rid of the numbering for( my $i = 0; $i < @REFS; $i++ ) { # $REFS[$i] = $REFS[$i+1]; splice(@REFS,$i,2,$REFS[$i+1]); } return @REFS; } sub style_years { my $BIBL = shift; $BIBL =~ s/^\s+//sg; # Convert very long lines of spaces into a return $BIBL =~ s/ {70} */\n/sg; my @REFS; # Lets try splitting on a blank line @REFS = split(/((?:\s*\n){2})/, $BIBL); shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS+\d{4}\D/); # That didn't work, lets split on left-aligned things (where the next line(s) are blank or indented) if( !@REFS || length($REFS[0]) > 300 ) { @REFS = split(/\n[ ]{0,2}((?:(?:\S$RE_NAME_LIST_CHARS{10,})|$RE_NAME[^\d\-])\d{4}[^\d\-][^\n]+)/, $BIBL); shift @REFS while (@REFS && $REFS[0] !~ /^$RE_NAME_LIST_CHARS{10,}\d{4}\D/s); #return @REFS; for( my $i = 1; $i < @REFS; $i++ ) { if( end_of_references(\$REFS[$i]) ) { splice(@REFS,$i+1); # Indented } elsif( $REFS[$i] =~ /^\s* {5}|\t/m ) { $REFS[$i-1] .= splice(@REFS,$i,1); } } } else { for( my $i = 1; $i < @REFS; $i++ ) { if( end_of_references(\$REFS[$i]) ) { splice(@REFS,$i+1); } } } # If we find what looks like the end of the reference section, discard the trailing rubbish # for( my $i = 0; $i < @REFS; $i++ ) { # if( end_of_references(\$REFS[$i]) ) { # splice(@REFS,$i+1); # } elsif( $BIBL =~ /(\r?\n){3}/s ) { # $REFS[$i] = $`; # splice(@REFS,$i+1); # } # } unless( @REFS ) { warn "Unable to split year-based references\n"; return (); } # Remove heavily indented lines following a blank line for( my $i = 1; $i < @REFS; $i++ ) { if( $REFS[$i-1] !~ /\S/ && $REFS[$i] =~ /^\s{40}/ ) { splice(@REFS,$i,1); $i--; } } # Join refs with the previous reference if they are very short or are quite short and don't start with ...(year) for( my $i = 1; $i < @REFS; $i++ ) { my $l = $REFS[$i]; $l =~ s/\s+//sg; if( (length($l) < 30) || (length($l) < 50 && $REFS[$i] !~ /^$RE_NAME_LIST_CHARS{10,40}[^\d\-](\d{4})[^\d\-]/s) ) { $REFS[$i-1] .= $REFS[$i]; splice(@REFS,$i,1); $i--; } } # If we find 3 sequential references without years near the beginning we probably have trailing garbage my $lc = 0; for( my $i = 10; $i < @REFS; $i++ ) { if( $REFS[$i] =~ /^\D{10,50}19|20\d{2}/s ) { $lc = 0; } else { $lc++; } if( $lc == 3 ) { splice(@REFS,$i-2); } } # Remove lines without any numbers that are quite long (excluding spaces) for( my $i = 0; $i < @REFS; $i++ ) { my $l = $REFS[$i]; $l =~ s/\s+//sg; if( length($l) > 100 && $REFS[$i] !~ /\d/ ) { splice(@REFS,$i,1); } } # Prettify map { $_ =~ s/\s+/ /sg; $_ =~ s/^\s+//; $_ =~ s/\s+$//s; } @REFS; # This doesn't work - names are too icky # Now go back in and split anything that looks like name, x (year) # for( my $i = 0; $i < @REFS; $i++ ) { # my @srefs = grep { $_ =~ /\S/ } split(/((?:[a-zA-Z\-\'\.]+\s*,\s*[a-zA-Z\.]+.{0,7})+\d{4}\b)/, $REFS[$i]); # next unless @srefs > 2; #print "Split reference:\n", # (map { "PART: \"$_\"\n" } @srefs), "\n"; # } #die; return @REFS; } sub header_to_regexp { my $header = shift; $header =~ s/([\\\|\(\)\[\]\.\*\+\?\{\}])/\\$1/g; $header =~ s/\s+/\\s+/g; $header =~ s/\d+/\\d+/g; return $header; return q/(?:\n\s*){3}(/.$header.q/)(?:\n\s*){2}/; } #sub unicode_string { # $_ = shift(); # s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg; # s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg; # return $_; #} 1; __END__ =back =head1 AUTHOR Written by Tim Brody