package MARC::Detrans::Rules; use strict; use warnings; =head1 NAME MARC::Detrans::Rules - A set of detransliteration rules =head1 SYNOPSIS use MARC::Detrans::Rules; my $rules = MARC::Detrans::Rules->new(); $rules->addRule( MARC::Detrans::Rule->new( from=>'a', to='b' ) ); =head1 DESCRIPTION MARC::Detrans::Rules provides the core functionality for managing detransliteration rules and for converting transliterated text to MARC-8. A MARC::Detrans::Rules object is essentially a collection of MARC::Detrans::Rule objects which are consulted during a call to convert(). =head1 METHODS =cut =head2 new() Create an empty rules object to add individual rules to. =cut sub new { my $class = shift; my $self = { rules => {}, error => undef }; return bless $self, ref( $class ) || $class; } =head2 addRule() Add a MARC::Detrans::Rule to the rules object. =cut sub addRule { my ( $self, $rule ) = @_; ## get first character off the source for lookup ## since we'll be processing a character at a time my $key = substr( $rule->from(), 0, 1 ); ## look for existing rules with this key my $rules = exists($self->{rules}{$key}) ? $self->{rules}{$key} : []; ## and the new rule and sort the rules so that the longest come first. ## this will mean that when we go to use the rules in convert() ## that the longest match will occur first. push( @$rules, $rule ); @$rules = sort byRule @$rules; ## stash away the new rules $self->{rules}{$key} = $rules; } sub byRule { return length( $b->from() . $b->position() ) <=> length( $a->from() . $a->position() ) } =head2 convert() convert() applies the rules contained in the MARC::Detrans::Rules object to convert a string that is passed in. =cut sub convert { my ( $self, $in ) = @_; ## ok, this is probably the most complicated bit of the distro ## and it's not really that bad. my $inLength = length( $in ); my $out = ''; my $pos = 0; my $currentEscape = ''; ## we're going to step through the source string and build up $out ## to contain the de-transliterated text while ( $pos < $inLength ) { ## extract the character at the current position ## and look to see if we have a rule for it my $key = substr( $in, $pos, 1 ); my $rules = exists $self->{rules}{$key} ? $self->{rules}{$key} : []; pos($in) = $pos; my $foundRule; ## go through each of the rules and see if we've got a match foreach my $rule ( @$rules ) { my $from = $rule->from(); ## if the rule matches remember it for later and jump out of ## the loop since we've got what we needed ## \G anchors the match at our current position ## \Q...\E makes sure that metacharacters in our pattern are escaped if ( $in =~ m/\G\Q$from\E/ ) { my $position = $rule->position() || ''; if ( $position eq 'initial' ) { next unless isInitial( $in, $pos ); } elsif ( $position eq 'medial' ) { next if isInitial( $in, $pos ) or isFinal( $in, $pos ); } elsif ( $position eq 'final' ) { next unless isFinal( $in, $pos ); } $foundRule = $rule; last; } } ## no matched rule, then we've got a character in the source ## data which doesn't map. Store the error and return asap. if ( ! defined($foundRule) ) { $self->{error} = sprintf( qq(no matching rule found for "%s" [0x%x] at position %i), $key, ord($key), $pos+1 ); return; } ## advance the position the amount of characters that we matched $pos += length( $foundRule->from() ); ## if the rule has an associated MARC-8 escape character tag it ## onto the output text if ($foundRule->escape() and $foundRule->escape() ne $currentEscape) { $out .= chr(0x1B).$foundRule->escape(); $currentEscape = $foundRule->escape(); } ## append the new text $out .= $foundRule->to(); } ## escape back to ASCII if approriate if ( $currentEscape ) { $out .= chr(0x1B).chr(0x28).chr(0x42); } ## make sure error flag is undef since we're ok now $self->{error} = undef; ## return the new text! return( $out ); } =head2 error() Will return the latest error encountered during a call to convert(). Can be useful for determining why a call to convert() failed. A side effect of calling error() is that the error slot is reset. =cut sub error { my $self = shift; my $error = $self->{error}; $self->{error} = undef; return( $error ); } =head1 AUTHORS =over 4 =item * Ed Summers =cut ## helper functions to determine whether a specific positon in a string ## is at the start or at the end of a word. sub isInitial { my ($string,$position) = @_; return 1 if $position == 0; return 1 if substr($string,$position-1,1) =~ /\W/; return 0; } sub isFinal { my ($string,$position) = @_; return 1 if $position == length($string)-1; return 1 if substr($string,$position+1,1) =~ /\W/; } 1;