package MARC::Detrans; use strict; use warnings; use Carp qw( croak ); use MARC::Detrans::Config; our $VERSION = '1.41'; =head1 NAME MARC::Detrans - De-transliterate text and MARC records =head1 SYNOPSIS use MARC::Batch; use MARC::Detrans; my $batch = MARC::Batch->new( 'marc.dat' ); my $detrans = MARC::Detrans->new( 'config.xml' ); while ( my $record = $batch->next() ) { my $newRecord = $detrans->convert( $record ); } =head1 DESCRIPTION MARC::Detrans is an eclectic addition to the already eclectic MARC::Record distribution for de-transliterating MARC::Records. What is detransliteration you ask? Well it's the opposite of transliteration, which according to the Merriam-Webster: to represent or spell in the characters of another alphabet Traditionally when librarians catalog an item that has a title in a non-Roman script they will follow transliteration rules for converting the title into the Roman alphabet, so that the bibliographic record could be filed into the card catalog or database index appropriately. These Romanization Rules are published by the Library of Congress http://www.loc.gov/catdir/cpso/roman.html. Now that computer screens can display Unicode fairly well it is now desirable to display the original script for library users who are more familiar with the original script. MARC::Detrans provides a framework for detransliterating MARC records so that the orginal script is available MARC-8 encoded in 880 fields. Very esoteric right? =head1 CONFIGURATION MARC::Detrans behavior is controlled by an XML configuration file. An example of this configuration file can be found in the examples directory of the MARC::Detrans distribution. The configuration determines the detransliteration rules that will be used to add 880 fields to existing records. It is hoped that people will contribute their configurations for various languages to the MARC::Detrans project so that they can be distributed with this package. For more information about the configuration file see L. In addition a sample driver program which uses MARC::Detrans has also been included in the examples directory. This script is meant as a jumping off point showing how to use the MARC::Detrans framework. =head1 METHODS =head2 new() The constructor which you should pass the path to your configuration file. my $detrans = MARC::Detrans->new( config => 'config.xml' ); =cut sub new { my ($class,%args) = @_; croak( "must supply config parameter" ) if ! exists $args{config}; croak( "config file doesn't exist" ) if ! -f $args{config}; my $config = MARC::Detrans::Config->new( $args{config} ); ## verify a few things croak( $args{config} . ": missing code attribute in language element" ) if ! $config->languageCode(); return _init( $class, $config ); } =head2 newFromConfig() If you want to supply your own MARC::Detran::Config object instead of an XML file configuration as in new() you can use newFromConfig(). It's unlikely that you'll ever need to use this method. =cut sub newFromConfig { my ($class,$config) = @_; croak( "must supply MARC::Detrans::Config object" ) if ! ref($config) or ! $config->isa( 'MARC::Detrans::Config' ); return _init( $class, $config ); } ## helper to initialize an object sub _init { my ($class,$config) = @_; return bless { config => $config, errors => [], tallyAdd880 => 0, tallyDetrans => {}, tallyCopy => {}, }, ref($class) || $class; } =head2 convert() Pass a MARC::Record into convert() and you will be returned a the same object with portions of it modified according to your configuration file. IMPORTANT: if the record was not modified or an error was encountered you will be returned undef instead of the MARC::Record object. You will want to use the errors() method for diagnosing what happened. =cut sub convert { my ($self,$record) = @_; croak( "must pass in MARC::Record object" ) if ! ref($record) or ! $record->isa( 'MARC::Record' ); my $config = $self->{config}; ## make sure the script isn't already present if ( $self->scriptAlreadyPresent($record) ) { $self->addError( "target script already present" ); return; } ## check the language of the record my $f008 = $record->field( '008' ); if ( ! $f008 ) { $self->addError( "can't determine language in record: missing 008" ); return; } my $lang = substr( $f008->data(), 35, 3 ); if ( $lang ne $config->languageCode() ) { $self->addError( "record is not correct language: $lang instead of ". $config->languageCode() ); return; } ## add 880 fields and return if the record was edited return $record if $self->add880s( $record ); ## otherwise return undef since the record was not modified return; } ## internal helper for adding 880 fields to a record ## will return 1 if the record is modified and 0 if it isn't sub add880s { my ($self,$r) = @_; my $config = $self->{config}; my $rules = $config->rules(); my $names = $config->names(); my $scriptCode = $config->scriptCode(); my $scriptOrientation = $config->scriptOrientation(); my $count = 0; my $edited = 0; ## see if the record is for a translation ## since we'll need to skip some fields below if it is my $isTranslation = isTranslation( $r ); foreach my $tag ( $config->detransFields() ) { FIELD: foreach my $field ( $r->field($tag) ) { my @newSubfields = (); ## we don't process parallel titles if ( isParallelTitle($field) ) { $self->addError( "field=$tag: skipped parallel title" ); next FIELD; } ## we don't process 1XX and 7XX fields the record ### is for a translation if ( $isTranslation and $tag =~ /(1|7)\d\d/ ) { $self->addError( "field=$tag: skipped because of translation" ); next FIELD; } ## if it's a field that might contain a name look it up ## to see if it has a non-standard detransliteration if ( isNameField($tag) ) { my $nameData = $names->convert( $field ); if ( $nameData ) { $self->{tallyAdd880}++; $count++; add880( $r, $count, $field, $nameData, $scriptCode, $scriptOrientation ); $edited = 1; next FIELD; } } SUBFIELD: foreach my $subfield ( $field->subfields() ) { my ($code,$data) = @$subfield; if ($config->needsDetrans(field=>$tag,subfield=>$code)) { my $new = $rules->convert( $data ); if ( ! defined $new ) { $self->addError( "field=$tag subfield=$code: " . $rules->error() ); next FIELD; } $self->{tallyDetrans}{"$tag-$code"}++; push( @newSubfields, $code, $rules->convert($data) ); } elsif ($config->needsCopy(field=>$tag,subfield=>$code)) { $self->{tallyCopy}{"$tag-$code"}++; push( @newSubfields, $code, $data); } } if ( @newSubfields ) { $self->{tallyAdd880}++; $count++; add880($r, $count, $field, \@newSubfields, $scriptCode, $scriptOrientation ); $edited = 1; } } } if ( $edited ) { $self->add066($r); } return $edited; } sub scriptAlreadyPresent { my ($self,$record ) = @_; my $config = $self->{config}; my $f066 = $record->field( '066' ); return 0 if ! $f066; foreach my $subfield( $f066->subfields() ) { return 1 if grep { $_ eq $subfield->[1] } $config->allEscapeCodes(); } return 0; } sub isNameField { my $tag = shift; return grep /^$tag$/, qw( 100 110 600 700 810 800 ); } sub isParallelTitle { my $field = shift; return if $field->tag() ne 246; return 1 if $field->indicator(2) =~ /1|5/; return 1 if ( $field->subfields() )[0]->[0] eq 'i'; return; } sub isTranslation { my $r = shift; my $f041 = $r->field( '041' ); return if ! $f041; return if ! $f041->subfield( 'h' ); return 1; } ## private helper function to add a single 880 based on the ## tag and indicators of another field sub add880 { my ( $record, $count, $field, $subfields, $scriptCode, $orientation ) = @_; my $tag = $field->tag(); my $occurrence = sprintf( '%02d', $count ); my $sub6 = "$tag-$occurrence"; $sub6 .= "/$scriptCode" if defined $scriptCode; $sub6 .= "/$orientation" if defined $orientation; my $f880 = MARC::Field->new( '880', $field->indicator(1), $field->indicator(2), 6 => $sub6, ## subfield 6 @$subfields ## the reset of the subfields ); $record->insert_grouped_field( $f880 ); ## now add to the original field ## by creating a new field with the subfield 6 ## and replacing the old field with it my @subfields = map { $_->[0], $_->[1] } $field->subfields(); unshift( @subfields, '6' => "880-$occurrence" ); $field->replace_with( MARC::Field->new( $tag, $field->indicator(1), $field->indicator(2), @subfields ) ); } ## private helper function for adding a 066 indicating which ## additional character sets were used in this record sub add066 { my ($self,$record) = @_; my $config = $self->{config}; ## get a list of all the 066 fields used in this mapping ## techically we should probably only list here the ones ## that are *actually* used in this record...but there's ## probably no harm in listing all of the ones used in this ## configuration. my @subfields; foreach ( $config->allEscapeCodes() ) { ## ignore (B next if $_ eq '(B'; push( @subfields, 'c', $_ ); } return if @subfields == 0; ## don't obliterate an 066 that's already present my $f066 = $record->field( '066' ); if ( $f066 ) { unshift( @subfields, map { $_->[0], $_->[1] } $f066->subfields() ); my $new066 = MARC::Field->new( '066', '', '', @subfields ); $f066->replace_with( $new066 ); } else { $f066 = MARC::Field->new( '066', '', '', @subfields ); $record->insert_grouped_field( $f066 ); } } =head2 errors() Will return the latest errors encountered during a call to convert(). Can be useful for determining why a call to convert() returned undef. A side effect of calling errors() is that the errors storage is reset. =cut sub errors { my $self = shift; my @errors = @{ $self->{errors} }; $self->{errors} = []; return @errors; } ## this really should just be used internally...hence no POD sub addError { my ($self,$msg) = @_; push( @{ $self->{errors} }, $msg ); } =head2 stats880sAdded() Returns the total amount of 880 fields added to records so far by this MARC::Detrans object. =cut sub stats880sAdded { my $self = shift; return $self->{tallyAdd880}; } =head2 statsDetransliterated() Returns a hash of stats on the field_subfield combinations that have been detransliterated by a MARC::Detrans object. =cut sub statsDetransliterated { return %{ shift->{tallyDetrans} }; } =head2 statsCopied() Returns a hash of stats on the field_subfield combinations that have been copied by a MARC::Detrans object. =cut sub statsCopied { return %{ shift->{tallyCopy} }; } =head1 AUTHORS MARC::Detrans was developed as part of a project funded by the Queens Borough Public Library in New York City under the direction of Jane Jacobs. It is their generosity that allowed this package to be released on CPAN. =over 4 =item * Ed Summers =back =cut 1;