# # $Id: Tape.pm,v 1.5 2005/08/04 15:00:04 patrick Exp $ # =head1 NAME XML::Tape - module for the manipulation of XMLtape archives =head1 SYNOPSIS use XML::Tape qw(:all); $tape = tapeopen('tape.xml','w'); $tape->add_record("info:archive_id/1", "2005-05-31", $xml_record); $tape->tapeclose(); $tape = tapeopen('tape.xml','r'); while ($record = $tape->get_record()) { printf "id: %s\n" , $record->getIdentifier; printf "date: %s\n" , $record->getDate; printf "xml: %s\n", $record->getRecord; } $tape->tapeclose(); =head1 DESCRIPTION The XMLtape provides a write-once/read-many XML wrapper for a collection of XML documents. The wrapper provides an easy storage format for big collections of XML files which can be processed with off the shelf tools and validated against a schema. The XMLtape is typically used in digital preservation projects. =cut package XML::Tape; use strict; require Exporter; use vars qw($VERSION); ( $VERSION ) = '$Revision: 1.5 $ ' =~ /\$Revision:\s+([^\s]+)/;; @XML::Tape::ISA = qw(Exporter); @XML::Tape::EXPORT_OK = qw(tapeopen); %XML::Tape::EXPORT_TAGS = (all => [qw(tapeopen)]); $XML::Tape::SCHEMA_LOCATION = 'http://purl.lanl.gov/STB-RL/schemas/2005-01/tape.xsd'; =head1 FUNCTIONS =over 4 =item tapeopen($filename, $mode, [, @admin]) Filename is the location of an XMLtape file or an opened IO::Handle. When mode is 'r' this function opens a XMLtape for reading. When mode is 'w' this function creates a new XMLtape on disk. Optionally an array of strings can be provided which contain in XML format metadata about the XMLtape. E.g. tapeopen( "tape.xml", "w" "2005-05-31" ); Returns a XMLtape instance on success or undef on error. =cut sub tapeopen { my ($filename, $mode, @admin) = @_; die "usage: tapeopen(\$filename, \$mode, [\@admin])" unless ($filename && $mode =~ /^r|w$/); if ($mode eq 'w') { return new XML::Tape::Writer($filename,@admin); } else { return new XML::Tape::Reader($filename); } return undef; } package XML::Tape::Writer; use IO::File; sub new { my ($pkg, $filename,@admin) = @_; my $fh; if (ref $filename && $filename->isa('Tie::Handle')) { $fh = $filename; } else { $fh = new IO::File; $fh->open("> $filename") || return undef; } my $obj = bless { fh => $fh , init => 0, recnum => 0 , } , $pkg; $obj->add_admin(@admin) if (@admin > 0); return $obj; } sub init { my ($this) = shift; my $fh = $this->{fh}; die "init: not allowed at this stage" unless $this->{init} == 0; print $fh "\n"; print $fh ""; $this->{init}++; } sub add_admin { my ($this,@admin) = @_; my $fh = $this->{fh}; $this->init() unless ($this->{init}); die "add_admin: not allowed at this stage" unless $this->{recnum} == 0; foreach (@admin) { printf $fh "%s", $_; } } =item $tape->add_record($identifier, $date, $record [, @admin]) Add a XML document to the XMLtape with identifier $identifier, date stamp $date and XML string representation $record. Optionally an array of strings can be provided which contain in XML format metadata about the record. Returns true on success undef on error. =cut sub add_record { my ($this, $identifier, $date, $record, @admin) = @_; my $fh = $this->{fh}; $this->init() unless ($this->{init}); print $fh ""; print $fh ""; print $fh "" , &escape($identifier) , ""; print $fh "" , &escape($date) , ""; foreach my $admin (@admin) { print $fh "" , $admin , ""; } print $fh ""; print $fh "" , $record , ""; print $fh ""; $this->{recnum}++; return 1; } =item $tape->tapeclose Closes the XMLtape. Returns true on success undef on error. =cut sub tapeclose { my ($this) = shift; my $fh = $this->{fh}; $this->init() unless ($this->{init}); print $fh ""; $fh->close; } sub escape { my $str = shift; $str =~ s/&/&/g; $str =~ s//>/g; $str =~ s/'/'/g; $str =~ s/"/"/g; return $str; } package XML::Tape::Reader; use XML::Parser; use IO::File; $XML::Tape::Reader::BUFF_SIZE = 1024; sub new { my ($pkg, $filename,%options) = @_; my $obj = bless {} , $pkg; my $fh; if (ref $filename && $filename->isa('Tie::Handle')) { $fh = $filename; } else { $fh = new IO::File; $fh->open("< $filename") || return undef; } $obj->{fh} = $fh; # XML file handle $obj->{records} = []; # Temporary storage for XML::Tape::Record $obj->{admins} = []; # Temporary storage for XML::Tape::Admin $obj->{curr} = undef; # Current record to be read $obj->{parse_init} = 0; # Flag to indicate if we started reading XML $obj->{parse_done} = 0; # Flag to indicate if we still reading XML $obj->{parser} = undef; # XML::Parser $obj->{parsernb} = undef; # XML::Parser::ExpatNB $obj->{nav} = {}; # Hash to navigate in the XML record return $obj; } =item $tape->get_admin() Reads one XMLtape admin section. Returns an instance of XML::Tape::Admin on success or undef when no more XMLtape admin sections are available. =cut sub get_admin { my ($this) = shift; $this->parse() until ( ( scalar @{$this->{records}} ) || ( $this->{parse_done} ) ); return shift( @{$this->{admins}} ); } =item $tape->get_record() Reads one XMLtape record section. Returns an instance of XML::Tape::Record on success or undef when no more records are available. =cut sub get_record { my ($this) = shift; # Parse the XML until we read a new record or the parse is done... $this->parse() until ( ( scalar @{$this->{records}} ) || ( $this->{parse_done} ) ); return shift( @{$this->{records}} ); } sub tapeclose { my ($this) = shift; $this->{fh}->close; } sub parse_init { my ($this) = shift; $this->{parser} = new XML::Parser( Handlers => { Start => sub { $this->handle_start(@_); }, Char => sub { $this->handle_char(@_); }, Comment => sub { $this->handle_comment(@_); }, Proc => sub { $this->handle_proc(@_); }, CdataStart => sub { $this->handle_cdata_start(@_); }, CdataEnd => sub { $this->handle_cdata_end(@_); }, End => sub { $this->handle_end(@_); }, Final => sub { $this->handle_final(@_); }, }); $this->{parsernb} = $this->{parser}->parse_start(); return undef unless $this->{parsernb}; $this->{parse_init} = 1; return 1; } sub parse { my ($this) = shift; unless ($this->{parse_init}) { $this->parse_init() || return undef; } if (defined $this->{fh}) { my $buffer; # Read a chunk of XML... read($this->{fh}, $buffer, $XML::Tape::Reader::BUFF_SIZE); # If the buffer isn't empty then, parse it # otherwise we reached the end of the file... if (length $buffer) { $this->{parsernb}->parse_more($buffer); } else { $this->{parsernb}->parse_done(); $this->{parse_done} = 1; } } } sub handle_start { my ($this, $xp, $elem, %attr) = @_; if (0) {} elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } elsif ($this->{nav}->{in_record_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } if (0) {} elsif ($xp->depth == 1 && $elem =~ /^(\w+:)?tape-admin$/) { $this->{nav}->{in_tape_admin} = 1; $this->{curr} = XML::Tape::Admin->new(); } elsif ($xp->depth == 1 && $elem =~ /^(\w+:)?tape-record$/) { $this->{curr} = XML::Tape::Record->new(); } elsif ($xp->depth == 2 && $elem =~ /^(\w+:)?tape-record-admin$/) { $this->{nav}->{in_tape_record_admin} = 1; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?identifier$/) { $this->{nav}->{in_record_identifier} = 1; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?date$/) { $this->{nav}->{in_record_date} = 1; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?record-admin$/) { $this->{nav}->{in_record_admin} = 1; $this->{curr}->pushAdmin(); } elsif ($xp->depth == 2 && $elem =~ /^(\w+:)?record$/) { $this->{nav}->{in_record} = 1; $this->{curr}->setStartByte($xp->current_byte + length $xp->original_string); } } sub handle_end { my ($this, $xp, $elem, %attr) = @_; if (0) {} elsif ($xp->depth == 1 && $elem =~ /^(\w+:)?tape-admin$/) { $this->{nav}->{in_tape_admin} = 0; push(@{$this->{admins}}, $this->{curr}); } elsif ($xp->depth == 1 && $elem =~ /^(\w+:)?tape-record$/) { push(@{$this->{records}}, $this->{curr}); } elsif ($xp->depth == 2 && $elem =~ /^(\w+:)?tape-record-admin$/) { $this->{nav}->{in_tape_record_admin} = 0; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?identifier$/) { $this->{nav}->{in_record_identifier} = 0; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?date$/) { $this->{nav}->{in_record_date} = 0; } elsif ($this->{nav}->{in_tape_record_admin} == 1 && $elem =~ /^(\w+:)?record-admin$/) { $this->{nav}->{in_record_admin} = 0; } elsif ($xp->depth == 2 && $elem =~ /^(\w+:)?record$/) { $this->{nav}->{in_record} = 0; $this->{curr}->setEndByte($xp->current_byte); } if (0) {} elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } elsif ($this->{nav}->{in_record_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } } sub handle_char { my ($this, $xp, $data) = @_; if (0) {} elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } elsif ($this->{nav}->{in_record_identifier}) { $this->{curr}->addIdentifier($data); } elsif ($this->{nav}->{in_record_date}) { $this->{curr}->addDate($data); } } sub handle_comment { my ($this, $xp, $data) = @_; if (0) {} elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } } sub handle_proc { my ($this, $xp) = @_; if (0) {} elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } } sub handle_cdata_start { my ($this, $xp) = @_; if (0) {} elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } } sub handle_cdata_end { my ($this, $xp) = @_; if (0) {} elsif ($this->{nav}->{in_tape_admin}) { $this->{curr}->addAdminXML($xp->original_string); } elsif ($this->{nav}->{in_record}) { $this->{curr}->addRecordXML($xp->original_string); } } sub handle_final { return 1; } =back =head1 XML::Tape::Admin METHODS =over 4 =item $admin->getRecord() Returns a XML string representation of a XMLtape administrative record. =back =cut package XML::Tape::Admin; sub new { my ($pkg) = shift; return bless { adminXML => undef , } , $pkg; } sub addAdminXML { my ($this, $str) = @_; $this->{adminXML} .= $str; } sub getRecord { my ($this) = @_; return $this->{adminXML}; } =head1 XML::Tape::Record METHODS =over 4 =item $record->getIdentifier() Returns the record identifier as string. =item $record->getDate() Returns the record datestamp as string. =item $record->getAdmin() Returns an ARRAY of administrative records =item $record->getRecord() Returns a XML string representation of a XMLtape record. =item $record->getStartByte() Returns the start byte position of the record in the XMLtape =item $record->getEndByte() Returns the end byte positorion of the record in the XMLtape =back =cut package XML::Tape::Record; sub new { my ($pkg) = shift; return bless { startByte => 0 , endByte => 0 , recordXML => undef, identifier => undef, date => undef, admin => [], } , $pkg; } sub setStartByte { my ($this,$num) = @_; $this->{startByte} = $num; } sub getStartByte { my ($this) = @_; return $this->{startByte}; } sub setEndByte { my ($this,$num) = @_; $this->{endByte} = $num; } sub getEndByte { my ($this) = @_; return $this->{endByte}; } sub addRecordXML { my ($this, $str) = @_; $this->{recordXML} .= $str; } sub getRecord { my ($this) = @_; return $this->{recordXML}; } sub addIdentifier { my ($this, $str) = @_; $this->{identifier} .= $str; } sub getIdentifier { my ($this) = @_; return $this->{identifier}; } sub addDate { my ($this, $str) = @_; $this->{date} .= $str; } sub getDate { my ($this) = @_; return $this->{date}; } sub pushAdmin { my ($this) = @_; push(@{$this->{admin}},''); } sub addAdminXML { my ($this, $xml) = @_; my $num = @{$this->{admin}}; $this->{admin}->[$num-1] .= $xml; } sub getAdmin { my ($this) = @_; return $this->{admin}; } 1; =head1 FURTHER INFORMATION 'File-based storage of Digital Objects and constituent datastreams: XMLtapes and Internet Archive ARC files' http://arxiv.org/abs/cs.DL/0503016 'The multi-faceted use of the OAI-PMH in the LANL Repository' http://yar.sourceforge.net/jcdl2004-submitted-draft.pdf =head1 BUGS UTF-8 encoding is mandatory. Doesn't check for UTF-8 encoding. =head1 CREDITS XMLtape archives were developed by the Digital Library Research & Prototyping team at Los Alamos National Laboratory. XML parsing in the module was inspired by Robert Hanson's XML::RAX module. =head1 SEE ALSO L In bin/oaitape you'll find an example of a OAI-PMH interface on XML::Tape =head1 AUTHOR Patrick Hochstenbach =cut 1;