# # $Id: Index.pm,v 1.5 2005/09/01 08:19:27 patrick Exp $ # =head1 NAME XML::Tape::Index - a XMLtape indexer =head1 SYNOPSIS use XML::Tape::Index qw(:all); unless (indexexists('ex/tape.xml')) { $x = indexopen('ex/tape.xml', 'w'); $x->reindex; $x->indexclose(); } $x = indexopen('ex/tape.xml', 'r'); for (my $rec = $x->list_identifiers(); defined($rec); $rec = $x->list_identifiers($rec->{token})) { print "id : %s\n" , $rec->{identifier}; print "date : %s\n" , $rec->{date}; print "start : %s\n" , $rec->{start}; print "length : %s\n" , $rec->{len}; } my $rec = $x->get_identifier('oai:arXiv.org:hep-th:0208183'); my $xml = $x->get_record('oai:arXiv.org:hep-th:0208183'); =head1 DESCRIPTION This modules creates an index on XMLtapes to enable fast retrieval of XML documents from the archive. The index files are stored next to the XMLtape. =cut package XML::Tape::Index; use strict; use DB_File; use XML::Tape; use Digest::MD5 qw(md5); require Exporter; use vars qw($VERSION); ( $VERSION ) = '$Revision: 1.5 $ ' =~ /\$Revision:\s+([^\s]+)/;; @XML::Tape::Index::ISA = qw(Exporter); @XML::Tape::Index::EXPORT_OK = qw(indexopen indexexists indexdrop); %XML::Tape::Index::EXPORT_TAGS = (all => [qw(indexopen indexexists indexdrop)]); $XML::Tape::Index::VERBOSE = 0; $XML::Tape::Index::CACHE_SIZE = 4 * 1024 * 1024; sub _get_index { my ($filename) = @_; return { adm_index_file => "$filename.adm" , rec_index_file => "$filename.rec" , dat_index_file => "$filename.dat" , } } =head1 METHODS =over 4 =item $x = indexopen($tape_file, $flag) This function opens an index for reading or writing. The parameter tape_file is the location of a XMLtape archive. The flag is "w" when creating a new index or "r" when reading an index. An XML::Tape::Index instance will be returned on success or undef on failure. =cut sub indexopen { my ($tape_file, $flag, $mode) = @_; my (%admh,$admh); my (%rech,$rech); my (%idsh,$idsh); my (%dath,$dath); $mode = 0644 unless $mode; my $files = &_get_index($tape_file); my $this = bless {} , 'XML::Tape::Index'; $this->{mode} = $flag; if ($flag eq 'w') { $flag = O_CREAT | O_RDWR; } elsif ($flag eq 'r') { $flag = O_RDONLY; } else { die "usage: indexopen(\$tape_file, 'r' | 'w')"; } my $f_hash = new DB_File::HASHINFO; $f_hash->{cachesize} = $XML::Tape::Index::CACHE_SIZE; my $f_btree = new DB_File::BTREEINFO; $f_btree->{cachesize} = $XML::Tape::Index::CACHE_SIZE; $f_btree->{flags} = R_DUP; $admh = tie %admh, 'DB_File' , $files->{adm_index_file} , $flag, $mode, $f_hash || die "can't tie " . $files->{adm_index_file} . ": $!"; $rech = tie %rech, 'DB_File' , $files->{rec_index_file} , $flag, $mode, $f_hash || die "can't tie " . $files->{rec_index_file} . ": $!"; $dath = tie %dath, 'DB_File' , $files->{dat_index_file} , $flag, $mode, $f_btree || die "can't tie " . $files->{dat_index_file} . ": $!"; $this->{tape_file} = $tape_file; $this->{admh} = $admh; $this->{rech} = $rech; $this->{dath} = $dath; $this->{t_admh} = \%admh; $this->{t_rech} = \%rech; $this->{t_dath} = \%dath; return $this; } =item $x->reindex() This method reads the XMLtape extracts all identifier and datestamps from it and stores the byte positions of all records in the index. =cut sub reindex { my ($this) = @_; die "reindex: only allowed in 'w' mode" unless ($this->{mode} eq 'w'); my $num_of_rec = 0; my $tape = XML::Tape::tapeopen($this->{tape_file}, 'r') || return undef; my $_start = time(); my $earliest_datestamp = undef; while (my $record = $tape->get_record()) { $num_of_rec++; my $id = $record->getIdentifier(); my $date = $record->getDate(); my $start = $record->getStartByte(); my $length = $record->getEndByte() - $start; my $value = join("\t", $id, $date, $start, $length); my $key = md5($id); $this->{rech}->put($key,$value); $this->{dath}->put($date,$key); if ($XML::Tape::Index::VERBOSE && $num_of_rec % 10000 == 0) { my $speed = int($num_of_rec/(time - $_start + 1)); print "record: $num_of_rec ($speed r/s) read: " . $record->getEndByte() . " bytes\n"; } my $comp_date = $date; $comp_date =~ s/\D+//g; if ( ! defined $earliest_datestamp || $earliest_datestamp->{val} > $comp_date ) { $earliest_datestamp->{val} = $comp_date; $earliest_datestamp->{str} = $date; } } $tape->tapeclose(); $this->{admh}->put('tapefile', $this->{tape_file}); $this->{admh}->put('recnum', $num_of_rec); $this->{admh}->put('earliest', $earliest_datestamp->{str}); return $num_of_rec; } =item $x->list_identifiers([$token]) =item $x->list_identifiers($from,$until) Use this method to iterate through the index to return all records. This method returns an index record on success or undef when no more records are available. Each index record is a HASH reference containing the fields 'identifier', 'date', 'start' (the starting byte of the XML document in the XMLtape), 'len' (the length of the XML document in the XMLtape) and 'token'. The 'token' field should be used to return the next index record. One can filter the returned indexed records by using two arguments at the first list_identifiers method invocation. Only index records with dates greater or equal than 'from' and less than 'until' will be returned by subsequent list_identifier requests. E.g. # Return all index records... for (my $r = $x->list_identifiers(); defined($r); $r = $x->list_identifiers($r->{token}) { } # Return all index records with dates between 2000-01-01 and 2005-12-31... for (my $r = $x->list_identifiers( '2001-01-01T00:00:00Z', '2005-12-31T23:59:59Z' ); defined($r); $r = $x->list_identifiers($r->{token}) { } =cut sub list_identifiers { my ($this) = shift; my ($from,$until,$md5); die "list_identifiers: only allowed in 'r' mode" unless ($this->{mode} eq 'r'); # If we have two arguments we need to filter on 'from' and 'until' date... if (@_ == 2) { ($from,$until) = @_; $this->{'from'} = $from; $this->{'until'} = $until; } # If we have one argument than it is a resumption token... elsif (@_ == 1) { ($from,$md5) = split(/,/,shift,2); $md5 = pack("H*",$md5); $until = $this->{'until'}; } # Else, we need to return all entries.. else { $from = $until = undef; $this->{'from'} = $from; $this->{'until'} = $until; } my $status; if ($md5) { $status = $this->{dath}->find_dup($from, $md5); $status = $this->{dath}->seq($from, $md5, R_NEXT) if ($status == 0); } elsif ($from) { $status = $this->{dath}->seq($from, $md5, R_CURSOR); } else { $status = $this->{dath}->seq($from, $md5, R_FIRST); } return undef unless ($status == 0); return undef if (defined $until && ($from cmp $until) >= 0); my $values; $status = $this->{rech}->get($md5,$values); return undef unless ($status == 0); my (@field) = split(/\t/,$values); return { 'identifier' => $field[0] , 'date' => $field[1] , 'start' => $field[2] , 'length' => $field[3] , 'token' => $field[1] . "," . unpack("H*",$md5) }; } =item $x->get_earlist_date() This methods returns earliest date in the index file =cut sub get_earliest_date { my ($this, $id) = @_; my $values; $this->{admh}->get('earliest',$values); return $values; } =item $x->get_tape_file() This methods returns name of the tape file associated with this index. =cut sub get_tape_file { my ($this, $id) = @_; my $values; $this->{admh}->get('tapefile',$values); return $values; } =item $x->get_num_of_records() This methods returns the number of record in an index. =cut sub get_num_of_records { my ($this, $id) = @_; my $values; $this->{admh}->get('recnum',$values); return $values; } =item $x->get_identifier($identifier) This method returns an index record given an identifier as argument. When no matching index record can be found undef will be returned. The index record is a HASH reference containing the fields 'identifier', 'date', 'start' and 'len' (see above). =cut sub get_identifier { my ($this, $id) = @_; my $md5 = md5($id); my $values; die "get_identifier: only allowed in 'r' mode" unless ($this->{mode} eq 'r'); $this->{rech}->get($md5,$values); return undef unless $values; my (@field) = split(/\t/,$values); return { 'identifier' => $field[0] , 'date' => $field[1] , 'start' => $field[2] , 'len' => $field[3] , }; } =item $x->get_record($identifier) This method returns an XML document from the XMLtape given an identifier as argument. When no matching record can be found undef will be returned. =cut sub get_record { my ($this, $id) = @_; die "get_record: only allowed in 'r' mode" unless ($this->{mode} eq 'r'); local(*F); my $rec = $this->get_identifier($id); return undef unless $rec; my $xml; if ($rec->{start} && $rec->{len}) { open(F, $this->{tape_file}) || return undef; seek(F, $rec->{start}, 0); read(F, $xml, $rec->{len}); close(F); } return $xml; } =item $x->indexclose(); Closes the XMLtape index. =cut sub indexclose { my ($this) = @_; $this->{admh} = undef; $this->{rech} = undef; $this->{dath} = undef; untie %{$this->{t_admh}}; untie %{$this->{t_rech}}; untie %{$this->{t_dath}}; } =item indexexists($tape_file) This class method returns true when an index on the XMLtape with location $tape_file exists, returns false otherwise. =cut sub indexexists { my ($filename) = @_; my $files = &_get_index($filename); return (-e $files->{adm_index_file} && -e $files->{rec_index_file} && -e $files->{dat_index_file}); } =item indexdrop($tape_file) This class method deletes the index associated with the XMLtape with location $tape_file. =cut sub indexdrop { my ($filename) = @_; my $files = &_get_index($filename); unlink $files->{adm_index_file}; unlink $files->{rec_index_file}; unlink $files->{dat_index_file}; } =head1 BUGS The XML::Tape::Index doesn't lock XMLtape before writing. It is possible to overwrite and index while another process is reading it. =head1 CREDITS XMLtape archives were developed by the Digital Library Research & Prototyping team at Los Alamos National Laboratory. =head1 SEE ALSO L =head1 AUTHOR Patrick Hochstenbach =cut 1;