#!/usr/bin/perl -COE # Exports records from AlvisCombine SQL database in XML-format # Takes switches ## $Id: combineExport 301 2008-12-08 13:56:45Z it-aar $ # Copyright (c) 2004-2005 Anders Ardö # # See the file LICENCE included in the distribution. my $jobname; my $configfile; my $baseConfig; my $dbase; my $help; my $profile; my $charsetopt; my $include; my $exclude; my $nrToExport; my $recidExp; my $md5Exp; my $pipehost; my $pipeport; my $incremental; my $xsltScript; my $verbose; my $collapseinlinks; my $nooutlinks; my $ZebraIndex; my $SolrIndex; use strict; use Combine::MySQLhdb; use Combine::Config; use Combine::XWI2XML; use DBI; use HTTP::Date; use Encode; use Getopt::Long; GetOptions('configfile:s' => \$configfile, 'database|sql:s' => \$dbase, 'jobname:s' => \$jobname, 'baseconfigdir:s' => \$baseConfig, 'help|?' => \$help, 'profile:s' => \$profile, 'charset:s' => \$charsetopt, 'include:s' => \$include, 'exclude:s' => \$exclude, 'number:i' => \$nrToExport, 'recordid:i' => \$recidExp, 'md5:s' => \$md5Exp, 'pipehost:s' => \$pipehost, 'pipeport:i' => \$pipeport, 'incremental' => \$incremental, 'xsltscript:s' => \$xsltScript, 'verbose' => \$verbose, 'collapseinlinks' => \$collapseinlinks, 'nooutlinks' => \$nooutlinks, 'ZebraIndex' => \$ZebraIndex, 'SolrIndex' => \$SolrIndex ); if ($help) { Getopt::Long::HelpMessage('See man page combineExport'); } if (defined($jobname)) { Combine::Config::Init($jobname, $baseConfig); } else { Getopt::Long::HelpMessage('No jobname suplied'); } if (defined($configfile)) { warn "Switch 'configfile' not implemented"; } #Config::Init('',$configfile); } if (defined($dbase)) { warn "Switch 'database' not implemented"; } #ConfigSQL::Init('',$dbase); } my $baseConfig = Combine::Config::Get('baseConfigDir') . '/'; #'/etc/combine/'; if (defined($ZebraIndex)) { warn "Using default Zebra configuration: profile=combine, nooutlinks, collapseinlinks"; $profile='combine'; $collapseinlinks=1; $nooutlinks=1; } if (defined($SolrIndex)) { warn "Using default Solr configuration: profile=combine, nooutlinks, collapseinlinks converted by /etc/combine/solr.xsl"; $profile='combine'; $collapseinlinks=1; $nooutlinks=1; } #configuration processing my $charset = 'UTF-8'; my $includeHTML=0; my $canonicalDoc=0; my $xmlNS = ''; #profile - default alvis if (!defined($profile)) { $profile='alvis'; } else { $profile = lc($profile); } if ($profile eq 'alvis') { $charset = 'UTF-8'; $includeHTML=1; $canonicalDoc=1; $xmlNS = 'xmlns="http://alvis.info/enriched/"'; $xsltScript = $baseConfig . 'combine2alvis.xsl'; } elsif ($profile eq 'dc') { $charset = 'UTF-8'; $includeHTML=0; $canonicalDoc=0; $xmlNS = 'xmlns:dc="http://purl.org/dc/elements/1.1/"'; $xsltScript = $baseConfig . 'combine2dc.xsl'; } elsif ($profile eq 'combine') { $charset = 'UTF-8'; $includeHTML=0; $canonicalDoc=0; $xmlNS = ''; } else { Getopt::Long::HelpMessage("Undefined profile: $profile"); } if (defined($charsetopt)) { if ($charsetopt =~ /utf-8|utf8/i) { $charset = 'UTF-8'; } elsif ($charsetopt =~ /iso-latin|isolatin|latin/i) { $charset = 'LATIN1'; } else { print STDERR "Unkown charset: $charsetopt; using $charset\n"; } } if (defined($include)) { #tobedone } if (defined($exclude)) { #tobedone } my $limit=''; if (defined($nrToExport) && $nrToExport>0) {$limit="LIMIT $nrToExport";} #use ALVIS Pipeline? my $pipe; my $pipeUpdateLast; if (defined($pipehost) || defined($pipeport)) { if (defined($pipehost) && defined($pipeport)) { require Alvis::Pipeline; $pipe = new Alvis::Pipeline::Write(host => $pipehost, port => $pipeport,loglevel => 10) or die "can't create ALVIS write-pipe for host '$pipehost', port '$pipeport': $!"; if (defined($nrToExport) || defined($recidExp) || defined($md5Exp)) { warn("NOT updating exports table with last export"); } else { $pipeUpdateLast = 1; } warn("Using ALVIS Pipeline for host '$pipehost', port '$pipeport'"); } else { die("You must define both pipehost and pipeport to use ALVIS Pipeline"); } } elsif (defined($incremental)) { if (defined($nrToExport) || defined($recidExp) || defined($md5Exp)) { warn("NOT updating exports table with last export"); } else { $pipeUpdateLast = 1; $pipehost='localhost'; $pipeport=0; } } my $sv = Combine::Config::Get('MySQLhandle'); my $n=0; #counter no of records exported my $level=0; #Used to calculate indentation for pretty printing if ($charset eq 'UTF-8') { binmode STDOUT, ":UTF-8"; } #Set up output to be in UTF-8 if (defined($recidExp)) { $recidExp = 'WHERE recordid=' . $recidExp; } else { $recidExp=''; } if (defined($md5Exp)) { $md5Exp = "WHERE md5='$md5Exp'"; } else {$md5Exp=''; } my $xmlhead=''; my $xmlfoot=''; if (! defined($pipe) ) { print "\n"; print '\n"; } else { $xmlhead="\n\n"; $xmlfoot="\n"; } #FIX the namespace for non ALVIS formats my $sth; my $last; my $now; if (defined($pipe) || defined($incremental)) { if (defined($pipeUpdateLast)) { $sth = $sv->prepare(qq{SELECT last,NOW() FROM exports WHERE host=? AND port=?}); $sth->execute($pipehost,$pipeport); ($last,$now)=$sth->fetchrow_array; my $firstTime=''; if (!defined($last)) { $sv->prepare(qq{INSERT INTO exports SET host=?,port=?})->execute($pipehost,$pipeport); $sth = $sv->prepare(qq{SELECT last,NOW() FROM exports WHERE host=? AND port=?}); $sth->execute($pipehost,$pipeport); ($last,$now)=$sth->fetchrow_array; $firstTime=" AND status!='deleted'"; } # print "T: got $last, $now\n"; $sth = $sv->prepare(qq{SELECT recordid,md5,status FROM oai WHERE date>'$last' AND date<'$now' $firstTime;}); } else { $sth = $sv->prepare(qq{SELECT recordid,md5,status FROM oai $recidExp $md5Exp $limit;}); } } else { $sth = $sv->prepare(qq{SELECT distinct(recordid),md5,'created' FROM recordurl $recidExp $md5Exp $limit;}); } $sth->execute; while ( my ($recordid,$recordmd5,$status)=$sth->fetchrow_array) { $n++; $level=1; my $rec = ''; if (defined($verbose)) { print STDERR "RID=$recordid; Status=$status; No=$n\n"; } if ($status eq 'deleted') { $rec = "\n"; } elsif ( ($ZebraIndex) && (my $zh = Combine::Config::Get('ZebraHost')) ) { require Combine::Zebra; my $xwi = Combine::MySQLhdb::Get($recordid); Combine::Zebra::update($zh,$xwi); } elsif ( ($SolrIndex) && (my $sh = Combine::Config::Get('SolrHost')) ) { require Combine::Solr; my $xwi = Combine::MySQLhdb::Get($recordid); Combine::Solr::update($sh,$xwi); } elsif ($charset eq 'UTF-8') { $rec = toXML($recordid, $includeHTML, $canonicalDoc); } elsif ($charset eq 'LATIN1') { $rec = Encode::encode('latin1',toXML($recordid, $includeHTML, $canonicalDoc)); } if (defined($pipe)) { $pipe->write($xmlhead . $rec . $xmlfoot) || warn('Alvis pipeline write failed'); } else { print "$rec\n"; } } if ( !defined($pipe) ) { print "\n"; } if (defined($pipeUpdateLast)) { # $now holds time when incremental export started $sv->prepare(qq{UPDATE exports SET last=? WHERE host=? AND port=?})->execute($now,$pipehost,$pipeport); } ##################SUBS##################### sub toXML { my ($recordid,$iHTML,$cDoc) = @_; my $xwi = Combine::MySQLhdb::Get($recordid); my $xml = Combine::XWI2XML::XWI2XML($xwi, $iHTML, $cDoc, $collapseinlinks, $nooutlinks); if ( defined($xsltScript) ) { #apply XSLT transformation use XML::LibXSLT; use XML::LibXML; my $parser = XML::LibXML->new(); my $xslt = XML::LibXSLT->new(); my $source = $parser->parse_string($xml); my $style_doc = $parser->parse_file($xsltScript); my $stylesheet = $xslt->parse_stylesheet($style_doc); my $results = $stylesheet->transform($source); $xml = $stylesheet->output_string($results); } return $xml; } __END__ =head1 NAME combineExport - export records in XML from Combine database =head1 SYNOPSIS combineExport --jobname [--profile alvis|dc|combine --charset utf8|isolatin --number --recordid --md5 --incremental --xsltscript ...] =head1 OPTIONS AND ARGUMENTS jobname is used to find the appropriate configuration (mandatory) =over 9 =item --profile Three profiles: alvis, dc, and combine . alvis and combine are similar XML formats. 'alvis' profile format is defined by the Alvis enriched document format DTD. It uses charset UTF-8 per default. 'combine' is more compact with less redundancy. 'dc' is XML encoded Dublin Core data. =item --charset Selects a specific characterset from UTF-8, iso-latin-1 Overrides --profile settings. =item --collapseinlinks Skip inlinks with duplicate anchor-texts (ie just one inlink per unique anchor-text). =item --nooutlinks Do not include any outlinks in the exported records. =item --ZebraIndex ZebraIndex sends XML records directly to the Zebra server defined in Combine configuration variable 'ZebraHost'. It uses the default Zebra configuration: profile=combine, nooutlinks, collapseinlinks and is compatible with the direct Zebra indexing done during harvesting when 'ZebraHost' is defined in the Combine configuration. Requires that the Zebra server is running. =item --SolrIndex SolrIndex sends XML records directly to the Solr server defined in Combine configuration variable 'SolrHost'. It uses the default Solr configuration: profile=combine, nooutlinks, collapseinlinks and is compatible with the direct Solr indexing done during harvesting when 'SolrHost' is defined in the Combine configuration. Requires that the Solr server is running. =item --xsltscript Generates records in Combine native format and converts them using this XSLT script before output. See example scripts in /etc/combine/*.xsl =item --number the max number of records to be exported =item --recordid Export just the one record with this recordid =item --md5 Export just the one record with this MD5 checksum =item --pipehost, --pipeport Specifies the server-name and port to connect to and export data using the Alvis Pipeline. Exports incrementally, ie all changes since last call to combineExport with the same pipehost and pipeport. =item --incremental Exports incrementally, ie all changes since last call to combineExport using --incremental =back =head1 DESCRIPTION =head1 EXAMPLES Export all records in Alvis XML-format to the file recs.xml combineExport --jobname atest > recs.xml Export 10 records to STDOUT combineExport --jobname atest --number 10 Export all records in UTF-8 using Combine native format combineExport --jobname atest --profile combine --charset utf8 > Zebrarecs.xml Incremental export of all changes from last call using localhost at port 6234 using the default profile (Alvis) combineExport --jobname atest --pipehost localhost --pipeport 6234 =head1 SEE ALSO Combine configuration documentation in F. Alvis XML schema (--profile alvis) at L =head1 AUTHOR Anders Ardö, Eanders.ardo@it.lth.seE =head1 COPYRIGHT AND LICENSE Copyright (C) 2005 - 2006 Anders Ardö This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available. See the file LICENCE included in the distribution at L =cut