# $Id: ApplyXSLT.pm,v 1.39 2009/03/05 04:25:24 jmates Exp $ # # The author disclaims all copyrights and releases this module into the # public domain. # # Convert XML data with XSLT stylesheet files. # # For more documentation, run perldoc(1) on this module. package XML::ApplyXSLT; use 5.005; use strict; use warnings; use base qw(Exporter); use File::Basename qw(fileparse); use File::Spec (); use XML::LibXML (); use XML::LibXSLT (); our $VERSION = '0.51'; my $suffix_char = '.'; my $suffix_re = qr/(?{default} = $params{default} || {}; # TODO better time to init these? $self->{xmlp} = XML::LibXML->new; $self->{xslp} = XML::LibXSLT->new; $self->rules( $params{rules} ) if exists $params{rules}; return $self; } # alter or return prefs sub config { my $self = shift; my $default = shift || return $self->{default}; $self->{default} = { %{ $self->{default} }, %$default }; return 1; } # interface to configure XML::LibXML sub config_libxml { my $self = shift; my $prefs = shift || return; my %allowed; @allowed{ qw( validation recover expand_entities keep_blanks pedantic_parser line_numbers load_ext_dtd complete_attributes expand_xinclude ) } = (); for my $method ( grep exists $allowed{$_}, keys %$prefs ) { $self->{xmlp}->$method( $prefs->{$method} ); } return 1; } sub config_libxslt { my $self = shift; my $prefs = shift || return; my %allowed; @allowed{ qw( max_depth debug_callback register_function ) } = (); for my $method ( grep exists $allowed{$_}, keys %$prefs ) { $self->{xslp}->$method( $prefs->{$method} ); } return 1; } # TODO fix/improve logging e.g. so can have warnings? # diagnostic message accessors sub errorstring { my $self = shift; return $self->{errorstring}; } sub debugstring { my $self = shift; return $self->{debugstring}; } # parses XML file by filehandle, scalar string, or filename. returns # reference to rendered XML document. sub parse { my $self = shift; $self->{errorstring} = ''; $self->{debugstring} = ''; # Could be a file, filehandle, or string. Figure out what to do. # Using IO::Wrap might be a portability win, though the XML::LibXML # docs say the C library layer can suck in files by name much faster # than Perl. my $what = shift; my %refmap = ( GLOB => 'parse_fh', SCALAR => 'parse_string', '' => 'parse_file' ); my $method = $refmap{ ref $what }; unless ( defined $method ) { $self->{errorstring} = 'no parse method found'; return; } my $doc; eval { $doc = $self->{xmlp}->$method( $method eq 'parse_string' ? $$what : $what ); }; if ($@) { chomp $@; $self->{debugstring} = $@; $self->{errorstring} = 'could not parse XML'; return; } return $doc; } # load or return rules used to determine how to handle a particular XML # document via file data or parsed document data sub rules { my $self = shift; my $fh = shift || return $self->{rules}; # TODO more sanity checking? my ( $line, @rules ); RULE: while ( $line = <$fh> ) { next if $line =~ /^\s*$/; $line =~ s/^\s+//; next if $line =~ /^#/; chomp $line; $line =~ s/\s+$//; # extend backslashed lines to include subseqent lines if ( $line =~ s/ \\ $ //x ) { $line .= <$fh>; redo RULE unless eof; } my @tokens; UBLE: { # non-quoted strings, backslashed quotes and whitespace allowed push( @tokens, $1 ), redo UBLE if $line =~ m/ \G ( [^"'\s]+ ) \s* /cgx; # single or double-quoted strings, backslashed quotes allowed push( @tokens, $2 ), redo UBLE if $line =~ m/ \G (['"]) ((?: \\.|[^\\\1] )+) \1 \s* /cgx; last UBLE if $line =~ / \G $ /gcx; # get here on bogus lines that above miss # # TODO need better error system so can flag warnings and attempt # to move on? $self->{errorstring} = "invalid rule at line $."; return; } next RULE unless @tokens >= 2; # unescape things like "\ " or "\n" @tokens = map { s/(\\.)/qq!"$1"!/eeg; $_ } @tokens; my %rule; $rule{action} = 'continue'; # test subjects may not end in :, while defaults: or params: do if ( $tokens[0] !~ m/:$/ ) { $rule{subject} = shift @tokens; if ( $tokens[0] eq 'not' ) { $rule{negate} = 1; shift @tokens; } next RULE unless @tokens >= 2; $rule{operator} = shift @tokens; $rule{value} = shift @tokens; if ( @tokens and ($tokens[0] eq 'stop' or $tokens[0] eq 'ignore' or $tokens[0] eq 'continue' ) ) { $rule{action} = shift @tokens; } } # deal with defaults: or params: that get set by rules, either # by setting where key=value pairs go, or parsing said pairs if (@tokens) { my $target = 'default'; for my $token (@tokens) { if ( $token =~ m/^(default|param)s?:$/ ) { $target = $1; } else { my ( $k, $v ) = $token =~ m/^ ([\w.-]+) = (.*) $/x; $rule{$target}->{$k} = $v if defined $k; } } } push @rules, \%rule; } $self->{rules} = \@rules; return 1; } sub apply_rules { my $self = shift; my $subject = shift; my %default; my %param; RULE: for my $rule ( @{ $self->{rules} } ) { my $topic = $rule->{subject}; # test free rules can set defaults unless ( defined $topic and exists $subject->{$topic} and defined $subject->{$topic} ) { %default = ( %default, %{ $rule->{default} } ) if exists $rule->{default}; %param = ( %param, %{ $rule->{param} } ) if exists $rule->{param}; next RULE; } my $match = 0; my $consider = ref $subject->{$topic} eq 'ARRAY' ? $subject->{$topic} : [ $subject->{$topic} ]; if ( $rule->{operator} eq 'eq' ) { for my $thingy (@$consider) { if ( $thingy eq $rule->{value} ) { $match = 1; last; } } } elsif ( $rule->{operator} eq 'sub' ) { for my $thingy (@$consider) { if ( -1 < index $thingy, $rule->{value} ) { $match = 1; last; } } } else { warn "error: unknown operator for rule number ...\n"; next RULE; } $match = $match ? 0 : 1 if exists $rule->{negate}; next RULE unless $match; return if $rule->{action} eq 'ignore'; # also set these on rule hits %default = ( %default, %{ $rule->{default} } ) if exists $rule->{default}; %param = ( %param, %{ $rule->{param} } ) if exists $rule->{param}; return \%default, \%param if $rule->{action} eq 'stop'; } # oops, dropped off end of ruleset without being handled # default to "do not handle" in such case return; } # parses out various file information such as the dirname, filename, # file name without suffix, suffix. Used by rules to figure out what to # do with a particular file. sub filedata { my $self = shift; my $filename = shift; my $parent = shift; my %filedata; ( $filedata{filename}, $filedata{dirname}, undef ) = fileparse $filename; $filedata{dirname} = File::Spec->rel2abs( $filedata{dirname} ); # try to determine "subdir" and "parentdir" for possible chroot or URI # based work if ( defined $parent ) { $parent =~ s,/+$,,; my $offset = index $filedata{dirname}, $parent; if ( $offset > -1 ) { $filedata{subdir} = substr $filedata{dirname}, $offset + length $parent; $filedata{parentdir} = $parent; } } my @portions = split /$suffix_re/, $filedata{filename}; $filedata{file} = $portions[0]; if ( @portions > 1 ) { local $" = $suffix_char; $filedata{suffix} = "@portions[1..$#portions]"; } return \%filedata; } # builds up a hash of document data such as DOCTYPE info and the root # element name for determination of what class and style the document # should be classified as. Returns hash references. sub docdata { my $self = shift; my $doc = shift; my %docdata; my $root = $doc->documentElement; if ($root) { $docdata{rootname} = $root->nodeName; } # TODO difference between internal and external here relevant? #my $doctype = $doc->externalSubset; my $doctype = $doc->internalSubset; if ($doctype) { # grr, XML::LibXML has incomplete Dtd handling at present, so have # to parse it manually my ( $ExternalID, $literal, $optional ) = $doctype->toString =~ m/^ \s* query_xpath( $doc, 'processing-instruction()' ); if ($pi_nodes) { my @pi_names; for my $node ( $pi_nodes->get_nodelist() ) { push @pi_names, $node->nodeName(); } $docdata{pi} = \@pi_names if @pi_names; } return \%docdata; } # execute arbitrary XPath against document via findnodes() sub query_xpath { my $self = shift; my $doc = shift; my $query = shift; $self->{errorstring} = ''; $self->{debugstring} = ''; my $results; eval { $results = $doc->findnodes($query); }; if ($@) { chomp $@; $self->{debugstring} = $@; $self->{errorstring} = 'xpath query error'; return; } return $results; } sub study { my $self = shift; my $doc = shift; # TODO replace these with param-from-hash to be more like other methods? my $filename = shift; my $parent = shift; # merge file and XML document metadata for rule tests my $filedata = defined $filename ? $self->filedata( $filename, $parent ) : {}; %$filedata = ( %$filedata, %{ $self->docdata($doc) } ); return $filedata, $self->apply_rules($filedata); } # needs to return style "id" for caching, and then something suitable to # be fed to the parse routine (filename, handle, etc.) sub get_style { my $self = shift; my $default = shift || {}; %$default = ( %{ $self->{default} }, %$default ); unless ( exists $default->{path} and defined $default->{path} ) { $self->{errorstring} = 'no style path set'; return; } my $style_doc = $self->expand( $default->{path}, $default ); return $style_doc, $style_doc; } sub expand { my $self = shift; my $something = shift; my $lookup = shift; my $default = shift; $default = '' unless defined $default; # bleh... my $what = ref $something; if ( $what eq 'HASH' ) { for my $value ( values %$something ) { $value =~ s/ %{ (\w+) } / $lookup->{$1} || $default /egx; } } elsif ( $what eq 'ARRAY' ) { for my $value (@$something) { $value =~ s/ %{ (\w+) } / $lookup->{$1} || $default /egx; } } else { $something =~ s/ %{ (\w+) } / $lookup->{$1} || $default /egx; } return $something; } # translate previously parsed XML document with stylesheet looked up via # get_style method sub transform { my $self = shift; my $doc = shift; my %params = @_; $self->{errorstring} = ''; $self->{debugstring} = ''; my ( $id, $style_doc ) = $self->get_style( $params{default} ); return unless $id and $style_doc; my $stylesheet = $self->{style_cache}->{$id}; # TODO support for refresh when ondisk more recent, and support to # remove from cache if have too many stylesheets in memory? unless ( defined $stylesheet and ref($stylesheet) =~ m/^XML::LibXSLT::Stylesheet/ ) { # geh, what if not a file in the future? TODO move elsewhere unless ( -f $style_doc ) { $self->{errorstring} = "stylesheet not found: file=$style_doc"; return; } my $docref = $self->parse($style_doc); return unless $docref; eval { $stylesheet = $self->{xslp}->parse_stylesheet($docref); }; if ($@) { chomp $@; $self->{debugstring} = $@; $self->{errorstring} = 'could not parse XSLT stylesheet'; return; } unless ( defined $stylesheet and ref($stylesheet) =~ m/^XML::LibXSLT::Stylesheet/ ) { $self->{errorstring} = 'stylesheet not a XML::LibXSLT::Stylesheet'; return; } } my $results; eval { $results = $stylesheet->transform( $doc, keys %{ $params{param} } ? XML::LibXSLT::xpath_to_string( %{ $params{param} } ) : () ); }; if ($@) { chomp $@; $self->{debugstring} = $@; $self->{errorstring} = 'could not transform XML document with stylesheet'; return; } my $rendered = \$stylesheet->output_string($results); my %details; $details{encoding} = $stylesheet->output_encoding; $details{media_type} = $stylesheet->media_type; return wantarray ? ( $rendered, \%details ) : $rendered; } 1; __END__ =head1 NAME XML::ApplyXSLT - convert XML data with XSLT stylesheet files =head1 SYNOPSIS use XML::ApplyXSLT; $xapply = XML::ApplyXSLT->new; # parse an XML document by various means $doc = $xapply->parse( $xml_filename ) || die $xapply->errorstring; $doc = $xapply->parse( \$xml_string ); $doc = $xapply->parse( \*FILEHANDLE ); # set global defaults, such as path, style, and class to lookup # stylesheets from the filesystem $xapply->config({ class => 'test', style => 'default' }); # load rules $xapply->rules( $rules_filehandle ); # determine information about a given document (via rules) ( $filedata, $defaults, $params ) = $xapply->study( $doc, $xml_filename ); # extra code here to mess with defaults, check parameters, etc. # transform the previously parsed XML document via stylesheet found # via path, class, and style lookups ( $docref, $details ) = $xapply->transform( $doc, default => $defaults, param => $params); print $$docref; =head1 DESCRIPTION This module converts XML documents with XSLT files. As different stylesheets could be applied to a particular XML format depending on the context, methods are provided to determine what C and C