## -*- cperl -*- package XML::DT; BEGIN { #XML::LIBXML# use XML::LibXML; #XML::PARSER# use XML::Parser; use Data::Dumper; use LWP::Simple; use XML::DTDParser "ParseDTDFile"; use Exporter (); use vars qw($c %v $q @dtcontext %dtcontextcount @dtatributes @dtattributes ); # Para tirar, parece-me #eval "use bytes"; #if (my $m = $INC{"bytes.pm"}) {require bytes; import bytes;} @ISA=qw(Exporter); @EXPORT=qw(&dt &dtstring &dturl &inctxt &ctxt &mkdtskel &mkdtskel_fromDTD &mkdtdskel &toxml &MMAPON $c %v $q &xmltree &pathdturl @dtcontext %dtcontextcount @dtatributes @dtattributes &pathdt &pathdtstring ); $VERSION = '0.37'; #XML::LIBXML# $PARSER = 'XML::LibXML'; #XML::PARSER# $PARSER = 'XML::Parser'; } =head1 NAME XML::DT - a package for down translation of XML files =head1 SYNOPSIS use XML::DT; %xml=( 'music' => sub{"Music from: $c\n"}, 'lyrics' => sub{"Lyrics from: $v{name}\n"}, 'title' => sub{ uc($c) }, '-default' => sub{"$q:$c"} ); print dt($filename,%xml); =head1 ABSTRACT This module is a XML down processor. It maps tag (element) names to functions to process that element and respective contents. =head1 DESCRIPTION This module processes XML files with an approach similar to OMNIMARK. As XML parser it uses XML::Parser or XML::LibXML module in an independent way. At configure stage, you should choose one of the back-ends. If you use XML::LibXML module as backend, you can parse HTML files as if they were XML files. For this, you must supply an extra option to the hash: %hander = ( -html => 1, ... ); =head1 Functions =head2 dt Down translation function C
receives a filename and a set of expressions (functions) defining the processing and associated values for each element. =head2 dtstring C works in a similar way with C
but takes input from a string instead of a file. =head2 dturl C works in a similar way with C
but takes input from an Internet url instead of a file. =head2 pathdt The C function is a C
function which can handle a subset of XPath on handler keys. Example: %handler = ( "article/title" => sub{ toxml("h1",{},$c) }, "section/title" => sub{ toxml("h2",{},$c) }, "title" => sub{ $c }, "//image[@type='jpg']" => sub{ "JPEG: " }, "//image[@type='bmp']" => sub{ "BMP: sorry, no bitmaps on the web" }, ) pathdt($filename,%handler); Here are some examples of valid XPath expressions under XML::DT: /aaa /aaa/bbb //ccc - ccc somewhere (same as "ccc") /*/aaa/* //* - same as "-default" /aaa[@id] - aaa with an attribute id /*[@*] - root with an attribute /aaa[not(@name)] - aaa with no attribute "name" //bbb[@name='foo'] - ... attribute "name" = "foo" /ccc[normalize-space(@name)='bbb'] //*[name()='bbb'] - complex way of saying "//bbb" //*[starts-with(name(),'aa')] - an element named "aa.*" //*[contains(name(),'c')] - an element ".*c.*" //aaa[string-length(name())=4] "...." //aaa[string-length(name())<4] ".{1,4}" //aaa[string-length(name())>5] ".{5,}" Note that not all XPath is currently handled by XML::DT. A lot of XPath will never be added to XML::DT because is not in accordance with the down translation model. For more documentation about XPath check the specification at http://www.w3c.org or some tutorials under http://www.zvon.org =head2 pathdtstring Like the C function but supporting XPath. =head2 pathdturl Like the C function but supporting XPath. =head2 inctxt C is true if the actual element path matches the provided pattern. This function is meant to be used in the element functions in order to achieve context dependent processing. =head2 ctxt Returns the context element of the currently being processed element. So, if you call C you will get your father element, and so on. =head2 toxml This is the default "-default" function. It can be used to generate XML based on C<$c> C<$q> and C<%v> variables. Example: add a new attribute to element C without changing it: %handler=( ... ele1 => sub { $v{at1} = "v1"; toxml(); }, ) C can also be used with 3 arguments: tag, attributes and contents toxml("a",{href=> "http://local/f.html"}, "example") returns: example =head2 xmltree This simple function just makes a HASH reference: { -c => $c, -q => $q, all_the_other_attributes } The function C understands this structure and makes XML with it. =head2 mkdtskel Used by the mkdtskel script to generate automatically a XML::DT perl script file based on an XML file. Check C manpage for details. =head2 mkdtskel_fromDTD Used by the mkdtskel script to generate automatically a XML::DT perl script file based on an DTD file. Check C manpage for details. =head2 mkdtdskel Used by the mkdtskel script to generate automatically a XML::DT perl script file based on a DTD file. Check C manpage for details. =head1 Accessing parents With XML::DT you can access an element parent (or grand-parent) attributes, till the root of the XML document. If you use c<$dtattributes[1]{foo} = 'bar'> on a processing function, you are defining the attribute C for that element parent. In the same way, you can use C<$dtattributes[2]> to access the grand-parent. C<$dtattributes[-1]> is, as expected, the XML document root element. =head1 User provided element processing functions The user must provide an HASH with a function for each element, that computes element output. Functions can use the element name C<$q>, the element content C<$c> and the attribute values hash C<%v>. All those global variables are defined in C<$CALLER::>. Each time an element is find the associated function is called. Content is calculated by concatenation of element contents strings and interior elements return values. =head2 C<-default> function When a element has no associated function, the function associated with C<-default> called. If no C<-default> function is defined the default function returns a XML like string for the element. When you use C definitions, you often need do set C<-default> function to return just the contents: C. =head2 C<-outputenc> option C<-outputenc> defines the output encoding (default is Unicode UTF8). =head2 C<-inputenc> option C<-inputenc> forces a input encoding type. Whenever that is possible, define the input encoding in the XML file: =head2 C<-pcdata> function C<-pcdata> function is used to define transformation over the contents. Typically this function should look at context (see C function) The default C<-pcdata> function is the identity =head2 C<-begin> function Function to be executed before processing XML file. Example of use: initialization of side-effect variables =head2 C<-end> function Function to be executed after processing XML file. I can use C<$c> content value. The value returned by C<-end> will be the C
return value. Example of use: post-processing of returned contents =head1 Elements with values other than strings (C<-type>) By default all elements return strings, and contents (C<$c>) is the concatenation of the strings returned by the sub-elements. In some situations the XML text contains values that are better processed as a structured type. The following types (functors) are available: =over 4 =item THE_CHILD Return the result of processing the only child of the element. =item LAST_CHILD Returns the result of processing the last child of the element. =item STR concatenates all the sub-elements returned values (DEFAULT) all the sub-element should return strings to be concatenated; =item SEQ makes an ARRAY with all the sub elements contents; attributes are ignored (they should be processed in the sub-element). (returns a ref) If you have different types of sub-elements, you should use SEQH =item SEQH makes an ARRAY of HASH with all the sub elements (returns a ref); for each sub-element: -q => element name -c => contents at1 => at value1 for each attribute =item MAP makes an HASH with the sub elements; keys are the sub-element names, values are their contents. Attributes are ignored. (they should be processed in the sub-element) (returns a ref) =item MULTIMAP makes an HASH of ARRAY; keys are the sub-element names; values are lists of contents; attributes are ignored (they should be processed in the sub-element); (returns a ref) =item MMAPON(element-list) makes an HASH with the sub-elements; keys are the sub-element names, values are their contents; attributes are ignored (they should be processed in the sub-element); for all the elements contained in the element-list, it is created an ARRAY with their contents. (returns a ref) =item XML return a reference to an HASH with: -q => element name -c => contents at1 => at value1 for each attribute =item ZERO don't process the sub-elements; return "" =back When you use C definitions, you often need do set C<-default> function returning just the contents C. =head2 An example: use XML::DT; %handler = ( contacts => sub{ [ split(";",$c)] }, -default => sub{$c}, -type => { institution => 'MAP', degrees => MMAPON('name') tels => 'SEQ' } ); $a = dt ("f.xml", %handler); with the following f.xml U.M. University of Minho 1111 1112 1113 Portugal J.Joao; J.Rocha; J.Ramalho Computer science Informatica history would make $a { 'name' => [ 'Computer science', 'Informatica ', ' history ' ], 'institution' => { 'tels' => [ 1111, 1112, 1113 ], 'name' => 'University of Minho', 'where' => 'Portugal', 'id' => 'U.M.', 'contacts' => [ 'J.Joao', ' J.Rocha', ' J.Ramalho' ] } }; =head1 DT Skeleton generation It is possible to build an initial processor program based on an example To do this use the function C. Example: perl -MXML::DT -e 'mkdtskel "f.xml"' > f.pl =head1 DTD skeleton generation It makes a naive DTD based on an example(s). To do this use the function C. Example: perl -MXML::DT -e 'mkdtdskel "f.xml"' > f.dtd =head1 SEE ALSO mkdtskel(1) and mkdtdskel(1) =head1 AUTHORS Home for XML::DT; http://natura.di.uminho.pt/~jj/perl/XML/ Jose Joao Almeida, Alberto Manuel Simões, thanks to Michel Rodriguez José Carlos Ramalho Mark A. Hillebrand =head1 COPYRIGHT AND LICENSE Copyright 1999-2004 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut %ty = (); sub dt { my ($file,%xml)=@_; # Treat -decl option my $declr = ""; if ($xml{-declr}) { if ($xml{-outputenc}) { $declr = "\n"; } else { $declr = "\n"; } } %ty = (); %ty = (%{$xml{'-type'}}) if defined($xml{'-type'}); $ty{-ROOT} = "NONE"; &{$xml{-begin}} if $xml{-begin}; #XML::LIBXML## TODO --- how to force encoding with XML::LibXML? #XML::LIBXML## $xml{-inputenc} #XML::LIBXML## create a new LibXML parser #XML::LIBXML# my $parser = XML::LibXML->new(); #XML::LIBXML# $parser->validation(0); #XML::LIBXML# $parser->load_ext_dtd(0); #XML::LIBXML## Check if we should expand entities #XML::LIBXML# $parser->expand_entities(1) if defined $xml{'-noexpand'} && $xml{'-noexpand'}; #XML::LIBXML## parse the file #XML::LIBXML# my $doc; #XML::LIBXML# if ( $xml{'-html'}) { #XML::LIBXML# $parser->recover(1); #XML::LIBXML# eval{$doc = $parser->parse_html_file($file);}; #XML::LIBXML## if ($@) {warn("Erro: $@\n"); } #{return undef; } #XML::LIBXML# return undef if !$doc; #XML::LIBXML# } #XML::LIBXML# else{ $doc = $parser->parse_file($file) } #XML::LIBXML## get the document root element #XML::LIBXML# my $tree = $doc->getDocumentElement(); #XML::PARSER## create a new XML::Parser instance using Tree Style #XML::PARSER# if (defined($xml{-inputenc}) && ($xml{-inputenc} eq 'ISO-8859-1')){ #XML::PARSER# $parser = new XML::Parser(Style => 'Tree', #XML::PARSER# ErrorContext => 2 , #XML::PARSER# ProtocolEncoding => 'ISO-8859-1'); #XML::PARSER# } #XML::PARSER# else { $parser = new XML::Parser(Style => 'Tree', #XML::PARSER# ErrorContext => 2 , #XML::PARSER# ); #XML::PARSER# } #XML::PARSER## Convert XML to Perl code #XML::PARSER# $tree = $parser->parsefile($file); my $return = ""; # execute End action if it exists if($xml{-end}) { #XML::LIBXML# $c = _omni("-ROOT", \%xml, $tree); #XML::PARSER# $c = _omni("-ROOT", \%xml, @$tree); $return = &{$xml{-end}} } else { #XML::LIBXML# $return = _omni("-ROOT",\%xml, $tree) #XML::PARSER# $return = _omni("-ROOT",\%xml, @$tree) } if ($declr) { return $declr.$return; } else { return $return; } } sub ctxt { my $level = $_[0]; $dtcontext[-$level-1]; } sub inctxt { my $pattern = shift ; # see if is in root context... return 1 if (($pattern eq "^" && @dtcontext==1) || $pattern eq ".*"); join("/",@dtcontext) =~ m!$pattern/[^/]*$! ; } sub pathdtstring{ my $string = shift; my %h = _pathtodt(@_); return dtstring($string,%h); } sub pathdturl{ my $url = shift; my %h = _pathtodt(@_); return dturl($url,%h); } sub dturl{ my $url = shift; my $contents = get($url); if ($contents) { return dtstring($contents, @_); } else { return undef; } } sub dtstring { my ($string,%xml)=@_; my $declr = ""; if ($xml{-declr}) { if ($xml{-outputenc}) { $declr = "\n"; } else { $declr = "\n"; } } $xml{'-type'} = {} unless defined $xml{'-type'}; %ty = (%{$xml{'-type'}}, -ROOT => "NONE"); # execute Begin action if it exists if ($xml{-begin}){ &{$xml{-begin}} } #XML::LIBXML# $string = XML::LibXML::encodeToUTF8($xml{-inputenc},$string) if ($xml{-inputenc}); #XML::LIBXML## create a new LibXML parser #XML::LIBXML# my $parser = XML::LibXML->new(); #XML::LIBXML## Check if we should expand entities #XML::LIBXML# $parser->expand_entities(0) if defined $xml{'-noexpand'} && $xml{'-noexpand'}; #XML::LIBXML## parse the string #XML::LIBXML# my $doc; #XML::LIBXML# $doc = $parser->parse_string($string) unless( $xml{'-html'}); #XML::LIBXML# if ( $xml{'-html'}) { #XML::LIBXML# $parser->recover(1); #XML::LIBXML# eval{$doc = $parser->parse_html_string($string);}; #XML::LIBXML## if ($@) { return undef; } #XML::LIBXML# return undef unless defined $doc; #XML::LIBXML# } #XML::LIBXML## get the document root element #XML::LIBXML# my $tree = $doc->getDocumentElement(); #XML::PARSER## create a new XML::Parser instance using Tree Style #XML::PARSER# if (defined($xml{-inputenc}) && ($xml{-inputenc} eq 'ISO-8859-1')){ #XML::PARSER# $parser = new XML::Parser(Style => 'Tree', #XML::PARSER# ErrorContext => 2 , #XML::PARSER# ProtocolEncoding => 'ISO-8859-1'); #XML::PARSER# } #XML::PARSER# else { $parser = new XML::Parser(Style => 'Tree', #XML::PARSER# ErrorContext => 2 , #XML::PARSER# ); #XML::PARSER# } #XML::PARSER## Convert XML to Perl code (Tree) #XML::PARSER# $tree = $parser->parse($string); my $return; # Check if we have an end function if ($xml{-end}) { #XML::LIBXML# $c = _omni("-ROOT", \%xml, $tree); #XML::PARSER# $c = _omni("-ROOT", \%xml, @$tree); $return = &{$xml{-end}} } else { #XML::LIBXML# $return = _omni("-ROOT", \%xml, $tree) #XML::PARSER# $return = _omni("-ROOT", \%xml, @$tree) } if ($declr) { return $declr.$return; } else { return $return; } } sub pathdt{ my $file = shift; my %h = _pathtodt(@_); return dt($file,%h); } # Parsing dos predicados do XPath sub _testAttr { my $atr = shift; for ($atr) { s/name\(\)/'$q'/g; # s/\@([A-Za-z_]+)/'$v{$1}'/g; s/\@([A-Za-z_]+)/defined $v{$1}?"'$v{$1}'":"''"/ge; s/\@\*/keys %v?"'1'":"''"/ge; if (/^not\((.*)\)$/) { return ! _testAttr($1); } elsif (/^('|")([^\1]*)(\1)\s*=\s*('|")([^\4]*)\4$/) { return ($2 eq $5); } elsif (/normalize-space\((['"])([^\1)]*)\1\)/) { my ($back,$forward)=($`,$'); my $x = _normalize_space($2); return _testAttr("$back'$x'$forward"); } elsif (/starts-with\((['"])([^\1))]*)\1,(['"])([^\3))]*)\3\)/) { my $x = _starts_with($2,$4); return $x; } elsif (/contains\((['"])([^\1))]*)\1,(['"])([^\3))]*)\3\)/) { my $x = _contains($2,$4); return $x; } elsif (/string-length\((['"])([^\1]*)\1\)/) { my ($back,$forward) = ($`,$'); my $x = length($2); return _testAttr("$back$x$forward"); } elsif (/^(\d+)\s*=(\d+)$/) { return ($1 == $2); } elsif (/^(\d+)\s*<(\d+)$/) { return ($1 < $2); } elsif (/^(\d+)\s*>(\d+)$/) { return ($1 > $2); } elsif (/^(['"])([^\1]*)\1$/) { return $2; } } return 0; #$atr; } # Funcao auxiliar de teste de predicados do XPath sub _starts_with { my ($string,$preffix) = @_; return 0 unless ($string && $preffix); return 1 if ($string =~ m!^$preffix!); return 0; } # Funcao auxiliar de teste de predicados do XPath sub _contains { my ($string,$s) = @_; return 0 unless ($string && $s); return 1 if ($string =~ m!$s!); return 0; } # Funcao auxiliar de teste de predicados do XPath sub _normalize_space { my $z = shift; $z =~ /^\s*(.*?)\s*$/; $z = $1; $z =~ s!\s+! !g; return $z; } sub _pathtodt { my %h = @_; my %aux=(); my %aux2=(); my %n = (); my $z; for $z (keys %h) { # TODO - Make it more generic if ( $z=~m{\w+(\|\w+)+}) { my @tags = split /\|/, $z; for(@tags) { $aux2{$_}=$h{$z} } } elsif ( $z=~m{(//|/|)(.*)/([^\[]*)(?:\[(.*)\])?} ) { my ($first,$second,$third,$fourth) = ($1,$2,$3,$4); if (($first eq "/") && (!$second)) { $first = ""; $second = '.*'; $third =~ s!\*!-default!; } else { $second =~ s!\*!\[^/\]\+!g; $second =~ s!/$!\(/\.\*\)\?!g; $second =~ s!//!\(/\.\*\)\?/!g; $third =~ s!\*!-default!g; } push( @{$aux{$third}} , [$first,$second,$h{$z},$fourth]); } else { $aux2{$z}=$h{$z};} } for $z (keys %aux){ my $code = sub { my $l; for $l (@{$aux{$z}}) { my $prefix = ""; $prefix = "^" unless (($l->[0]) or ($l->[1])); $prefix = "^" if (($l->[0] eq "/") && ($l->[1])); if ($l->[3]) { if(inctxt("$prefix$l->[1]") && _testAttr($l->[3])) {return &{$l->[2]}; } } else { if(inctxt("$prefix$l->[1]")) {return &{$l->[2]};} } } return &{ $aux2{$z}} if $aux2{$z} ; return &{ $h{-default}} if $h{-default}; &toxml(); }; $n{$z} = $code; } for $z (keys %aux2){ $n{$z} ||= $aux2{$z} ; } return %n; } sub _omni{ my ($par, $xml, @l) = @_; my $type = $ty{$par} || "STR"; my %typeargs = (); if (ref($type) eq "mmapon") { for (@$type) { $typeargs{$_} = 1 } $type = "MMAPON"; }; my $r ; if( $type eq 'STR') { $r = "" } elsif( $type eq 'THE_CHILD' or $type eq 'LAST_CHILD') { $r = 0 } elsif( $type eq 'SEQ' or $type eq "ARRAY") { $r = [] } elsif( $type eq 'SEQH' or $type eq "ARRAYOFHASH") { $r = [] } elsif( $type eq 'MAP' or $type eq "HASH") { $r = {} } elsif( $type eq 'MULTIMAP') { $r = {} } elsif( $type eq 'MMAPON' or $type eq "HASHOFARRAY") { $r = {} } elsif( $type eq 'NONE') { $r = "" } elsif( $type eq 'ZERO') { return "" } my ($name, $val, @val, $atr, $aux); while(@l) { #XML::LIBXML# my $tree = shift @l; #XML::LIBXML# if (ref($tree) eq "XML::LibXML::CDATASection") { #XML::LIBXML# $name = "-pcdata"; #XML::LIBXML## $val = $tree->getData(); #XML::LIBXML## print STDERR Dumper($val); #XML::LIBXML## $aux= (defined($xml->{-outputenc}))?_fromUTF8($val,$xml->{-outputenc}):$val; #XML::LIBXML## if (defined($xml->{-pcdata})) { #XML::LIBXML## push(@dtcontext,"-pcdata"); #XML::LIBXML### $c = $aux; #XML::LIBXML#### $aux = &{$xml->{-pcdata}}; #XML::LIBXML## pop(@dtcontext); #XML::LIBXML## } #XML::LIBXML# } else { #XML::LIBXML# $name = $tree->getName(); #XML::LIBXML# } #XML::PARSER# ($name,$val,@l) = @l; #XML::LIBXML# if (ref($tree) eq "XML::LibXML::Comment") { #XML::LIBXML# ### At the moment, treat as Text #XML::LIBXML# ### We will need to change this, I hope! #XML::LIBXML# $val = ""; #XML::LIBXML# $name = "-pcdata"; #XML::LIBXML# $aux= (defined($xml->{-outputenc}))?_fromUTF8($val, $xml->{-outputenc}):$val; #XML::LIBXML# if (defined($xml->{-pcdata})) { #XML::LIBXML# push(@dtcontext,"-pcdata"); #XML::LIBXML# $c = $aux; #XML::LIBXML# $aux = &{$xml->{-pcdata}}; #XML::LIBXML# pop(@dtcontext); #XML::LIBXML# } #XML::LIBXML# } elsif (ref($tree) eq "XML::LibXML::Text" || ref($tree) eq "XML::LibXML::CDATASection") #XML::PARSER# if ($name eq "0") { #XML::LIBXML# $val = $tree->getData(); $name = "-pcdata"; $aux = (defined($xml->{-outputenc}))?_fromUTF8($val,$xml->{-outputenc}):$val; if (defined($xml->{-pcdata})) { push(@dtcontext,"-pcdata"); $c = $aux; $aux = &{$xml->{-pcdata}}; pop(@dtcontext); } } else { #XML::LIBXML# my %atr = _nodeAttributes($tree); #XML::LIBXML# $atr = \%atr; #XML::PARSER# ($atr,@val) = @$val; push(@dtcontext,$name); $dtcontextcount{$name}++; unshift(@dtatributes, $atr); unshift(@dtattributes, $atr); #XML::LIBXML# $aux = _omniele($xml, $name, _omni($name, $xml, ($tree->getChildnodes())), $atr); #XML::PARSER# $aux = _omniele($xml, $name, _omni($name, $xml, @val), $atr); shift(@dtatributes); shift(@dtattributes); pop(@dtcontext); $dtcontextcount{$name}--; } if ($type eq "STR"){ if (defined($aux)) {$r .= $aux} ;} elsif ($type eq "THE_CHILD" or $type eq "LAST_CHILD"){ $r = $aux unless _whitepc($aux, $name); } elsif ($type eq "SEQ" or $type eq "ARRAY"){ push(@$r, $aux) unless _whitepc($aux, $name);} elsif ($type eq "SEQH" or $type eq "ARRAYHASH"){ push(@$r,{"-c" => $aux, "-q" => $name, #XML::LIBXML# _nodeAttributes($tree) #XML::PARSER# %$atr }) unless _whitepc($aux,$name); } elsif($type eq "MMAPON"){ if(not _whitepc($aux,$name)){ if(! $typeargs{$name}) { warn "duplicated tag ´$name´\n" if(defined($r->{$name})); $r->{$name} = $aux } else { push(@{$r->{$name}},$aux) unless _whitepc($aux,$name)}} } elsif($type eq "MAP" or $type eq "HASH"){ if(not _whitepc($aux,$name)){ warn "duplicated tag ´$name´\n" if(defined($r->{$name})); $r->{$name} = $aux }} elsif($type eq "MULTIMAP"){ push(@{$r->{$name}},$aux) unless _whitepc($aux,$name)} elsif($type eq "NONE"){ $r = $aux;} else { $r="undefined type !!!"} } $r; } sub _omniele { my $xml = shift; my $aux; ($q, $c, $aux) = @_; %v = %$aux; if (defined($xml->{-outputenc})) { for (keys %v){ $v{$_} = _fromUTF8($v{$_}, $xml->{-outputenc}) } } if (defined $xml->{$q}) { &{$xml->{$q}} } elsif (defined $xml->{'-default'}) { &{$xml->{'-default'}} } else { toxml() } } sub xmltree { +{'-c' => $c, '-q' => $q, %v} } # está a ser usada pela toxml1, que não é usada :-) # sub toxmlp { # my ($q,$v,$c ) = @_; # if ($q eq "-pcdata") { $c } # else {"<$q" . join("",map {" $_=\"$v->{$_}\""} keys %$v ) . ">$c" } # } sub toxml { my ($q,$v,$c); if (not @_) { ($q,$v,$c) = ($XML::DT::q, \%XML::DT::v, $XML::DT::c); } elsif (ref($_[0])) { $c = shift; } else { ($q,$v,$c) = @_; } if (not ref($c)) { if ($q eq "-pcdata") { return $c } return _openTag($q,$v) . "$c" } elsif (ref($c) eq "HASH" && $c->{'-q'} && $c->{'-c'}) { my %a = %$c; my ($q,$c) = delete @a{"-q","-c"}; toxml($q,\%a,$c); } elsif (ref($c) eq "HASH") { _openTag($q,$v). join("",map {($_ ne "-pcdata") ? ( (ref($c->{$_}) eq "ARRAY") ? "<$_>". join("\n<$_>", @{$c->{$_}}). "\n" : toxml($_,{},$c->{$_})."\n" ) : () } keys %{$c} ) . "$c->{-pcdata}" } ######## "NOTYetREady" elsif (ref($c) eq "ARRAY") { if($ty{$q} eq "SEQH"){toxml($q,$v,join("\n",map {toxml($_)} @$c))} else {toxml($q,\%v, join("",@{$c}))} } } sub _openTag{ "<$_[0]". join("",map {" $_=\"$_[1]{$_}\""} keys %{$_[1]} ).">" } # toxml1 is not being used. # sub toxml1 { # if(@_ == 3){return toxmlp(@_)} # return "" if (defined $ty{$q} && $ty{$q} eq "ZERO"); # if(not ref($c)){ toxmlp($q,\%v,$c)} # elsif (ref($c) eq "ARRAY") { # if($ty{$q} eq "SEQH") { # toxmlp($q,{}, # join("",map {my %a=%$_; # delete @a{"-q","-c"}; # toxmlp($_->{'-q'},\%a,$_->{-c}) } @{$c} )) # } # else { toxmlp($q,\%v, join("",@{$c}))} # } # elsif (ref($c) eq "HASH") { # "<$q". # join("",map {" $_=\"$v{$_}\""} keys %v ) . ">" . # join("",map {($_ ne "-pcdata") # ? ( (ref($c->{$_}) eq "ARRAY") # ? "<$_>". # join("\n<$_>", @{$c->{$_}}). # "\n" # : "<$_>$c->{$_}\n" ) # : () } # keys %{$c} ) . # "$c->{-pcdata}" } # } sub mkdtskel_fromDTD { my $filename = shift; my $file = ParseDTDFile($filename); print <<'PERL'; #!/usr/bin/perl use XML::DT ; my $filename = shift; # Variable Reference # # $c - contents after child processing # $q - element name (tag) # %v - hash of attributes %handler=( # '-outputenc' => 'ISO-8859-1', # '-default' => sub{"<$q>$c"}, PERL for (sort keys %{$file}) { print " '$_' => sub { },"; print " # attributes: ", join(", ", keys %{$file->{$_}{attributes}}) if exists($file->{$_}{attributes}); print "\n"; } print <<'PERL'; ); print dt($filename,%handler); PERL } sub mkdtskel{ my @files = @_; my %mkdtskel = ('-default' => sub{ $element{$q}++; for (keys %v) { $att{$q}{$_} = 1 }; ""}, '-end' => sub{ print <<'END'; #!/usr/bin/perl use XML::DT ; my $filename = shift; # Variable Reference # # $c - contents after child processing # $q - element name (tag) # %v - hash of attributes %handler=( # '-outputenc' => 'ISO-8859-1', # '-default' => sub{"<$q>$c"}, END for $name (sort keys %element) { print " '$name' => sub{ }, #"; print " $element{$name} occurrences;"; print ' attributes: ', join(', ', keys %{$att{$name}}) if $att{$name}; # print " \"\$q:\$c\"\n"; print "\n"; } print <<'END'; ); print dt($filename,%handler); END } ); $file = shift(@files); while($file =~ /^-/){ if ($file eq "-html") { $mkdtskel{'-html'} = 1;} elsif($file eq "-latin1") { $mkdtskel{'-inputenc'}='ISO-8859-1';} else { die("usage mktskel [-html] [-latin1] file \n")} $file=shift(@files)} dt($file,%mkdtskel) } sub _nodeAttributes { my $node = shift; my %answer = (); my @attrs = $node->getAttributes(); for (@attrs) { if (ref($_) eq "XML::LibXML::Namespace") { # TODO: This should not be ignored, I think. # This sould be converted on a standard attribute with # key 'namespace' and respective contents } else { $answer{$_->getName()} = $_->getValue(); } } return %answer; } sub mkdtdskel { my @files = @_; my %handler=( '-outputenc' => 'ISO-8859-1', '-default' => sub{ $elel{$q}++; $root = $q unless ctxt(1); $ele{ctxt(1)}{$q} ++; for(keys(%v)){$att{$q}{$_} ++ } ; }, '-pcdata' => sub{ if ($c =~ /[^ \t\n]/){ $ele{ctxt(1)}{"#PCDATA"}=1 }}, ); while($files[0] =~ /^-/){ if ($files[0] eq "-html") { $handler{'-html'} = 1;} elsif($files[0] eq "-latin1") { $handler{'-inputenc'}='ISO-8859-1';} else { die("usage mkdtdskel [-html] [-latin1] file* \n")} shift(@files)} for $filename (@files){ dt($filename,%handler); } print "\n\n"; delete $elel{$root}; for ($root, keys %elel){ _putele($_); for $name (keys(%{$att{$_}})) { print( "\t\n"); print( "\t\n"); } } } # This is NOT being used. # # sub _process_libxml { # my $node = shift; # my $ref = ref($node); # if ($ref eq "XML::LibXML::Text") { # return [0, $node->toString() ]; # } else { # my $name = $node->getName(); # my %attr = ( map { ($_->getName(), $_->getValue()) } $node->getAttributes() ); # my $x = [ $name, [ { %attr }, map {my $w = _process_libxml($_); @$w } $node->getChildnodes() ]]; # return $x; # } # } sub _putele { my $e = shift; my @f ; if ($ele{$e}) { @f = keys %{$ele{$e}}; print "= 1 && $f[0] eq "#PCDATA" ? "" : "*"), " >\n"; print "\n"; } else { print "\n"; } } sub _whitepc { $_[1] eq '-pcdata' and $_[0] =~ /^[ \t\n]*$/ } sub MMAPON { bless([@_],"mmapon") } # not being used... # # sub SEQOF { # bless([@_],"seqof") # } ####################################### # # # Auxiliary functions for encodings # # # ### STARTLATIN1 ####################### sub _fromUTF8 { my $string = shift; my $encode = shift; my $ans = eval { XML::LibXML::decodeFromUTF8($encode, $string) }; if ($@) { return $string } else { return $ans } } ### ENDLATIN1 1;