package XML::DOM::Lite::Parser; use XML::DOM::Lite::Document; use XML::DOM::Lite::Node; use XML::DOM::Lite::Constants qw(:all); #======================================================================== # These regular expressions have been gratefully borrowed from: # # REX/Perl 1.0 # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", # Technical Report TR 1998-17, School of Computing Science, Simon Fraser # University, November, 1998. # Copyright (c) 1998, Robert D. Cameron. # The following code may be freely used and distributed provided that # this copyright and citation notice remains intact and that modifications # or additions are clearly identified. our $TextSE = "[^<]+"; our $UntilHyphen = "[^-]*-"; our $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; our $CommentCE = "$Until2Hyphens>?"; our $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; our $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; our $S = "[ \\n\\t\\r]+"; our $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; our $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; our $Name = "(?:$NameStrt)(?:$NameChar)*"; our $QuoteSE = "\"[^\"]*\"|'[^']*'"; our $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; our $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; our $S1 = "[\\n\\r\\t ]"; our $UntilQMs = "[^?]*\\?+"; our $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; our $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; our $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; our $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; our $PI_CE = "$Name(?:$PI_Tail)?"; our $EndTagCE = "$Name(?:$S)?>?"; our $AttValSE = "\"[^<\"]*\"|'[^<']*'"; our $ElemTagCE = "$Name(?:$S$Name(?:$S)?=(?:$S)?(?:$AttValSE))*(?:$S)?/?>?"; our $ElementCE = "/(?:$EndTagCE)?|(?:$ElemTagCE)?"; our $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|(?:$ElementCE)?)"; our $XML_SPE = "$TextSE|$MarkupSPE"; #======================================================================== # these have captures for parsing the attributes our $AttValSE2 = "\"([^<\"]*)\"|'([^<']*)'"; our $ElemTagCE2 = "(?:($Name)(?:$S)?=(?:$S)?($AttValSE2))+(?:$S)?/?>?"; sub new { my ($class, %options) = @_; my $self = bless { stack => [ ], options => \%options, }, $class; return $self; } sub parse { my ($self, $XML) = (shift, shift); unless (ref($self)) { $self = __PACKAGE__->new(@_); } my @nodes = $self->_shallow_parse($XML); $self->{document} = XML::DOM::Lite::Document->new(); push @{$self->{stack}}, $self->{document}; STEP : foreach my $n ( @nodes ) { substr($n, 0, 1) eq '<' && do { substr($n, 1, 1) eq '!' && do { $self->_handle_decl_node($n); next STEP; }; substr($n, 1, 1) eq '?' && do { $self->_handle_pi_node($n); next STEP; }; $self->_handle_element_node($n); next STEP; }; $self->_handle_text_node($n); } return $self->{document}; } sub parseFile { my ($self, $filename) = @_; unless (ref $self) { $self = __PACKAGE__->new; } my $stream; { open FH, $filename or die "can't open file $filename for reading ".$!; local $/ = undef; $stream = ; close FH; } return $self->parse($stream); } sub _shallow_parse { my ($self, $XML) = @_; # Check the options. my %options = %{$self->{options}}; if (defined($options{'whitespace'})) { my $mode = $options{'whitespace'}; if (index($mode, 'strip') >= 0) { $XML =~ s/>$S/>/sg; $XML =~ s/$S= 0) { $XML =~ s/$S/ /sg } } return $XML =~ /$XML_SPE/go; } sub _handle_decl_node { my ($self, $decl) = @_; my $kind; my $length = length($decl); my $start = 1; $parent = $self->{stack}->[$#{$self->{stack}}]; substr($decl, 0, 4) eq '