#line 1 package HTML::TokeParser; # $Id: TokeParser.pm,v 2.35 2005/12/02 16:08:17 gisle Exp $ require HTML::PullParser; @ISA=qw(HTML::PullParser); $VERSION = sprintf("%d.%02d", q$Revision: 2.35 $ =~ /(\d+)\.(\d+)/); use strict; use Carp (); use HTML::Entities qw(decode_entities); use HTML::Tagset (); my %ARGS = ( start => "'S',tagname,attr,attrseq,text", end => "'E',tagname,text", text => "'T',text,is_cdata", process => "'PI',token0,text", comment => "'C',text", declaration => "'D',text", # options that default on unbroken_text => 1, ); sub new { my $class = shift; my %cnf; if (@_ == 1) { my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; %cnf = ($type => $_[0]); } else { %cnf = @_; } my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; $self->{textify} = $textify; $self; } sub get_tag { my $self = shift; my $token; while (1) { $token = $self->get_token || return undef; my $type = shift @$token; next unless $type eq "S" || $type eq "E"; substr($token->[0], 0, 0) = "/" if $type eq "E"; return $token unless @_; for (@_) { return $token if $token->[0] eq $_; } } } sub _textify { my($self, $token) = @_; my $tag = $token->[1]; return undef unless exists $self->{textify}{$tag}; my $alt = $self->{textify}{$tag}; my $text; if (ref($alt)) { $text = &$alt(@$token); } else { $text = $token->[2]{$alt || "alt"}; $text = "[\U$tag]" unless defined $text; } return $text; } sub get_text { my $self = shift; my @text; while (my $token = $self->get_token) { my $type = $token->[0]; if ($type eq "T") { my $text = $token->[1]; decode_entities($text) unless $token->[2]; push(@text, $text); } elsif ($type =~ /^[SE]$/) { my $tag = $token->[1]; if ($type eq "S") { if (defined(my $text = _textify($self, $token))) { push(@text, $text); next; } } else { $tag = "/$tag"; } if (!@_ || grep $_ eq $tag, @_) { $self->unget_token($token); last; } push(@text, " ") if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; } } join("", @text); } sub get_trimmed_text { my $self = shift; my $text = $self->get_text(@_); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; $text; } sub get_phrase { my $self = shift; my @text; while (my $token = $self->get_token) { my $type = $token->[0]; if ($type eq "T") { my $text = $token->[1]; decode_entities($text) unless $token->[2]; push(@text, $text); } elsif ($type =~ /^[SE]$/) { my $tag = $token->[1]; if ($type eq "S") { if (defined(my $text = _textify($self, $token))) { push(@text, $text); next; } } if (!$HTML::Tagset::isPhraseMarkup{$tag}) { $self->unget_token($token); last; } push(@text, " ") if $tag eq "br"; } } my $text = join("", @text); $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; $text; } 1; __END__ #line 367