package HTML::LinkFilter; use strict; use warnings; use HTML::Parser; our $VERSION = "0.03"; ## The html tags which might have URLs # the master list of tagolas and required attributes (to constitute a link) our %TAGS = ( # Copied from HTML::LinkExtractor 0.13 a => [qw( href )], applet => [qw( archive code codebase src )], area => [qw( href )], base => [qw( href )], bgsound => [qw( src )], blockquote => [qw( cite )], body => [qw( background )], del => [qw( cite )], div => [qw( src )], # IE likes it, but don't know where it's documented embed => [qw( pluginspage pluginurl src )], form => [qw( action )], frame => [qw( src longdesc )], iframe => [qw( src )], ilayer => [qw( background src )], img => [qw( dynsrc longdesc lowsrc src usemap )], input => [qw( dynsrc lowsrc src )], ins => [qw( cite )], isindex => [qw( action )], # real oddball layer => [qw( src )], link => [qw( src href )], object => [qw( archive classid code codebase data usemap )], q => [qw( cite )], script => [qw( src )], # HTML::Tagset has 'for' ~ it's WRONG! sound => [qw( src )], table => [qw( background )], td => [qw( background )], th => [qw( background )], tr => [qw( background )], ## the exotic cases meta => undef, '!doctype' => [qw( url )], # is really a process instruction ); ### HTML::Parser method, not for __PACKAGE__. my $default_h_sub = sub { my( $self, $tagname, $original ) = @_; push @{ $self->{link_filter}{tags} }, $original; return; }; ### HTML::Parser method, not for __PACKAGE__. my $start_h_sub = sub { my( $self, $tagname, $attr_ref, $original ) = @_; unless ( exists $TAGS{ $tagname } ) { push @{ $self->{link_filter}{tags} }, $original and return; } unless ( grep { my $name = $_; grep { $_ eq $name } @{ $TAGS{ $tagname } } } keys %{ $attr_ref } ) { push @{ $self->{link_filter}{tags} }, $original and return; } unless ( $self->{link_filter}{cb} ) { push @{ $self->{link_filter}{tags} }, $original and return; } foreach my $attr ( keys %{ $attr_ref } ) { next unless grep { $_ eq $attr } @{ $TAGS{ $tagname } }; my $new = $self->{link_filter}{cb}->( $tagname, $attr, $attr_ref->{ $attr }, $attr_ref, ); $attr_ref->{ $attr } = $new if defined $new; } my $tag = do { my $build = q{}; my $is_xhtml = grep { $_ eq q{/} } keys %{ $attr_ref }; my $attr = join q{ }, map { join q{=}, $_, join q{}, q{"}, $attr_ref->{ $_ }, q{"}, } grep { $_ ne q{/} } sort keys %{ $attr_ref }; if ( $attr && $is_xhtml ) { $build = "<$tagname $attr />"; } elsif ( $attr && ! $is_xhtml ) { $build = "<$tagname $attr>"; } elsif ( ! $attr && $is_xhtml ) { $build = "<$tagname />"; } else { $build = "<$tagname>"; } if ( chomp $original ) { $build .= "\n"; } $build; }; push @{ $self->{link_filter}{tags} }, $tag; return $self; }; sub new { my $class = shift; my %param = @_; my $self = bless \%param, $class; my $p = HTML::Parser->new( api_version => 3, start_h => [ $start_h_sub, "self, tagname, attr, text", ], default_h => [ $default_h_sub, "self, tagname, text", ], ); $p->{link_filter} = $self; $self->{p} = $p; $self->_init_tags; return $self; } sub change { my $self = shift; my( $html, $callback_sub ) = @_; $self->_init_tags; $self->{cb} = $callback_sub; $self->{p}->parse( $html ); $self->{p}->eof; return $self; } sub _init_tags { my $self = shift; $self->{tags} = [ ]; return $self; } sub tags { return shift->{tags}; } sub html { my $self = shift; return join q{}, @{ $self->tags }; } 1; __END__ =head1 NAME HTML::LinkFilter - Changes all links in HTML =head1 SYNOPSIS use HTML::LinkFilter; use Data::Dumper; my $html = do { local $/; }; my $filter = HTML::LinkFilter->new; $filter->change( \$html, \&callback ); print Dumper $filter->tags; sub callback { my( $tagname, $attr, $value ) = @_; return; # Uses original. } __DATA__

example.com

=head1 DESCRIPTION HTML::LinkFilter can change all links in passed HTML. This requires callback sub. The sub takes tagname, attr, value, and returns new value, then it will be replaced. Or uses original when returns undef. *Note* this breaks attributes order in tag. =head1 METHODS =over =item new Returns instance. =item change Changes html to tags by using callback filter. Callback filter is an argument which changes link. Callback filter will take args those are tagname, attr, value, and return value is pushed to $self->tags as a new value. Callback filter can tell 'use original' to parser by returns undef. =item tags Returns some changed HTML tags. =item html Returns HTML code which is parsed. =back =head1 AUTHOR kuniyoshi kouji Ekuniyoshi@cpan.orgE =head1 SEE ALSO =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut