package WWW::Webrobot::HtmlAnalyzer;
use strict;
# Author: Stefan Trcek
# Copyright(c) 2004 ABAS Software AG
use HTML::TokeParser;
=head1 NAME
WWW::Webrobot::HtmlAnalyzer - analyze HTML files for links/images/frames
=head1 SYNOPSIS
WWW::Webrobot::HtmlAnalyzer -> get_links($scheme, $input)
=head1 DESCRIPTION
Analyze an HTML file.
Returns a list of images, a list of frames and a list of links.
=head1 METHODS
=over
=item WWW::Webrobot::HtmlAnalyzer -> get_links($scheme, $input)
Extract all links found in an HTML page
Parameters:
$scheme uri of the content
$in content, same form as in HTML::TokeParser->new($in)
return (\@img, \@frame, \@a);
\@img list of images
\@frame list of frames
\@a list of plain links
=back
=cut
sub get_links { # static method
my ($self, $scheme, $in) = @_;
#print $scheme, " ", $$in;
my $p = HTML::TokeParser -> new($in);
my @img = ();
my @frame = ();
my @a = ();
my $refresh = undef;
while (my $token = $p -> get_tag(qw(img frame a meta))) {
my ($tag, $attr, $attrseq, $text) = @$token;
SWITCH: {
($tag eq "img") && do {
my $href = $attr -> {'src'};
my $link = URI -> new($href) -> abs($scheme);
push(@img, $link->as_string()) if $href;
last SWITCH;
};
($tag eq "frame") && do {
my $href = $attr -> {'src'};
my $link = URI -> new($href) -> abs($scheme);
push(@frame, $link->as_string()) if $href;
last SWITCH;
};
($tag eq "a") && do {
my $href = $attr -> {'href'};
my $link = URI -> new($href) -> abs($scheme);
push(@a, $link->as_string()) if $href;
last SWITCH;
};
($tag eq "meta" && ($attr -> {"http-equiv"} || "") eq "refresh") && do {
my $refresh = $attr -> {'content'} || "-";
my ($time, $href) = ($refresh =~ /^\s*(\d+);\s+URL\s*=\s*(.*)$/);
my $link = URI -> new($href) -> abs($scheme);
$refresh = $link->as_string() if $href;
last SWITCH;
};
}
}
return (\@img, \@frame, \@a, $refresh);
}
1;