package HTML::ContentExtractor;
=head1 NAME
HTML::ContentExtractor - extract the main content from a web page by analysising the DOM tree!
=head1 VERSION
Version 0.02
=cut
our $VERSION = '0.02';
=head1 SYNOPSIS
use HTML::ContentExtractor;
my $extractor = HTML::ContentExtractor->new();
my $agent=LWP::UserAgent->new;
my $url='http://sports.sina.com.cn/g/2007-03-23/16572821174.shtml';
my $res=$agent->get($url);
my $HTML = $res->decoded_content();
$extractor->extract($url,$HTML);
print $extractor->as_html();
print $extractor->as_text();
=head1 DESCRIPTION
Web pages often contain clutter (such as ads, unnecessary images and
extraneous links) around the body of an article that distracts a user
from actual content. This module is used to reduce the noise content
in web pages and thus identify the content rich regions.
A web page is first parsed by an HTML parser, which corrects the
markup and creates a DOM (Document Object Model) tree. By using a
depth-first traversal to navigate the DOM tree, noise nodes are
identified and removed, thus the main content is extracted. Some
useless nodes (script, style, etc.) are removed; the container nodes
(table, div, etc.) which have high link/text ratio (higher than
threshold) are removed; (link/text ratio is the ratio of the number of
links and non-linked words.) The nodes contain any string in the
predefined spam string list are removed.
Please notice the input HTML should be encoded in utf-8 format( so do
the spam words), thus the module can handle web pages in any language
(I've used it to process English, Chinese, and Japanese web pages).
=over 4
=item $e = HTML::ContentExtractor->new(%options);
Constructs a new C object. The optional
%options hash can be used to set the options list below.
=item $e->table_tags();
=item $e->table_tags(@tags);
=item $e->table_tags(\@tags);
This is used to get/set the table tags array. The tags are used as the
container tags.
=item $e->ignore_tags();
=item $e->ignore_tags(@tags);
=item $e->ignore_tags(\@tags);
This is used to get/set the ignore tags array. The elements of such
tags will be removed.
=item $e->spam_words();
=item $e->spam_words(@strings);
=item $e->spam_words(\@strings);
This is used to get/set the spam words list. The elements have such
string will be removed.
=item $e->link_text_ratio();
=item $e->link_text_ratio($ratio);
This is used to get/set the link/text ratio, default is 0.05.
=item $e->min_text_len();
=item $e->min_text_len($len);
This is used to get/set the min text length, default is 20. If length
of the text of an elment is less than this value, this element will be
removed.
=item $e->extract($url,$HTML);
This is used to perform the extraction process. Please notice the
input $HTML must be encoded in UTF-8.
=item $e->as_html();
Return the extraction result in HTML format.
=item $e->as_text();
Return the extraction result in text format.
=back
=head1 AUTHOR
Zhang Jun, C<< >>
=head1 COPYRIGHT & LICENSE
Copyright 2007 Zhang Jun, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
use strict;
use warnings;
use HTML::TreeBuilder;
sub new {
my $proto = shift;
my $class = ref($proto) || $proto;
my $self = {};
bless($self, $class);
return $self->_init(@_);
}
sub _init{
my $self = shift;
$self->{table_tags} = [qw(table form div td tr tbody thead tfoot th col colgroup span iframe center ul h1 h2 h3 p)];
$self->{ignore_tags} = [qw(script noscript style form button meta input select iframe embed hr img)];
$self->{spam_words} = ['All rights reserved'];
$self->{link_text_ratio} = 0.05;
$self->{min_text_len} = 20;
if (@_ != 0) {
if (ref $_[0] eq 'HASH') {
my $hash=$_[0];
foreach my $key (keys %$hash) {
$self->{lc($key)}=$hash->{$key};
}
}else{
my %args = @_;
foreach my $key (keys %args) {
$self->{lc($key)}=$args{$key};
}
}
}
$self->table_tags($self->{table_tags});
$self->ignore_tags($self->{ignore_tags});
return $self;
}
sub min_text_len{
my $self=shift;
return $self->{min_text_len} if (@_ == 0);
$self->{min_text_len}=shift;
}
sub link_text_ratio{
my $self=shift;
return $self->{link_text_ratio} if (@_ == 0);
$self->{link_text_ratio}=shift;
}
sub spam_words{
my $self = shift;
if(@_ == 0){
return @{$self->{spam_words}};
}
if(ref $_[0] eq 'ARRAY'){
$self->{spam_words} = $_[0];
}else{
my @array = @_;
$self->{spam_words} = \@array;
}
}
sub ignore_tags{
my $self = shift;
if(@_ == 0){
return keys %{$self->{ignore_tags}};
}
my $array;
if(ref $_[0] eq 'ARRAY'){
$array = $_[0];
}else{
$array = \@_;
}
my $h={};
grep {$h->{$_}=1;} @$array;
$self->{ignore_tags} = $h;
}
sub table_tags{
my $self = shift;
if(@_ == 0){
return keys %{$self->{table_tags}};
}
my $array;
if(ref $_[0] eq 'ARRAY'){
$array = $_[0];
}else{
$array = \@_;
}
my $h={};
grep {$h->{$_}=1;} @$array;
$self->{table_tags} = $h;
}
#the input should be utf8 encoded html content
sub extract{
my $self=shift;
my $url=shift;
my $HTML=shift;
$self->{tree}->delete if($self->{tree});
$HTML=_PreprocessForFragmentIdentifiedPage($url,$HTML);
_remove_crap($HTML);
$self->{url}=$url;
$self->{tree} = HTML::TreeBuilder->new();
$self->{tree} ->parse($HTML);
$self->{link_count} = _how_many_links($self->{tree});
$self->{is_index}= _check_if_index($self->{tree});
$self->_Heuristic_Remove($self->{tree});
$self->_Table_Remove($self->{tree});
}
sub _is_index{
return $_[0]->{is_index};
}
sub DESTROY{
my $self = shift;
$self->{tree}->delete if($self->{tree});
}
#also the output are in utf8 format
sub as_html{
my $self=shift;
my $HTML = $self->{tree}->as_HTML('<>&',"\t");
return $HTML;
}
sub as_text{
my $self=shift;
my $output = _to_text($self->{tree});
$output =~ s/[\n\r] +/\n/sg;
$output =~ s/[\n\r]+/\n/sg;
$output =~ s/ +/ /sg;
$output =~ s/\n /\n/sg;
$output =~ s/^\s+//;
return $output;
}
sub _link_count{
return $_[0]->{link_count};
}
sub _check_if_index{
my $node=shift;
my $num_links=_how_many_links($node);
my $txt=_nonlink_words($node);
my $num_words = _count_words_num($txt);
my $ratio=1;
$ratio = $num_links/$num_words unless $num_words==0;
if($ratio>0.3 || $num_links>400){
return 1;
}else{
return 0;
}
}
sub _remove_crap{
$_[0] =~ s/ / /isg;
}
sub _Table_Remove{
my $self=shift;
my $node=shift;
return if not ref $node; # not an element node
my $tag=$node->tag;
my @nodes = $node->content_list(); # depth first recursive travesel
foreach my $child (@nodes){
$self->_Table_Remove( $child );
}
if($self->{table_tags}->{$tag}){
my $num_links=_how_many_links($node);
my $txt=_nonlink_words($node);
my $num_words = _count_words_num($txt);
my $ratio=1;
$ratio = $num_links/$num_words unless $num_words==0;
if ($num_words < $self->{min_text_len} and
$node->tag ne 'h1' and
$node->tag ne 'h2' and
$node->tag ne 'h3' and
$node->tag ne 'p'){
$node->delete; return;
}
if ($ratio > $self->{link_text_ratio}){
$node->delete; return;
}
$txt = lc $txt;
foreach(@{ $self->{spam_words} }){
if(index($txt,$_) != -1){
$node->delete;
return;
}
}
}
}
sub _how_many_links{
my $node=shift;
my $links_r = $node->extract_links();
my $num_links = scalar(@$links_r);
return $num_links;
}
sub _nonlink_words{
my $node=shift;
if(not ref $node){
my $text = $node;
return $text;
}
return '' if($node->tag eq 'a'
or $node->tag eq 'style'
or $node->tag eq 'script'
or $node->tag eq 'option'
or $node->tag eq 'noscript'
or $node->tag eq 'hr'
or $node->tag eq 'input'
);
my @nodes = $node->content_list(); # breadth first travesel
my $sum_text="";
foreach $node (@nodes){
$sum_text .= _nonlink_words( $node );
}
return $sum_text;
}
sub _Heuristic_Remove{
my $self=shift;
my $node=shift;
return if not ref $node; # not an element node
my @nodes = $node->content_list(); # depth first recursive travesel
foreach my $child (@nodes){
$self->_Heuristic_Remove( $child );
}
if($self->{ignore_tags}->{$node->tag} ){ # ignore the tags defined in ignore_tags
$node->delete;
return;
}
if($node->tag eq 'a' and $node->parent->tag eq 'body'){
$node->delete;
}
}
sub _to_text{
my $node = shift;
if(not ref $node){
return $node;
}
return '' if($node->tag eq 'head');
my @nodes = $node->content_list(); #breadth firth travesel
my $text="";
foreach my $child (@nodes){
$text .= _to_text($child)."\n";
}
return $text;
}
sub _count_words_num{
my $text = shift;
$text =~ s/([\x21-\x7e]+)/ $1 /g;
$text =~ s/([^\x20-\x7e])/ $1 /g;
$text =~ s/^ +//;
my @tokens=split(/\s+/,$text);
return scalar(@tokens);
}
# input is the url and HTML
# output is the processed HTML
sub _PreprocessForFragmentIdentifiedPage{
my $url=shift;
my $HTML=shift;
if($url!~/\#/){
return $HTML;
}
my ($fragment_id)= $url=~/\#(.+)$/;
$fragment_id=~s/\///;
if($HTML=~/(