package HTML::ExtractContent::Util; use strict; use warnings; use Exporter::Lite; use utf8; use HTML::Entities; sub strip { my $str = shift; $str =~ s/(^\s+|\s+$)//gs; return $str; } sub strip_tags { my $page = shift; $page =~ s/<[^>\s]+(?:\s+[^>"]+(?:=(?:"[^"]*"|'[^']*'|\S+))?)*>//gs; return $page; } sub eliminate_tags { my ($page, $tag) = @_; $page =~ s/<$tag[\s>].*?<\/$tag\s*>//igs; return $page; } sub eliminate_links { return eliminate_tags shift, 'a'; } sub eliminate_forms { return eliminate_tags shift, 'form'; } sub eliminate_br { my $page = shift; $page =~ s/]*>/ /igs; return $page; } sub extract_alt { my $page = shift; $page =~ s/]*alt\s*=\s*['"]?(.*?)["']?[^>]*>/$1/igs; return $page; } sub unescape { my $page = shift; decode_entities($page); } sub reduce_ws { my $page = shift; $page =~ s/[ \t]+/ /g; $page =~ s/\n\s*/\n/gs; return $page; } sub decode { return strip (reduce_ws (unescape (strip_tags (eliminate_br shift)))); } sub to_text { return decode (extract_alt shift); } sub match_count { my ($str, $exp) = @_; my @list = ($str =~ $exp); return $#list + 1; } our @EXPORT = qw/strip strip_tags eliminate_tags eliminate_links eliminate_forms eliminate_br extract_alt unescape reduce_ws decode to_text match_count/; 1;