# $Id: HTML.pm,v 1.96 2003/09/12 00:04:27 quinlan Exp $
package Mail::SpamAssassin::HTML;
1;
package Mail::SpamAssassin::PerMsgStatus;
use HTML::Parser 3.24 ();
use strict;
use bytes;
use vars qw{
$re_loose $re_strict $events
};
# HTML decoding TODOs
# - add URIs to list for faster URI testing
# elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
$re_loose = 'applet|basefont|center|dir|font|frame|frameset|iframe|isindex|menu|noframes|s|strike|u';
$re_strict = 'a|abbr|acronym|address|area|b|base|bdo|big|blockquote|body|br|button|caption|cite|code|col|colgroup|dd|del|dfn|div|dl|dt|em|fieldset|form|h1|h2|h3|h4|h5|h6|head|hr|html|i|img|input|ins|kbd|label|legend|li|link|map|meta|noscript|object|ol|optgroup|option|p|param|pre|q|samp|script|select|small|span|strong|style|sub|sup|table|tbody|td|textarea|tfoot|th|thead|title|tr|tt|ul|var';
# loose list of HTML events
$events = 'on(?:activate|afterupdate|beforeactivate|beforecopy|beforecut|beforedeactivate|beforeeditfocus|beforepaste|beforeupdate|blur|change|click|contextmenu|controlselect|copy|cut|dblclick|deactivate|errorupdate|focus|focusin|focusout|help|keydown|keypress|keyup|load|losecapture|mousedown|mouseenter|mouseleave|mousemove|mouseout|mouseover|mouseup|mousewheel|move|moveend|movestart|paste|propertychange|readystatechange|reset|resize|resizeend|resizestart|select|submit|timeerror|unload)';
my %tested_colors;
sub html_init {
my ($self) = @_;
push @{ $self->{bgcolor_color} }, "#ffffff";
push @{ $self->{bgcolor_tag} }, "default";
push @{ $self->{fgcolor_color} }, "#000000";
push @{ $self->{fgcolor_tag} }, "default";
undef %tested_colors;
}
sub html_tag {
my ($self, $tag, $attr, $num) = @_;
$self->{html_inside}{$tag} += $num;
$self->{html}{elements}++ if $tag =~ /^(?:$re_strict|$re_loose)$/io;
$self->{html}{tags}++;
if ($tag =~ /^(?:body|table|tr|th|td)$/) {
$self->html_bgcolor($tag, $attr, $num);
}
if ($tag =~ /^(?:body|font)$/) {
$self->html_fgcolor($tag, $attr, $num);
}
if ($num == 1) {
$self->html_format($tag, $attr, $num);
$self->html_uri($tag, $attr, $num);
$self->html_tests($tag, $attr, $num);
$self->{html_last_tag} = $tag;
}
if ($tag =~ /^(?:b|i|u|strong|em|big|center|h\d)$/) {
$self->{html}{shouting} += $num;
if ($self->{html}{shouting} > $self->{html}{max_shouting}) {
$self->{html}{max_shouting} = $self->{html}{shouting};
}
}
}
sub html_format {
my ($self, $tag, $attr, $num) = @_;
# ordered by frequency of tag groups
if ($tag eq "br") {
push @{$self->{html_text}}, "\n";
}
elsif ($tag eq "li" || $tag eq "td") {
push @{$self->{html_text}}, " ";
}
elsif ($tag eq "p" || $tag eq "hr") {
push @{$self->{html_text}}, "\n\n";
}
elsif ($tag eq "img" && exists $attr->{alt} && $attr->{alt} ne "") {
push @{$self->{html_text}}, " $attr->{alt} ";
}
}
sub html_uri {
my ($self, $tag, $attr, $num) = @_;
my $uri;
# ordered by frequency of tag groups
if ($tag =~ /^(?:body|table|tr|td)$/) {
push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{background};
}
elsif ($tag =~ /^(?:a|area|link)$/) {
push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{href};
}
elsif ($tag =~ /^(?:img|frame|iframe|embed|script)$/) {
push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{src};
}
elsif ($tag eq "form") {
push @{$self->{html_text}}, "URI:$uri " if $uri = $attr->{action};
}
elsif ($tag eq "base") {
if ($uri = $attr->{href}) {
# use to turn relative links into absolute links
# even if it is a base URI, handle like a normal URI as well
push @{$self->{html_text}}, "URI:$uri ";
# a base URI will be ignored by browsers unless it is an absolute
# URI of a standard protocol
if ($uri =~ m@^(?:ftp|https?)://@i) {
# remove trailing filename, if any; base URIs can have the
# form of "http://foo.com/index.html"
$uri =~ s@^([a-z]+://[^/]+/.*?)[^/\.]+\.[^/\.]{2,4}$@$1@i;
# Make sure it ends in a slash
$uri .= "/" unless $uri =~ m@/$@;
$self->{html}{base_href} = $uri;
}
}
}
}
# input values from 0 to 255
sub rgb_to_hsv {
my ($r, $g, $b) = @_;
my ($h, $s, $v, $max, $min);
if ($r > $g) {
$max = $r; $min = $g;
}
else {
$min = $r; $max = $g;
}
$max = $b if $b > $max;
$min = $b if $b < $min;
$v = $max;
$s = $max ? ($max - $min) / $max : 0;
if ($s == 0) {
$h = undef;
}
else {
my $cr = ($max - $r) / ($max - $min);
my $cg = ($max - $g) / ($max - $min);
my $cb = ($max - $b) / ($max - $min);
if ($r == $max) {
$h = $cb - $cg;
}
elsif ($g == $max) {
$h = 2 + $cr - $cb;
}
elsif ($b == $max) {
$h = 4 + $cg - $cr;
}
$h *= 60;
$h += 360 if $h < 0;
}
return ($h, $s, $v);
}
# HTML 4 defined 16 colors
my %html_color = (
aqua => '#00ffff',
black => '#000000',
blue => '#0000ff',
fuchsia => '#ff00ff',
gray => '#808080',
green => '#008000',
lime => '#00ff00',
maroon => '#800000',
navy => '#000080',
olive => '#808000',
purple => '#800080',
red => '#ff0000',
silver => '#c0c0c0',
teal => '#008080',
white => '#ffffff',
yellow => '#ffff00',
);
# popular X11 colors specified in CSS3 color module
my %name_color = (
aliceblue => '#f0f8ff',
cyan => '#00ffff',
darkblue => '#00008b',
darkcyan => '#008b8b',
darkgray => '#a9a9a9',
darkgreen => '#006400',
darkred => '#8b0000',
firebrick => '#b22222',
gold => '#ffd700',
lightslategray=> '#778899',
magenta => '#ff00ff',
orange => '#ffa500',
pink => '#ffc0cb',
whitesmoke => '#f5f5f5',
);
sub name_to_rgb {
return $html_color{$_[0]} || $name_color{$_[0]} || $_[0];
}
sub pop_bgcolor {
my ($self) = @_;
pop @{ $self->{bgcolor_color} };
pop @{ $self->{bgcolor_tag} };
}
sub html_bgcolor {
my ($self, $tag, $attr, $num) = @_;
if ($num == 1) {
# close elements with optional end tags
if ($tag eq "body") {
# compromise between HTML browsers generally only using first
# body and some messages including multiple HTML attachments:
# pop everything except first body color
while ($self->{bgcolor_tag}[-1] !~ /^(?:default|body)$/) {
$self->pop_bgcolor();
}
}
if ($tag eq "tr") {
while ($self->{bgcolor_tag}[-1] =~ /^t[hd]$/) {
$self->pop_bgcolor();
}
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "tr";
}
elsif ($tag =~ /^t[hd]$/) {
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] =~ /^t[hd]$/;
}
# figure out new bgcolor
my $bgcolor;
if (exists $attr->{bgcolor}) {
$bgcolor = name_to_rgb(lc($attr->{bgcolor}));
}
else {
$bgcolor = $self->{bgcolor_color}[-1];
}
# tests
if ($tag eq "body" && $bgcolor !~ /^\#?ffffff$/) {
$self->{html}{bgcolor_nonwhite} = 1;
}
# push new bgcolor
push @{ $self->{bgcolor_color} }, $bgcolor;
push @{ $self->{bgcolor_tag} }, $tag;
}
else {
# close elements
if ($tag eq "body") {
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "body";
}
elsif ($tag eq "table") {
while ($self->{bgcolor_tag}[-1] =~ /^t[rhd]$/) {
$self->pop_bgcolor();
}
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "table";
}
elsif ($tag eq "tr") {
while ($self->{bgcolor_tag}[-1] =~ /^t[hd]$/) {
$self->pop_bgcolor();
}
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] eq "tr";
}
elsif ($tag =~ /^t[hd]$/) {
$self->pop_bgcolor() if $self->{bgcolor_tag}[-1] =~ /^t[hd]$/;
}
}
}
sub pop_fgcolor {
my ($self) = @_;
pop @{ $self->{fgcolor_color} };
pop @{ $self->{fgcolor_tag} };
}
sub html_fgcolor {
my ($self, $tag, $attr, $num) = @_;
if ($num == 1) {
if ($tag eq "body") {
# compromise between HTML browsers generally only using first
# body and some messages including multiple HTML attachments:
# pop everything except first body color
while ($self->{fgcolor_tag}[-1] !~ /^(?:default|body)$/) {
$self->pop_fgcolor();
}
}
# figure out new fgcolor
my $fgcolor;
if ($tag eq "font" && exists $attr->{color}) {
$fgcolor = name_to_rgb(lc($attr->{color}));
}
elsif ($tag eq "body" && exists $attr->{text}) {
$fgcolor = name_to_rgb(lc($attr->{text}));
}
else {
$fgcolor = $self->{fgcolor_color}[-1];
}
# push new fgcolor
push @{ $self->{fgcolor_color} }, $fgcolor;
push @{ $self->{fgcolor_tag} }, $tag;
}
else {
# close elements
if ($tag eq "body") {
$self->pop_fgcolor() if $self->{fgcolor_tag}[-1] eq "body";
}
if ($tag eq "font") {
$self->pop_fgcolor() if $self->{fgcolor_tag}[-1] eq "font";
}
}
}
sub html_font_invisible {
my ($self, $text) = @_;
my $fg = $self->{fgcolor_color}[-1];
my $bg = $self->{bgcolor_color}[-1];
return if exists $tested_colors{"$fg\000$bg"};
$tested_colors{"$fg\000$bg"}++;
# invisibility
if (substr($fg,-6) eq substr($bg,-6)) {
$self->{html}{font_invisible} = 1;
}
# near-invisibility
elsif ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
my ($r1, $g1, $b1) = (hex($1), hex($2), hex($3));
if ($bg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
my ($r2, $g2, $b2) = (hex($1), hex($2), hex($3));
my $r = ($r1 - $r2);
my $g = ($g1 - $g2);
my $b = ($b1 - $b2);
# geometric distance weighted by brightness
# maximum distance is 191.151823601032
my $distance = ((0.2126*$r)**2 + (0.7152*$g)**2 + (0.0722*$b)**2)**0.5;
# the text is very difficult to read if the distance is under 12,
# a limit of 14 to 16 might be okay if the usage significantly
# increases (near-invisible text is at about 0.95% of spam and
# 1.25% of HTML spam right now), but please test any changes first
if ($distance < 12) {
$self->{html}{"font_near_invisible"} = 1;
}
}
}
}
sub html_tests {
my ($self, $tag, $attr, $num) = @_;
if ($tag eq "table" && exists $attr->{border} && $attr->{border} =~ /(\d+)/)
{
$self->{html}{thick_border} = 1 if $1 > 1;
}
if ($tag eq "script") {
$self->{html}{javascript} = 1;
}
if ($tag =~ /^(?:a|body|div|input|form|td|layer|area|img)$/i) {
for (keys %$attr) {
if (/\b(?:$events)\b/io)
{
$self->{html}{html_event} = 1;
}
if (/\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i &&
$attr->{$_})
{
$self->{html}{html_event_unsafe} = 1;
if ($attr->{$_} =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
if ($attr->{$_} =~ /\.blur\s*\(/) { $self->{html}{window_blur} = 1; }
if ($attr->{$_} =~ /\.focus\s*\(/) { $self->{html}{window_focus} = 1; }
}
}
}
if ($tag eq "font" && exists $attr->{size}) {
$self->{html}{big_font} = 1 if (($attr->{size} =~ /^\s*(\d+)/ && $1 > 3) ||
($attr->{size} =~ /\+(\d+)/ && $1 >= 1));
}
if ($tag eq "font" && exists $attr->{color}) {
my $bg = $self->{bgcolor_color}[-1];
my $fg = lc($attr->{color});
if ($fg =~ /^\#?[0-9a-f]{6}$/ && $fg !~ /^\#?(?:00|33|66|80|99|cc|ff){3}$/)
{
$self->{html}{font_color_unsafe} = 1;
}
if ($fg !~ /^\#?[0-9a-f]{6}$/ && !exists $html_color{$fg})
{
$self->{html}{font_color_name} = 1;
}
if ($fg =~ /^\#?([0-9a-f]{2})([0-9a-f]{2})([0-9a-f]{2})$/) {
my ($h, $s, $v) = rgb_to_hsv(hex($1), hex($2), hex($3));
if (!defined($h)) {
$self->{html}{font_gray} = 1 unless ($v == 0 || $v == 255);
}
elsif ($h < 30 || $h >= 330) {
$self->{html}{font_red} = 1;
}
elsif ($h < 90) {
$self->{html}{font_yellow} = 1;
}
elsif ($h < 150) {
$self->{html}{font_green} = 1;
}
elsif ($h < 210) {
$self->{html}{font_cyan} = 1;
}
elsif ($h < 270) {
$self->{html}{font_blue} = 1;
}
elsif ($h < 330) {
$self->{html}{font_magenta} = 1;
}
}
else {
$self->{html}{font_color_unknown} = 1;
}
}
if ($tag eq "font" && exists $attr->{face}) {
#print STDERR "FONT " . $attr->{face} . "\n";
if ($attr->{face} =~ /[A-Z]{3}/ && $attr->{face} !~ /M[ST][A-Z]|ITC/) {
$self->{html}{font_face_caps} = 1;
}
if ($attr->{face} !~ /^[a-z][a-z -]*[a-z](?:,\s*[a-z][a-z -]*[a-z])*$/i) {
$self->{html}{font_face_bad} = 1;
}
for (split(/,/, lc($attr->{face}))) {
$self->{html}{font_face_odd} = 1 if ! /^\s*(?:arial|arial black|courier new|geneva|helvetica|ms sans serif|sans serif|sans-serif|sans-serif;|serif|sunsans-regular|swiss|tahoma|times|times new roman|trebuchet|trebuchet ms|verdana)\s*$/i;
}
}
if (exists($attr->{style})) {
if ($attr->{style} =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
my $size = $1;
my $type = $2;
$self->{html}{big_font} = 1 if (lc($type) eq "pt" && $size > 12);
}
}
if ($tag eq "img" && exists $attr->{width} && exists $attr->{height}) {
my $width = 0;
my $height = 0;
my $area = 0;
# assume 800x600 screen for percentage values
if ($attr->{width} =~ /^(\d+)(\%)?$/) {
$width = $1;
$width *= 8 if (defined $2 && $2 eq "%");
}
if ($attr->{height} =~ /^(\d+)(\%)?$/) {
$height = $1;
$height *= 6 if (defined $2 && $2 eq "%");
}
if ($width > 0 && $height > 0) {
$area = $width * $height;
$self->{html}{image_area} += $area;
}
# this is intended to match any width and height if they're specified
if (exists $attr->{src} &&
$attr->{src} =~ /\.(?:pl|cgi|php|asp|jsp|cfm)\b/i)
{
$self->{html}{web_bugs} = 1;
}
}
if ($tag eq "form" && exists $attr->{action}) {
$self->{html}{form_action_mailto} = 1 if $attr->{action} =~ /mailto:/i
}
if ($tag =~ /^i?frame$/) {
$self->{html}{relaying_frame} = 1;
}
if ($tag =~ /^(?:object|embed)$/) {
$self->{html}{embeds} = 1;
}
if ($tag eq "title" &&
!(exists $self->{html_inside}{body} && $self->{html_inside}{body} > 0))
{
$self->{html}{title_text} = "";
}
if ($tag eq "meta" &&
exists $attr->{'http-equiv'} &&
exists $attr->{content} &&
$attr->{'http-equiv'} =~ /Content-Type/i &&
$attr->{content} =~ /\bcharset\s*=\s*["']?([^"']+)/i)
{
$self->{html}{charsets} .= exists $self->{html}{charsets} ? " $1" : $1;
}
$self->{html}{anchor_text} ||= "" if ($tag eq "a");
}
sub html_text {
my ($self, $text) = @_;
if (exists $self->{html_inside}{a} && $self->{html_inside}{a} > 0) {
$self->{html}{anchor_text} .= " $text";
}
if (exists $self->{html_inside}{script} && $self->{html_inside}{script} > 0)
{
if ($text =~ /\b(?:$events)\b/io)
{
$self->{html}{html_event} = 1;
}
if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
{
$self->{html}{html_event_unsafe} = 1;
}
if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
if ($text =~ /\.blur\s*\(/) { $self->{html}{window_blur} = 1; }
if ($text =~ /\.focus\s*\(/) { $self->{html}{window_focus} = 1; }
return;
}
if (exists $self->{html_inside}{style} && $self->{html_inside}{style} > 0) {
if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
my $size = $1;
my $type = $2;
$self->{html}{big_font} = 1 if (lc($type) eq "pt" && $size > 12);
}
return;
}
if (!(exists $self->{html_inside}{body} && $self->{html_inside}{body} > 0) &&
exists $self->{html_inside}{title} && $self->{html_inside}{title} > 0)
{
$self->{html}{title_text} .= $text;
}
$self->html_font_invisible($text) if $text =~ /[^ \t\n\r\f\x0b\xa0]/;
$text =~ s/^\n//s if $self->{html_last_tag} eq "br";
push @{$self->{html_text}}, $text;
}
sub html_comment {
my ($self, $text) = @_;
$self->{html}{comment_8bit} = 1 if $text =~ /[\x80-\xff]{3,}/;
$self->{html}{comment_email} = 1 if $text =~ /\S+\@\S+/;
$self->{html}{comment_egp} = 1 if $text =~ /\S+begin egp html banner\S+/;
$self->{html}{comment_saved_url} = 1 if $text =~ /"
if (exists $self->{html_inside}{script} && $self->{html_inside}{script} > 0)
{
if ($text =~ /\b(?:$events)\b/io)
{
$self->{html}{html_event} = 1;
}
if ($text =~ /\bon(?:blur|contextmenu|focus|load|resize|submit|unload)\b/i)
{
$self->{html}{html_event_unsafe} = 1;
}
if ($text =~ /\.open\s*\(/) { $self->{html}{window_open} = 1; }
if ($text =~ /\.blur\s*\(/) { $self->{html}{window_blur} = 1; }
if ($text =~ /\.focus\s*\(/) { $self->{html}{window_focus} = 1; }
return;
}
if (exists $self->{html_inside}{style} && $self->{html_inside}{style} > 0) {
if ($text =~ /font(?:-size)?:\s*(\d+(?:\.\d*)?|\.\d+)(p[tx])/i) {
my $size = $1;
my $type = $2;
$self->{html}{big_font} = 1 if (lc($type) eq "pt" && $size > 12);
}
}
if (exists $self->{html}{shouting} && $self->{html}{shouting} > 1) {
$self->{html}{comment_shouting} = 1;
}
}
sub html_declaration {
my ($self, $text) = @_;
if ($text =~ /^{html}{elements}++;
$self->{html}{tags}++;
$self->{html_inside}{$tag} = 0;
}
}
###########################################################################
# HTML parser tests
###########################################################################
sub html_tag_balance {
my ($self, undef, $rawtag, $rawexpr) = @_;
$rawtag =~ /^([a-zA-Z0-9]+)$/; my $tag = $1;
$rawexpr =~ /^([\<\>\=\!\-\+ 0-9]+)$/; my $expr = $1;
return 0 unless exists $self->{html_inside}{$tag};
$self->{html_inside}{$tag} =~ /^([\<\>\=\!\-\+ 0-9]+)$/;
my $val = $1;
return eval "$val $expr";
}
sub html_image_only {
my ($self, undef, $min, $max) = @_;
return (exists $self->{html_inside}{'img'} &&
exists $self->{html}{non_space_len} &&
$self->{html}{non_space_len} > $min &&
$self->{html}{non_space_len} <= $max &&
$self->get('X-eGroups-Return') !~ /^sentto-.*\@returns\.groups\.yahoo\.com$/);
}
sub html_image_ratio {
my ($self, undef, $min, $max) = @_;
return 0 unless (exists $self->{html}{non_space_len} &&
exists $self->{html}{image_area} &&
$self->{html}{image_area} > 0);
my $ratio = $self->{html}{non_space_len} / $self->{html}{image_area};
return ($ratio > $min && $ratio <= $max);
}
sub html_charset_faraway {
my ($self) = @_;
return 0 unless exists $self->{html}{charsets};
my @locales = $self->get_my_locales();
return 0 if grep { $_ eq "all" } @locales;
my $okay = 0;
my $bad = 0;
for my $c (split(' ', $self->{html}{charsets})) {
if (Mail::SpamAssassin::Locales::is_charset_ok_for_locales($c, @locales)) {
$okay++;
}
else {
$bad++;
}
}
return ($bad && ($bad >= $okay));
}
sub html_tag_exists {
my ($self, undef, $tag) = @_;
return exists $self->{html_inside}{$tag};
}
sub html_test {
my ($self, undef, $test) = @_;
return $self->{html}{$test};
}
sub html_eval {
my ($self, undef, $test, $expr) = @_;
return exists $self->{html}{$test} && eval "qq{\Q$self->{html}{$test}\E} $expr";
}
sub html_message {
my ($self) = @_;
return (exists $self->{html}{elements} &&
($self->{html}{elements} >= 8 ||
$self->{html}{elements} >= $self->{html}{tags} / 2));
}
sub html_range {
my ($self, undef, $test, $min, $max) = @_;
return 0 unless exists $self->{html}{$test};
$test = $self->{html}{$test};
# not all perls understand what "inf" means, so we need to do
# non-numeric tests! urg!
if ( !defined $max || $max eq "inf" ) {
return ( $test eq "inf" ) ? 1 : ($test > $min);
}
elsif ( $test eq "inf" ) {
# $max < inf, so $test == inf means $test > $max
return 0;
}
else {
# if we get here everything should be a number
return ($test > $min && $test <= $max);
}
}
1;
__END__