package HTML::Declaw; =head1 NAME HTML::Declaw - Cleans HTML as well as CSS of scripting and other executable contents, and neutralises XSS attacks. Derived from HTML::Defang version 1.01. =head1 SYNOPSIS my $InputHtml = ""; my $Defang = HTML::Declaw->new( context => $Self, fix_mismatched_tags => 1, tags_to_callback => [ br embed img ], tags_callback => \&DefangTagsCallback, url_callback => \&DefangUrlCallback, css_callback => \&DefangCssCallback, attribs_to_callback => [ qw(border src) ], attribs_callback => \&DefangAttribsCallback ); my $SanitizedHtml = $Defang->defang($InputHtml); # Callback for custom handling specific HTML tags sub DefangTagsCallback { my ($Self, $Defang, $OpenAngle, $lcTag, $IsEndTag, $AttributeHash, $CloseAngle, $HtmlR, $OutR) = @_; return 1 if $lcTag eq 'br'; # Explicitly defang this tag, eventhough safe return 0 if $lcTag eq 'embed'; # Explicitly whitelist this tag, eventhough unsafe return 2 if $lcTag eq 'img'; # I am not sure what to do with this tag, so process as HTML::Defang normally would } # Callback for custom handling URLs in HTML attributes as well as style tag/attribute declarations sub DefangUrlCallback { my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $AttributeHash, $HtmlR) = @_; return 0 if $$AttrValR =~ /safesite.com/i; # Explicitly allow this URL in tag attributes or stylesheets return 1 if $$AttrValR =~ /evilsite.com/i; # Explicitly defang this URL in tag attributes or stylesheets } # Callback for custom handling style tags/attributes sub DefangCssCallback { my ($Self, $Defang, $Selectors, $SelectorRules, $Tag, $IsAttr) = @_; my $i = 0; foreach (@$Selectors) { my $SelectorRule = $$SelectorRules[$i]; foreach my $KeyValueRules (@$SelectorRule) { foreach my $KeyValueRule (@$KeyValueRules) { my ($Key, $Value) = @$KeyValueRule; $$KeyValueRule[2] = 1 if $Value =~ '!important'; # Comment out any '!important' directive $$KeyValueRule[2] = 1 if $Key =~ 'position' && $Value =~ 'fixed'; # Comment out any 'position=fixed;' declaration } } $i++; } } # Callback for custom handling HTML tag attributes sub DefangAttribsCallback { my ($Self, $Defang, $lcTag, $lcAttrKey, $AttrValR, $HtmlR) = @_; $$AttrValR = '0' if $lcAttrKey eq 'border'; # Change all 'border' attribute values to zero. return 1 if $lcAttrKey eq 'src'; # Defang all 'src' attributes return 0; } =head1 DESCRIPTION This module accepts an input HTML and/or CSS string and removes any executable code including scripting, embedded objects, applets, etc., and neutralises any XSS attacks. A whitelist based approach is used which means only HTML known to be safe is allowed through. HTML::Defang uses a custom html tag parser. The parser has been designed and tested to work with nasty real world html and to try and emulate as close as possible what browsers actually do with strange looking constructs. The test suite has been built based on examples from a range of sources such as http://ha.ckers.org/xss.html and http://imfo.ru/csstest/css_hacks/import.php to ensure that as many as possible XSS attack scenarios have been dealt with. HTML::Defang can make callbacks to client code when it encounters the following: =over 4 =item * When a specified tag is parsed =item * When a specified attribute is parsed =item * When a URL is parsed as part of an HTML attribute, or CSS property value. =item * When style data is parsed, as part of an HTML style attribute, or as part of an HTML " if !$ClosingStyleTagPresent; } # We don't want