package Text::Hatena::HTMLFilter; use strict; use HTML::Parser; sub new { my $class = shift; my %args = @_; my $self = { context => $args{context}, html => '', }; bless $self,$class; $self->init; return $self; } sub init { my $self = shift; $self->{parser} = HTML::Parser->new( api_version => 3, handlers => { start => [$self->starthandler, 'tagname, attr, text'], end => [$self->endhandler, 'tagname, text'], text => [$self->texthandler, 'text'], comment => [$self->commenthandler, 'text'], }, ); $self->{allowtag} = qr/^(a|abbr|acronym|address|b|base|basefont|big|blockquote|br|col|em|caption|center|cite|code|div|dd|del|dfn|dl|dt|fieldset|font|form|hatena|h\d|hr|i|img|input|ins|kbd|label|legend|li|meta|ol|optgroup|option|p|pre|q|rb|rp|rt|ruby|s|samp|select|small|span|strike|strong|sub|sup|table|tbody|td|textarea|tfoot|th|thead|tr|tt|u|ul|var)$/; $self->{allallowattr} = qr/^(accesskey|align|alt|background|bgcolor|border|cite|class|color|datetime|height|id|size|title|type|valign|width)$/; $self->{allowattr} = { a => 'href|name|target', base => 'href|target', basefont => 'face', blockquote => 'cite', br => 'clear', col => 'span', font => 'face', form => 'action|method|target|enctype', hatena => '.+', img => 'src', input => 'type|name|value|tabindex|checked|src', label => 'for', li => 'value', meta => 'name|content', ol => 'start', optgroup => 'label', option => 'value', select => 'name|accesskey|tabindex', table => 'cellpadding|cellspacing', td => 'rowspan|colspan|nowrap', th => 'rowspan|colspan|nowrap', textarea => 'name|cols|rows', }; } sub parse { my $self = shift; my $html = shift or return; $self->{parser}->parse($html); } sub texthandler { my $self = shift; return sub { my $text = shift; $text = &{$self->{context}->texthandler}($text, $self->{context}); $self->{html} .= $text; } } sub starthandler { my $self = shift; return sub { my ($tagname, $attr, $attrseq, $text) = @_; if ($tagname =~ /$self->{allowtag}/) { my $ret = "<$tagname"; for my $p (keys %{$attr}) { my $v = $attr->{$p}; if ($p =~ /$self->{allallowattr}/) { } elsif ($self->{allowattr}->{$tagname}) { $p =~ /^($self->{allowattr}->{$tagname})$/i or next; } else { next; } if ($p =~ /^(src|href|cite)$/i) { $v = $self->sanitize_url($v); } else { $v = $self->sanitize($v); } $ret .= qq| $p="$v"|; } $ret .= ">"; $self->{html} .= $ret; } else { $self->{html} .= $self->sanitize($text); } } } sub endhandler { my $self = shift; return sub { my ($tagname, $text) = @_; if ($tagname =~ /$self->{allowtag}/) { $self->{html} .= ""; } else { $self->{html} .= $self->sanitize($text); } } } sub commenthandler { my $self = shift; return sub { $self->{html} .= ""; } } sub sanitize { my $self = shift; my $str = shift; length $str or return; $str =~ s/&(?![\#a-zA-Z0-9_]{2,6};)/&/g; $str =~ s/\/\>\;/g; $str =~ s/\"/"/g; $str =~ s/\'/'/g; $str =~ s/\\/\\\;/g; return $str; } sub sanitize_url { my $self = shift; my $url = shift or return; $url =~ s/^\s+//; $url =~ /^(\&|about|\:)/ and return ''; if ($url =~ /^([A-Za-z]+:)/) { my $scheme = $1; $scheme =~ /^(http|ftp|https|mailto|rtsp|mms):/i or return ''; } elsif ($url =~ /^(\.|\/|#)/) { } else { $url = "./$url"; } $url =~ s/["'\(\)<>]//g; return $url; } sub html { $_[0]->{html}; } 1;