package HTML::GMUCK; # $Id: GMUCK.pm,v 1.24 2007/04/01 20:26:55 scop Exp $ use strict; require 5.006; use vars qw($VERSION $Tag_End $Tag_Start $Non_Tag_End $URI_Attrs $End_Omit $All_Elems $Min_Elems $Compat_Elems $Min_Attrs $MIME_Type @MIME_Attrs %Req_Attrs $All_Attrs $Depr_Elems @Depr_Attrs @Int_Attrs @Length_Attrs @Fixed_Attrs); use Carp qw(carp); no warnings 'utf8'; BEGIN { $VERSION = sprintf("%d.%02d", q$Revision: 1.24 $ =~ /(\d+)\.(\d+)/); # --- Preload regexps. my $tmp = ''; my %tmp = (); if (! do 'HTML/GMUCK/regexps.pl') { my $err = $! || $@; die "Error reading HTML/GMUCK/regexps.pl: $err"; } } # ----- Constructors -------------------------------------------------------- # sub new { my ($class, %attr) = @_; my $this = bless({ _mode => undef, _xml => undef, _xhtml => undef, _html => undef, _tab_width => undef, _num_errors => undef, _num_warnings => undef, _quote => undef, _min_attrs => undef, }, (ref($class) || $class)); my $tab_width = delete($attr{tab_width}); $tab_width = 4 unless defined($tab_width); $this->tab_width($tab_width) or $this->tab_width(4); my $mode = delete($attr{mode}); $mode = 'XHTML' unless defined($mode); $this->mode($mode) or $this->mode('XHTML'); my $quote = delete($attr{quote}); $this->quote(defined($quote) ? $quote : '"'); $this->min_attributes(delete($attr{min_attributes})); $this->reset(); if (my @unknown = keys(%attr)) { carp("** Unrecognized attributes: " . join(',', sort(@unknown))); } return $this; } # ---------- Check: deprecated ---------------------------------------------- # sub deprecated { return shift->_wrap('_deprecated', @_);} sub _deprecated { my ($this, $line) = @_; my @errors = (); while ($line =~ /\b(document\.location)\b/go) { push(@errors, { col => $this->_pos($line, pos($line) - length($1)), type => 'W', mesg => 'document.location is deprecated, use window.location ' . 'instead', }, ); } # --- return @errors unless $this->{_html}; # Optimization. return @errors unless $line =~ $Tag_Start; # --- while ($line =~ / < (\/?) ( ($Depr_Elems) (?:$|$Tag_End|\s) ) /giox) { push(@errors, { col => $this->_pos($line, pos($line) - length($2)), elem => $3, mesg => 'deprecated element' . ($1 ? ' end' : ''), type => 'W', }, ); } # --- foreach my $re (@Depr_Attrs) { while ($line =~ /$re/g) { my ($m, $elem, $attr) = ($1, $2, $3); if ($attr) { push(@errors, { col => $this->_pos($line, pos($line) - length($m)), elem => $elem, attr => $attr, type => 'W', mesg => 'deprecated attribute for this element', }, ); } } } return @errors; } # ----- Check: attributes --------------------------------------------------- # sub attributes { return shift->_wrap('_attributes', @_); } sub _attributes { my ($this, $line) = @_; return () unless $this->{_html}; my @errors = (); # --- my $type = $this->{_xhtml} ? 'E' : 'W'; # BUG: Does not catch non-lowercase minimized attributes, like CHECKED. while ($line =~ / (?:^\s*|(?<=[\w\"\'])\s+) ( ($All_Attrs) = (.\S?) # Would like to see ['"], possibly backslashed. ) /giox) { my ($pos, $att, $q) = (pos($line) - length($1), $2, $3); if ($att ne lc($att)) { push(@errors, { col => $this->_pos($line, $pos), attr => $att, type => $type, mesg => 'non-lowercase attribute', }, ); } if (my $tq = $this->{_quote}) { my $pos = $this->_pos($line, $pos + length($att) + 1); if ($q =~ /\\?([\"\'])/o) { if ($1 ne $tq) { push(@errors, { col => $pos, type => 'W', attr => $att, mesg => "quote attribute values with $tq", }, ); } } else { push(@errors, { col => $pos, attr => $att, type => 'W', mesg => 'unquoted value', }, ); } } } # --- # Optimization. return @errors unless $line =~ /$Tag_Start\w../o; # --- foreach my $re (@Int_Attrs) { my $msg = 'value should be an integer: "%s"'; while ($line =~ /$re/g) { my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5); my $lel = lc($el); my $latt = lc($att); if ($val !~ /^\d+$/o && $val !~ /[\\\$\(\[]/o # bogus protection ) { # Special case: img->border only in HTML 4 next if ($this->{_xhtml} && $lel eq 'img' && $latt eq 'border'); push(@errors, { col => $this->_pos($line, pos($line) - length($m)), type => 'E', mesg => sprintf($msg, $val), elem => $el, attr => $att, }, ); } } } # --- foreach my $re (@Length_Attrs) { my $msg = 'value should be an integer or a percentage: "%s"'; while ($line =~ /$re/g) { my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5); if ($val !~ /^\d+%?$/o && $val !~ /[\\\$\(\[]/o # bogus protection ) { push(@errors, { col => $this->_pos($line, pos($line) - length($m)), type => 'E', mesg => sprintf($msg, $val), elem => $el, attr => $att, }, ); } } } # --- foreach (@Fixed_Attrs) { my ($re, $vre, $vals) = @$_; $vre = $this->{_xml} ? qr/$vre/ : qr/$vre/i; my $msg = 'invalid value: "%s", should be %s"%s"'; while ($line =~ /$re/g) { my ($m, $el, $att, $q, $val) = ($1, $2, $3, $4, $5); if ($val !~ $vre && $val !~ /[\\\$\(\[]/o # bogus protection ) { my $latt = lc($att); my $lel = lc($el); # Special case: html->xmlns and pre,script,style->xml:space XHTML-only next if (! $this->{_xhtml} && (($lel eq 'html' && $latt eq 'xmlns') || ($latt eq 'xml:space' && $lel =~ /^(pre|s(cript|tyle))$/o))); push(@errors, { col => $this->_pos($line, pos($line) - length($m)), type => 'E', mesg => sprintf($msg, $val, ($vals =~ /\|/o) ? 'one of ' : '', $vals), elem => $el, attr => $att, }, ); } } } # --- # # Note that minimized attributes are forbidden only in XHTML, but it # is legal to have them in HTML too. # # Not doing this check inside <>'s would result in too much bogus. # if ($this->{_min_attrs}) { while ($line =~ / < $Non_Tag_End+? \s ( ($Min_Attrs) ([=\s]|$Tag_End) ) /giox) { my ($m, $attr, $eq) = ($1, $2, $3); if ($eq ne '=') { push(@errors, { col => $this->_pos($line, pos($line) - length($m)), attr => $attr, type => $type, mesg => 'minimized attribute', }, ); } } } # --- while (my ($attr, $re) = each(%Req_Attrs)) { my $msg = 'missing required attribute: "%s"'; # Parens: 1: for pos(), 2:element, 3: attribute (or undef if not found) while ($line =~ /$re/g) { my ($m, $el, $att) = ($1, $2, $3); if (! $att) { my $lel = lc($el); # Special case: @name not required for input/@type's submit and reset next if ($lel eq 'input' && $attr eq 'name' && # TODO: this is crap $line =~ /\stype=(\\?[\"\'])?(submi|rese)t\b/io); # Special case: map/@id required only in XHTML 1.0+ next if ($lel eq 'map' && $attr eq 'id' && ! $this->{_xhtml}); push(@errors, { col => $this->_pos($line, pos($line) - length($m)), type => 'E', mesg => sprintf($msg, $attr), elem => $el, }, ); } } } return @errors; } # ----- Check: MIME types --------------------------------------------------- # sub mime_types { return shift->_wrap('_mime_types', @_); } sub _mime_types { my ($this, $line) = @_; return () unless $this->{_html}; # Optimization. "_pos($line, pos($line) - length($m)); if ($mtype !~ $MIME_Type) { push(@errors, { col => $pos, type => 'E', elem => $elem, attr => $attr, mesg => sprintf($msg, $mtype), }, ); } elsif (lc($elem) eq 'script' && $mtype =~ /(ecm|jav)ascript/io && lc($mtype) !~ '^application/(ecm|jav)ascript$') { push(@errors, { col => $pos, type => 'W', elem => $elem, attr => $attr, mesg => sprintf($jsmsg, $mtype), }, ); } } } return @errors; } # ----- Check: elements ----------------------------------------------------- # sub elements { return shift->_wrap('_elements', @_); } sub _elements { my ($this, $line) = @_; return () unless $this->{_html}; my @errors = (); # --- my $type = $this->{_xhtml} ? 'E' : 'W'; my $msg = 'non-lowercase element%s'; while ($line =~ / < (\/?) ( ($All_Elems) (\s|$Tag_End|\Z) # \Z) because $) would screw my indentation :) ) /giox) { my ($slash, $pos, $elem) = ($1, pos($line) - length($2), $3); if ($elem ne lc($elem)) { push(@errors, { col => $this->_pos($line, $pos), type => $type, elem => $elem, mesg => sprintf($msg, ($slash ? ' end' : '')), }, ); } } # --- $msg = 'missing end tag'; while ($line =~ / < ( ($End_Omit) .*? $Tag_End [^<]* < (.?) ($End_Omit) ) /giox) { my ($m, $start, $slash, $end) = ($1, $2, $3, $4); if ((lc($start) eq lc($end) && $slash ne '/') || # TODO: this needs tuning. See t/002endtag.t, line 6. (lc($start) ne lc($end))) { push(@errors, { col => $this->_pos($line, pos($line) - length($m)), mesg => $msg, elem => $start, type => 'W', }, ); } } # --- # We also allow a backslashed "/", they're common in eg. Perl regexps. # Consider # $foo =~ s/bar/baz
/; while ($line =~ / < # TODO: Do we really need to see a known ($All_Elems) # element here? .*? (\s?\\?\/?($Tag_End)) /giox) { my ($el, $end, $m) = ($1, $2); my $pos = $this->_pos($line, pos($line) - length($3)); if ($end =~ m|/>$|o) { if ($this->{_xhtml} && $el !~ /^$Compat_Elems$/io && # These don't apply here, see later. $end !~ m|\s\\?/|o) { push(@errors, { col => $pos, type => 'W', mesg => 'use space before "/>" for compatibility', elem => $el, }, ); } elsif (! $this->{_xml} && $end =~ m|/>$|o) { push(@errors, { col => $pos, type => 'E', mesg => 'element end "/>" is allowed in X(HT)ML only', elem => $el, }, ); } } } # --- # Check for missing " />". if ($this->{_xhtml}) { while ($line =~ / < ($Min_Elems) .*? (\/?$Tag_End) /giox) { my ($el, $end) = ($1, $2); if ($end ne '/>') { push(@errors, { col => $this->_pos($line, pos($line) - length($end)), elem => $el, mesg => 'missing " />"', type => 'E', }, ); } } while ($line =~ / < ($Compat_Elems) .*? (\s?.?$Tag_End) /giox) { my ($el, $end) = ($1, $2); $msg = 'use "<%s>" instead of <%s for compatibility'; if ($end =~ m|(\s?/>)$|o) { my $e = lc($el); push(@errors, { col => $this->_pos($line, pos($line) - length($end)), elem => $el, mesg => sprintf($msg, $e, $e, $e . $1), type => 'W', }, ); } } } return @errors; } # ----- Check: entities ----------------------------------------------------- # # Check for unterminated entities in URIs (usually & instead of &). sub entities { return shift->_wrap('_entities', @_);} sub _entities { my ($this, $line) = @_; return () unless $this->{_html}; # Optimization. "src=&" is the shortest we know of. return () unless $line =~ /\w{3}=./; my @errors = (); my $msg = 'unterminated entity: %s'; while ($line =~ / (?:^|\s) ($URI_Attrs) = ( (.+?) (?: (? $this->_pos($line, $pos + pos($val) - length($2) - 1), type => 'E', mesg => sprintf($msg, $1), attr => $attr, }, ); } } return @errors; } # ----- Check: DOCTYPE ------------------------------------------------------ # # Check for doctype declaration errors. sub doctype { return shift->_wrap('_doctype', @_); } sub _doctype { my ($this, $line) = @_; my @errors = (); while ($line =~ /)/gio) { my ($pos, $dt, $rest) = (pos($line) - length($1), $2, $3); if ($dt ne "DOCTYPE") { push(@errors, { col => $this->_pos($line, $pos), type => 'E', mesg => "DOCTYPE must be uppercase: $dt", }, ); $pos = pos($line) - length($rest) - 1; if ($this->{_html} && (my ($p1, $html, $t) = ($rest =~ /^((html)\s+)(\w+)?/io))) { # TODO: better message, maybe this should not be XHTML-only. if ($this->{_xhtml} && $html ne 'html') { my $msg = "\"html\" in DOCTYPE should be lowercase in XHTML: $html"; push(@errors, { col => $this->_pos($line, $pos), type => 'W', mesg => $msg, }, ); } $pos += length($p1); if ($t =~ /^(PUBLIC|SYSTEM)$/io) { if ($t ne uc($t)) { my $msg = uc($t) . " must be uppercase: \"$t\""; push(@errors, { col => $this->_pos($line, $pos), type => 'E', mesg => $msg, }, ); if ($this->{_xml} && uc($t) eq 'PUBLIC') { # TODO: In XML, you can't declare public ID without # system ID. Check this. } } } else { my $msg = "PUBLIC or SYSTEM should follow root element name: \"$t\""; push(@errors, { col => $this->_pos($line, $pos), type => 'W', mesg => $msg, }, ); } } } } return @errors; } # ---------- Accessors and mutators ----------------------------------------- # sub mode { my ($this, $mode) = @_; if ($mode) { my $was_xml = $this->{_xml}; if ($mode eq 'HTML') { $this->{_xhtml} = 0; $this->{_xml} = 0; $this->{_html} = 1; $this->{_mode} = $mode; } elsif ($mode eq 'XML') { $this->{_xhtml} = 0; $this->{_xml} = 1; $this->{_html} = 0; $this->{_mode} = $mode; $this->quote('"') unless $was_xml; } elsif ($mode eq 'XHTML') { $this->{_xhtml} = 1; $this->{_xml} = 1; $this->{_html} = 1; $this->{_mode} = $mode; $this->quote('"') unless $was_xml; } else { carp("** Mode must be one of XHTML, HTML, XML (resetting to XHTML)"); $this->mode('XHTML'); } } return $this->{_mode}; } sub tab_width { my ($this, $tw) = @_; if (defined($tw)) { if ($tw > 0) { $this->{_tab_width} = sprintf("%.0f", $tw); # Uh. Integers please. } else { carp("** TAB width must be > 0"); } } return $this->{_tab_width}; } sub min_attributes { my ($this, $minattr) = @_; if (defined($minattr)) { if (! $minattr && $this->{_xml}) { carp("** Will not disable minimized attribute checks in " . $this->mode() . " mode"); } else { $this->{_min_attrs} = $minattr; } } return $this->{_min_attrs}; } sub stats { my $this = shift; return ($this->{_num_errors}, $this->{_num_warnings}); } sub reset { my $this = shift; my ($e, $w) = $this->stats(); $this->{_num_errors} = 0; $this->{_num_warnings} = 0; return ($e, $w); } sub quote { my ($this, $q) = @_; if (defined($q)) { # We always allow " and ', and empty when non-xml, refuse others. my $is_ok = ($q eq '"' || $q eq "'" ); $is_ok ||= (! $this->{_xml} && ! length($q)); if ($is_ok) { $this->{_quote} = $q; } else { carp("** Refusing to set quote to ", ($q || '[none]'), " when in " . $this->mode() . " mode"); } } return $this->{_quote}; } sub full_version { return "HTML::GMUCK $VERSION"; } # ---------- Utility methods ------------------------------------------------ # sub _pos { my ($this, $line, $pos) = @_; $pos = 0 unless (defined($pos) && $pos > 0); if ($this->{_tab_width} > 1 && $pos > 0) { my $pre = substr($line, 0, $pos); while ($pre =~ /\t/g) { $pos += $this->{_tab_width} - 1; } } return $pos; } sub _wrap { my ($this, $method, @lines) = @_; my @errors = (); my $ln = 0; for (my $ln = 0; $ln < scalar(@lines); $ln++) { foreach my $err ($this->$method($lines[$ln])) { $err->{line} = $ln; if (! $err->{mesg}) { $err->{mesg} = "no error message, looks like you found a bug"; carp("** " . ucfirst($err->{mesg})); } $err->{col} ||= 0; if (! $err->{type}) { carp("** No error type, looks like you found a bug"); $err->{type} = '?'; } push(@errors, $err); if ($err->{type} eq 'W') { $this->{_num_warnings}++; } else { $this->{_num_errors}++; } } } return @errors; } 1;