# TableContentParser # A package to parse the contents of HTML tables. # (C) 2002 Simon Drabble # # $Id: TableContentParser.pm,v 1.7 2002/07/14 15:00:43 simon Exp $ package HTML::TableContentParser; use HTML::Parser; @ISA = qw(HTML::Parser); use strict; our $VERSION = 0.13; our $DEBUG = 0; # The tags we're interested in. my @tag_names = qw(table tr td th caption); sub start { my ($self, $tag, $attr, $attrseq, $origtext) = @_; $tag = lc($tag); # Store the incoming details in the current 'object'. if ($tag eq 'table') { my $table = $attr; push @{$self->{STORE}->{tables}}, $table; $self->{STORE}->{current_table} = $table; } elsif ($tag eq 'th') { my $th = $attr; push @{$self->{STORE}->{current_table}->{headers}}, $th; $self->{STORE}->{current_header} = $th; $self->{STORE}->{current_element} = $th; } elsif ($tag eq 'tr') { my $tr = $attr; push @{$self->{STORE}->{current_table}->{rows}}, $tr; $self->{STORE}->{current_row} = $tr; $self->{STORE}->{current_element} = $tr; } elsif ($tag eq 'td') { my $td = $attr; push @{$self->{STORE}->{current_row}->{cells}}, $td; $self->{STORE}->{current_data_cell} = $td; $self->{STORE}->{current_element} = $td; } elsif ($tag eq 'caption') { my $cap = $attr; $self->{STORE}->{current_table}->{caption} = $cap; $self->{STORE}->{current_element} = $cap; } else { ## Found a non-table related tag. Push it into the currently-defined td ## or th (if one exists). my $elem = $self->{STORE}->{current_element}; if ($elem) { $self->debug('TEXT(tag) = ', $origtext) if $DEBUG; $elem->{data} .= $origtext; } } $self->debug($origtext) if $DEBUG; } sub text { my ($self, $text) = @_; my $elem = $self->{STORE}->{current_element}; if (!$elem) { return undef; } $self->debug('TEXT = ', $text) if $DEBUG; $elem->{data} .= $text; } sub end { my ($self, $tag, $origtext) = @_; $tag = lc($tag); # Turn off the current object if ($tag eq 'table') { $self->{STORE}->{current_table} = undef; $self->{STORE}->{current_row} = undef; $self->{STORE}->{current_data_cell} = undef; $self->{STORE}->{current_header} = undef; $self->{STORE}->{current_element} = undef; } elsif ($tag eq 'th') { $self->{STORE}->{current_row} = undef; $self->{STORE}->{current_data_cell} = undef; $self->{STORE}->{current_header} = undef; $self->{STORE}->{current_element} = undef; } elsif ($tag eq 'tr') { $self->{STORE}->{current_row} = undef; $self->{STORE}->{current_data_cell} = undef; $self->{STORE}->{current_header} = undef; $self->{STORE}->{current_element} = undef; } elsif ($tag eq 'td') { $self->{STORE}->{current_data_cell} = undef; $self->{STORE}->{current_header} = undef; $self->{STORE}->{current_element} = undef; } elsif ($tag eq 'caption') { $self->{STORE}->{current_element} = undef; } else { ## Found a non-table related close tag. Push it into the currently-defined ## td or th (if one exists). my $elem = $self->{STORE}->{current_element}; if ($elem) { $self->debug('TEXT(tag) = ', $origtext) if $DEBUG; $elem->{data} .= $origtext; } } $self->debug($origtext) if $DEBUG; } sub parse { my ($self, $data) = @_; $self->{STORE} = undef; # Ensure the following keys exist $self->{STORE}->{current_data_cell} = undef; $self->{STORE}->{current_row} = undef; $self->{STORE}->{current_table} = undef; $self->SUPER::parse($data); return $self->{STORE}->{tables}; } sub debug { my ($self) = shift; my $class = ref($self); warn "$class: ", join('', @_), "\n"; } 1; __END__ =head1 NAME HTML::TableContentParser - Do interesting things with the contents of tables. =head1 SYNOPSIS use HTML::TableContentParser; $p = HTML::TableContentParser->new(); $tables = $p->parse($html); =head1 DESCRIPTION This package pulls out the contents of a table from a string containing HTML. Each time a table is encountered, data will be stored in an array consisting of a hash of whatever was discovered about the table -- id, name, border, cellspacing etc, and of course data contained within the table. The format of each hash will look something like attributes keys from the attributes of the tag @{$table_headers} array of table headers, in order found @{$table_rows} rows discovered, in order If the table has a caption, this will be provided as caption keys from the caption tag's attributes data the text of the element then for each table row, @{$table_data} td's found, in order other attributes the ... in then for each data cell, data what comes between other attributes the ... in
..
and =head2 EXAMPLE use HTML::TableContentParser; $p = HTML::TableContentParser->new(); $html = read_html_from_somewhere(); $tables = $p->parse($html); for $t (@$tables) { for $r (@{$t->{rows}}) { print "Row: "; for $c (@{$r->{cells}}) { print "[$c->{data}] "; } print "\n"; } } =head1 METHODS =over 4 =item start($parser, $tag, $attr, $attrseq, $origtext); Called whenever a particular start tag has been recognised. This is called automatically by the parser and should not be called from the application. =item text($parser, $content); Called whenever a piece of content is encountered. This is called automatically by the parser and should not be called from the application. =item end($parser, $tag, $origtext); Called whenever a particular end tag is encountered. This is called automatically by the parser and should not be called from the application. =item $tables_ref = $p->parse($html); Called with the HTML to parse. This is all the application needs to do. The return value will be an arrayref containing each table encountered, in the format detailed above. =item DEBUG Not a method, but a class variable. Set to 1 to cause debugging output (basically the structure and content of the table) to be sent to stdout via warn(). =back =head2 EXPORTS Nothing. =head2 CAVEATS, BUGS, and TODO =head1 AUTHOR Simon Drabble Esdrabble@cpan.orgE (C) 2002 Simon Drabble This software is released under the same terms as perl. =cut