package WWW::Mechanize::Plugin::DOM; # DOM is in a separate module from JavaScript because other scripting # languages may use DOM as well. Anyone have time to implement Acme::Chef # bindings for Mech? :-) $VERSION = '0.014'; use 5.006; use strict; use warnings; no warnings qw 'utf8 parenthesis bareword'; use Encode qw'encode decode'; use Hash::Util::FieldHash::Compat 'fieldhash'; use HTML::DOM 0.021; use HTTP::Headers::Util 'split_header_words'; use Scalar::Util 'weaken'; no URI(); no WWW::Mechanize (); no WWW::Mechanize::Plugin::DOM::Window (); fieldhash my %parathia; # keyed by mech fieldhash my %mech_per_frame; # keyed by (i)frame element sub init { # expected to return a plugin object that the mech object will # use to communicate with the plugin. my ($package, $mech) = @_; my $self = bless { script_handlers => {}, event_attr_handlers => {}, s => 1, # scriptable mech => $mech, }, $package; weaken $self->{mech}; $mech->set_my_handler( parse_html => \&_parse_html ); $mech->set_my_handler( get_content => sub { shift; my $mech = shift; $mech->is_html or return; my $stuff = (my $self = $mech->plugin('DOM')) ->tree->innerHTML; defined $$self{charset} ? encode $$self{charset}, $stuff : $stuff; } ); $mech->set_my_handler( get_text_content => sub { shift; my $mech = shift; $mech->is_html or return; my $stuff = (my $self = $mech->plugin('DOM')) ->tree->documentElement->as_text; defined $$self{charset} ? encode $$self{charset}, $stuff : $stuff; } ); $mech->set_my_handler( extract_forms => sub { shift; shift->plugin('DOM')->tree->forms } ); $mech->set_my_handler( extract_links => sub { shift; tie my @links, WWW'Mechanize'Plugin'DOM'Links:: => scalar shift->plugin('DOM')->tree->links ;\@links; }); $mech->set_my_handler( extract_images => sub { shift; my $doc = shift->plugin('DOM')->tree; my $list = HTML::DOM::NodeList::Magic->new( sub { grep tag $_ =~ /^i(?:mg|nput)\z/, $doc->descendants }, $doc ); tie my @images, WWW'Mechanize'Plugin'DOM'Images:: => $list; ;\@images; }); $self; } sub _parse_html { my (undef,$mech,undef,$src) = @_; weaken $mech; my $self = $mech->plugin('DOM'); weaken $self; $$self{tree} = my $tree = new HTML::DOM response => $mech->response, cookie_jar => $mech->cookie_jar; $tree->error_handler(sub{$mech->warn($@)}); $tree->default_event_handler_for( link => sub { $mech->get(shift->target->href) }); $tree->default_event_handler_for( submit => sub { $mech->request(shift->target->make_request); }); if(%{$$self{script_handlers}} || %{$$self{event_attr_handlers}}) { my $script_type = $mech->response->header( 'Content-Script-Type'); defined $script_type or $tree->elem_handler(meta => sub { my($tree, $elem) = @_; return unless lc $elem->attr('http-equiv') eq 'content-script-type'; $script_type = $elem->attr('content'); }); if(%{$$self{script_handlers}}) { $tree->elem_handler(script => sub { return unless $self->{s}; my($tree, $elem) = @_; my $lang = $elem->attr('type'); defined $lang or $lang = $elem->attr('language'); defined $lang or $lang = $script_type; my $uri; my($inline, $code, $line) = 0; if($uri = $elem->attr('src')) { my $clone = $mech->clone->clear_history(1); my $base = $mech->base; $uri = URI->new_abs( $uri, $base ) if $base; my $res = $clone->get($uri); $res->is_success or $mech->warn("couldn't get script $uri: " . $res->status_line ); # Find out the encoding: my $cs = { map @$_, split_header_words $res->header( 'Content-Type' ) }->{charset}; $code = decode $cs||$elem->charset ||$tree->charset||'latin1', $res->decoded_content(charset=>'none'); $line = 1; } else { $code = $elem->firstChild->data; ++$inline; $uri = $mech->uri; $line = _line_no( $src,$elem->content_offset ); }; SCRIPT_HANDLER: { if(defined $lang) { while(my($lang_re,$handler) = each %{$$self{script_handlers}}) { next if $lang_re eq 'default'; $lang =~ $lang_re and &$handler($mech, $tree, $code, $uri, $line, $inline), # reset iterator: keys %{$$self{script_handlers}}, last SCRIPT_HANDLER; }} # end of if-while &{ $$self{script_handlers}{default} || return }($mech,$tree, $code, $uri, $line, $inline); } # end of S_H }); $tree->elem_handler(noscript => sub { return unless $self->{s}; $_[1]->detach#->delete; # ~~~ delete currently stops it from work- # ing; I need to looook into this. }); } if(%{$$self{event_attr_handlers}}) { $tree->event_attr_handler(sub { return unless $self->{s}; my($elem, $event, $code, $offset) = @_; my $lang = $elem->attr('language'); defined $lang or $lang = $script_type; my $uri = $mech->uri; my $line = defined $offset ? _line_no( $src, $offset ) : undef; HANDLER: { if(defined $lang) { while(my($lang_re,$handler) = each %{$$self{event_attr_handlers}}) { next if $lang_re eq 'default'; $lang =~ $lang_re and &$handler($mech, $elem, $event,$code,$uri,$line), # reset the hash iterator: keys %{$$self{event_attr_handlers}}, last HANDLER; }} # end of if-while &{ $$self{event_attr_handlers}{default} || return }( $mech,$elem,$event,$code,$uri,$line ); } # end of HANDLER }); } } # ~~~ Should we use the content of