# $Id: /mirror/perl/Web-Scraper-Config/trunk/lib/Web/Scraper/Config.pm 7145 2007-05-09T16:36:57.901467Z daisuke $ # # Copyright (c) 2007 Daisuke Maki package Web::Scraper::Config; use strict; use warnings; use Config::Any; use Data::Visitor::Callback; use Web::Scraper; use URI; our $VERSION = '0.01'; sub new { my $class = shift; my $self = bless {}, $class; my $config = shift; my $opts = shift; $config = $self->_load_config($config); { # BROKEN YAML::Syck my $v = Data::Visitor::Callback->new( plain_value => sub { $_[1] =~ s/(\w:) /$1/g; $_[1] } ); $config = $v->visit($config); } $self->{config} = $config; $self->{callbacks} = $opts->{callbacks} if $opts && $opts->{callbacks}; return $self; } sub _load_config { my $self = shift; my $file = shift; if (ref $file eq 'HASH') { return $file; } else { # This is a bit hackish, but we're only loading one file, so # we should be okay my $list = Config::Any->load_files({ files => [ $file ]}); if (! @$list ) { require Carp; Carp::croak("Could not load config file $file: $@"); } return (values %{$list->[0]})[0]; } } sub scrape { my $self = shift; my $config = $self->{config}; my $scraper = $self->_recurse($config)->(); return $scraper->scrape(@_); } sub _recurse { my ($self, $rules) = @_; my $ref = ref($rules); my $ret; if (! $ref) { if ($rules =~ /^__callback\(([^\)]+)\)__$/) { $rules = $self->{callbacks}{$1}; } $ret = $rules; } elsif ($ref eq 'ARRAY') { my @elements; foreach my $rule (@$rules) { if ($rule =~ /^__callback\(([^\)]+)\)__$/) { $rule = $self->{callbacks}{$1}; push @elements, sub { sub { $rule->(@_) } }; } else { push @elements, ref $rule ? $self->_recurse($rule) : $rule; } } if (! grep { my $ref = ref($_); $ref ? $ref ne 'CODE' : 1 } @elements) { $ret = sub { foreach my $code (@elements) { $code->() } }; } else { $ret = \@elements; } } elsif ($ref eq 'HASH'){ my($op) = keys %$rules; my $h = $self->_recurse($rules->{$op}); my $is_func = ($op =~ /^(?:scraper|process(?:_first)?|result)$/); if ($is_func) { my @args = (ref $h eq 'ARRAY') ? @$h : ($h); if ($op eq 'scraper') { $ret = sub { scraper(sub { for (@args) { $_->() } }) }; } else { $ret = sub { my $code = sub { @_ = map { (ref $_ eq 'CODE') ? $_->() : $_ }@args; goto &$op; }; $code->() }; } } else { $ret = { $op => $h }; } } else { require Data::Dumper; die "Web::Scraper::Config does not know how to parse: " . Data::Dumper::Dumper($rules); } return $ret; } 1; __END__ =head1 NAME Web::Scraper::Config - Run Web::Scraper From Config Files =head1 SYNOPSIS --- scraper: - process: - td>ul>li - trailers[] - scraper: - process_first: - li>b - title - TEXT - process_first: - ul>li>a[href] - url - @href - process: - ul>li>ul>li>a - movies[] - __callback(process_movie)__ my $scraper = Web::Scraper::Config->new( $config, { callbacks => { process_movie => sub { my $elem = shift; return { text => $elem->as_text, href => $elem->attr('href') } } } } ); $scraper->scrape($uri); =head1 DESCRIPTION Web::Scraper::Config allows you to harness the power of Web::Scraper from a config file. The config files can be written in any format that Config::Any understands, as long as it conforms to this module's rules. =head1 METHODS =head2 new Creates a new Web::Scraper::Config instance. The first arguments is either a hashref that represents a config, or a filename to the config. The config file can be in any format that Config::Any understands as long as it returns a hash that's conformant to the Web::Scraper::Config rules. The second argument (options) is optional, and is currently only used to provider callbacks to be called from the scraper. When Web::Scraper::Config encounters an element in the form of: __callback(function_name)__ then that is replaced by the corresponding callback specified in the options hash. =head2 scrape Starts scraping. The semantics are exactly the same as Web::Scraper::scrape =head1 AUTHOR Daisuke Maki Edaisuke@endeworks.jpE =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See http://www.perl.com/perl/misc/Artistic.html =cut