# $Id: /mirror/gungho/lib/Gungho/Component/RobotRules.pm 31300 2007-11-29T11:51:03.339151Z lestrrat $ # # Copyright (c) 2007 Daisuke Maki package Gungho::Component::RobotRules; use strict; use warnings; use base qw(Gungho::Component); use Gungho::Component::RobotRules::Rule; use WWW::RobotRules::Parser; __PACKAGE__->mk_classdata($_) for qw(pending_robots_txt robot_rules_parser robot_rules_storage pending_count); sub setup { my $c = shift; $c->pending_robots_txt({}); $c->pending_count(0); $c->setup_robot_rules_storage(); $c->setup_robot_rules_parser(); $c->next::method(@_); } sub send_request { my ($c, $request) = @_; my $allowed = $request->uri->path eq '/robots.txt' || $c->allowed($request) ; if ($allowed == -2) { $c->log->debug("[ROBOT RULES] Fetch for /robots.txt already scheduled for " . $request->original_uri->host_port); } elsif ($allowed == -1) { $c->log->debug("[ROBOT RULES] No robot rules found for " . $request->original_uri->host_port . ", going to fetch one"); } elsif ($allowed) { return $c->next::method($request); } else { $c->log->debug("[ROBOT RULES] " . $request->uri . " is disallowed by robot rules"); } return 0; } sub allowed { my ($c, $request) = @_; my $rule = $c->robot_rules_storage->get_rule( $c, $request ); if (! $rule) { if ($c->push_pending_robots_txt($request) == 0) { return -2; } my $uri = $request->original_uri; $uri->path('/robots.txt'); $uri->query(undef); $uri->fragment(undef); my $req = Gungho::Request->new(GET => $uri); $req->notes('auto_robot_rules' => 1); $c->pushback_request( $req ); return -1; } else { return $rule->allowed( $c, $request->uri ); } } sub handle_response { my $c = shift; my ($request, $response) = @_; if ($request->uri->path eq '/robots.txt' && $request->notes('auto_robot_rules')) { $c->log->debug("Handling robots.txt response for " . $request->uri); $c->parse_robot_rules($request, $response); $c->dispatch_pending_robots_txt($request); return; } $c->next::method(@_); } sub push_pending_robots_txt { my ($c, $request) = @_; $c->pending_count( $c->pending_count + 1 ); $c->log->debug( "[ROBOT RULES]: Requests still pending: " . $c->pending_count ); return $c->robot_rules_storage->push_pending_robots_txt( $c, $request ); } sub dispatch_pending_robots_txt { my ($c, $request) = @_; my $pending = $c->robot_rules_storage->get_pending_robots_txt($c, $request); if ($pending && ref $pending eq 'HASH') { foreach my $request (values %$pending) { $c->pending_count( $c->pending_count - 1 ); $c->pushback_request( $request ); } } $c->log->debug( "[ROBOT RULES]: Requests still pending: " . $c->pending_count ); } sub setup_robot_rules_storage { my $c = shift; my $config = $c->config->{robotrules}{storage} || {}; my $pkg = $config->{module} || 'DB_File'; my $pkg_config = $config->{config} || {}; $pkg = $c->load_gungho_module($pkg, 'Component::RobotRules::Storage'); my $storage = $pkg->new(%$config); $storage->setup($c); $c->robot_rules_storage( $storage ); } sub setup_robot_rules_parser { my $c = shift; my $config = $c->config->{robotrules}{parser} || {}; my $pkg = $config->{module} || '+WWW::RobotRules::Parser'; my $pkg_config = $config->{config} || {}; $pkg = $c->load_gungho_module($pkg, 'Component'); my $parser = $pkg->new($config); $c->robot_rules_parser( $parser ); } sub parse_robot_rules { my ($c, $request, $response) = @_; my $h = ($request && $response && $response->is_success && $response->content) ? $c->robot_rules_parser->parse($request->original_uri, $response->content) : {} ; $c->log->debug("Parse robot rules " . $request->uri . ": " . keys(%$h) . " rules"); my $rule = Gungho::Component::RobotRules::Rule->new($h); $c->robot_rules_storage->put_rule($c, $request, $rule); } 1; =head1 NAME Gungho::Component::RobotRules - Respect robots.txt =head1 SYNOPSIS --- components: - RobotRules =head1 METHODS =head2 setup =head2 setup_robot_rules_parser =head2 setup_robot_rules_storage =head2 handle_response =head2 send_request =head2 allowed($request) Returns 1 if request is allowed to be fetched, 0 if not. -1 and -2 are returned when there is a pending request to fetch /robots.txt =head2 dispatch_pending_robots_txt Dispatches requests that were pending because of a missing robots.txt entry =head2 push_pending_robots_txt Pushes a request in the wait queue for a robots.txt =head2 parse_robot_rules Parses the robot rule and stores it =cut