The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
#!/usr/bin/perl

package CPAN::CachingProxy;

use strict;
use Carp;
use URI;
use Cache::File;
use Data::Dumper;
use LWP::UserAgent;
use Fcntl qw(:flock);
use Digest::SHA1 qw(sha1_hex);

our $VERSION = "1.6500";

# wget -O MIRRORED.BY http://www.cpan.org/MIRRORED.BY

# new {{{
sub new {
    my $class = shift;
    my $this  = bless {@_}, $class;

    unless( $this->{cgi} ) {
        require CGI or die $@;
        $this->{cgi} = new CGI;
    }

    unless( $this->{cache_object} ) {
        $this->{cache_root}       = "/tmp/ccp/"  unless exists $this->{cache_root};
        $this->{cache_root}       = "/tmp/ccp/"  unless exists $this->{cache_root};
        $this->{default_expire}   = "2 day"      unless exists $this->{default_expire};
        $this->{index_expire}     = "3 hour"     unless exists $this->{index_expire};
        $this->{error_expire}     = "15 minute"  unless exists $this->{error_expire};
        $this->{url_lockfile_dir} = "/tmp"       unless exists $this->{url_lockfile_dir};

        $this->{index_regexp}   = qr/(?:03modlist\.data|02packages\.details\.txt|01mailrc\.txt)/ unless exists $this->{index_regexp};
        $this->{cache_object}   = Cache::File->new(cache_root=>$this->{cache_root}, default_expires => $this->{default_expire} );
    }

    $this->{key_space} = "CK" unless $this->{key_space};

    unless( $this->{ua} ) {
        my $ua = $this->{ua} = new LWP::UserAgent;
           $ua->agent($this->{agent} ? $this->{agent} : "CCP/$VERSION (Paul's CPAN caching proxy / perlmonks-id=16186)");
           if( exists $this->{activity_timeout} ) {
               if( defined (my $at = $this->{activity_timeout}) ) {
                   $ua->timeout($at);
               }

           } else {
               $ua->timeout(12);
           }
    }

    $this->{ua}->timeout( $this->{activity_timeout} ) if defined $this->{activity_timeout};

    croak "there are no default mirrors, they must be set" unless $this->{mirrors};

    return $this;
}
# }}}
# run {{{
sub run {
    my $this   = shift;
    my $cgi    = $this->{cgi};
    my $mirror = $this->{mirrors}[ rand @{$this->{mirrors}} ];
    my $pinfo  = $cgi->path_info || return print $cgi->redirect( $cgi->url . "/" );

       $pinfo =~ s/^\///;
       $mirror=~ s/\/$//;

    my $CK = "$this->{key_space}:$pinfo";
    my $URL = "$mirror/$pinfo";
     # $URL =~ s/\/{2,}/\//g;

    if( $pinfo =~ s{^___/}{} ) {
        # NOTE: undocumented special case.  If the path begins with ___, it
        # probably came from a 404 handler.  in which case, the real pinfo was
        # probably an absolute url.  replace the entire path portion of our
        # mirror url with the non ___'d part of the pinfo.

        my $nurl = URI->new($mirror);
        $nurl->path($pinfo);
        # arguably we should use URI for all our path manips, but this section is new and the old stuff works fine
        $URL = "$nurl";
    }

    my $lockfile_fh;
    my $uld = $this->{url_lockfile_dir};
    if( $uld and -d $uld ) {
        for(glob("$uld/.CP_FILE*")) {
            open my $tlf, "<", $_ or next;
            next unless flock $tlf, (LOCK_NB|LOCK_EX);
            warn "[DEBUG] unlinking old URL-lockfile $_\n" if $this->{debug};
            unlink $_;
        }

        # NOTE: sha1 is not for security, as I think timing attacks on this
        # have little value really, aside from DoS, and then the local
        # attackers probably have better things to do.  These are simply here
        # because my /tmp is tmpfs, which has surprisingly low file name length
        # restrictions.
        my $converted = join("/", $uld, ".CP_FILE_" . sha1_hex($URL));

        warn "[DEBUG] locking $URL using $converted lockfile\n" if $this->{debug};
        open $lockfile_fh, ">", $converted or die "error opening lockfile for $URL: $!";
        flock $lockfile_fh, LOCK_EX or die "failed to lock lockfile for $URL: $!";
    }

    else {
        die "as of version 1.6, url_locking_dir is a required option.";
    }

    my $cache = $this->{cache_object};
    if( $cache->exists($CK) and $cache->exists("$CK.hdr") ) { our $VAR1;
        my $res = eval $cache->get( "$CK.hdr" ); die "problem finding cache entry\n" if $@;

        unless( $this->{ignore_last_modified} ) {
            if( my $lm = $res->header('last_modified') ) {
                my $_lm = eval { $this->{ua}->head($URL)->header('last_modified') };

                # $lm = "hehe, random failure time" if (int rand(7)) == 0;

                if( $_lm and $lm ne $_lm ) {
                    warn "[DEBUG] last_modified differs ($lm vs $_lm), forcing cache miss\n" if $this->{debug};
                    goto FORCE_CACHE_MISS;
                }
            }
        }

        my $start = $this->my_copy_hdr($res, "cache hit");

        # XXX: is it the right thing to do to close the lockfile here?
        # Probably.  At this point, we should have the whole file, and we sure
        # don't mind serving similtaneous requests, right?

        close $lockfile_fh;

        ###

        my $fh = $cache->handle( $CK, "<" ) or die "problem finding cache entry\n";
        my $buf;
        BUF: while( read $fh, $buf, 4096 ) {
            if( $start > 0 ) {
                if( $start > length $buf ) {
                    $start -= length $buf;
                    next BUF;

                } else {
                    substr $buf, 0, $start, "";
                    $start = 0;
                }
            }
            print $buf;
        }
        close $fh;

    } else {
        FORCE_CACHE_MISS:
        my $expire = $this->{default_expire};
           $expire = $this->{index_expire} if $pinfo =~ $this->{index_regexp};

        $cache->set($CK, 1, $expire ); # doesn't seem like we should have to do this, but apparently we do

        warn "[DEBUG] getting $URL\n" if $this->{debug};

        my $fh       = $cache->handle( $CK, ">", $expire );
        my $request  = HTTP::Request->new(GET => $URL);

        my $announced_header;
        my $response = $this->{ua}->request($request, sub {
            my $chunk = shift;

            unless( $announced_header ) {
                my $res = shift;
                $announced_header = 1;
                $this->my_copy_hdr($res, "cache miss");
            }

            print $fh $chunk;
            print     $chunk;
        });
        close $fh;

        unless( $response->is_success ) {
            my $my_fail = "FAIL: " . $response->status_line . "\n";
            $cache->set($CK => $my_fail, $expire);
            $response->header(content_length=>length $my_fail); # fix content length so we don't lie to clients

            $this->my_copy_hdr($response, "cache miss [fail]");
            print $my_fail;
        }

        warn "[DEBUG] setting $CK\n" if $this->{debug};
        $cache->set("$CK.hdr", Dumper($response), $expire);

        # if there was an error (which we don't know until ex post facto), go back and fix the expiry
        if( defined $this->{error_expire} and not $response->is_success ) {
            $cache->set_expiry( $CK       => $this->{error_expire} );
            $cache->set_expiry( "$CK.hdr" => $this->{error_expire} );
        }
    }
}
# }}}

# {{{ sub my_copy_hdr
sub my_copy_hdr {
    my ($this, $res, $hit) = @_;
    my $cgi = $this->{cgi};

    my $status = $res->status_line;
    warn "[DEBUG] cache status: $hit; status: $status\n" if $this->{debug};

    my %more_headers = (qw(accept_ranges bytes));

    for(qw(content_length), $this->{ignore_last_modified} ? ():(qw(last_modified))) {
        my $v = $res->header($_);

        if( $v ) {
            my $k = lc $_;
               $k =~ s/-/_/g;

            $more_headers{$k} = $v;
        }
    }

    my $start = 0;

    if( my $r = $cgi->http("Range") ) {
        if( ($start) = $r =~ m/^bytes=(\d+)-$/ ) {
            my $len = $more_headers{content_length};
            my $new = $len - $start;
            my $end = $len - 1; # this is the byte number, not a number of bytes or something

            $more_headers{content_range}  = "bytes $start-$end/$len";
            $more_headers{content_length} = $new;
        }
    }

    print $cgi->header(-status=>$status, -charset=>"", -type=>$res->header( 'content-type' ), %more_headers);

    return $start;
}

# }}}