=head1 NAME Data::Downloader -- Download and organize files using RSS feeds and templates =head1 SYNOPSIS #!/usr/bin/env perl use Data::Downloader -init_logging => "DEBUG"; use strict; # Get pictures of apples. Store them using md5's, make symlinks using date and tags my $images = Data::Downloader::Repository->new( name => "images", storage_root => "/usr/local/datastore/data", cache_strategy => "Keep", feeds => [ { name => "flickr", feed_template => 'http://api.flickr.com/services/feeds/photos_public.gne?tags=&lang=en-us&format=rss_200', file_source => { url_xpath => 'media:content/@url', filename_xpath => 'media:content/@url', filename_regex => '/([^/]*)$', }, metadata_sources => [ { name => 'date_taken', xpath => 'dc:date.Taken' }, { name => 'tags', xpath => 'media:category' }, ], }, ], metadata_transformations => [ { input => "tags", output => "tag", function_name => "split", }, ], linktrees => [ { root => "/usr/local/datastore/by_tag", condition => undef, path_template => "" }, { root => "/usr/local/datastore/by_date", condition => undef, path_template => "" }, ], ); $images->load(speculative => 1) or $images->save; for my $feed ($images->feeds) { $feed->refresh(tags => "apples"); } $images->download_all; =head1 DESCRIPTION Data::Downloader allows one to maintain local repositories of files and file metadata based on RSS feeds which describe the files. It stores files uniquely using MD5 sums, but allows for the creation and maintenance of trees of symbolic links to the downloaded files. An SQLite database is used to store both the configuration and information about the files that have been downloaded. This file is automatically created in $HOME/.data_downloader.db (by default). This distribution also conatins a command-line tool, L, for managing the repository. =head2 CONFIGURATION See L. =head2 UPDATING RSS FEEDS Read about this in L. =head2 DOWNLOADING FILES Take a look at L. =head2 MAINTAINING SYMLINKS Check out L. =head1 SEE ALSO L L L L L L L L =cut package Data::Downloader; use Rose::DB::Object::Loader; use Lingua::EN::Inflect qw/def_noun/; use Log::Log4perl qw(:easy); use Data::Downloader::DB; use Data::Downloader::DB::Object; use Data::Downloader::DB::Object::Cached; use Data::Downloader::Config; use Data::Downloader::MetadataPivot; use Data::Downloader::FileMetadata; use strict; def_noun "metadatum" => "metadata"; our $VERSION = '0.9902'; our $db; our $useProgressBars; # set during import to turn on Smart::Comments our $setupDone; sub import { my ($class,@params) = @_; return if $setupDone; $setupDone = 1; $db = Data::Downloader::DB->new("main"); if (@params && grep /-init_logging/, @params) { my ($level) = grep { /^(DEBUG|INFO|WARN|ERROR|FATAL)$/ } @params; $level ||= "INFO"; my $init = <<"EOT"; log4perl.rootLogger = $level, Screen log4perl.appender.Screen = Log::Log4perl::Appender::ScreenColoredLevels log4perl.appender.Screen.layout = Log::Log4perl::Layout::PatternLayout log4perl.appender.Screen.layout.ConversionPattern = @{[ $ENV{HARNESS_ACTIVE} ? '#' : '' ]} [%-5p] %d %F{1} (%L) %m %n EOT Log::Log4perl::init(\$init); } if (@params && grep /-use_progress_bars/, @params) { $useProgressBars = 1; } if (! -s $db->database) { INFO "initializing database ".$db->database."\n"; _initialize_db(); } _setup_classes(); } sub _initialize_db { local $/ = ';'; while () { my $stmt = $_; if (/CREATE TRIGGER/) { do { $stmt .= ( $_ = ); } until /END/ || !defined($_); } TRACE "SQL:\n$stmt\n"; next unless $stmt =~ /\w/; $db->dbh->do($stmt) or die "failed to execute '$stmt'\n\n".$db->dbh->errstr; } } sub _setup_classes { our $cm = Rose::DB::Object::ConventionManager->new(); $cm->tables_are_singular(1); $cm->singular_to_plural_function(\&Lingua::EN::Inflect::PL); my $loader_dynamic = Rose::DB::Object::Loader->new( base_class => "Data::Downloader::DB::Object", db => $db, db_class => "Data::Downloader::DB", class_prefix => "Data::Downloader", convention_manager => $cm, post_init_hook => sub { shift->error_mode('return'); }, # TODO use log4perl for error mode somehow ); my $loader_cached = Rose::DB::Object::Loader->new( base_class => "Data::Downloader::DB::Object::Cached", db => $db, db_class => "Data::Downloader::DB", class_prefix => "Data::Downloader", convention_manager => $cm, post_init_hook => sub { shift->error_mode('return'); }, ); my @config_tables = qw/repository feed disk feed_parameter file_source metadata_source metadata_transformation/; my @classes = $loader_dynamic->make_classes(exclude_tables => \@config_tables) or LOGDIE "Error: unable to load classes from ".$db->dbh->database; push @classes, $loader_cached->make_classes(include_tables => \@config_tables); for (@classes) { eval "use $_"; no strict 'refs'; *{"${_}::init_db"} = sub { Data::Downloader::DB->new_or_cached("main") }; die "Errors using $_ : $@" if $@ && $@ !~ /Can't locate/; } Data::Downloader::MetadataPivot->do_setup; Data::Downloader::FileMetadata->do_setup; } 1; <<"=head1 SCHEMA"; # my trick to put __DATA__ into pod =head1 SCHEMA __DATA__ -- See also L /* * Every table below corresponds to a class named Data::Downloader::. * Every class has accessors and mutators whose names are the same as the column names. * Foreign keys also beget methods in both the parent and child classes, e.g. * e.g. $repository->files and $file->repository_obj. */ /*** static tables : populated once during configuration ***/ --- A L has a L strategy. create table repository ( id integer primary key, name varchar(255) not null unique, storage_root text not null, file_url_template text, cache_strategy text not null, -- e.g. "LRU" corresponds to Data::Downloader::Cache::LRU cache_max_size integer -- in bytes ); --- A L has many Ls. create table feed ( id integer primary key, repository integer not null references repository(id), name varchar(255) not null unique, feed_template text ); --- A L has many Ls. create table disk ( id integer primary key, repository integer not null references repository(id), root varchar(255) not null unique ); --- A L has many default parameters create table feed_parameter ( id integer primary key, feed integer not null references feed(id), name text not null, default_value text ); --- A L has a file source. create table file_source ( feed integer primary key not null references feed(id), url_xpath text, urn_xpath text, -- unique identifier for files md5_xpath text, filename_xpath text, filename_regex text -- apply to whatever is extracted from the xpath ); --- A L has many metadata sources. create table metadata_source ( id integer primary key, feed integer references feed(id), name text not null unique, xpath text ); --- A L has many Ls. create table linktree ( id integer primary key, repository integer references repository(id), root text, condition text, -- SQL::Abstract clause path_template text, -- String::Template string unique(root) ); --- A L has many Ls. create table metadata_transformation ( id integer primary key, input text, -- references metadata_source.name or this table output text not null, repository integer references repository(id), function_name text, -- function to apply function_params text, order_key integer not null default 1, unique(order_key) ); /*** dynamic tables : populated when files are downloaded, rss feeds are updated, etc. ***/ create table stat_info ( id integer primary key, repository integer not null references repository(id), last_stat_update datetime, last_fsck datetime, unique(repository) ); --- metadata create table metadatum ( id integer primary key, file integer references file(id), name varchar(255) references metadata_source(name), value text, unique (file,name) ); --- A L has many Ls. create table file ( id integer primary key, repository integer not null references repository(id), filename text not null, md5 char(32), url text, urn text, size integer, on_disk integer, -- boolean disk integer references disk(id), atime datetime, -- stat(file)->atime unique(md5), unique(filename), unique(urn) ); --- A L has many log entries --- (if $ENV{DATA_DOWNLOADER_GATHER_STATS} is set). create table log_entry ( id integer primary key, file integer not null references file(id), requested_at datetime not null, cache_hit integer, -- boolean completed_at datetime, prog text, -- $0 pid integer, -- $$ uid text, -- $< note text -- $ENV{DATA_DOWNLOADER_LOG_NOTE}, e.g. app info ); --- A L has many expirations create table file_removal ( id integer primary key, file integer not null references file(id), expired_on datetime, algorithm, -- the expiration algorithm prog text, -- $0 pid integer, -- $$ uid text, -- $< note text -- $ENV{DATA_DOWNLOADER_FILE_REMOVAL_NOTE}, ); --- A L has many symlinks. create table symlink ( id integer primary key, linkname text not null, -- linktree.path_template + file's metadata file integer not null references file(id), linktree integer not null references linktree(id), unique(linkname) ); /***\ =cut */ create index symlink_file on symlink(file); create index symlink_file_linktree on symlink(file,linktree); create index symlink_linktree on symlink(linktree); /***\ =head1 AUTHOR Brian Duggan Phillip Durbin Stuart Pineo Arnold Martin Graham Ollis Michael Walters =cut