package Daizu::File; use warnings; use strict; use Carp qw( croak ); use Carp::Assert qw( assert DEBUG ); use XML::LibXML; use Encode qw( encode decode ); use URI; use Daizu::Wc; use Daizu::Util qw( trim trim_with_empty_null pgregex_escape url_encode url_decode parse_db_datetime db_select db_select_col db_insert db_update transactionally wc_file_data instantiate_generator expand_xinclude ); use Daizu::HTML qw( dom_body_to_html4 absolutify_links ); =head1 NAME Daizu::File - class representing files in working copies =head1 DESCRIPTION Each object of this class represents a particular file in a Daizu CMS working copy (a record in the C table). =head1 METHODS Note that all the functions which return the value of a Subversion property will strip leading and trailing whitespace, and treat a value which is empty or entirely whitespace as if it wasn't set at all. =over =item Daizu::File-Enew($cms, $file_id) Return a new Daizu::File object for the file with the specified ID number. =cut sub new { my ($class, $cms, $file_id) = @_; croak 'usage: Daizu::File->new($cms, $file_id)' unless defined $cms && ref $cms && defined $file_id; my $db = $cms->{db}; my $record = $db->selectrow_hashref(q{ select wc_id, guid_id, parent_id, is_dir, name, path, cur_revnum, modified, deleted, generator, root_file_id, custom_url, article, retired, title, short_title, description, issued_at, modified_at, image_width, image_height, data_len, content_type, article_pages_url from wc_file where id = ? }, undef, $file_id); croak "no file found with ID $file_id" unless defined $record; for (qw( title short_title description )) { $record->{$_} = decode('UTF-8', $record->{$_}, Encode::FB_CROAK) if defined $record->{$_}; } return bless { cms => $cms, db => $db, id => $file_id, %$record, }, $class; } =item $file-Edata Return a reference to a string containing the file data (content). =cut sub data { my ($self) = @_; return wc_file_data($self->{db}, $self->{id}); } =item $file-Ewc Return a L object representing the working copy in which this file lives. =cut sub wc { my ($self) = @_; return Daizu::Wc->new($self->{cms}, $self->{wc_id}); } =item $file-Eguid_uri Return the GUID URI for this file. =cut sub guid_uri { my ($self) = @_; return db_select($self->{db}, file_guid => $self->{guid_id}, 'uri'); } =item $file-Edirectory_path Returns the path of a directory, either the same as the file if it's a directory itself, or the path of its parent directory, or '' if it's at the top level. =cut sub directory_path { my ($self) = @_; my $path = $self->{path}; return $path if $self->{is_dir}; return $path =~ m!^(.*)/[^/]+\z! ? $1 : ''; } =item $file-Epermalink Returns the first URL generated by the file, which will be the URL you want to link to most of the time. For articles this will always be the normal HTML version of the article, even if there are also other URLs available for it, and it will always be the first page of multipage articles. For non-article files there is no guarantee about what this will return, but most will only generate a single URL anyway, and for those that don't generators are likely to return the most 'linkable' URL first. The URL returned is an absolute URL provided as a L object. Returns nothing if the file doesn't generate any URLs. There are some cases where this might not be what you want. For example, the root directory of a website using L will either not generate a URL at all, or will generate one for a Google sitemap XML file, neither of which is likely to be useful for linking. To get the URL of the website you would probably need to find a file called something like '_index.html'. On the other hand, the L generator will give you a sensible URL for the blog homepage if you call this on its root directory. =cut sub permalink { my ($self) = @_; return URI->new($self->{article_pages_url}) if $self->{article}; my ($permalink) = $self->generator->urls_info($self); return unless defined $permalink; return $permalink->{url}; } =item $file-Eurls_in_db($method, $argument) Return a list of the URLs (plain strings, each an absolute URI) of the file which have the specified method and argument values, drawing from the C table in the database. =cut sub urls_in_db { my ($self, $method, $argument) = @_; my %criteria = ( wc_id => $self->{wc_id}, guid_id => $self->{guid_id}, status => 'A', ); $criteria{method} = $method if defined $method; $criteria{argument} = $argument if defined $argument; return db_select_col($self->{db}, url => \%criteria, 'url'); } =item $file-Earticle_urls Return information about the URLs which the file should have, if it is an article. Fails if it isn't. The URLs are returned in the same format as the Lcustom_urls_info($file)>, and should be used within implementations of that method to ensure that articles get the proper URLs even if an article loader plugin or DOM filtering plugin has changed the usual forms. There are two sets of URLs returned, as a single list: =over =item * URLs for the pages of the actual article, as normal web pages published through the templating system. There will always be at least one of these, which will be the first URL returned. It will have a method of 'article', an empty argument string, and a content type of 'text/html'. The generator class is likely to be L, although it doesn't have to be. The URL for this first article page will be the one supplied by the article loader plugin as C, and stored in the database in the C column. If none is supplied it defaults to the empty string. If the article has multiple pages, this URL info will be followed by others, one for each subsequent page, which will be identical except for the actual URL and the argument, which will contain the page number (starting from '2' for the second page). The first 'article' page URL is the one which should be used when linking to an article, unless you have some special reason to link to a particular page or an alternative URL for the same file. For example, this is the URL which will be included in blog feeds and navigation menus. To get at it conveniently, see the Lpermalink> method. =item * There may be additional URLs for supplementary resources generated by plugins, although by default a simple article written in XHTML won't have any 'extra' URLs. These URLs are the ones supplied by the article loader plugin as C, and stored in the database in the C table. One example of an 'extra' URL is a POD file (Perl documentation, like this document itself) published with the L plugin. If the filename of the POD file ends in '.pm', then this plugin will add an extra URL for the original source code, since that might be of interest to programmers reading API documentation. =back =cut # Used by article_urls() below. Uses some crufty heuristics to decide # how pages (other than the first page) of articles should be referenced. # $url should be the URL of the article's first page, which is likely to # be the empty string or 'filename.html', although it could be an absolute # URL. It's up to the generator class. sub _pagify_url { my ($url, $page) = @_; assert($page > 0) if DEBUG; assert($url ne '') if DEBUG; return $url if $page == 1; return "$url/page$page.html" if $url =~ m!/$!; $url =~ s!\.([^/.]+)$!-page$page.$1! or $url .= "-page$page.html"; return $url; } sub article_urls { my ($self) = @_; croak "file is not an article" unless $self->{article}; $self->_find_content_marks; # need to know how many pages my @page_urls = map { { url => _pagify_url($self->{article_pages_url}, $_), generator => (ref $self->generator), method => 'article', argument => ($_ == 1 ? '' : $_), type => 'text/html', } } (1 .. scalar @{$self->{page_start}}); return @page_urls, $self->article_extra_urls; } =item $file-Eparent Return a new Daizu::File object representing this file's parent directory. Returns nothing if this file is at the 'top level' of its branch and so has no parent. =cut sub parent { my ($self) = @_; return unless defined $self->{parent_id}; return Daizu::File->new($self->{cms}, $self->{parent_id}); } =item $file-Efile_at_path($path) Returns a L object for the file found at C<$path> in the same working copy as C<$file>. C<$path> can be an absolute path (if it starts S), or it can be relative to the path of C<$file>. Dies if there is no such file. =cut sub file_at_path { my ($self, $path) = @_; my $wc_id = $self->{wc_id}; # The 'daizu' scheme is used for XInclude expansion, so might as well # use it here too. It doesn't really matter, we just want to get the # URI module to resolve the relative path for us. my $base = 'daizu:///' . url_encode($self->{path}); $base .= '/' if $self->{is_dir}; $path =~ s!/\./!/!g; my $abs_path = URI->new($path)->abs($base)->path; $abs_path =~ s!^/!!; $abs_path = url_decode($abs_path); my $wc = Daizu::Wc->new($self->{cms}, $self->{wc_id}); return $wc->file_at_path($abs_path); } =item $file-Eissued_at Return a L object for the publication date and time of the file. All files have an 'issued' date, either specified explicitly in a C property, or determined from the time at which the file was first committed into the Subversion repository (which is assumed to be about the time it was first published). =cut sub issued_at { my ($self) = @_; return parse_db_datetime($self->{issued_at}); } =item $file-Emodified_at Return a L object for the last-updated date and time of the file. This is always defined. The value is either specified explicitly in a C property, or determined from the time of the last commit which modified or renamed the file. =cut sub modified_at { my ($self) = @_; return parse_db_datetime($self->{modified_at}); } =item $file-Eproperty($name) Return the value of the Subversion property C<$name> on this file, or undef if there is no such property. The value is assumed to be text, so leading and trailing whitespace is trimmed off, and the value is decoded as UTF-8. If the value exists but contains only whitespace then undef is returned. =cut sub property { my ($self, $name) = @_; my $value = db_select($self->{db}, 'wc_property', { file_id => $self->{id}, name => $name }, 'value', ); $value = trim_with_empty_null($value); return decode('UTF-8', $value, Encode::FB_CROAK); } =item $file-Emost_specific_property($name) Return the value of the Subversion property C<$name> on this file, or on its closest ancestor if it has no such property. Therefore properties on subdirectories will override those of their parent directories. Returns undef if there is no property of this name on the file or any of its ancestors. Properties whose values are empty or contain only whitespace are ignored. The value is assumed to be text, so leading and trailing whitespace is trimmed off, and the value is decoded as UTF-8. =cut sub most_specific_property { my ($file, $name) = @_; while (defined $file) { my $value = $file->property($name); if (defined $value && $value =~ /\S/) { $value = trim($value); return decode('UTF-8', $value, Encode::FB_CROAK); } $file = $file->parent; } return undef; } =item $file-Eleast_specific_property($name) Return the value of the Subversion property C<$name> on this file, or on its most distant ancestor if it has no such property. Therefore the return value is the 'top level' value for this property. For example, if you ask for the C property then you might get the title of the website of which C<$file> is a part. Returns undef if there is no property of this name on the file or any of its ancestors. Properties whose values are empty or contain only whitespace are ignored. The value is assumed to be text, so leading and trailing whitespace is trimmed off, and the value is decoded as UTF-8. =cut sub least_specific_property { my ($file, $name) = @_; my $best; while (defined $file) { my $value = $file->property($name); $best = trim($value) if defined $value && $value =~ /\S/; $file = $file->parent; } return decode('UTF-8', $best, Encode::FB_CROAK); } =item $file-Ehomepage_file Return the file which most probably represents the 'homepage' of the website on which C<$file> will be published. This will be the file closest to the top level of the filesystem hierarchy which has a C property set. It is possible for this to return C<$file> itself if there is nothing above it with a URL. Returns undef if not even C<$file> has a URL set, in which case it can't have a homepage because it won't be published itself. =cut sub homepage_file { my ($file) = @_; my $best; while (defined $file) { $best = $file if defined $file->{custom_url}; $file = $file->parent; } return $best; } =item $file-Etitle Return the title of C<$file>, as a decoded Perl text string, or undef if the file doesn't have a title. The title is taken from the file's C property. =cut sub title { shift->{title} } =item $file-Eshort_title Return the 'short-title' of C<$file>, as a decoded Perl text string, or undef if the file doesn't have a title. The title is taken from the file's C property. =cut sub short_title { shift->{short_title} } =item $file-Edescription Return the description/summary of C<$file>, as a decoded Perl text string, or undef if the file doesn't have a description. The value is taken from the file's C property. =cut sub description { shift->{description} } =item $file-Egenerator Create and return a generator object for the file C<$file>. Figures out which generator class to use, by looking at the C property for the file, and if necessary its ancestors. The class is loaded automatically. It also knows to use L if no generator specification is found. Returns the new object, which should support the API of class L. =cut sub generator { my ($self) = @_; return $self->{generator_obj} if exists $self->{generator_obj}; my $cms = $self->{cms}; my $root_file = $self; $root_file = Daizu::File->new($cms, $self->{root_file_id}) if defined $self->{root_file_id}; my $generator = instantiate_generator($cms, $self->{generator}, $root_file); $self->{generator_obj} = $generator; return $generator; } =item $file-Eupdate_loaded_article_in_db Updates the cached article content for C<$file> in the database. This includes the finished XML version of the content, the article pages URL, the extra URLs, and the extra templates. Does nothing if the file isn't an article. Fails if it is but there are no plugins able to load it. This normally happens automatically when a file's content is updated, and can also be triggered manually from the C program with the This is where article loader plugins set with the L method Ladd_article_loader($mime_type, $path, $object, $method)> are invoked. Doesn't return anything. =cut sub update_loaded_article_in_db { my ($self) = @_; return unless $self->{article}; return transactionally($self->{db}, \&_update_loaded_article_in_db_txn, $self); } sub _update_loaded_article_in_db_txn { my ($self) = @_; my $cms = $self->{cms}; my $mime_type = $self->{content_type}; if (!defined $mime_type) { # Articles must have a mime type, but allow a default based on file # extension for the built-in XHTML format. croak "article in file '$self->{path}' has no mime type specified" unless $self->{name} =~ /\.html?$/i; $mime_type = 'text/html'; } $mime_type =~ m!^(.+?)/! or croak "bad article mime type '$mime_type' in file '$self->{path}'"; my $mime_type_family = "$1/*"; # Search through applicable MIME type patterns. my $file_path = $self->{path}; for my $match ($mime_type, $mime_type_family, '*') { next unless exists $cms->{article_loaders}{$match}; my $plugins = $cms->{article_loaders}{$match}; # Search through applicable paths, sorting in reverse order of length # so that the most specific configuration gets tested first. for my $match_path (sort { length $b <=> length $a } keys %$plugins) { next unless $match_path eq '' || $match_path eq $file_path || substr($file_path, 0, length $match_path + 1) eq "$match_path/"; # Search through the plugins we've found to find one which # accepts the file. for my $handler (@{$plugins->{$match_path}}) { my ($object, $method) = @$handler; my $article = $object->$method($cms, $self); next unless $article; croak "bad return value '$article' from article loader" . " '$object->$method" unless ref($article) eq 'HASH'; $self->_expand_article_xinclude($article); $self->_filter_loaded_article($article); $self->_save_article_content($article); return; } } } die "can't load article $self->{id}," . " don't know how to handle content type '$mime_type'"; } sub _expand_article_xinclude { my ($self, $article) = @_; my @included_files = expand_xinclude($self->{db}, $article->{content}, $self->{wc_id}, $self->{path}); $article->{included_files} = \@included_files if @included_files; } sub _filter_loaded_article { my ($self, $article) = @_; my $cms = $self->{cms}; # Filter through plugins. my $doc = $article->{content}; my $file_path = $self->{path}; # Go through the known filters in an arbitrary order. FILTER: for my $plugins (values %{$cms->{html_dom_filters}}) { # Search through applicable paths, sorting in reverse order of length # so that the most specific configuration gets tested first. for my $match_path (sort { length $b <=> length $a } keys %$plugins) { next unless $match_path eq '' || $match_path eq $file_path || substr($file_path, 0, length $match_path + 1) eq "$match_path/"; my ($object, $method) = @{$plugins->{$match_path}}; my $result = $object->$method($cms, $self, $doc); die "filter plugin $object->$method didn't return any content" unless defined $result && defined $result->{content}; $doc = $result->{content}; push @{$article->{extra_urls}}, @{$result->{extra_urls}} if defined $result->{extra_urls}; push @{$article->{extra_templates}}, @{$result->{extra_templates}} if defined $result->{extra_templates}; # Only execute the best match for each filter. next FILTER; } } $article->{content} = $doc; } # Save the new article content and associated metadata from the loader plugin # in the database. Also update this Daizu::File object to contain the new # information, and to invalidate caches of some stuff. sub _save_article_content { my ($self, $art) = @_; my $db = $self->{db}; my %meta; while (my ($property, $column) = each %Daizu::OVERRIDABLE_PROPERTY) { my $value = db_select($db, 'wc_property', { file_id => $self->{id}, name => $property }, 'value', ); $value = $art->{$column} unless defined $value; $self->{$column} = $meta{$column} = trim_with_empty_null($value); } my $pages_url = $art->{pages_url}; $pages_url = '' unless defined $pages_url; $self->{article_pages_url} = $pages_url, my $xml = $art->{content}->documentElement->toString; croak "article can't be loaded because the result contains a nul byte" if $xml =~ /\0/; delete $self->{article_doc}; delete $self->{snippet_doc}; delete $self->{fold}; delete $self->{page_start}; db_update($db, wc_file => $self->{id}, %meta, article_content => encode('UTF-8', $xml, Encode::FB_CROAK), ); $db->do(q{ delete from wc_article_extra_url where file_id = ? }, undef, $self->{id}); delete $self->{article_extra_urls}; if ($art->{extra_urls}) { for (@{$art->{extra_urls}}) { my $arg = $_->{argument}; $arg = '' if !defined $arg; $db->do(q{ insert into wc_article_extra_url (file_id, url, content_type, generator, method, argument) values (?, ?, ?, ?, ?, ?) }, undef, $self->{id}, $_->{url}, $_->{type}, $_->{generator}, $_->{method}, $arg); } } $db->do(q{ delete from wc_article_extra_template where file_id = ? }, undef, $self->{id}); delete $self->{article_extra_templates}; if ($art->{extra_templates}) { for (@{$art->{extra_templates}}) { $db->do(q{ insert into wc_article_extra_template (file_id, filename) values (?, ?) }, undef, $self->{id}, $_); } } $db->do(q{ delete from wc_article_included_files where file_id = ? }, undef, $self->{id}); if ($art->{included_files}) { for (@{$art->{included_files}}) { $db->do(q{ insert into wc_article_included_files (file_id, included_file_id) values (?, ?) }, undef, $self->{id}, $_); } } my $base_url = $self->generator->base_url($self); $pages_url = URI->new_abs($pages_url, $base_url); db_update($db, wc_file => $self->{id}, article_pages_url => $pages_url, ); } sub _find_content_marks { my ($self) = @_; # Find out where fold and page breaks are, and remove the markers. my $node = $self->article_doc->documentElement->firstChild; my $fold; my @page_start; push @page_start, $node; while (defined $node) { my $next = $node->nextSibling; last unless defined $next; if ($node->nodeType == XML_ELEMENT_NODE) { my $ns = $node->namespaceURI; push @page_start, $next if defined $ns && $ns eq $Daizu::HTML_EXTENSION_NS && $node->localname eq 'page'; if (defined $ns && $ns eq $Daizu::HTML_EXTENSION_NS && $node->localname eq 'fold') { croak "only one is allowed in an article" if defined $fold; $fold = $node; } } $node = $next; } $self->{fold} = defined $fold ? $fold : @page_start > 1 ? $page_start[1]->previousSibling : undef; $self->{page_start} = \@page_start; } =item $file-Earticle_doc Returns an L object representing the content of the article, as stored in the C column of the C table. The first time this is used on C<$file> it will parse the content from the database, doing XInclude processing. Dies if the file is not an article. =cut sub article_doc { my ($self) = @_; croak "can't load article content for '$self->{path}', it's not an article" unless $self->{article}; return $self->{article_doc} if exists $self->{article_doc}; my ($xml) = db_select($self->{db}, wc_file => $self->{id}, 'article_content'); assert(defined $xml) if DEBUG; my $parser = XML::LibXML->new; eval { $self->{article_doc} = $parser->parse_string($xml) }; croak "error parsing stored article_content of '$self->{path}': $@" if $@; return $self->{article_doc}; } =item $file-Earticle_body Returns the root element of the article content. Equivalent to this: =for syntax-highlight perl $file->article_doc->documentElement =cut sub article_body { my ($self) = @_; return $self->article_doc->documentElement; } =item $file-Earticle_content_html4([$page_num]) Returns the content of an article file as S. If C<$page_num> is provided, only returns the content for that page, otherwise for the whole article. Fails if the file is not an article, or if C<$page_num> is greater than the number of pages (C<$page_num> would be 0 for the first page, not zero). C<$page_num> can be undef to select the whole article, and making it the empty string has the same effect (to make this easier to use from within templates). =cut sub article_content_html4 { my ($self, $page_num) = @_; $self->_find_content_marks; my ($start_node, $end_node); if (defined $page_num && $page_num ne '') { croak "page $page_num out of range for this article" if $page_num < 1 || $page_num > @{$self->{page_start}}; $start_node = $self->{page_start}[$page_num - 1]; $end_node = $self->{page_start}[$page_num]; } return dom_body_to_html4($self->article_doc, $start_node, $end_node); } =item $file-Earticle_extract Returns a short extract (up to a certain number of words) from the beginning of the article's content, with all markup removed. What's left is plain text, except that the text from different top-level elements in the document is separated by two newlines. The text returned is not XML escaped. =cut sub article_extract { my ($self) = @_; my $block_elem = $self->article_body->firstChild; my $max_words = 50; # TODO - make configurable. my @words; while (@words <= $max_words && defined $block_elem) { $block_elem = $block_elem->nextSibling, next unless $block_elem->nodeType == XML_ELEMENT_NODE; $words[-1] .= "\n\n" if @words && $words[-1] !~ /\n\z/; my @new_words = split ' ', trim($block_elem->textContent); while (@words <= $max_words && @new_words) { push @words, shift @new_words; } $block_elem = $block_elem->nextSibling; } if (@words > $max_words) { pop @words; push @words, "\x{2026}"; } my $text = join ' ', @words; $text =~ s/\n /\n/g; return $text; } =item $file-Earticle_extra_urls Return a list of URL info hashes describing additional URLs which should be generated by this file, pulled from the C table. Returns nothing for files which aren't articles. =cut sub article_extra_urls { my ($self) = @_; return unless $self->{article}; if (!exists $self->{article_extra_urls}) { my $sth = $self->{db}->prepare(q{ select * from wc_article_extra_url where file_id = ? }); $sth->execute($self->{id}); my @extra; while (my $r = $sth->fetchrow_hashref) { push @extra, { url => $r->{url}, type => $r->{content_type}, generator => $r->{generator}, method => $r->{method}, argument => $r->{argument}, }; } $self->{article_extra_urls} = \@extra; } return @{$self->{article_extra_urls}}; } =item $file-Earticle_extra_templates Returns a list of the extra templates which should be included in the article's 'extras' column. Returns nothing for files which aren't articles. =cut sub article_extra_templates { my ($self) = @_; return unless $self->{article}; if (!exists $self->{article_extra_templates}) { my $sth = $self->{db}->prepare(q{ select filename from wc_article_extra_template where file_id = ? }); $sth->execute($self->{id}); my @extra; while (my ($filename) = $sth->fetchrow_array) { push @extra, $filename } $self->{article_extra_templates} = \@extra; } return @{$self->{article_extra_templates}}; } =item $file-Etags Return a reference to an array of tags which have been applied to this article. These come ultimately from the C property, although it is loaded into the database tables C and C when the working copy is updated. The tags are returned sorted by canonical tag name. Each item of the returned array is a hashref containing the following values: =over =item tag The canonical tag name, as used as the primary key in the C table. =item original_spelling The spelling used for to name the tag in the C property of this file. =back Both of these values are provided as text strings, decoded from UTF-8. =cut sub tags { my ($self) = @_; my $sth = $self->{db}->prepare(q{ select t.tag, ft.original_spelling from tag t inner join wc_file_tag ft on ft.tag = t.tag where ft.file_id = ? order by t.tag }); $sth->execute($self->{id}); my @tags; while (my $row = $sth->fetchrow_hashref) { $row->{$_} = decode('UTF-8', $row->{$_}, Encode::FB_CROAK) for qw( title short_title description ); push @tags, { %$row }; } return \@tags; } =item $file-Earticle_snippet Return an L object representing the part of an article which comes before the fold, or before the first page break (whichever comes first). If there are no fold markers or page breaks in the article, returns the complete article content. =cut sub article_snippet { my ($self) = @_; return $self->{snippet_doc} if exists $self->{snippet_doc}; my $whole_doc = $self->article_doc; $self->_find_content_marks; my $fold = $self->{fold}; return $whole_doc unless defined $fold; my $snippet_doc = XML::LibXML::Document->new('1.0', 'UTF-8'); my $body = $snippet_doc->createElementNS('http://www.w3.org/1999/xhtml', 'body'); $snippet_doc->setDocumentElement($body); my $elem = $whole_doc->documentElement->firstChild; while (defined $elem && !$elem->isSameNode($fold)) { $body->appendChild($elem->cloneNode(1)); $elem = $elem->nextSibling; } return $self->{snippet_doc} = $snippet_doc; } =item $file-Earticle_snippet_html4 Returns a chunk of S markup for the article's content, just as the Larticle_content_html4([$page_num])> does, except that this only returns the content up to the fold or first page break, if the article has any of those. This also sets an internal flag called C to true if the content returned represents a truncated version of the article's content (that is, there was a fold mark or page break found). =cut sub article_snippet_html4 { my ($self) = @_; my $snippet_doc = $self->article_snippet; $self->{snippet_is_not_whole_article} = 1 unless $snippet_doc->isSameNode($self->article_doc); # This is going to be shown on the homepage or something, so links won't # be relative to the output page's URL. absolutify_links($snippet_doc, $self->permalink); # TODO - this could be more efficient if we passed in the fold position. return dom_body_to_html4($snippet_doc); } =item $file-Eauthors Returns information about the author or authors credited with creating the file. The return value is a reference to an array of zero or more references to hashes. Each one contains the following keys: =over =item id The ID number of the entry in the database's C table. =item username The username, as specified in the C property, decoded into a Perl text string. Always defined. =item name Full name of the author, as a Perl text string. Always defined. =item email Email address as a binary string, or undef. =item uri A URL associated with the author, probably their own website, or undef. =back The authors are returned in the same order that they are specified in the C property. Note that because of the way the standard property loader works, directories are not considered to have authors. If a directory has a C property, that will just affect all the files within it. =cut sub authors { my ($self) = @_; my $db = $self->{db}; # Build a PostgreSQL regular expression which will be used to select # all the 'person_info' records with a path which applies to the file, # in order to select the most specific one (with the longest path). my @path = map { pgregex_escape($_) } split '/', $self->{path}; my $path_regex = '^(' . join('(/', @path) . '$' . ('|$)' x @path); my $sth = $db->prepare(q{ select person_id from file_author where file_id = ? order by pos }); $sth->execute($self->{id}); my @author; while (my ($id) = $sth->fetchrow_array) { my $info = $db->selectrow_hashref(q{ select p.id, p.username, i.name, i.email, i.uri from person p inner join person_info i on i.person_id = p.id where p.id = ? and i.path ~ ? order by length(i.path) desc }, undef, $id, $path_regex); croak "no 'person_info' record for user $id at path '$self->{path}'" unless defined $info; for (qw( username name )) { $info->{$_} = decode('UTF-8', $info->{$_}, Encode::FB_CROAK); } push @author, { %$info }; } return \@author; } =item $file-Eupdate_urls_in_db([$dup_urls]) Updates the C table to match the current URLs generated by C<$file>, as returned by the generator method Lurls_info($file)>. This includes changing active URLs to redirects or marking them 'gone' if they are no longer generated by the file. If this update isn't being done in isolation, but for example is being run by the L function, then C<$dup_urls> should be a reference to a hash which can be used to keep track of new URLs which cannot be added to the database yet because they would conflict with an existing active URL. This method will remove them from the hash later if the original active URL becomes inactive, and will replace it in the database with the new one. Some cleanup code is needed in the caller though to finish processing any URLs left in the hash, or to signal errors if they are a genuine conflict which cannot be resolved. An error should cause the transaction to update, because items in the hash represent unfinished updates to the database. Returns a list of two values, which can each be either true or false. They indicate whether the set of URLs which are redirects or marked as 'gone' have changed. The first indicates that at least one redirect has been added, removed, or had its destination changed. The second value indicates that a previously active or redirected URL is now marked 'gone', or that a previously dead URL has been reactivated or turned into a redirect. These two values can be used to determine whether redirect maps need to be regenerated by the caller. The work is done in a transaction, so that if it fails there will be no changes to the database. TODO - update docs about return value =cut sub update_urls_in_db { my ($self, $dup_urls) = @_; return transactionally($self->{db}, \&_update_urls_in_db_txn, $self, $dup_urls); } sub _update_urls_in_db_txn { my ($self, $dup_urls) = @_; my $db = $self->{db}; # Get information about the URLs that we currently have for this file. my $sth = $db->prepare(q{ select * from url where wc_id = ? and guid_id = ? }); $sth->execute($self->{wc_id}, $self->{guid_id}); my (%old_active, %old_redirect, %old_gone); while (my $r = $sth->fetchrow_hashref) { my $hash = $r->{status} eq 'A' ? \%old_active : $r->{status} eq 'R' ? \%old_redirect : \%old_gone; $hash->{$r->{url}} = { %$r }; } # Keep track of whether the set of redirects or gone files have changed, # which might mean that the caller will need to regenerate some redirect # files. The keys are the filenames which need updating, and the values # are the output configuration hashes where they were found. my (%redirects_changed, %gone_changed); # Track changes to URLs for the publishing process. Each one of these # has the URL as the key and the URL info hash as the value. For changed # URLs the new URL and URL info is used in the key and value, but the value # also contains a key called 'old_url_info'. # Note that changed URLs are ones where a redirect has been created, and # changes of URL ownership (one file deactivates it and another file # reactivates the same URL) are not recorded. my (%url_activated, %url_deactivated, %url_changed); # Put the new URLs in the database. Add the 'id' of each one to the # information in @new_url. my @new_url = $self->generator->urls_info($self); for (@new_url) { my $url = $_->{url}; if (exists $old_active{$url}) { # Was active, and still is. If a duplicate URL has been created # by another file then at this point we know there's no chance # of it being resolved, so signal an error. croak "new URL '$url' would conflict with existing URL" if defined $dup_urls && exists $dup_urls->{$url}; $_->{id} = $old_active{$url}{id}; db_update($db, url => $_->{id}, method => $_->{method}, argument => $_->{argument}, content_type => $_->{type}, ); delete $old_active{$url}; } elsif (exists $old_redirect{$url}) { # Was a redirect, but now active again. assert(!defined $dup_urls || !exists $dup_urls->{$url}) if DEBUG; $_->{id} = $old_redirect{$url}{id}; db_update($db, url => $_->{id}, method => $_->{method}, argument => $_->{argument}, content_type => $_->{type}, status => 'A', redirect_to_id => undef, ); $url_activated{$url} = $_; delete $old_redirect{$url}; $self->_update_rewrite(\%redirects_changed, 'redirect', $url); } elsif (exists $old_gone{$url}) { # Was gone, but has come back. assert(!defined $dup_urls || !exists $dup_urls->{$url}) if DEBUG; $_->{id} = $old_gone{$url}{id}; db_update($db, url => $_->{id}, method => $_->{method}, argument => $_->{argument}, content_type => $_->{type}, status => 'A', ); $url_activated{$url} = $_; delete $old_gone{$url}; $self->_update_rewrite(\%gone_changed, 'gone', $url); } else { # New URL. It might replace a non-active one belonging to a # different file. my ($id, $status) = db_select($db, 'url', { wc_id => $self->{wc_id}, url => $url }, qw( id status ), ); if (defined $id && !(defined $dup_urls && $status eq 'A')) { if ($status eq 'A') { croak "new URL '$url' would conflict with existing URL"; } elsif ($status eq 'R') { $self->_update_rewrite(\%redirects_changed, 'redirect', $url); } elsif ($status eq 'G') { $self->_update_rewrite(\%gone_changed, 'gone', $url); } # %$dup_urls should never contain a non-active URL. assert(!defined $dup_urls || !exists $dup_urls->{$url}) if DEBUG; # Write the new active URL over the top of an old inactive one. $_->{id} = $id; db_update($db, url => $id, guid_id => $self->{guid_id}, generator => $_->{generator}, method => $_->{method}, argument => $_->{argument}, content_type => $_->{type}, status => 'A', redirect_to_id => undef, ); $url_activated{$url} = $_; } else { if (defined $id) { assert(defined $dup_urls) if DEBUG; # We already have one duplicate for this, so it can't # possibly be resolved by deactivating the existing one. croak "new URL '$url' would conflict with existing URL" if exists $dup_urls->{$url}; # The new URL is a duplicate of an existing one which is # still active, but our caller has let it be known that # there are multiple files being updated at once, so keep # information about the new URL in the hopes that the old # one will later be deactivated, allowing it to be replaced # with this one. $_->{id} = $id; $dup_urls->{$url} = { id => $id, guid_id => $self->{guid_id}, generator => $_->{generator}, method => $_->{method}, argument => $_->{argument}, type => $_->{type}, }; } else { # This is the only place where new 'url' records are # inserted. $_->{id} = db_insert($db, 'url', url => $url, wc_id => $self->{wc_id}, guid_id => $self->{guid_id}, generator => $_->{generator}, method => $_->{method}, argument => $_->{argument}, content_type => $_->{type}, status => 'A', ); $url_activated{$url} = $_; } } } } # Adjust any previously-active URLs which are no longer active. for (values %old_active) { if (defined $dup_urls && exists $dup_urls->{$_->{url}}) { # A new URL which is a duplicate of this one has been created # by a different file, and is waiting in the wings in the hopes # that this file will ditch it. We can now replace the old one # with the new one from the other file, and thereby resolve # the duplication. my $dup = $dup_urls->{$_->{url}}; assert($dup->{id} == $_->{id}) if DEBUG; db_update($db, url => $_->{id}, guid_id => $dup->{guid_id}, generator => $dup->{generator}, method => $dup->{method}, argument => $dup->{argument}, content_type => $dup->{type}, ); delete $dup_urls->{$_->{url}}; next; } if (!@new_url) { # Nothing to redirect to, so mark the old one as gone. db_update($db, url => $_->{id}, status => 'G', ); $self->_update_rewrite(\%gone_changed, 'gone', $_->{url}); $url_deactivated{$_->{url}} = $_; } else { # Change it to a redirect, if there are any active URLs which # are suitable (same generator, method, and argument). If there # are multiple choices, choose the one with the same content type, # or the first of any ties. my $best_match; for my $new (@new_url) { next unless $new->{generator} eq $_->{generator} && $new->{method} eq $_->{method} && $new->{argument} eq $_->{argument}; $best_match = $new unless defined $best_match; next unless $new->{type} eq $_->{content_type}; $best_match = $new; last; } if (defined $best_match) { # Set old URL to redirect. # TODO - I don't think I'm being consistent about what the # content type of a redirect URL means. Is it just a left-over # from the old inactive URL, or as here is it the type of # the target URL? assert(defined $best_match->{id}) if DEBUG; db_update($db, url => $_->{id}, content_type => $best_match->{type}, status => 'R', redirect_to_id => $best_match->{id}, ); $self->_update_rewrite(\%redirects_changed, 'redirect', $_->{url}); $url_changed{$best_match->{url}} = { %$best_match, old_url_info => $_, }; delete $url_activated{$best_match->{url}}; # Adjust any which previously redirected to the old URL # so that they point directly to the new one. db_update($db, url => { redirect_to_id => $_->{id} }, content_type => $best_match->{type}, redirect_to_id => $best_match->{id}, ); } else { # Kill the old URL. db_update($db, url => $_->{id}, status => 'G', ); $self->_update_rewrite(\%gone_changed, 'gone', $_->{url}); $url_deactivated{$_->{url}} = $_; } } } return { update_redirect_maps => \%redirects_changed, update_gone_maps => \%gone_changed, url_activated => \%url_activated, url_deactivated => \%url_deactivated, url_changed => \%url_changed, }; } sub _update_rewrite { my ($self, $which_changed, $map_name, $url) = @_; my ($config) = $self->{cms}->output_config($url); return unless defined $config; my $map_file = $config->{"${map_name}_map"}; $which_changed->{$map_file} = $config if defined $map_file && !exists $which_changed->{$map_file}; } =back =head1 COPYRIGHT This software is copyright 2006 Geoff Richards Egeoff@laxan.comE. For licensing information see this page: L =cut 1; # vi:ts=4 sw=4 expandtab