package Plucene::SearchEngine::Index::PDF; use base 'Plucene::SearchEngine::Index::Base'; __PACKAGE__->register_handler("application/pdf", ".pdf"); use File::Temp qw/tmpnam/; =head1 NAME Plucene::SearchEngine::Index::PDF - Backend for parsing PDF =head1 DESCRIPTION This backend analyzes a PDF file for its textual content (using C) and turns any metadata found in the PDF into Plucene fields. =cut sub gather_data_from_file { my ($self, $filename) = @_; my $html = tmpnam(); system("pdftotext", "-htmlmeta", $filename, $html); return unless -e $html; $self->Plucene::SearchEngine::Index::HTML::gather_data_from_file($html); unlink $html; return $self; } 1;