# -*- perl -*- # # w3pdfuri.pm -- extract uris from a adobe acrobat pdf file. # version 0.1 $VERSION=0.1; # # Assumptions: # - File starts with %PDF # - All URLS match this RE: ^\/URI\s+\(([^)]*)\)\s*$ (found by inspection of # one (1) pdf file (the reader.pdf in the acrobat distribution). # - pdf files are potentially very large, so we read it from disk # record by record. # - Record separator is ^M # # History: # - 19/02/97 janl: Version 0.1, seems to work with PDF-1.2 # package w3pdfuri; use strict; sub list ($) { my ($file) = @_; my @urls=(); local($/)="\r"; unless (open(PDF,"< $file")) { warn "Could not open $file for input: $!\n"; return; } $_ = ; unless (/^%PDF-/) { warn "$file is not a PDF file.\n"; close(PDF); return; } while () { chomp; push(@urls,$1) if (m~^/URI\s+\(([^)]*)\)\s*$~); } close(PDF); return @urls; } 1; __END__ # Test code print "reader.pdf: ", join(",",list("/local/lib/acrobat/Reader/help/reader.pdf")),"\n"; print "Acrobat.pdf: ", join(",",list("/local/lib/acrobat/Reader/Acrobat.pdf")),"\n"; print "License.pdf: ", join(",",list("/local/lib/acrobat/Reader/License.pdf")),"\n"; print "MapTypes.pdf: ", join(",",list("/local/lib/acrobat/Reader/MapTypes.pdf")),"\n"; list("/etc/services");