package File::Mork; use strict; use vars qw($VERSION $ERROR); use POSIX qw(strftime); $VERSION = "0.2"; =head1 NAME File::Mork - a module to read Mozilla URL history files =head1 SYNOPSIS my $mork = File::Mork->new($filename, verbose => 1) || die $File::Mork::ERROR."\n"; foreach my $entry ($mork->entries) { while (my($key,$val) = each %$entry) { printf ("%14s = %s\n", $key, $val); } } =head1 DESCRIPTION This is a module that can read the Mozilla URL history file -- normally $HOME/.mozilla/default/*.slt/history.dat -- and extract the id, url, name, hostname, first visted dat, last visited date and visit count. To find your history file it might be worth using B which has some platform-independent code for finding the profiles of various Mozilla-isms (including Firefox, Camino, K-Meleon, etc.). =cut =head1 METHODS =head2 new [opts] Takes a filename and parses that file. Returns C on error, setting C<$File::Mork::Error>. Takes an optional hash of options =over 4 =item verbose A value up to 3 - defines the level of verbosity =item age A ctime which forces C to only parse entries later than this. =back =cut sub new { my ($class, $file, %opts) = @_; my $self = bless \%opts, $class; $self->{verbose} ||= 0; unless ($self->parse($file)) { $ERROR = $self->{error}; return; } return $self; } ########################################################################## # Define the messy regexen up here ########################################################################## my $top_level_comment = qr@//.*\n@; my $key_table_re = qr/ < \s* < # "< <" \( a=c \) > # "(a=c)>" (?> ([^>]*) ) > \s* # Grab anything that's not ">" /sx; my $value_table_re = qr/ < ( .*?\) )> \s* /sx; my $table_re = qr/ \{ -? # "{" or "{-" [\da-f]+ : # hex, ":" (?> .*?\{ ) # Eat up to a {... ((?> .*?\} ) # and then the closing }... (?> .*?\} )) # Finally, grab the table section \s* /six; my $row_re = qr/ ( (?> \[ [^]]* \] # "["..."]" \s*)+ ) # Perhaps repeated many times /sx; my $section_begin_re = qr/ \@\$\$\{ # "@$${" ([\dA-F]+) # hex \{\@ \s* # "{@" /six; my $crlf = qr/\x0a\x0d|\x0d\x0a|\x0a|\x0d/; # We are liberal in what we accept. # But then, so is a six dollar whore. =head2 parse Internal method to parse the file. Obviously. =cut sub parse { my ($self, $file) = @_; $self->{since} = ($self->{age} ? time() - $self->{age} : 0); $self->{section} = "top level"; $self->{section_end_re} = undef; ########################################################################## # Read in the file. ########################################################################## local $/ = undef; local *IN; $self->{file} = $file; $self->{total} = 0; $self->{skipped} = 0; unless (open (IN, $file)) { $self->{error} = "Couldn't open $file : $!"; return; } $self->debug("reading ...",1); my $body = ; close IN; $body =~ s/($crlf)/\n/gs; # Windows Mozilla uses \r\n # Presumably Mac Mozilla is similarly dumb $body =~ s/\\\\/\$5C/gs; # Sometimes backslash is quoted with a # backslash; convert to hex. $body =~ s/\\\)/\$29/gs; # close-paren is quoted with a backslash; # convert to hex. $body =~ s/\\\n//gs; # backslash at end of line is continuation. ########################################################################## # Figure out what we're looking at, and parse it. ########################################################################## $self->debug("parsing ...",1); pos($body) = 0; my $length = length($body); while( pos($body) < $length ) { my $section_end_re = $self->{section_end_re}; # Key table if ( $body =~ m/\G$key_table_re/gc ) { return unless $self->parse_key_table($1); # Values } elsif ( $body =~ m/\G$value_table_re/gco ) { return unless $self->parse_value_table($1); # Table } elsif ( $body =~ m/\G$table_re/gco ) { return unless $self->parse_table($1); # Rows (-> table) } elsif ( $body =~ m/\G$row_re/gco ) { return unless $self->parse_table($1); # Section begin } elsif ( $body =~ m/\G$section_begin_re/gco ) { my $section = $1; $self->{section_end_re} = qr/\@\$\$\}$section\}\@\s*/s; $self->{section} = $section; # Section end } elsif ( $section_end_re && $body =~ m/\G$section_end_re/gc ) { $self->{section_end_re} = undef; $self->{section} = "top level"; # Comment } elsif ( $body =~ m/\G$top_level_comment/gco ) { #no-op } else { # $body =~ m/\G (.{0,300}) /gcsx; print "<$1>\n"; return $self->error($self->{section}.": Cannot parse"); } } if($self->{section_end_re}) { return $self->error("Unterminated section ".$self->{section}); } $self->debug("sorting...",1); my @entries = map { File::Mork::Entry->new(%$_) } sort { $b->{LastVisitDate} <=> $a->{LastVisitDate} } values(%{$self->{row_hash}}); $self->debug("done! (".$self->{total}." total, ".$self->{skipped}." skipped)",1); for (qw(key_table val_table row_hash total skipped)) { $self->{$_} = undef; } $self->{entries} = \@entries; return 1; } =head2 entries Return a list of C objects sorted by B. =cut sub entries { return @{$_[0]->{entries}}; } ########################################################################## # parse a row and column table ########################################################################## sub parse_table { my($self, $table_part) = (@_); $self->debug("",3); # Assumption: no relevant spaces in values in this section $table_part =~ s/\s+//g; # print $table_part; #exit(0); # Grab each complete [...] block while( $table_part =~ m/\G [^[]* \[ # find a "[" ( [^]]+ ) \] # capture up to "]" /gcx ) { $_ = $1; my ($id, @cells) = split (m/[()]+/s); next unless scalar(@cells); # Trim junk $id =~ s/^-//; $id =~ s/:.*//; my %hash = ($self->{row_hash}->{$id}) ? %{$self->{row_hash}->{$id}} : ( 'ID' => $id, 'LastVisitDate' => 0 ); foreach (@cells) { next unless $_; my ($keyi, $which, $vali) = m/^\^ ([-\dA-F]+) ([\^=]) (.*) $/xi; return $self->error("unparsable cell: $_\n") unless defined ($vali); # If the key isn't in the key table, ignore it # my $key = $self->{key_table}->{$keyi}; next unless defined($key); my $val = ($which eq '=' ? $vali : $self->{val_table}->{$vali}); if ($key eq 'LastVisitDate' || $key eq 'FirstVisitDate') { $val = int ($val / 1000000); # we don't need milliseconds, dude. } $hash{$key} = $val; #print "$id: $key -> $val\n"; } if ($self->{age} && ($hash{LastVisitDate} || $self->{since}) < $self->{since}) { $self->debug("skipping old: $hash{LastVisitDate} $hash{URL}",3); $self->{skipped}++; next; } $self->{total}++; $self->{row_hash}->{$id} = \%hash; } return 1; } ########################################################################## # parse a values table ########################################################################## sub parse_value_table { my($self, $val_part) = (@_); return 1 unless $val_part; my @pairs = split (m/\(([^\)]+)\)/, $val_part); $val_part = undef; $self->debug("",3); foreach (@pairs) { next unless (m/[^\s]/s); my ($key, $val) = m/([\dA-F]*)[\t\n ]*=[\t\n ]*(.*)/i; if (! defined ($val)) { $self->debug($self->{section}.": unparsable val: $_"); next; } # Assume that URLs and LastVisited are never hexilated; so # don't bother unhexilating if we won't be using Name, etc. if($val =~ m/\$/) { # Approximate wchar_t -> ASCII and remove NULs $val =~ s/\$00//g; # faster if we remove these first $val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge; } $self->{val_table}->{$key} = $val; $self->debug($self->{section}.": val $key = \"$val\"", 3); } return 1; } ########################################################################## # parse a key table ########################################################################## sub parse_key_table { my ($self, $key_table) = (@_); $self->debug("",3); $key_table =~ s@\s+//.*$@@gm; my @pairs = split (m/\(([^\)]+)\)/s, $key_table); $key_table = undef; foreach (@pairs) { next unless (m/[^\s]/s); my ($key, $val) = m/([\dA-F]+)\s*=\s*(.*)/i; return $self->error ("unparsable key: $_") unless defined ($val); # savie the other fields that we aren't interested in. $self->{key_table}->{$key} = $val; $self->debug($self->{section}.": key $key = \"$val\"",3); } return 1; } =head2 error Internal method to set the internal error message =cut sub error { my ($self, $message) = @_; $self->{error} = $self->{file}.": $message"; return undef; } =head2 debug Internal method to print out a debug message if it's a higher priority than the the current verbosity level. =cut sub debug { my ($self, $message, $level) = @_; $level ||= 0; return if $self->{verbose} < $level; print STDERR "".(($message eq "")? "\n" : $self->{file}.": $message\n" ); } =head1 THE UGLY TRUTH LAID BARE I In Netscape Navigator 1.0 through 4.0, the history.db file was just a Berkeley DBM file. You could trivially bind to it from Perl, and pull out the URLs and last-access time. In Mozilla, this has been replaced with a "Mork" database for which no tools exist. Let me make it clear that McCusker is a complete barking lunatic. This is just about the stupidest file format I've ever seen. http://www.mozilla.org/mailnews/arch/mork/primer.txt http://jwz.livejournal.com/312657.html http://www.jwz.org/doc/mailsum.html http://bugzilla.mozilla.org/show_bug.cgi?id=241438 In brief, let's count its sins: =over 4 =item Two different numerical namespaces that overlap. =item It can't decide what kind of character-quoting syntax to use: Backslash? Hex encoding with dollar-sign? =item C++ line comments are allowed sometimes, but sometimes // is just a pair of characters in a URL. =item It goes to all this serious compression effort (two different string-interning hash tables) and then writes out Unicode strings without using UTF-8: writes out the unpacked wchar_t characters! =item Worse, it hex-encodes each wchar_t with a 3-byte encoding, meaning the file size will be 3x or 6x (depending on whether whchar_t is 2 bytes or 4 bytes.) =item It masquerades as a "textual" file format when in fact it's just another binary-blob file, except that it represents all its magic numbers in ASCII. It's not human-readable, it's not hand-editable, so the only benefit there is to the fact that it uses short lines and doesn't use binary characters is that it makes the file bigger. Oh wait, my mistake, that isn't actually a benefit at all. =back Pure comedy. =head1 AUTHOR Module-ised by Simon Wistow based on http://www.jwz.org/hacks/mork.pl Created: 3-Mar-2004 by Jamie Zawinski, Anonymous, and Jacob Post. =head1 COPYRIGHT Copyright © 2004 Jamie Zawinski =head1 LICENSE Permission to use, copy, modify, distribute, and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. No representations are made about the suitability of this software for any purpose. It is provided "as is" without express or implied warranty. =head1 BUGS Might be a bit memory heavy? Could do with an iterator interface. Can't write Mork dbs. =head1 SEE ALSO http://www.livejournal.com/users/jwz/312657.html http://www.erys.org/resume/netscape/mork/jwz.html =cut package File::Mork::Entry; use strict; use vars qw($AUTOLOAD); =head1 NAME File::Mork::Entry - an single entry in a mork DB =head1 METHODS All methods except C take an optional argument to set them. =head2 new <%opts> blesses C<%opts> into the class File::Mork::Entry =cut sub new { my ($class, %self) = @_; return bless \%self, $class; } =head2 ID The internal id of the entry =head2 URL The url visited =head2 NAME The name of the url visited =head2 Hostname The hostname of the url visited =head2 FirstVisitDate The first time this url was visited as a C =head2 LastVisitDate The last time this url was visited as a C =head2 Hidden Whether this URL is hidden from the history list or not =head2 VisitCount The number of times this url has been visited =head2 ByteOrder The byte order - this is associated with ID number 1. =cut sub DESTROY { } sub AUTOLOAD { my $self = shift; my $attr = $AUTOLOAD; $attr =~ s/.*:://; $self->{$attr} = $_[0] if @_; return $self->{$attr}; } 1;