#!/usr/bin/perl use LWP::Simple; my $Base = 'http://www.isbn.spk-berlin.de/html/prefix/'; my @Files = map "pref$_.htm", qw( a b c d_f g_h i_j k_l m n_o p q_r s t u v_z ); foreach my $file ( @Files ) { unless( -f $file ) { $_ = get( "$Base$file" ); open FILE, "> $file" or warn "Could not open [$file]\n$!"; print FILE; close FILE; } else { local $/; open FILE, $file or warn "Could not open local file [$file]\n$!"; $_ = ; close FILE; } $_ = munge($_); open FILE, "> m-$file" or warn "Could not open [m-$file]\n$!"; print FILE; close FILE; } sub munge { local $_ = shift; s|.*\s+\s+.*?\s+||s; # chop off the head s|.*||s; #chop off the tail s| +| |sg; #collapse spaces foreach my $pattern ( qw( \s+width="\\d+"   \s* \s+colspan="\d+"
\s+height="\d+" \s+v?align=".*?"

) ) { s|$pattern||isg; } s|(\d+)\s+-\s+(\d+)|$1 - $2|g; s|\s+||ig; s|\s+||ig; s|\t| |g; s| \s+ \s* ([\w\s,]+) # $1 \s* \s* ( .*? )? # $2, possible extra remark \s+ (?:\s+)? #possible blank cell \s* (\d+) # $3, first country code \s* (?:)? # some of these are missing (?: #there might be more country codes \s* (?:\s*)? # some of these are missing too \+ (?:\s*)? # some of these are missing \s* (?:)? \s* (\d+) # $4, $5, ... other country codes \s* (?:)? )* # these might be here or not \s+ |\f$1 $2*$3*$4*$5*$6|ixg; foreach my $pattern ( qw( ) ) { s|$pattern||isg; } s|\s+[\r\n]|\n|g; s|[\r\n]+|\n|g; s|(\f.*?)([A-Z]+)\s+|$1*$2\n|sg; foreach my $pattern ( qw( ) ) { s|$pattern||sg; } return $_; }