package WebService::CIA::Parser;
require 5.005_62;
use strict;
use warnings;
use WebService::CIA;
our $VERSION = '1.4';
sub new {
my $proto = shift;
my $source = shift;
my $class = ref($proto) || $proto;
my $self = {};
bless ($self, $class);
return $self;
}
sub parse {
my ($self, $cc, $html) = @_;
my $data = {
'URL - Flag' => $WebService::CIA::base_url . 'flags/' . $cc . '-flag.gif',
'URL - Map' => $WebService::CIA::base_url . 'maps/' . $cc . '-map.gif',
'URL' => $WebService::CIA::base_url . 'geos/' . $cc . '.html',
'URL - Print' => $WebService::CIA::base_url . 'print/' . $cc . '.html'
};
while ($html =~ m#
]+ class="FieldLabel">.*?
(.+?):
.*?.*?
| .*?
]+>
(.*?)
( | |)
#xsg) {
my $field = $1;
my $value = $2;
$field =~ s/\s+/ /sg;
$field =~ s/^\s*(.*?)\s*$/$1/;
$value =~ s/\s+/ /sg;
$value =~ s/^\s*(.*?)\s*$/$1/;
$value =~ s/\s*
\s*/\n/g;
$value =~ s/<\/?[^>+]>//g;
$data->{$field} = $value;
}
return $data;
}
1;
__END__
=head1 NAME
WebService::CIA::Parser - Parse pages from the CIA World Factbook
=head1 SYNOPSIS
use WebService::CIA::Parser;
my $parser = WebService::CIA::Parser->new;
my $data = $parser->parse($string);
=head1 DESCRIPTION
WebService::CIA::Parser takes a string of HTML and parses it. It will only give
sensible output if the string is the HTML for a page whose URL matches
C
This parsing is somewhat fragile, since it assumes a certain page structure.
It'll work just as long as the CIA don't choose to alter their pages.
=head1 METHODS
=over 4
=item C
Creates a new WebService::CIA::Parser object. It takes no arguments.
=item C
Parses a string of HTML take from the CIA World Factbook. It takes a single
string as its argument and returns a hashref of fields and values.
The values are stripped of all HTML. CbrE> tags are replaced by
newlines.
It also creates four extra fields: "URL", "URL - Print", "URL - Flag", and
"URL - Map" which are the URLs of the country's Factbook page, the
printable version of that page, a GIF map of the country, and a GIF flag
of the country respectively.
=back
=head1 EXAMPLE
use WebService::CIA::Parser;
use LWP::Simple qw(get);
$html = get(
"https://www.cia.gov/library/publications/the-world-factbook/print/uk.html"
);
$parser = WebService::CIA::Parser->new;
$data = $parser->parse($html);
print $data->{"Population"};
=head1 AUTHOR
Ian Malpass (ian-cpan@indecorous.com)
=head1 COPYRIGHT
Copyright 2003-2007, Ian Malpass
This library is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
The CIA World Factbook's copyright information page
(L)
states:
The Factbook is in the public domain. Accordingly, it may be copied
freely without permission of the Central Intelligence Agency (CIA).
=head1 SEE ALSO
WebService::CIA
=cut