package WWW::Patent::Page::JPO_IPDI; use Carp qw(cluck confess carp croak); use strict; use warnings; use diagnostics; use subs qw( methods JPO_IPDI_country_known), qw( JPO_IPDI_terms JPO_IPDI_translation JPO_IPDI_request _output_next_request_form _output_next_request_re _save_image ) ; ## JPO_IPDI_xml_tree use LWP::UserAgent 2.003; require HTTP::Request; use HTTP::Request::Common; use URI; use HTML::Form; use PDF::API2 2.000; use WWW::Patent::Page::Response; use Archive::Zip qw( :ERROR_CODES :CONSTANTS ); use HTML::TokeParser; use HTTP::Headers; use IO::Scalar; use Data::Dumper ; # use HTML::Display; ## comment out after completion; used for testing. $Carp::Verbose = 1; my $POST; $POST = 'GET'; $POST = 'POST'; # print "$] $^O $^V $^X @INC \n" ; exit; # my $browser = HTML::Display->new(class => 'HTML::Display::Win32::IE',); our ($VERSION, %_country_known); $VERSION = '0.02'; #$| = 1; sub JPO_IPDI_translation { my $self = shift; $self->agent('Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6'); # $self->proxy(['http', 'ftp'], 'http://localhost:5364/'); ## Howard P. Katseff, "Web Scraping Proxy" wsp http://www.research.att.com/~hpk/ my $re; ## regular expression my (@english_response, @japanese_page, @image, %image_hash); ## @english_response 0=>summary 1=>claims 2=>detailed description my (@response, $response, @request, $request); my (@next_request, $next_request); my $html; my %parameter; # print "'$self->{'patent'}->{'doc_type'}' doc_type\n"; if (defined($self->{'patent'}->{'doc_type'})) { %parameter = ( N2001 => $self->{'patent'}->{'doc_type'} . $self->{'patent'}->{'number'}, N1001 => $self->{'patent'}->{'kind'} ); } else { %parameter = ( N2001 => $self->{'patent'}->{'number'}, N1001 => $self->{'patent'}->{'kind'} ); } ## print "Number '$parameter{N2001}' type '$parameter{N1001}'\n"; my $name = $parameter{N2001} . '_' . $parameter{N1001}; $request = HTTP::Request->new(GET => "http://www4.ipdl.inpit.go.jp/Tokujitu/tjsogodben.ipdl?" . "N0000=115"); my $output_next_request = sub { ## take the raw html, find the right form, fill in my $response = shift; my $html = $response->content; if ($html =~ m/The whole service will not be provided in the following time for regular maintenance/mxi) { $self->{success} = 0; $self->{message} = 'The whole service will not be provided in the following time for regular maintenance.'; return ($response, $request); } my @forms = HTML::Form->parse($response); if (!scalar(@forms)) { carp 'returned html has no forms; possibly maintenance time at JPO? '; $request = HTTP::Request->new(GET => 'http://www.ipdl.inpit.go.jp/homepg_e.ipdl'); $response = $self->request($request); confess $response->content; $self->{is_success} = q{}; $self->{message} = 'returned html has no forms; possibly maintenance time? '; return $self; ## this kicks out error at $response[1] = $self->request( $next_request[0] ); due to WWW::Patent::Page object not request object } my @inputs = $forms[0]->inputs; foreach my $name (keys %parameter) { my $input = $forms[0]->find_input($name); $input->readonly(0); $forms[0]->value($name, $parameter{$name}); # print "\$name = '$name' , \$parameter{$name} = '$parameter{$name}'\n"; } # exit; $forms[0]->method($POST); my $next_request = $forms[0]->click; # request for form submission return $next_request; }; ($response[0], $next_request[0]) # the html with forms, and the next url = (JPO_IPDI_request($self, $request, $output_next_request)); $response[1] = $self->request($next_request[0]); #obtain frames and URLs $re = qr/NAME="tjlistdben" \s+ SRC="([^"]+)"/imx; # find the url for our number ($next_request[2]) = (_output_next_request_re($response[1], $re)); my $referer = $next_request[2]->uri; # use later $response[2] = $self->request($next_request[2]); $html = $response[2]->content; ## to click on the number, fill out the form if ( $html # ( =~ m/ \) [\n\s]* \{ \s* document\.form1\.N0000\.value \s* = \s* (\d+) \s* ;/isxm ) # } { $parameter{'N0000'} = $1; } else {carp "No N0000 in \$html:\n$html\n"} if ($html =~ m/"N0703" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0703'} = $1; } else {carp 'No N0703 in \$html'} if ($html =~ m/"N0700" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0700'} = $1; } else {carp 'No N0700 in \$html'} if ($html =~ m/"N0701" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0701'} = $1; } else {carp 'No N0701 in \$html'} if ($html =~ m/"N0702" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0702'} = $1; } else {carp 'No N0702 in \$html'} if ($html =~ m/DL\[0\]=" ([^"]+) "/isxm) { $parameter{'N0500'} = $1; } else {carp 'No N0500 in \$html'} if ($html =~ m/PI\[0\]=" ([^"]+) "/isxm) { # PI[0]="000100110011" $parameter{'N0501'} = $1; $parameter{'N0501'} =~ s/\n//imxg; } else {carp 'No N0501 in $html'} $parameter{'N0502'} = substr($parameter{'N0501'}, 0, 4); $parameter{'N0503'} = '0'; $parameter{'N0504'} = 2; if ($html =~ m/MID=" ([^"]+) "/isxm) { $parameter{'N0001'} = $1; $parameter{'N0001'} =~ s/\n//mxig; } else {carp 'No N0001 in \$html'} if ($html =~ m/DK= ([^;]+) ;/isxm) { $parameter{'N0002'} = $1; $parameter{'N0002'} =~ s/\n//mxig; } else {carp 'No N0002 in \$html'} ($next_request[5]) = (_output_next_request_form($response[2], 0, %parameter)); $next_request[5]->header(referer => $referer); $response[5] = $self->request($next_request[5]); # click on the patent number # now you should have the CALLPAJ routine- click the form 0 $next_request[13] = (_output_next_request_form($response[5], 0,)); $response[13] = $self->request($next_request[13]); # print "\$next_request[13] < \$english[0] = ".Dumper($next_request[13])."\n"; #if ($debug) { } # $browser->display(html => $http_response->content); #first branch, now a passing move my $html13 = $response[13]->content(); # print "\$html13 = \n$html13\n"; if ($html13 =~ m/SRC="([^"]+)" \s+ NAME="FTMMAIN"/imx ) { # print "2!\n"; $request = _output_next_request_re($response[13], qr/SRC="([^"]+)" \s+ NAME="FTMMAIN"/imx); $english_response[0] = $self->request($request); #->content; # summary, coversheet, first page $next_request[13] = _output_next_request_re($response[13], qr/SRC="([^"]+)" \s+ NAME="FTMNAVI"/imx); $response[14] = $self->request($next_request[13]); # Detail / Japanese ($next_request[6]) = (_output_next_request_form($response[14], 1)); # 0 or 1 ? # print "\$next_request[6] > \$english[0] = ".Dumper($next_request[6])."\n"; $response[6] = $self->request($next_request[6]); } else { # print "1!\n"; $self->{is_success} = 0; $self->{message} = 'no PAJ translation available'; # carp('no PAJ translation available- possibly reference is too old or too new?'); $english_response[0] = HTTP::Response->new(200, 'sham' , HTTP::Headers->new , 'JP ' . $name . '

JP ' . $name . '

No translated patent abstract is available.

'); $response[6] = $response[13]; } # end of branch 1, now returning to linear flow . $html = $response[6]->content; if ($html =~ m/"N0000" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0000'} = $1; } else {carp "No N0000 in \$html:\n$html\n".Dumper($response[6]) } ($next_request[6]) = (_output_next_request_form($response[6], 0, %parameter)); # 0 or 1 ? $response[6] = $self->request($next_request[6]); $html = $response[6]->content; my $K_flg; if ($html =~ m/K_flg \s* = \s* " ([^"]+) "/isxm) { $K_flg = $1; } else {carp "No \$K_flg $K_flg in \$html:\n$html\n"} my $mid; if ($html =~ m/MID \s* = \s* (\d+) /isxm) { $mid = $1; } else {carp "No \$mid $mid in \$html:\n$html\n"} $re = qr/FRAME \s+ NAME="tjitemidx" \s+ SRC="([^"]+)"/imsx; ($next_request[7]) = (_output_next_request_re($response[6], $re)); $re = qr/FRAME \s+ NAME="tjitemcnt" \s+ SRC="([^"]+)"/imsx; ($next_request[8]) = (_output_next_request_re($response[6], $re)); $re = qr/FRAME \s+ NAME="tjitemdrw" \s+ SRC="([^"]+)"/imsx; ($next_request[9]) = (_output_next_request_re($response[6], $re)); $response[6] = $self->request($next_request[6]); #obtain frames and URLs $response[7] = $self->request($next_request[7]); #navigation $response[8] = $self->request($next_request[8]); #claims $english_response[1] = $response[8]; #claims $response[9] = $self->request($next_request[9]); #drawings navigator # now let's get the English $html = $response[7]->content; if ($html =~ m/document\.form3\.N0000\.value \s* = \s* (\d+) \s* ;/isxm) { $parameter{'N0000'} = $1; } else {carp "No N0000 in \$html:\n$html\n"} $parameter{'N0550'} = $K_flg; $parameter{'N0551'} = '00000000000010000000'; # drawings $parameter{'N0580'} = '0'; $parameter{'N0001'} = $mid; ($next_request[20]) = (_output_next_request_form($response[7], 3, %parameter)); # 0 or 1 ? $response[20] = $self->request($next_request[20]); # &japanese(); if you want the images of the untranslated patent $html = $response[6]->content; my $japanese_url; if ($html =~ m/location\.href="([^;]+)/ismx) { $japanese_url = $1; $japanese_url =~ s/"$//i; if (! $japanese_url =~ m/"\+ MID \+"/i) { warn "\$japanese_url '$japanese_url'\n"} # variable # of spaces around + sign $japanese_url =~ s/"\+ MID \+"/$parameter{'N0001'}/i; $japanese_url =~ s/"\+ DK \+ *"/$parameter{'N0002'}/i; $japanese_url =~ s/" \+ DL \+ "/$parameter{'N0500'}/i; $japanese_url =~ s/" \+ PI \+ "/$parameter{'N0501'}/i; $japanese_url =~ s/" \+ PI.substring\( 0, 4 \) \+ "/$parameter{'N0502'}/i; $japanese_url =~ s/" \+ count \+ "/0/i; # print "\$japanese_url = '$japanese_url'\n"; } else { carp "No Japanese location.href in function SubmitDataLayout: \n$html\n"; } my $japanese_request = HTTP::Request->new('GET', URI->new($japanese_url)); $response[30] = $self->request($japanese_request); # next is where it goes wrong $next_request[31] = _output_next_request_re($response[30], qr/NAME="tjcontentdben" \s* SRC="([^"]+)"/imx); $response[31] = $self->request($next_request[31]); $next_request[32] = _output_next_request_re($response[30], qr/NAME="tjdispdben" \s* SRC="([^"]+)"/imx); $response[32] = $self->request($next_request[32]); $html = $response[32]->content; my $time = int(1000 * time - int(rand(1000))); my %page_parameter; $page_parameter{ImgKind} = 1; $page_parameter{RolKind} = 0; $page_parameter{'N0001'} = $parameter{'N0001'}; $page_parameter{'N0002'} = $parameter{'N0002'}; $page_parameter{'N0500'} = $parameter{'N0500'}; $page_parameter{'N0501'} = $parameter{'N0501'}; $page_parameter{'N0502'} = $parameter{'N0502'}; $page_parameter{'N0503'} = $parameter{'N0503'}; $page_parameter{'N0700'} = $page_parameter{ImgKind}; # 1 $page_parameter{'N0701'} = 0; # 0 = no rotation $page_parameter{'N0702'} = 0; # 0 = reversal->no $page_parameter{'N0703'} = 0; # opposite (1's complement) of 700 my $last_page = substr($parameter{'N0501'}, 4, 4); $last_page =~ s/^0+//mxi; # print "\$last_page = '$last_page'\n"; # eliminate this image fetching. # my $JPO_page = '000'; # foreach my $image_count (1 .. $last_page) { # $JPO_page++; # $page_parameter{'N0502'} = $image_count; # ($next_request[33]) = (&_output_next_request_form($response[32], 0, %page_parameter)); # 0 or 1 ? # $response[33] = $self->request($next_request[33]); # $next_request[34] = &_output_next_request_re($response[33], qr/NAME="tjcontentdben" \s* SRC="([^"]+)"/imx); # $response[37] = $self->request($next_request[34]); # $next_request[38] = &_output_next_request_re($response[37], qr/IMG \s* SRC="([^"]+)"/imx); # $response[38] = $self->request($next_request[38]); # print 'uri = ', $next_request[38]->uri, "\n"; # &_save_image($response[38], 'image' . $JPO_page . '.gif'); # } #japanese images undef $parameter{'N0000'}; $html = $response[20]->content; if ($html =~ m/"N0000" \s+ VALUE=" ([^"]+) "/isxm) { $parameter{'N0000'} = $1; } else {carp "No N0000 in $html"} #DETAILED DESCRIPTION javascript:ShowFrames('00010000000000000000') #TECHNICAL FIELD javascript:ShowFrames('00001000000000000000') #PRIOR ART javascript:ShowFrames('00000100000000000000') #EFFECT OF THE INVENTION javascript:ShowFrames('00000010000000000000') #EXAMPLE javascript:ShowFrames('00000000100000000000') #TECHNICAL PROBLEM javascript:ShowFrames('00000000010000000000') #MEANS javascript:ShowFrames('00000000001000000000') #DESCRIPTION OF DRAWINGS javascript:ShowFrames('00000000000100000000') #DRAWINGS javascript:ShowFrames('00000000000010000000') my $count = 1; foreach my $N0551 ( '00010000000000000000', # Detailed Description '00000000000100000000', # Description of Drawings '00000000000010000000' # Drawings ) { $parameter{'N0551'} = $N0551; $count++; ($next_request[21]) = (_output_next_request_form($response[20], 1, %parameter)); # 0 or 1 ? $english_response[$count] = $self->request($next_request[21]); } # loop to get translated pages # now we get the drawings (this works as to web) $html = $response[9]->content; #drawings navigator $referer = $next_request[9]->uri; my %drawing2image; # print "drawings navigator referer [9] = $referer\n"; if ($html =~ m/IMG \s+ SRC=" ([^"]+) "/isxm) { $re = qr/IMG \s+ SRC=" ([^"]+) "/isxm; ($next_request[0]) = (_output_next_request_re($response[9], $re)); $response[0] = $self->request($next_request[0]); $image[0] = $response[0]->content; } else {carp "NO expected representative drawing in \n" . $html . "\n";} my ($images); if ($html =~ m/drawing \s+ (\d+) \s* <\/ SELECT /isxm) { ($images) = ($1); #how many drawings / images in patent # print "$images drawings found.\n"; } else {carp 'no last drawing number found.'} #my $drawing; foreach my $drawing (1 .. $images) { # $images from drawings navigator # print "processing drawing $drawing\n"; if ($html =~ m/VALUE=" ([^"]+) "> \n* drawing \s+ $drawing \n* /isxm) { my $number = $1; my %image = ( select1 => "$number", N0552 => substr($number, 0, 1), N0553 => substr($number, 2, 6), N0500 => $parameter{N0500}, ); $drawing2image{$image{N0553}} = $drawing; ($next_request[1]) = (_output_next_request_form($response[9], 0, %image)); $next_request[1]->header(referer => $referer); # print "url = ", $next_request[1]->uri , "\n"; $response[1] = $self->request($next_request[1]); #html that holds link to drawing my $html2 = $response[1]->content; if ($html2 =~ m/IMG \s+ SRC=" ([^"]+) "/isxm) { $re = qr/IMG \s+ SRC=" ([^"]+) "/isxm; # drawing link ($next_request[0]) = (_output_next_request_re($response[1], $re)); $response[0] = $self->request($next_request[0]); $image[$drawing] = $response[0]->content; # _save_image($response[0],"Image__$drawing"); } } else { confess "single drawing number $drawing of $images not found.\nhtml = $html\n"; } } $html = $english_response[2]->content; #detailed description if ($html =~ m/IMG \s+ SRC=" ([^"]+)".?ALT="ID=\s*([^"]+)"/isxm) { my $link = $1; my $number = $2; $re = qr/IMG \s+ SRC=" ([^"]+) "/isxm; # table like-drawing ($next_request[0]) = (_output_next_request_re($english_response[2], $re)); $response[0] = $self->request($next_request[0]); # print "saving 'table' as image[$images]; link = '$link' number = '$number'\n"; $image[++$images] = $response[0]->content; $image_hash{$number} = $response[0]->content; # print "got representative image\n"; } # else {carp "NO expected table drawing in here: \n" . "\$html :\n$html\n" . "\n";} #=cover.html'); 1=> claims.html');2=>detailed_description.html'); 3=>drawing_description.html');4=>\drawings.html'); $html = q{}; ## here is the start of the html manipulation and zip # my (@english_response, $tokenizer, $re, @image, %alt2image_file_name, $id, %alt2image); my ($id, %alt2image, %alt2image_file_name); my $other_image = 'other000'; for my $page (0 .. $#english_response) { # print "processing english page $page\n"; my $p = HTML::TokeParser->new(\$english_response[$page]->content) or cluck "page $page problem!"; my $base = $english_response[$page]->base; # print "\$base = '$base'\n" ; my $url; while (my $token = $p->get_token) { if (!$token) { # print "end of page $page"; } if (${$token}[0] eq 'S' and ${$token}[1] eq 'a') { ## print "found an anchor.\n ${$token}[0] ${$token}[1] " . join ' ', %{ ${$token}[2] }, "\n", ${$token}[4], "\n"; if (${$token}[2]{'href'} =~ m/N0553%3D(\d+)/mxi) { $id = $1; } else {carp "token ${$token}[4] not parsed"} if ($page == 4) { $html .= "\n\n"; } else {$html .= "\n\n"} } elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'base') { } # no base markup elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'emi') { } # no emi markup elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'txf') { } # no txf markup elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'dp') { } # no dp markup elsif (${$token}[0] eq 'E' and ${$token}[1] eq 'base') { } # no base markup elsif (${$token}[0] eq 'E' and ${$token}[1] eq 'emi') { } # no emi markup elsif (${$token}[0] eq 'E' and ${$token}[1] eq 'txf') { } # no txf markup elsif (${$token}[0] eq 'E' and ${$token}[1] eq 'dp') { } # no dp markup # elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'base') { } # no base markup elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'base') { } # no base markup elsif (${$token}[0] eq 'S' and ${$token}[1] eq 'img') { my ($img, $type, $count); if (exists ${$token}[2]{'src'} and ${$token}[2]{'src'} =~ m{.*/([^\.]+)\.(.+)$}mxi) { $img = $1; $type = $2; } else { carp "'${$token}[2]{'src'}' (src) does not pass m|/([^\\.]+)\\.(.+)$|"; } if (exists ${$token}[2]{'alt'} and ${$token}[2]{'alt'} =~ m{^id=(\d+)$|}mxi) { $id = $1; } elsif (${$token}[2]{'border'} == 0) { $id = 'representative000'; } else {$id = $other_image++;} # not drawing or representative image; likely a table or chemical structure $alt2image_file_name{$id} = $img . q{.} . $type; # print "\$alt2image_file_name{$id} = $alt2image_file_name{$id}\n" ; # my $ua = LWP::UserAgent->new; # $ua->agent('Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6'); # $ua->proxy(['http', 'ftp'], 'http://localhost:5364/') ; #Howard P. Katseff, "Web Scraping Proxy" wsp http://www.research.att.com/~hpk/ # my $req = URI->new_abs(${$token}[2]{'src'}, $base); my $req = HTTP::Request->new(GET => URI->new_abs(${$token}[2]{'src'}, $base)); my $res = $self->request($req); $alt2image{$id} = $res->content; # open my $IMG, ">P:\\workspace\\WWW-Patent-Page\\" . "$img" . ".$type"; # binmode $IMG; # print $IMG $res->content; # close $IMG; my $link = "$type id = $id
"; $html .= $link; } elsif ( $page == 0 and ${$token}[0] eq 'S' and ${$token}[1] eq 'title') { $token = $p->get_token; # text of title $html .= '' . $name . '' . "\n"; } elsif (${$token}[0] eq 'S' and ${$token}[0] ne 'a' and ${$token}[0] ne 'img') { if ( $page > 0 and ( ${$token}[1] eq 'body' or ${$token}[1] eq 'html' or ${$token}[1] eq 'head' or ${$token}[1] eq 'meta' or ${$token}[1] eq 'title') ) { } else { $html .= ${$token}[4]; } } elsif ( $page < 4 and ${$token}[0] eq 'E' and ( ${$token}[1] eq 'body' or ${$token}[1] eq 'html' or ${$token}[1] eq 'head') ) { # skip this end link } elsif ($page == 4 and ${$token}[1] eq 'head' and ${$token}[0] eq 'E') { } elsif ( $page > 0 and ${$token}[0] eq 'S' and ( ${$token}[1] eq 'body' or ${$token}[1] eq 'html' or ${$token}[1] eq 'head' or ${$token}[1] eq 'meta' or ${$token}[1] eq 'title') ) { } elsif (${$token}[1] eq 'title') { } elsif (${$token}[0] eq 'E') {$html .= ${$token}[2]} elsif (${$token}[0] eq 'T') {$html .= ${$token}[1]} elsif (${$token}[0] eq 'C') {$html .= ${$token}[1]} elsif (${$token}[0] eq 'D') {$html .= ${$token}[1]} elsif (${$token}[0] eq 'PI') {$html .= ${$token}[2]} } # print " page done : $page\n"; } $html =~ s/\[([^\]]+)\]/[$1] /mxig; # for readibility # open my $HTML, '>P:\workspace\WWW-Patent-Page\html3.html'; # print $HTML $html; # close $HTML; my $zip = Archive::Zip->new(); # my $directory = q{}; # my $dir_member = $zip->addDirectory("JP_$name/"); # my $string_member = $zip->addString($html, "JP_$name/index.html"); my $string_member = $zip->addString($html, 'index.html'); $string_member->desiredCompressionMethod(COMPRESSION_DEFLATED); my (@image_member, $i); # $alt2image{$id} $i = 0; foreach my $key (keys %alt2image) { # $image_member[$i] = $zip->addString($alt2image{$key}, "JP_$name/$alt2image_file_name{$key}"); $image_member[$i] = $zip->addString($alt2image{$key}, "$alt2image_file_name{$key}"); $image_member[$i++]->desiredCompressionMethod(COMPRESSION_STORED); } # unless ($zip->writeToFileNamed("P:\\workspace\\WWW-Patent-Page\\JP_$name" . '.zip') == AZ_OK) { # carp 'zip write error'; # } # print "past zip write\n"; my $zipContents = ''; my $SH = IO::Scalar->new(\$zipContents); # if ($zip->writeToFileHandle( $SH , 0 ) ) { $zip->writeToFileHandle( $SH , 0 ); $self->{is_success} = 1; $self->{'patent'}{'content'} = $zipContents; return ($zip); # } # else { # cluck 'no write to file handle $SH' # } } # JPO_IPDI_translation sub _output_next_request_re { my $response = shift; my $regular_expression = shift; my $base = $response->base; my $html = $response->content; my $url; if ($html =~ m/$regular_expression/ixm) { $url = $1; } else {carp "no $regular_expression ... regular expression\n$html";} my $next_request = URI->new_abs($url, $base); $next_request = HTTP::Request->new(GET => $next_request); if ($url) {return $next_request;} else {return} } sub _save_image { my ($response, $name) = @_; my $image = $response->content; open my $IMAGE, '>', "$name"; binmode $IMAGE; print {$IMAGE} $image; close $IMAGE; return; } sub _output_next_request_form { # make request given; get response. # look at response and figure out next request to make # as an absolute URI object my ($http_response, $whichform, %replacement) = @_; # my $whichform = shift @_; my $next_request; # my %replacement = @_; my @forms = HTML::Form->parse($http_response); # my @f = grep $_->attr("id") eq "form1", @forms; # print scalar(@forms), " forms\n ",; # print scalar(@f) , " matching\n" ; my @inputs = $forms[$whichform]->inputs; foreach my $name (keys %replacement) { my $input = $forms[$whichform]->find_input($name); if ($input) { $input->readonly(0); $forms[$whichform]->value($name, $replacement{$name}); } } $forms[$whichform]->method($POST); $next_request = $forms[$whichform]->click; # print $next_request->as_string("\n"); return $next_request; } sub JPO_IPDI_request { my ($self, $request, $output_next_request, %tests) = @_; # my $request = shift @_; # my $output_next_request = shift @_; my $next_request; # my %tests = @_; my $http_response = $self->request($request); $next_request = &{$output_next_request}($http_response); # $browser->display(html => $http_response->content); return ($http_response, $next_request); } # perl -MLWP::Simple -MHTML::Display -e "my \$browser = HTML::Display->new( class => 'HTML::Display::Win32::IE', ); \$browser->display( html => get( 'http://www.google.com'), location => 'http://www.google.com');" sub methods { return ( ## no method() 'JPO_IPDI_translation' => \&JPO_IPDI_translation, 'JPO_IPDI_country_known' => \&JPO_IPDI_country_known, ## 'JPO_IPDI_parse_doc_id' => \&JPO_IPDI_parse_doc_id, 'JPO_IPDI_terms' => \&JPO_IPDI_terms, ); } sub JPO_IPDI_country_known { my $self = shift; my ($country_in_question) = shift; if (uc($country_in_question) == 'JP') { return ($_country_known{$country_in_question}); } else { return (undef); } } sub JPO_IPDI_terms { my ($self) = @_; return ( 'The Industrial Property Digital Library (IPDL) offers the public access to IP Gazettes of the JPO free of charge through the Internet. Certain conditions may apply. As of February 2007, consult http://www.ipdl.ncipi.go.jp/notice_e.htm .' ); } %_country_known = ( # 20060922 'JP' => 'from 19800109 to 20060906', # Japan--B 'JP' => 'from 19830527 to 20060824', # Japan--A 'JP' => 'from 1980 (partial coverage)', # Japan--other # http://www4.ipdl.inpit.go.jp/Tokujitu/tjsogodben.ipdl?N0000=115 ... Stored Data # Stored Data Information(Patent & Utility Model Gazette DB) # 13/12/2007 # The coverage of available documents is following. (not necessarily translated) # # Document Description Coverage of Documents # A: Published patent application 1971-000001 - 2007-325500 # B: Examined patent application publication 1922-000001 - 1996-034772 # B: Patent 2500001 - 4022300 # C: Patent specification 1 - 216017 # A: Japanese translation of PCT international application 1979-500001 - 2007-536890 # U: Registered utility model 3000001 - 3138009 # U: Published utility model application 1971-000001 - 2006-000001 # U1: Unexamined utility model specification 1971-000001 - 1992-138600 # Y: Examined utility model application publication 1922-000001 - 1996-011090 # Y: Examined utility model registration 2500001 - 2607898 # Z: Examined utility model specification 1 - 406203 # U: Japanese translation of PCT international application(utility model) 1979-500001 - 1998-500001 # H: Corrected patent specification 72 - 814 # I: Corrected utility model specification 32 - 330 # A1: Domestic re-publication of PCT international application 79/000329 - 2006/004050 # N1: Journal of technical disclosure 87/003986 - 07/502725 ); 1; __END__ =head1 WWW::Patent::Page::JPO_IPDI Get (download and assemble into HTML, then zip with images) machine translated patents from the Japanese Patent Office, specifically the Industrial Property Digital Library at the National Center For Industrial Property Information and Training. =cut =head2 methods set up the methods available for each document type =cut =head2 JPO_IPDI_request makes a HTTP::Request =cut =head2 JPO_IPDI_translation Translation download and assembly. Many HTTP requests are made; failure is not uncommon and eval() trapping is recommended. =cut =head2 _output_next_request_form internal helper method, processes a common HTML CGI form =cut =head2 _output_next_request_re internal helper method, passes a regular expression to hunt a web page for the next link of interest. could upgrade to TokeParser ... =cut =head2 _save_image internal helper method, saves an image object obtained from JPO =cut =head2 JPO_IPDI_terms You get what you pay for. =cut =head2 JPO_IPDI_country_known hash with keys of two letter acronyms, values of the dates covered =cut =head1 LICENSE AND COPYRIGHT Copyright (c) 2008, Wanda B. Anon wanda_b_anon@yahoo.com . All rights reserved. This program is free software; you can redistribute it and/or modify it under the Artistic License version 2.0 or above ( http://www.perlfoundation.org/artistic_license_2_0 ) . =cut