#!/usr/bin/perl -X use utf8; use File::Spec; binmode STDOUT, ':utf8'; binmode STDERR, ':utf8'; use HTML::Strip; use Data::Dumper; use Digest::MD5 qw(md5_hex); use AI::MicroStructure::WordBlacklist; use JSON::XS; use WWW::Wikipedia; use Storable::CouchDB; require LWP::UserAgent; $ARGV[0] = "Space" unless($ARGV[0]); #exit(0) unless (!checkIsThere($ARGV[0])); our $doc ={}; our @links; our $linkdata = {}; our $result; my $url = $ARGV[0]; our $done; our $doneit={}; my @inx; my @test; my $search=""; my $TOP=""; my $s=AI::MicroStructure::WordBlacklist::getStopWords('en'); our @s = keys %$s; #print join('|',@s); our $x = Storable::CouchDB->new( uri =>"http://user:password\@localhost:5984/", #default db =>"table" ); my $carry = {count=>0,max=>0}; sub checkIsThere { my $key =shift; require LWP::UserAgent; my $ua = LWP::UserAgent->new; my ($server,$db) = (sprintf("http://%s:5984","localhost"),"table"); my $res = $ua->get(sprintf('%s/%s/_all_docs?key="%s"', $server, $db, $key)); my $r = JSON::XS::decode_json($res->content); my @rows = @{$r->{rows}}; if(@rows) { return 1; }else{ return 0; } return 1; } sub URLDecode { my $theURL = $_[0]; $theURL =~ tr/+/ /; $theURL =~ s/%([a-fA-F0-9]{2,2})/chr(hex($1))/eg; $theURL =~ s///g; return $theURL; } sub URLEncode { my $theURL = $_[0]; $theURL =~ s/([\W])/"%" . uc(sprintf("%2.2x",ord($1)))/eg; return $theURL; } sub smartdecode { use URI::Escape qw( uri_unescape ); use utf8; my $x = my $y = uri_unescape($_[0]); return $x if utf8::decode($x); return $y; } sub imgTranslate { require HTTP::Request::Common; my ($url) = @_; if($url){ my $request = HTTP::Request::Common::GET("http://localhost/tiny/concept2.php?img=".$url); $request->content_type('application/x-www-form-urlencoded'); my $ua = LWP::UserAgent->new; my $response = $ua->request($request); if ($response->is_success) { $response->decoded_content; $url = JSON::XS::decode_json($response->decoded_content); return \@{$url->{result}}; } else { #print STDERR $response->status_line, "\n"; } } } sub call { my $url = shift; return () unless (!checkIsThere($url)); warn "doing $url"; # my $ua = LWP::UserAgent->new; my $content ; my $response =""; # @book = (); my @book = (); $response = $ua->get(sprintf("http://en.wikipedia.org/wiki/%s",ucfirst($url))); # if ($response->base =~ /m{$TOP}/ ) { my $doc={}; my $linkdata={}; use Data::Dumper; require HTML::SimpleLinkExtor; no warnings 'utf8'; my $e = HTML::SimpleLinkExtor->new($response->base); $e->parse($response->decoded_content); @links = grep {/$TOP|.pdf|.ogg|.mp3|.mpg|.avi/}grep{!/Disambig|Help:|Wikipedia:|Special:|:Contents|:Featured_content|Main_Pag|_talk:|Talk:|#|[Aa]rticle[s|_]|All_*_*/}$e->links; foreach(@links){ $linkdata->{base}->{$_} = 1 unless($linkdata->{base}->{$_}); } @links = keys %{$linkdata->{base}}; $linkdata->{pdf}=[grep{/^http.*.[\.](pdf|PDF)$/}@links]; $linkdata->{audio}=[grep{/^http.*.[\.](mp3|wave|ogg|OGG|WAVE|MP3)$/}@links]; @links = grep{/^http:\/\/en.wikipedia.org\/wiki/}@links; $linkdata->{image}=[grep{/^http.*.[\.](JPG|GIF|PNG|svg|jpg|png|gif)$/}@links]; @{$doc->{audio}} = map{$_=shift imgTranslate($_) } @{$doc->{audio}}; @{$doc->{audio}} = grep{ defined }@{$doc->{audio}}; # warn $#links."\n"; my $wiki = WWW::Wikipedia->new(); my $hs = HTML::Strip->new(); my $result = $wiki->search(ucfirst $url); if (defined($result) && $result->text() ) { my $clean_text = $hs->parse($result->text() ); $hs->eof; # print "\n"x10,$clean_text; $doc={}; $doc->{linknr}=$#links; $doc->{url}=$url; $doc->{tags}=[map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} $result->related()]; $doc->{instances}= {}; $doc->{members}={}; my $ltxt = join("\n",@links); $ltxt =~ s/http:\/\/en.wikipedia.org\/wiki\///g; push @{$doc->{tags}},map{$_=lc($_); $_=~s/\)//g; $_=~s/ /_/g; $_=[split("_\\(",$_)] }grep {/[(].+?[)]/} split("\n",$ltxt); foreach(@{$doc->{tags}}){ if($_->[0] && $_->[1]){ $doc->{instances}->{lc $_->[0]}=lc $_->[1]; push @test,lc $_->[0]; } # push @test,$_->[1]; } my @instances = [grep{!/\W/}values %{$doc->{instances}}]; $doc->{members}=[grep{!/\W/}keys %{$doc->{instances}}]; $doc->{instances}=[grep{!/\W/}values %{$doc->{instances}}]; # push @{$doc->{instances}},lc($url); $doc->{article}=$clean_text; $doc->{links}=[grep{!/:Wikipedia/}@links]; $doc->{image}=$linkdata->{image}; $doc->{audio}=$linkdata->{audio}; $doc->{book}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Book:/}@{$doc->{links}}]; $doc->{portal}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Portal:/}@{$doc->{links}}]; $doc->{cat}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Category:/}@{$doc->{links}}]; $doc->{list}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/List_of/}@{$doc->{links}}]; $doc->{tmpl}=[grep{/^http:\/\/en.wikipedia.org\/wiki\/Template:/}@{$doc->{links}}]; $doc->{pdf}=$linkdata->{pdf}; $doc->{recorded}=time; @{$doc->{image}} = map{$_=shift imgTranslate($_) } @{$doc->{image}}; @{$doc->{image}} = grep{ defined }@{$doc->{image}}; # $doc->{$_} = $_=~/src|full/? smartdecode($hs->parse($result->{$_})) : smartdecode($result->{$_}) for qw(src fulltext cursor related categories headings currentlang); $doc->{related} = [grep{!/#/}map{my $a = $_; $a =~ s/ /_/g; $_=ucfirst $a;}@{$doc->{related}}]; $doc->{size} = length sprintf($doc); my $tmp = $doc->{fulltext}; $tmp =~ s/( |'|,|")/\n/g; my @tmp = split("\n",$tmp); $tmp = grep{!@s}@tmp; $doc->{wn} = `wn $url -over`; # warn "doing $doc->{wn}"; # my @L = [sort {$a cmp $b}grep{/(wiki|book|Category|List_of_)/i} @links]; $x->store("$url" ,$doc); # print Dumper $doc; return $doc; } } foreach my $urlx (@ARGV) { if($urlx) { $result->{$urlx} = call($urlx); } if(defined($result->{$urlx}) && @{$result->{$urlx}->{related}}){ foreach(@{$result->{$urlx}->{related}}){ $result->{$_} = call($_); } } } BEGIN{ `IFS_BAK=\$IFS; IFS=\$'\n';`; } END{ `IFS=\$IFS_BAK;`; } 1;