#!/usr/local/bin/perl -w =head1 WebService::Google-Hack Web Interface =head1 SYNOPSIS The WebService::Google-Hack web interface provides an easy to use interface for some of the features of WebService::Google-Hack. =head1 DESCRIPTION To install the interface please follow these steps: 1) Create a directory named ghack in your cgi-bin directory (Where all your cgi files reside). So it should be something like: /webspace/cgi-bin/ghack 2) Next, copy the file named google_hack.cgi, which is given with the distribution of the google-hack package into your cgi-bin/ghack/ directory. 3) Open the index.cgi file. *Note: The index.cgi file is in the WebInterface directory of GoogleHack. For eg: WebService/GoogleHack/WebInterface/. 4) Now, in the index.cgi file (which is also given in the WebInterface directory of GoogleHack), Set the remote_host, and remote_port variables to the correct values. $remote_host = ''; $remote_port = ''; The remote host will be the IP address of the machine where the google_hack server will be running. The remote port needs to be the same as the $LOCALPORT variable in ghack_server.pl 5) Set the defaultKey variable to your default Google-API key. $defaultKey="XXXXXXXXX"; You should now be able to use the web interface. =head1 AUTHOR Ted Pedersen, Etpederse@d.umn.eduE Pratheepan Raveendranathan, Erave0029@d.umn.eduE Jason Michelizzi, Emich0212@d.umn.eduE Date 11/08/2004 =head1 COPYRIGHT AND LICENSE Copyright (c) 2003 by Pratheepan Raveendranathan, Ted Pedersen, Jason Michelizzi This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to The Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut use strict; ########################################################## # Change to host ip address and port # ########################################################## my $remote_host = '111.111.11.111'; my $remote_port = '32983'; ########################################################## # Change to default API key # ########################################################## my $defaultKey="W3EDt6dQFHIBN/qfbniXjwvaf7SFXh0U"; use CGI; use Socket; BEGIN { # Our University's webserver uses an ancient version of CGI::Carp # so we can't do fatalsToBrowser. # The carpout() function lets us modify the format of messages sent to # a filehandle (in this case STDERR) to include timestamps use CGI::Carp 'carpout'; carpout(*STDOUT); } my $cgi = CGI->new; # These are the colors of the text when we alternate text colors (when # showing errors, for example). my $text_color1 = 'black'; my $text_color2 = '#d03000'; print $cgi->header; my $action=$cgi->param ('action'); my $type=$cgi->param ('opt'); my $key = $cgi->param ('apikey'); my $words; my $frequency; my $numPages; my $numIterations; my $scoreType; my $scoreCutOff; my $wordS1; my $wordS2; my $review; my $text; if(!defined($action)) { $action="first"; } if($action eq "first") { showPageStart(); } if($action eq "Submit") { if($type eq "wordcluster") { WordClusters(); } if($type eq "wordcluster2") { WordClusters2(); } elsif($type eq "pmi") { PMI(); } elsif($type eq "review") { Review(); } elsif($type eq "words") { SemanticWords(); } elsif($type eq "phrases") { SemanticPhrases(); } } if($action eq "Generate") { # $words = $cgi->param ('words');; print $words; $words = $cgi->param ('searchString1')." ".$cgi->param ('searchString2'); # print $words; $frequency = $cgi->param ('cutoff');; $numPages = $cgi->param ('numres');; $numIterations=$cgi->param ('numiters');;; if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } generateWordCluster(); #$numIterations = $cgi->param ('apikey');; } if($action eq "Generate2") { # $words = $cgi->param ('words');; print $words; $words = $cgi->param ('searchString1').":".$cgi->param ('searchString2'); print $words; $frequency = $cgi->param ('cutoff');; $numPages = $cgi->param ('numres');; $numIterations=$cgi->param ('numiters');;; $scoreType=$cgi->param ('scoretype'); $scoreCutOff=$cgi->param ('scorecutoff'); if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } generateWordCluster2(); #$numIterations = $cgi->param ('apikey');; } if($action eq "PMIMeasure") { $wordS1 = $cgi->param ('searchString1'); $wordS2 = $cgi->param ('searchString2'); if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } generatePMI(); #$numIterations = $cgi->param ('apikey');; } if($action eq "Predict") { $wordS1 = $cgi->param ('searchString1'); $wordS2 = $cgi->param ('searchString2'); $review= $cgi->param ('review'); if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } predictReview(); } if($action eq "Semantic") { $wordS1 = $cgi->param ('searchString1'); $wordS2 = $cgi->param ('searchString2'); $text= $cgi->param ('text'); if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } predictSemanticWords(); } if($action eq "SemanticPhrases") { $wordS1 = $cgi->param ('searchString1'); $wordS2 = $cgi->param ('searchString2'); $text= $cgi->param ('text'); if($cgi->param ('apikey') ne "") { $key=$cgi->param ('apikey'); } else { $key="$defaultKey"; } predictSemanticPhrases(); } showPageEnd (); exit; # ========= subroutines ========= sub round ($) { my $num = shift; my $str = sprintf ("%.4f", $num); $str =~ s/\.?0+$//; return $str; } sub showPageStart { print <<"EOINTRO"; Google-Hack
G O O G L E   -  H A C K  


  Learn more about each option


 

(Please enter your Google API license key here, if you dont have one you can get it @ http://www.google.com/apis.
Or to proceed with default google-hack developer\'s key, select the feature that you would like to use and click on submit.)


Project Information

Project Information

Developers

Ted Pedersen ,    Pratheepan Raveendranathan EOINTRO } sub WordClusters { print <<"Word_Clusters";
G O O G L E - H A C K  

Word Clusters --- Algorithm 1 - Baseline Approach

(Baseline algorithm)

Set Parameters

Word_Clusters print " (This will be the number of web pages to parse, Defaults to 10, Maximum 50 )
\n"; print ""; print <<"Word_Clusters1";
Word_Clusters1 print " (Words with frequency less than given would not be considered, Max 20)
\n"; print <<"Word_Clusters2";
Word_Clusters2 print " (This will be the number of iterations)
\n"; print <<"Word_Clusters3";


(Accepts ONLY single word as input)

(Enter a word like "toyota")

(Enter a word like "ford")

Word_Clusters3 } sub generateWordCluster { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); $words=~s/\s+/:/g; print Server "c\t$key\t$words\t$numPages\t$frequency\t$numIterations\t\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp print "\nGoogle Hack Word Cluster Results for "; my @terms=(); my @temp= split(/:/, $words); foreach my $word (@temp) { if($word ne "") { print "
$word"; } } print "

Frequency Cutoff: $frequency
# of Web Pages: $numPages
# of Iterations: $numIterations
" ; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub WordClusters2 { print <<"Word_Clusters";
G O O G L E - H A C K  

Word Clusters --- Algorithm 2 - Beta Version

Set Parameters

Word_Clusters print " (This will be the number of web pages to parse, Defaults to 10, Maximum 50 )
\n"; print ""; print <<"Word_Clusters1";
Word_Clusters1 print " (Words with frequency less than given would not be considered, Max 20)
\n"; print <<"Word_Clusters1";


    Measure 1 : log(hits(w1)) + log(hits(w2)) - log(hits(w1w2))
    Measure 2 : log( hits(w1w2) / (hits(w1) + hits(w2)))
    Measure 3 : log( hits(w1w2) / (hits(w1) * hits(w2)))


Word_Clusters1 print " (Words with relatedness score greater than given would not be considered, Max 60)
\n"; print <<"Word_Clusters2";
Word_Clusters2 print " (This will be the number of iterations)
\n"; print <<"Word_Clusters3";


(Accepts Uni-Grams or Bi-Grams as input)

(Enter a word like "toyota")

(Enter a word like "ford")

Word_Clusters3 } sub generateWordCluster2 { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); #$words=~s/\s+/:/g; print Server "g\t$key\t$words\t$numPages\t$frequency\t$numIterations\t$scoreType\t$scoreCutOff\t\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp print "\nGoogle Hack Word Cluster Algorithm 2 Results for "; my @terms=(); my @temp= split(/:/, $words); foreach my $word (@temp) { if($word ne "") { print "
$word"; } } print "

Frequency Cutoff: $frequency
# of Web Pages: $numPages
# of Iterations: $numIterations
" ; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub PMI { print <<"PMI";

p r o j e c t  

            g o o g l e    - h a c k  


PMI Measure

(This feature allows you to find the Pointwise Mutual Information measure between two terms)

(Enter a term like dog)

(Enter a term like cat)

PMI print ""; print <<"PMIR";
PMIR } sub generatePMI { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); $wordS1=~s/\s+//g; $wordS2=~s/\s+//g; print Server "p\t$key\t$wordS1\t$wordS2\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp print "\nGoogle Hack PMI Measure for "; print "
$wordS1 AND $wordS2"; print "
PMI Measure: "; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub predictReview { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); $wordS1=~s/\s+//g; $wordS2=~s/\s+//g; $review=~s/\s+/\#/g; print Server "r\t$key\t$review\t$wordS1\t$wordS2\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp $review=~s/\#+/ /g; print "\nReview

"; print "
$review"; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub Review() { print <<"Review";

p r o j e c t  

            g o o g l e    - h a c k  


Semantic Orientation of Review

(Positive inference such as "excellent")

(Negative inference such as "bad")

Review print ""; print <<"Review1";
Review1 } sub SemanticWords() { print <<"Review";

p r o j e c t  

            g o o g l e    - h a c k  


Semantic Orientation of Words

(Positive inference such as "excellent")

(Negative inference such as "bad")

Review print ""; print <<"Review1";
Review1 } sub SemanticPhrases() { print <<"Review";

p r o j e c t  

            g o o g l e    - h a c k  


Semantic Orientation of Phrases

(Positive inference such as "excellent")

(Negative inference such as "bad")

Review print ""; print <<"Review1";
Review1 } sub predictSemanticWords { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); $wordS1=~s/\s+//g; $wordS2=~s/\s+//g; $text=~s/\s+/\#/g; print Server "s\t$key\t$text\t$wordS1\t$wordS2\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp $text=~s/\#+/ /g; print "\nText

"; print "
$text"; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub predictSemanticPhrases { socket (Server, PF_INET, SOCK_STREAM, getprotobyname ('tcp')); my $internet_addr = inet_aton ($remote_host) or die "Could not convert $remote_host to an Internet addr: $!\n"; my $paddr = sockaddr_in ($remote_port, $internet_addr); unless (connect (Server, $paddr)) { print "

Cannot connect to server $remote_host:$remote_port

\n"; close Server; } select ((select (Server), $|=1)[0]); $wordS1=~s/\s+//g; $wordS2=~s/\s+//g; $text=~s/\s+/\#/g; print Server "h\t$key\t$text\t$wordS1\t$wordS2\015\012\015\012"; print <<"temp";

p r o j e c t  

            g o o g l e    - h a c k  


temp $text=~s/\#+/ /g; print "\nText

"; print "
$text"; while (my $line = ) { last if $line eq "\015\012"; print "
$line"; } local $ENV{PATH} = "/usr/local/bin:/usr/bin:/bin:/ghack"; my $t_osinfo = `uname -a` || "Couldn't get system information: $!"; # $t_osinfo is tainted. Use it in a pattern match and $1 will # be untainted. $t_osinfo =~ /(.*)/; # print "

HTTP server: $ENV{HTTP_HOST} ($1)

\n"; # print "

Google server: $remote_host

\n"; print "
"; close Server; } sub showPageEnd { print <<'ENDOFPAGE'; ENDOFPAGE } __END__