package Text::Categorize::Textrank::En; use strict; use warnings; use Log::Log4perl; use Text::StemTagPOS; use Text::Categorize::Textrank; use Data::Dump qw(dump); # TODO: need parameter for maximum phrase length. BEGIN { use Exporter (); use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); $VERSION = '0.51'; @ISA = qw(Exporter); @EXPORT = qw(getTextrankInfoOfText); @EXPORT_OK = qw(getTextrankInfoOfText); %EXPORT_TAGS = (); } #12345678901234567890123456789012345678901234 #Find potential keywords in English text. =head1 NAME C - Find potential keywords in English text. =head1 SYNOPSIS use strict; use warnings; use Text::Categorize::Textrank::En; use Data::Dump qw(dump); my $textrankerEn = Text::Categorize::Textrank::En->new(); my $text = 'This is the first sentence. Here is the second sentence.'; my $results = $textrankerEn->getTextrankInfoOfText(listOfText => [$text]); dump $results->{hashOfTextrankValues}; =head1 DESCRIPTION C provides methods for ranking the words in English text as potential keywords. It implements a version of the textrank algorithm from the report I by R. Mihalcea and P. Tarau. Encoding of all text should be in Perl's internal format; see L or L for converting text from various encodings. =head1 CONSTRUCTOR =head2 C The method C creates an instance of the C class with the following parameters: =over =item C endingSentenceTag => 'PP' C is the part-of-speech tag that should be used to indicate the end of a sentence. The default is 'PP'. The value of this tag must be a tag generated by the module L. =item C listOfPOSTypesToKeep => [qw(TEXTRANK_WORDS)] The textrank algorithm preprocesses the text so that only certain parts-of-speech (POS) are retained and used to build the graph representing the text. The module L is used to tag the parts-of-speech of the text. The parts-of-speech retained can be specified by word types, where the type is a combination of 'ALL', 'ADJECTIVES', 'ADVERBS', 'CONTENT_WORDS', 'NOUNS', 'PUNCTUATION', 'TEXTRANK_WORDS', or 'VERBS'. The default is C<[qw(TEXTRANK_WORDS)]>, which equates to C<[qw(ADJECTIVES NOUNS)]>. =item C listOfPOSTagsToKeep => [...] C provides finer control over the parts-of-speech to be retained when filtering the tagged text. For a list of all the possible tags call C. =back =cut sub new { my ($Class, %Parameters) = @_; my $Self = bless ({}, ref ($Class) || $Class); # get the POS/stemmer engine. $Self->{posTaggerStemmerEngine} = Text::StemTagPOS->new (%Parameters); return $Self; } =head1 METHODS =head2 C getTextrankInfoOfText (...) The method C returns a data structure (hash-reference) containing all the stemmed words partitioned into their sentences (L), the subset of words used to compute the textranks (L), and the textrank of the tokens (L) that occur in C. The sum of all the textrank values is one. More precisely, if C<$results> is the returned hash, then C<$results-E{listOfStemmedTaggedSentences}> contains the array reference generated by the L method of L, C<$results-E{listOfFilteredSentences}> contains the array reference generated by L of L, and C<$results-E{hashOfTextrankValues}> holds the hash of the textrank values computed by L. C<$results-E{useStemmedWords}> is also set to the value of C. =over =item C listOfStemmedTaggedSentences => [...] C is the array reference containing the list of stemmed and part-of-speech tagged sentences from L. If C is not defined, then the text to be processed should be provided via C. =item C listOfText => [...] C is an array reference containing the strings of text to be categorized. C is only used if C is undefined. =item C edgeCreationSpan => 1 For each word in the text, C is the number of successive words used to make an edge in the textrank token graph. For example, if C is two, then given the word sequence C<"apple orange pear"> the edges C<[apple, orange]> and C<[apple, pear]> will be added to the text graph for the word C. The default is one. Note that loop edges are ignored. For example, if C is two, then given the word sequence C<"daba daba doo"> the edge C<[daba, daba]> is disguarded but the edge C<[daba, doo]> is added to the token graph. =item C directedGraph => 0 If C is true, the textranks are computed from the directed token graph, if false, they are computed from the undirected version of the graph. The default is false. =item C pageRankDampeningFactor => 0.85 When computing the textranks of the token graph, the dampening factor specified by C will be used; it should range from zero to one. The default is 0.85. =begin html The Wikipedia article on pagerank has a good explaination of the dampening factor.
  =end html =item C addEdgesSpanningLists => 1 If C is true, then when building the token graph, links between the tokens at the end of a list and the beginning of the next list will be made. For example, for the lists C<[[qw(This is the first list)], [qw(Here is the second list)]]> the edge C<[list, Here]> will be added to the token graph. The default is true. =item C useStemmedWords => 1 If C is true, then when building the token graph, the stemmed words are used as the id of each node, otherwise the original words are used; in both cases the stemmed or original words are converted to lowercase. The default is true. =back =cut sub getTextrankInfoOfText { my ($Self, %Parameters) = @_; # get the text to process. my $listOfStemmedTaggedSentences; if (exists ($Parameters{listOfStemmedTaggedSentences})) { $listOfStemmedTaggedSentences = $Parameters{listOfStemmedTaggedSentences}; } elsif (exists($Parameters{listOfText})) { $listOfStemmedTaggedSentences = $Self->{posTaggerStemmerEngine}->getStemmedAndTaggedText ($Parameters{listOfText}); } else { my $logger = Log::Log4perl->get_logger(); $logger->logdie("error: one of the parameters 'listOfStemmedTaggedSentences' or 'listOfText' must be defined."); } # set the parameter to use the original or stemmed word. my $useStemmedWords = 1; $useStemmedWords = $Parameters{useStemmedWords} if exists $Parameters{useStemmedWords}; my $tokenIndex; if ($useStemmedWords) { $tokenIndex = Text::StemTagPOS::WORD_STEMMED; } else { $tokenIndex = Text::StemTagPOS::WORD_ORIGINAL; } # set the addEdgesSpanningLists option via the addEdgesSpanningSentences flag. $Parameters{addEdgesSpanningLists} = 1; $Parameters{addEdgesSpanningLists} = $Parameters{addEdgesSpanningSentences} if exists $Parameters{addEdgesSpanningSentences}; # filter the tagged text down to only the parts-of-speech that are to be kept. my $listOfFilteredSentences = $Self->{posTaggerStemmerEngine}->getTaggedTextToKeep (listOfStemmedTaggedSentences => $listOfStemmedTaggedSentences); # build the list of sentences containing only the stemmed words kept. my @listOfTokens; foreach my $sentence (@$listOfFilteredSentences) { # skip empty sentences. next unless ($#$sentence + 1); # use only the stemmed word as the token. push @listOfTokens, [map {lc $_->[$tokenIndex]} @$sentence]; } # get the textrank of the tokens. my $hashOfTextrankValues = getTextrankOfListOfTokens (%Parameters, listOfTokens => \@listOfTokens); # store the tagged text, filtered text, and textrank values in a hash. # all this info is needed to build the keywords and phrases. my %textrankInfo; $textrankInfo{listOfStemmedTaggedSentences} = $listOfStemmedTaggedSentences; $textrankInfo{listOfFilteredSentences} = $listOfFilteredSentences; $textrankInfo{hashOfTextrankValues} = $hashOfTextrankValues; $textrankInfo{useStemmedWords} = $useStemmedWords; return \%textrankInfo; } =head1 INSTALLATION To install the module run the following commands: perl Makefile.PL make make test make install If you are on a windows box you should use 'nmake' rather than 'make'. =head1 BUGS Please email bugs reports or feature requests to C, or through the web interface at L. The author will be notified and you can be automatically notified of progress on the bug fix or feature request. =head1 AUTHOR Jeff Kubina =head1 COPYRIGHT Copyright (c) 2009 Jeff Kubina. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The full text of the license can be found in the LICENSE file included with this module. =head1 KEYWORDS categorize, english, keywords, keyprhases, nlp, pagerank, textrank =head1 SEE ALSO =begin html This package implements the Textrank algorithm from the report TextRank: Bringing Order into Texts by Rada Mihalcea and Paul Tarau; which is related to pagerank. See the Lingua::EN::Tagger README file for a list of the part-of-speech tags. =end html L, L, L, L, L =cut 1; # The preceding line will help the module return a true value