package Kasago; use strict; use Carp qw(croak); use DBI; use File::Find::Rule; use File::stat; use File::Slurp; use Kasago::Hit; use Kasago::Token; use Path::Class; use PPI; use Search::QueryParser; use base qw( Class::Accessor::Chained::Fast ); __PACKAGE__->mk_accessors(qw( dbh )); our $VERSION = '0.29'; sub new { my $class = shift; my $self = $class->SUPER::new(@_); croak "No dbh passed to Kasago" unless $self->dbh; $self->dbh->{RaiseError} = 1; $self->dbh->{AutoCommit} = 0; return $self; } sub DESTROY { my $self = shift; $self->dbh->disconnect; } sub init { my $self = shift; my $dbh = $self->dbh; eval { eval { $dbh->do("select 1 from tokens"); }; if ($dbh->errstr) { $dbh->rollback; } else { $dbh->do(" DROP TABLE tokens; DROP TABLE lines; DROP TABLE words; DROP TABLE files; DROP TABLE sources; "); } $dbh->do(" CREATE TABLE sources ( source_id SERIAL PRIMARY KEY, source TEXT UNIQUE ) WITHOUT OIDS; "); $dbh->do(" CREATE TABLE files ( file_id SERIAL PRIMARY KEY, source_id INTEGER REFERENCES sources ON DELETE CASCADE, file TEXT, UNIQUE (source_id, file) ) WITHOUT OIDS; CREATE INDEX source_id_index ON files(source_id); "); $dbh->do(" CREATE TABLE words ( word_id SERIAL PRIMARY KEY, word TEXT UNIQUE ) WITHOUT OIDS; "); $dbh->do(" CREATE TABLE lines ( line_id SERIAL PRIMARY KEY, file_id INTEGER REFERENCES files ON DELETE CASCADE, row INTEGER, line TEXT, UNIQUE (file_id, row) ) WITHOUT OIDS; CREATE INDEX file_id_index ON lines(file_id); CREATE INDEX row_index ON lines(row); "); $dbh->do(" CREATE TABLE tokens ( token_id SERIAL PRIMARY KEY, line_id INTEGER REFERENCES lines ON DELETE CASCADE, word_id INTEGER REFERENCES words ON DELETE CASCADE, col INTEGER ) WITHOUT OIDS; CREATE INDEX line_id_index ON tokens(line_id); CREATE INDEX word_id_index ON tokens(word_id); "); $dbh->commit; }; die $@ if $@ && $@ !~ /already exists/; } my %word_cache; sub import { my ($self, $source, $dir) = @_; return unless ref $self; # This isn't Exporter, you know my $dbh = $self->dbh; $self->_delete($source); my $source_id = $dbh->selectcol_arrayref("SELECT source_id FROM sources WHERE source = ?", {}, $source)->[0]; unless ($source_id) { $dbh->do("INSERT INTO sources (source) VALUES (?)", {}, $source); $source_id = $dbh->last_insert_id(undef, undef, "sources", undef); } foreach my $file (File::Find::Rule->new->file->in($dir)) { my $rel = file($file)->relative($dir); my $file_id = $dbh->selectcol_arrayref( "SELECT file_id FROM files WHERE source_id = ? AND file = ?", {}, $source_id, $rel)->[0]; unless ($file_id) { $dbh->do("INSERT INTO files (source_id, file) VALUES (?, ?)", {}, $source_id, $rel); $file_id = $dbh->last_insert_id(undef, undef, "files", undef); } my @lines = read_file($file); my $row = 1; foreach my $line (@lines) { chomp $line; $dbh->do("INSERT INTO lines (file_id, row, line) VALUES (?, ?, ?)", {}, $file_id, $row++, $line); } my @line_ids = @{ $dbh->selectcol_arrayref( "SELECT line_id FROM lines WHERE file_id = ? ORDER by row", {}, $file_id) }; my @tokens = $self->_tokenise_perl($file); foreach my $token (@tokens) { my $word_id = $word_cache{ $token->value }; unless ($word_id) { $word_id = $dbh->selectcol_arrayref("SELECT word_id FROM words WHERE word = ?", {}, $token->value)->[0]; unless ($word_id) { $dbh->do("INSERT INTO words (word) VALUES (?)", {}, $token->value); $word_id = $dbh->last_insert_id(undef, undef, "words", undef); } $word_cache{ $token->value } = $word_id; } my $line_id = $line_ids[ $token->row - 1 ]; $dbh->do("INSERT INTO tokens (line_id, word_id, col) VALUES (?, ?, ?)", {}, $line_id, $word_id, $token->col); } } $dbh->commit; $dbh->do(" ANALYZE tokens; ANALYZE lines; ANALYZE words; ANALYZE files; ANALYZE sources; "); } sub _tokenise_perl { my ($self, $file) = @_; my @tokens; my $document = PPI::Document->new($file); return unless $document; $document->index_locations; foreach my $node (@{ $document->find('PPI::Statement::Package') || [] }) { push @tokens, Kasago::Token->_new_from_node($node, $node->namespace); } foreach my $node (@{ $document->find('PPI::Token::Symbol') || [] }) { push @tokens, Kasago::Token->_new_from_node($node, $node->canonical); } foreach my $node (@{ $document->find('PPI::Token::Number') || [] }) { push @tokens, Kasago::Token->_new_from_node($node, $node->content); } foreach my $node (@{ $document->find('PPI::Token::Word') || [] }) { push @tokens, Kasago::Token->_new_from_node($node, $node->content); } foreach my $node (@{ $document->find('PPI::Token::Quote') || [] }) { my ($line, $col) = @{ $node->location }; my $left = ""; my $content = $node->content; my $split = qr/(\s+|\.|'|")/; foreach my $word (split /$split/, $content) { if ($word !~ /^$split$/) { push @tokens, Kasago::Token->_new_from_node($node, $word, [ $line, $col + length($left) ]); } $left .= $word; } } foreach my $node (@{ $document->find('PPI::Token::Comment') || [] }) { my ($line, $col) = @{ $node->location }; my $left = ""; my $split = qr/(\s+|\.)/; foreach my $word (split /$split/, $node->content) { if ($word !~ /^$split$/) { push @tokens, Kasago::Token->_new_from_node($node, $word, [ $line, $col + length($left) ]); } $left .= $word; } } foreach my $node (@{ $document->find('PPI::Token::Pod') || [] }) { my ($line, $col) = @{ $node->location }; foreach my $content (split "\n", $node->content) { my $left = ""; my $split = qr/(\s+|\.)/; foreach my $word (split /$split/, $content) { if ($word !~ /^$split$/) { push @tokens, Kasago::Token->_new_from_node($node, $word, [ $line, $col + length($left) ]); } $left .= $word; } $line++; } } return @tokens; } sub delete { my ($self, $source) = @_; $self->_delete($source); $self->dbh->commit; } sub _delete { my ($self, $source) = @_; $self->dbh->do("DELETE FROM sources WHERE source = ?", undef, $source); } sub sources { my $self = shift; return @{ $self->dbh->selectcol_arrayref("SELECT source FROM sources") }; } sub files { my ($self, $source) = @_; return @{ $self->dbh->selectcol_arrayref(" SELECT file FROM sources NATURAL INNER JOIN files WHERE source=? ORDER BY file; ", {}, $source) }; } sub tokens { my ($self, $source, $file) = @_; return @{ $self->dbh->selectcol_arrayref(" SELECT word FROM files NATURAL INNER JOIN words NATURAL INNER JOIN tokens NATURAL INNER JOIN lines WHERE source_id=(SELECT source_id from sources WHERE source=?) AND file=? ORDER BY word; ", {}, $source, $file) }; } sub search { my ($self, $word) = @_; my $sth = $self->dbh->prepare(" SELECT source, file, row, col, line FROM words NATURAL INNER JOIN files NATURAL INNER JOIN tokens NATURAL INNER JOIN lines NATURAL INNER JOIN sources WHERE word = ? ORDER by source, file, row, col; "); $sth->execute($word); my @tokens; while (my ($source, $file, $row, $col, $line) = $sth->fetchrow_array) { push @tokens, Kasago::Token->new( { source => $source, file => $file, row => $row, col => $col, value => $word, line => $line, } ); } return @tokens; } sub search_merged { my ($self, $word) = @_; return $self->_merge($self->search($word)); } sub _merge { my ($self, @all_tokens) = @_; my @hits; my $prev; my @tokens; foreach my $token (@all_tokens) { my $now = $token->source . ':' . $token->file . ':' . $token->row; if (defined $prev && $prev ne $now) { push @hits, Kasago::Hit->new( { source => $tokens[0]->source, file => $tokens[0]->file, row => $tokens[0]->row, line => $tokens[0]->line, tokens => [@tokens], } ); @tokens = (); } push @tokens, $token; $prev = $now; } push @hits, Kasago::Hit->new( { source => $tokens[0]->source, file => $tokens[0]->file, row => $tokens[0]->row, line => $tokens[0]->line, tokens => [@tokens], } ) if @tokens; return @hits; } sub _search_more_file { my ($self, $term) = @_; my $word = $term->{value}; $word = $self->dbh->quote($word); return qq{ SELECT DISTINCT(file_id) FROM words NATURAL INNER JOIN tokens NATURAL INNER JOIN lines WHERE word = $word}; } sub search_more { my ($self, $words) = @_; my $dbh = $self->dbh; my $qp = Search::QueryParser->new; my $query = $qp->parse($words); return unless $query; #use YAML; warn Dump $query; my (@union, @plus, @minus, @words); foreach my $term (@{ $query->{""} }) { push @union, $self->_search_more_file($term); push @words, $term->{value}; } foreach my $term (@{ $query->{"+"} }) { push @plus, $self->_search_more_file($term); push @words, $term->{value}; } foreach my $term (@{ $query->{"-"} }) { push @minus, $self->_search_more_file($term); } my $subsql = "SELECT DISTINCT(file_id) FROM files WHERE file_id IN ("; if (@union) { $subsql .= '(' . join(' UNION ', map { $_ = "($_)" } @union) . ')'; } if (@plus) { $subsql .= ' INTERSECT (' . join(' INTERSECT ', map { $_ = "($_)" } @plus) . ')'; } if (@minus) { $subsql .= ' EXCEPT (' . join(' UNION ', map { $_ = "($_)" } @minus) . ')'; } $subsql .= ')'; $subsql =~ s/WHERE AND/WHERE /; $subsql =~ s/IN \( INTERSECT/IN ( /; # die "$subsql;\n"; # warn "$subsql;\n"; # my @file_ids = @{$self->dbh->selectcol_arrayref($sql)}; # warn "@file_ids"; # my $file_ids = join(',', @file_ids); $words = join(',', map { $_ = $dbh->quote($_) } @words); my $sql = qq{ SELECT source, file, row, col, word, line FROM tokens NATURAL INNER JOIN files NATURAL INNER JOIN words NATURAL INNER JOIN lines NATURAL INNER JOIN sources WHERE file_id IN ($subsql) AND word_id IN (SELECT word_id FROM words WHERE word IN ($words)) ORDER by source, file, row, col; }; # warn $sql; my $sth = $dbh->prepare($sql); $sth->execute(); my @tokens; while (my ($source, $file, $row, $col, $word, $line) = $sth->fetchrow_array) { push @tokens, Kasago::Token->new( { source => $source, file => $file, row => $row, col => $col, value => $word, line => $line, } ); } return @tokens; } sub search_more_merged { my ($self, $search) = @_; return $self->_merge($self->search_more($search)); } 1; __END__ =head1 NAME Kasago - A Perl source code indexer =head1 SYNOPSIS my $kasago = Kasago->new({ dbh => $dbh }); $kasago->init; # this creates the tables for you # import/update a directory $kasago->import($source, $dir); # delete a directory $kasago->delete($source); my @sources = $kasago->sources; my @files = $kasago->files($source); my @tokens = $kasago->tokens($source, $file); # search for a token foreach my $token ($kasago->search('orange')){ print $token->source . "/" . $token->file . "@" . $token->col . "," . $token->row . ": " . $token->line . "\n"; } # search for a token, merging lines foreach my $hit ($kasago->search_merged($search)) { print $hit->source . "/" . $hit->file . "@" . $hit->row . ": " . $hit->line . "\n"; foreach my $token (@{ $hit->tokens }) { print " @" . $token->col . ": " . $token->value . "\n"; } } # search for tokens foreach my $token ($kasago->search_more($search)) { print $token->source . "/" . $token->file . "@" . $token->col . "," . $token->row . ": " . $token->line . "\n"; } # searh for tokens, merging lines foreach my $hit ($kasago->search_more_merged($search)) { print $hit->source . "/" . $hit->file . "@" . $hit->row . ": " . $hit->line . "\n"; foreach my $token (@{ $hit->tokens }) { print " @" . $token->col . ": " . $token->value . "\n"; } } =head1 DESCRIPTION L is a module for indexing Perl source code. You can index source trees, and then query the index for symbols, strings, and documentation. L uses the L module to parse Perl and stores the index in a PostegreSQL database. Thus you need to have L installed and a database available for L. Why is this called Kasago? Because that's the Japanese name for a beautiful fish. =head1 METHODS =head2 new This is the constructor. It takes a L database handle as a parameter. This must be a valid dababase handle for a PostgreSQL database, constructed along the lines of 'my $dbh = DBI->connect("DBI:Pg:dbname=kasago", "", "")': my $kasago = Kasago->new({ dbh => $dbh }); =head2 delete This deletes a source from the index: $kasago->delete($source); =head2 files Given a source, returns a list of the files indexed in that source: my @files = $kasago->files($source); =head2 import This recursively imports a directory into Kasago. If the source is already indexed, the index is updated. You pass a source name and the directory path: $kasago->import($source, $dir); =head2 init This created the tables needed by Kasago in the database. You only need run this once. If you run this after initialisation, it will delete the index. $kasago->init; =head2 search This searches the index for an individual token: foreach my $token ($kasago->search('orange')){ print $token->source . "/" . $token->file . "@" . $token->col . "," . $token->row . ": " . $token->line . "\n"; } =head2 search_merged This searches the index for an individual token, but merges multiple tokens on the same line together: foreach my $hit ($kasago->search_merged($search)) { print $hit->source . "/" . $hit->file . "@" . $hit->row . ": " . $hit->line . "\n"; foreach my $token (@{ $hit->tokens }) { print " @" . $token->col . ": " . $token->value . "\n"; } } =head2 search_more This searches the index for tokens. "orange" would return all hits for orange, "orange leon" would return all hits for both "orange" and "leon". "orange -leon" shows all the hits for "orange" but without files that contain "leon", "+orange +leon" returns hits in files that contain both "orange" and "leon": foreach my $token ($kasago->search_more($search)) { print $token->source . "/" . $token->file . "@" . $token->col . "," . $token->row . ": " . $token->line . "\n"; } =head2 search_more_merged This searches the index for tokens as search_more, but merges multiple tokens on the same line together: foreach my $hit ($kasago->search_more_merged($search)) { print $hit->source . "/" . $hit->file . "@" . $hit->row . ": " . $hit->line . "\n"; foreach my $token (@{ $hit->tokens }) { print " @" . $token->col . ": " . $token->value . "\n"; } } =head2 sources This returns a list of the sources currently indexed: my @sources = $kasago->sources; =head2 tokens Given a source and a file, returns a list of the tokens indexed: my @tokens = $kasago->tokens($source, $file); =head1 AUTHOR Leon Brocard . =head1 COPYRIGHT Copyright (C) 2005, Leon Brocard This module is free software; you can redistribute it or modify it under the same terms as Perl itself.