The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package Kasago;
use strict;
use Carp qw(croak);
use DBI;
use File::Find::Rule;
use File::stat;
use File::Slurp;
use Kasago::Hit;
use Kasago::Token;
use Path::Class;
use PPI;
use Search::QueryParser;
use base qw( Class::Accessor::Chained::Fast );
__PACKAGE__->mk_accessors(qw( dbh ));
our $VERSION = '0.29';

sub new {
  my $class = shift;
  my $self  = $class->SUPER::new(@_);

  croak "No dbh passed to Kasago" unless $self->dbh;
  $self->dbh->{RaiseError} = 1;
  $self->dbh->{AutoCommit} = 0;

  return $self;
}

sub DESTROY {
  my $self = shift;
  $self->dbh->disconnect;
}

sub init {
  my $self = shift;
  my $dbh  = $self->dbh;
  eval {

    eval { $dbh->do("select 1 from tokens"); };
    if ($dbh->errstr) {
      $dbh->rollback;
    } else {
      $dbh->do("
DROP TABLE tokens;
DROP TABLE lines;
DROP TABLE words;
DROP TABLE files;
DROP TABLE sources;
");
    }

    $dbh->do("
CREATE TABLE sources (
  source_id SERIAL PRIMARY KEY,
  source TEXT UNIQUE
) WITHOUT OIDS;
");

    $dbh->do("
CREATE TABLE files (
  file_id SERIAL PRIMARY KEY,
  source_id INTEGER REFERENCES sources ON DELETE CASCADE,
  file TEXT,
  UNIQUE (source_id, file)
) WITHOUT OIDS;
CREATE INDEX source_id_index ON files(source_id);
");

    $dbh->do("
CREATE TABLE words (
  word_id SERIAL PRIMARY KEY,
  word TEXT UNIQUE
) WITHOUT OIDS;
");

    $dbh->do("
CREATE TABLE lines (
  line_id SERIAL PRIMARY KEY,
  file_id INTEGER REFERENCES files ON DELETE CASCADE,
  row INTEGER,
  line TEXT,
  UNIQUE (file_id, row)
) WITHOUT OIDS;
CREATE INDEX file_id_index ON lines(file_id);
CREATE INDEX row_index ON lines(row);
");

    $dbh->do("
CREATE TABLE tokens (
  token_id SERIAL PRIMARY KEY,
  line_id INTEGER REFERENCES lines ON DELETE CASCADE,
  word_id INTEGER REFERENCES words ON DELETE CASCADE,
  col INTEGER
) WITHOUT OIDS;
CREATE INDEX line_id_index ON tokens(line_id);
CREATE INDEX word_id_index ON tokens(word_id);
");

    $dbh->commit;
  };
  die $@ if $@ && $@ !~ /already exists/;
}

my %word_cache;

sub import {
  my ($self, $source, $dir) = @_;
  return unless ref $self;    # This isn't Exporter, you know
  my $dbh = $self->dbh;

  $self->_delete($source);

  my $source_id =
    $dbh->selectcol_arrayref("SELECT source_id FROM sources WHERE source = ?",
    {}, $source)->[0];
  unless ($source_id) {
    $dbh->do("INSERT INTO sources (source) VALUES (?)", {}, $source);
    $source_id = $dbh->last_insert_id(undef, undef, "sources", undef);
  }

  foreach my $file (File::Find::Rule->new->file->in($dir)) {
    my $rel     = file($file)->relative($dir);
    my $file_id =
      $dbh->selectcol_arrayref(
      "SELECT file_id FROM files WHERE source_id = ? AND file = ?",
      {}, $source_id, $rel)->[0];
    unless ($file_id) {
      $dbh->do("INSERT INTO files (source_id, file) VALUES (?, ?)",
        {}, $source_id, $rel);
      $file_id = $dbh->last_insert_id(undef, undef, "files", undef);
    }

    my @lines = read_file($file);
    my $row   = 1;
    foreach my $line (@lines) {
      chomp $line;
      $dbh->do("INSERT INTO lines (file_id, row, line) VALUES (?, ?, ?)",
        {}, $file_id, $row++, $line);
    }

    my @line_ids = @{
      $dbh->selectcol_arrayref(
        "SELECT line_id FROM lines WHERE file_id = ? ORDER by row",
        {}, $file_id)
      };

    my @tokens = $self->_tokenise_perl($file);
    foreach my $token (@tokens) {
      my $word_id = $word_cache{ $token->value };
      unless ($word_id) {
        $word_id =
          $dbh->selectcol_arrayref("SELECT word_id FROM words WHERE word = ?",
          {}, $token->value)->[0];
        unless ($word_id) {
          $dbh->do("INSERT INTO words (word) VALUES (?)", {}, $token->value);
          $word_id = $dbh->last_insert_id(undef, undef, "words", undef);
        }
        $word_cache{ $token->value } = $word_id;
      }
      my $line_id = $line_ids[ $token->row - 1 ];
      $dbh->do("INSERT INTO tokens (line_id, word_id, col) VALUES (?, ?, ?)",
        {}, $line_id, $word_id, $token->col);
    }
  }

  $dbh->commit;
  $dbh->do("
  ANALYZE tokens;
  ANALYZE lines;
  ANALYZE words;
  ANALYZE files;
  ANALYZE sources;
  ");
}

sub _tokenise_perl {
  my ($self, $file) = @_;
  my @tokens;
  my $document = PPI::Document->new($file);
  return unless $document;
  $document->index_locations;
  foreach my $node (@{ $document->find('PPI::Statement::Package') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->namespace);
  }
  foreach my $node (@{ $document->find('PPI::Token::Symbol') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->canonical);
  }
  foreach my $node (@{ $document->find('PPI::Token::Number') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->content);
  }
  foreach my $node (@{ $document->find('PPI::Token::Word') || [] }) {
    push @tokens, Kasago::Token->_new_from_node($node, $node->content);
  }
  foreach my $node (@{ $document->find('PPI::Token::Quote') || [] }) {
    my ($line, $col) = @{ $node->location };
    my $left    = "";
    my $content = $node->content;
    my $split   = qr/(\s+|\.|'|")/;
    foreach my $word (split /$split/, $content) {
      if ($word !~ /^$split$/) {
        push @tokens,
          Kasago::Token->_new_from_node($node, $word,
          [ $line, $col + length($left) ]);
      }
      $left .= $word;
    }
  }
  foreach my $node (@{ $document->find('PPI::Token::Comment') || [] }) {
    my ($line, $col) = @{ $node->location };
    my $left  = "";
    my $split = qr/(\s+|\.)/;
    foreach my $word (split /$split/, $node->content) {
      if ($word !~ /^$split$/) {
        push @tokens,
          Kasago::Token->_new_from_node($node, $word,
          [ $line, $col + length($left) ]);
      }
      $left .= $word;
    }
  }
  foreach my $node (@{ $document->find('PPI::Token::Pod') || [] }) {
    my ($line, $col) = @{ $node->location };
    foreach my $content (split "\n", $node->content) {
      my $left  = "";
      my $split = qr/(\s+|\.)/;
      foreach my $word (split /$split/, $content) {
        if ($word !~ /^$split$/) {
          push @tokens,
            Kasago::Token->_new_from_node($node, $word,
            [ $line, $col + length($left) ]);
        }
        $left .= $word;
      }
      $line++;
    }
  }
  return @tokens;
}

sub delete {
  my ($self, $source) = @_;
  $self->_delete($source);
  $self->dbh->commit;
}

sub _delete {
  my ($self, $source) = @_;
  $self->dbh->do("DELETE FROM sources WHERE source = ?", undef, $source);
}

sub sources {
  my $self = shift;
  return @{ $self->dbh->selectcol_arrayref("SELECT source FROM sources") };
}

sub files {
  my ($self, $source) = @_;
  return @{
    $self->dbh->selectcol_arrayref("
SELECT file FROM sources
NATURAL INNER JOIN files
WHERE source=?
ORDER BY file;
",
      {},
      $source)
    };
}

sub tokens {
  my ($self, $source, $file) = @_;
  return @{
    $self->dbh->selectcol_arrayref("
SELECT word FROM files 
NATURAL INNER JOIN words
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
WHERE source_id=(SELECT source_id from sources WHERE source=?) 
AND file=? ORDER BY word;
",
      {}, $source, $file)
    };
}

sub search {
  my ($self, $word) = @_;
  my $sth = $self->dbh->prepare("
SELECT source, file, row, col, line FROM words
NATURAL INNER JOIN files
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
NATURAL INNER JOIN sources
WHERE word = ?
ORDER by source, file, row, col;
");
  $sth->execute($word);
  my @tokens;
  while (my ($source, $file, $row, $col, $line) = $sth->fetchrow_array) {
    push @tokens,
      Kasago::Token->new(
      {
        source => $source,
        file   => $file,
        row    => $row,
        col    => $col,
        value  => $word,
        line   => $line,
      }
      );
  }
  return @tokens;
}

sub search_merged {
  my ($self, $word) = @_;
  return $self->_merge($self->search($word));
}

sub _merge {
  my ($self, @all_tokens) = @_;
  my @hits;
  my $prev;
  my @tokens;

  foreach my $token (@all_tokens) {
    my $now = $token->source . ':' . $token->file . ':' . $token->row;
    if (defined $prev && $prev ne $now) {
      push @hits,
        Kasago::Hit->new(
        {
          source => $tokens[0]->source,
          file   => $tokens[0]->file,
          row    => $tokens[0]->row,
          line   => $tokens[0]->line,
          tokens => [@tokens],
        }
        );
      @tokens = ();
    }
    push @tokens, $token;
    $prev = $now;
  }
  push @hits,
    Kasago::Hit->new(
    {
      source => $tokens[0]->source,
      file   => $tokens[0]->file,
      row    => $tokens[0]->row,
      line   => $tokens[0]->line,
      tokens => [@tokens],
    }
    )
    if @tokens;
  return @hits;
}

sub _search_more_file {
  my ($self, $term) = @_;
  my $word = $term->{value};
  $word = $self->dbh->quote($word);
  return qq{
SELECT DISTINCT(file_id) FROM words
NATURAL INNER JOIN tokens
NATURAL INNER JOIN lines
WHERE word = $word};
}

sub search_more {
  my ($self, $words) = @_;
  my $dbh = $self->dbh;

  my $qp    = Search::QueryParser->new;
  my $query = $qp->parse($words);
  return unless $query;

  #use YAML; warn Dump $query;

  my (@union, @plus, @minus, @words);
  foreach my $term (@{ $query->{""} }) {
    push @union, $self->_search_more_file($term);
    push @words, $term->{value};
  }

  foreach my $term (@{ $query->{"+"} }) {
    push @plus,  $self->_search_more_file($term);
    push @words, $term->{value};
  }

  foreach my $term (@{ $query->{"-"} }) {
    push @minus, $self->_search_more_file($term);
  }

  my $subsql = "SELECT DISTINCT(file_id) FROM files WHERE file_id IN (";
  if (@union) {
    $subsql .= '(' . join(' UNION ', map { $_ = "($_)" } @union) . ')';
  }
  if (@plus) {
    $subsql .=
      ' INTERSECT (' . join(' INTERSECT ', map { $_ = "($_)" } @plus) . ')';
  }
  if (@minus) {
    $subsql .= ' EXCEPT (' . join(' UNION ', map { $_ = "($_)" } @minus) . ')';
  }
  $subsql .= ')';
  $subsql =~ s/WHERE  AND/WHERE /;
  $subsql =~ s/IN \( INTERSECT/IN ( /;

  #  die "$subsql;\n";
  #  warn "$subsql;\n";

  #  my @file_ids = @{$self->dbh->selectcol_arrayref($sql)};
  #  warn "@file_ids";

  #  my $file_ids = join(',', @file_ids);
  $words = join(',', map { $_ = $dbh->quote($_) } @words);

  my $sql = qq{
SELECT source, file, row, col, word, line FROM tokens
NATURAL INNER JOIN files
NATURAL INNER JOIN words
NATURAL INNER JOIN lines
NATURAL INNER JOIN sources
WHERE
file_id IN ($subsql) AND
word_id IN (SELECT word_id FROM words WHERE word IN ($words))
ORDER by source, file, row, col;
};

  #  warn $sql;
  my $sth = $dbh->prepare($sql);
  $sth->execute();
  my @tokens;
  while (my ($source, $file, $row, $col, $word, $line) = $sth->fetchrow_array) {
    push @tokens,
      Kasago::Token->new(
      {
        source => $source,
        file   => $file,
        row    => $row,
        col    => $col,
        value  => $word,
        line   => $line,
      }
      );
  }
  return @tokens;
}

sub search_more_merged {
  my ($self, $search) = @_;
  return $self->_merge($self->search_more($search));
}

1;

__END__

=head1 NAME

Kasago - A Perl source code indexer

=head1 SYNOPSIS

  my $kasago = Kasago->new({ dbh => $dbh });
  $kasago->init; # this creates the tables for you
  
  # import/update a directory
  $kasago->import($source, $dir);
  # delete a directory
  $kasago->delete($source);

  my @sources = $kasago->sources;
  my @files   = $kasago->files($source);
  my @tokens  = $kasago->tokens($source, $file);

  # search for a token
  foreach my $token ($kasago->search('orange')){
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }

  # search for a token, merging lines
  foreach my $hit ($kasago->search_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }  

  # search for tokens
  foreach my $token ($kasago->search_more($search)) {
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }

  # searh for tokens, merging lines
  foreach my $hit ($kasago->search_more_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }
  
=head1 DESCRIPTION

L<Kasago> is a module for indexing Perl source code. You can index source trees, 
and then query the index for symbols, strings, and documentation.

L<Kasago> uses the L<PPI> module to parse Perl and stores the index in a PostegreSQL
database. Thus you need to have L<DBD::Pg> installed and a database available for L<Kasago>.

Why is this called Kasago? Because that's the Japanese name for a beautiful fish.

=head1 METHODS

=head2 new

This is the constructor. It takes a L<DBI> database handle as a parameter. This must be
a valid dababase handle for a PostgreSQL database, constructed along the lines of
'my $dbh = DBI->connect("DBI:Pg:dbname=kasago", "", "")':

  my $kasago = Kasago->new({ dbh => $dbh });

=head2 delete

This deletes a source from the index:

  $kasago->delete($source);

=head2 files

Given a source, returns a list of the files indexed in that source:

  my @files   = $kasago->files($source);

=head2 import

This recursively imports a directory into Kasago.
If the source is already indexed, the index is updated.
You pass a source name and the directory path:

  $kasago->import($source, $dir);

=head2 init

This created the tables needed by Kasago in the database. You only need run this
once. If you run this after initialisation, it will delete the index.
  
  $kasago->init;

=head2 search

This searches the index for an individual token:

    foreach my $token ($kasago->search('orange')){
      print $token->source . "/"
        . $token->file . "@"
        . $token->col . ","
        . $token->row . ": "
        . $token->line . "\n";
    }

=head2 search_merged

This searches the index for an individual token, but merges multiple 
tokens on the same line together:

    foreach my $hit ($kasago->search_merged($search)) {
      print $hit->source . "/"
        . $hit->file . "@"
        . $hit->row . ": "
        . $hit->line . "\n";
      foreach my $token (@{ $hit->tokens }) {
        print "  @" . $token->col . ": " . $token->value . "\n";
      }
    }  
    
=head2 search_more

This searches the index for tokens. "orange" would return all hits for orange,
"orange leon" would return all hits for both "orange" and "leon".
"orange -leon" shows all the hits for "orange" but without files that contain "leon",
"+orange +leon" returns hits in files that contain both "orange" and "leon":

  foreach my $token ($kasago->search_more($search)) {
    print $token->source . "/"
      . $token->file . "@"
      . $token->col . ","
      . $token->row . ": "
      . $token->line . "\n";
  }
  
=head2 search_more_merged

This searches the index for tokens as search_more, but merges multiple 
tokens on the same line together:

  foreach my $hit ($kasago->search_more_merged($search)) {
    print $hit->source . "/"
      . $hit->file . "@"
      . $hit->row . ": "
      . $hit->line . "\n";
    foreach my $token (@{ $hit->tokens }) {
      print "  @" . $token->col . ": " . $token->value . "\n";
    }
  }

=head2 sources

This returns a list of the sources currently indexed:

  my @sources = $kasago->sources;
  
=head2 tokens

Given a source and a file, returns a list of the tokens indexed:

  my @tokens  = $kasago->tokens($source, $file);

=head1 AUTHOR

Leon Brocard <acme@astray.com>.

=head1 COPYRIGHT

Copyright (C) 2005, Leon Brocard

This module is free software; you can redistribute it or modify it
under the same terms as Perl itself.