package OnSearch::Search; =head1 NAME OnSearch::Search - Search library module. =head1 DESCRIPTION OnSearch::Search searches Web site indexes generated by L and sends the names of matching documents to L. The subroutine, search (), is typically called by the application with a L object that contains the search word or words, the type of search, whether to match the case of the search term, and whether to match complete or partial words. =cut #$Id: Search.pm,v 1.32 2005/08/22 13:45:51 kiesling Exp $ use strict; use warnings; use POSIX; use Socket; use OnSearch; use OnSearch::AppConfig; use OnSearch::Cache; use OnSearch::StringSearch; use OnSearch::WebLog; my ($VERSION)= ('$Revision: 1.32 $' =~ /:\s+(.*)\s+\$/); require Exporter; require DynaLoader; our (@ISA, @EXPORT); @ISA = qw(Exporter DynaLoader); @EXPORT = (qw/search collate text_string_search save_sid clean_sids clean_results s_write/); my $logfunc = \&OnSearch::WebLog::clf; my ($chldpid, $sid); my $OPENTAG = qr'^'o; my $CLOSETAG = qr'^'o; sub search { my $q = shift; my $tlds = $_[0]; # Don't re-spawn when parent exits. Then re-assign signal handler # in the child process. $SIG{CHLD} = \&ignore_signal; FORK: if ($chldpid = fork ()) { return $chldpid; } elsif (defined $chldpid) { setpgrp (0,0); FORK2: if ($sid = fork ()) { $q -> {sid} = $sid; } elsif (defined $sid) { ### ### See the comments in WebClient.pm. ### chdir '/' || die "OnSearch: Could not chdir /: $!\n"; close STDIN; close STDOUT; close STDERR; OnSearch::WebLog::clf ('notice', "Search started PID $$."); s_write ($q->{ppid}, ''); if (caching_enabled ()) { my @cachehits = cache_retrieve ($q->{ppid}, $q->{searchterm}, $q->{searchtermlist}, $q->{matchtype}, $q->{matchcase}); ### ### Collate cache hits. ### $q -> {cachehits} = _new_hash_ref (); if ($#cachehits >= 0) { foreach my $cacheref (@cachehits) { my ($cachefn) = ($cacheref =~ $OPENTAG); ${$q -> {cachehits}}{$cachefn} = $cachefn unless exists ${$q->{cachehits}}{$cachefn}; } } } foreach (@{$tlds}) { subdir ($q, $_); # tld); } s_write ($q->{ppid}, ''); ### ### TO DO Make this into an onsearch.cfg directive. ### OnSearch::WebLog::clf ('notice', "Search ended PID $$."); return 0; } elsif ($! =~ /No more processes|Resource temporarily unavailable/) { OnSearch::WebLog::clf ('error', "search () $! PID $$."); sleep 2; redo FORK2; } else { OnSearch::WebLog::clf ('error', "search () $! PID $$."); } } elsif ($! =~ /No more processes|Resource temporarily unavailable/) { OnSearch::WebLog::clf ('error', "search () $! PID $$."); sleep 2; redo FORK; } else { OnSearch::WebLog::clf ('error', "search () $! PID $$."); } if ($?) { OnSearch::WebLog::clf ('error', "search exited $?."); exit $?; } return $$; } my $idxexpr = '.onindex.idx'; my $searchstr = \&OnSearch::StringSearch::_strindex; sub subdir { my $q = shift; my $top_dir = shift; my (@direntries, $path, $ent, $r); opendir DIR, $top_dir || die "$top_dir: $!"; @direntries = readdir DIR; closedir DIR; foreach $ent (@direntries) { chomp $ent; next if ($ent eq '.') || ($ent eq '..'); $path = $top_dir . '/' . $ent; no warnings; if (-d $path) { use warnings; if (! $q -> {excludedirs} || scalar grep ("$path", @{$q->{excludedirs}})) { $r = subdir ($q, $path); } ### ### A string match here would also match ### backup indexes. ### } elsif ($ent eq $idxexpr) { $r = postings ($q, $path); } } # $#direntries = -1; return undef; } ### ### This is where the work of searching actually occurs. ### First we check that the result was not already sent ### after the search of cached results. Then each target ### section of the index gets parsed. We ### check each ... posting for a ### match against the search regex, compiled in ### Regex.pm. ### ### If the user selects the match type, "any," we simply send all ### of the postings. If the user's match type is, "all," or, ### "exact," the collate function determines that the target ### file contains all of the search terms. Target files of "exact" ### matches then get searched for the exact phrase. The index posting ### of an exact phrase match is rewritten so that we can cache ### the entire phrase. That way, we need to read the target file only ### once. ### ### As many of the search parameters as possible are initialized in ### search.cgi, and the code here is designed fast, without ### attempting optimizations that would add significantly to the ### application's complexity. ### sub postings { my $q = $_[0]; my $idxfile = $_[1]; my ($l, $is, @postlines, $postbuf, $targetfn, $targetfnexp, $r, $cacheref); my (%cachehits, $cachefn); my ($indexfh, @indexlines); my ($s, $str, $im); my $idxref = _new_array_ref (); my $cached_results = caching_enabled (); ### ### Suppress warnings about reopening standard I/O channels. ### no warnings; open $indexfh, "$idxfile" || do { warn ("$idxfile: $!"); return; }; use warnings; $is = $im = 0; while (defined ($str = <$indexfh>)) { chomp $str; if (defined (&$searchstr ('', $str))) { close ($indexfh); return undef; } ### ### If changing this code, remember that $targetfn needs ### to be saved somewhere. ### if (defined (&$searchstr ('', $str)) && $is) { $postbuf .= "$str\n"; if ($im) { if (&{$q->{sfptr}} ($q, \$postbuf)) { ### ### Posting should already have matched ### $q->{partword}. ### add_to_cache ($q->{ppid}, posting_to_cache ($postbuf, $q->{searchtermlist},$q->{matchtype},$q->{matchcase})) if caching_enabled (); } } $is = 0; $#postlines = -1; } } close ($indexfh); return undef; } sub _new_array_ref { my @a = (); return \@a; } sub _new_hash_ref { my $h = {}; return $h; } sub text_string_search { my $q = $_[0]; my $postbufref = $_[1]; ### ### Collate () has already filtered documents that don't ### contain all of the words in the search phrase. ### my ($vf, $offset_ref, $path, $buf, $bufnc, $content, $stnc, @l); my ($completewords, $partialwords); $completewords = 0; @l = split /\n/, $$postbufref; ($path) = ($l[0] =~ $OPENTAG); $vf = OnSearch::VFile -> new; return undef unless $vf -> vfopen ($path); $content = ''; while (1) { $buf = $vf -> vfread (1024); if ($q -> {nmatchcase}) { $bufnc = $buf; } else { $bufnc = lc $buf; } $content .= $bufnc; last if length ($buf) < 1024; } $vf -> vfclose; $content =~ s/\n/ /gs; if ($q -> {nmatchcase}) { $stnc = $q -> {searchterm}; } else { $stnc = lc $q -> {searchterm}; } $offset_ref = OnSearch::StringSearch::_search_string ($stnc, $content); if ($#{$offset_ref} < 0) { return undef; } ### ### Check for complete word matches. This workaround is necessary ### because _search_string can match partially; for example, ### "file dialog box," also matches "file dialog boxes." ### if ($q -> {partword} =~ /no/) { foreach my $offset (@{$offset_ref}) { my $beforechar = substr ($content, $offset-1, 1); my $afterchar = substr ($content, $offset + length($q->{searchterm}), 1); if ("$beforechar$afterchar" =~ /\W\W/) { $completewords = 1; last; } } } else { $partialwords = 1; } if (($completewords || $partialwords) && defined ($$postbufref = rewrite_posting ($stnc, $offset_ref, \@l))) { return $postbufref; } else { return undef; } } ### ### Rewrite posting for the complete phrase results of exact ### match searches. ### sub rewrite_posting { my $term = shift; my $offset_ref = shift; my $postrefs = shift; my ($offset_str, $lcterm); return undef if ($#{$offset_ref} < 0); $offset_str = join ',', @$offset_ref; $lcterm = lc ($term); $postrefs -> [1] = qq| $offset_str|; $postrefs -> [2] = qq||; $#{$postrefs} = 2; return join "\n", @$postrefs; } ### ### Collate () determines that a target file contains all of ### the required search terms. Each hash elements is a ### true/(implied) false vector for each search term, and if ### a vector for a search term is missing, the search fails. ### sub collate { my $q = shift; my $postbuf = shift; my ($p1, $m_str, $st, @l, %vec); @l = split /\n/, $postbuf; return 0 unless (($#l - 2) >= $#{$q->{searchtermlist}}); ### ### Loop through the {collateregex}; $m_str = $1; $m_str = lc $m_str unless $q->{nmatchcase}; $vec{$m_str} = 1 unless $vec{$m_str}; } foreach $st (@{$q -> {searchtermlist}}) { $st = lc $st unless $q -> {nmatchcase}; return 0 unless $vec{$st}; } return 1; } sub save_sid { my $sid = $_[0]; sysopen (S, OnSearch::AppConfig->str('DataDir') . '/' . $sid, O_CREAT | O_WRONLY); close S; } ### ### Retrieve session id before cleaning.... ### sub clean_sids { my (@dirents, $r); opendir (D, OnSearch::AppConfig->str('DataDir')) || browser_die ("clean_sids: " . OnSearch::AppConfig->str('DataDir') . ": $!\n"); @dirents = readdir (D); foreach my $ent (@dirents) { next if ($ent =~ /\.|\.\./); if ($ent =~ /^\d+$/) { if (($r = kill (0, $ent)) == 0) { if (($r = unlink (OnSearch::AppConfig->str ('DataDir') . '/' . $ent)) == 0) { warn "clean_sids unlink $ent: $!"; } } } } closedir D; } sub clean_results { my $exptime = OnSearch::AppConfig->str ('ResultsPersist'); my $rd = OnSearch::AppConfig->str ('DataDir'); opendir (D, $rd) || browser_die ("clean_results: $rd: $!\n"); my @dirents = readdir D; foreach my $ent (@dirents) { if ($ent =~ /session\.\d+/) { if (time - (stat("$rd/$ent"))[9] > $exptime) { OnSearch::WebLog::clf ('notice', "removing out of date $ent" ); unlink "$rd/$ent"; } } } closedir D; } sub s_write { my $session_id = shift; my $buf = shift; my ($name, $clientfh, $serverfh, $r, $buflength); $name = '/tmp/.onsearch.sock.' . $session_id; socket ($serverfh, PF_UNIX, SOCK_STREAM, 0) || die "OnSearch: s_write socket: $!"; if (-S $name && ! unlink ($name)) { &$logfunc ('error', "s_write unlink: $!\n"); } bind ($serverfh, sockaddr_un($name)) || &$logfunc ('notice', "s_write bind: $!."); listen ($serverfh, SOMAXCONN) || &$logfunc ('notice', "s_write listen: $!."); accept ($clientfh, $serverfh) || &$logfunc ('notice', "s_write accept: $!."); if (fileno ($clientfh)) { $buflength = length ($buf); if (($r = syswrite ($clientfh, $buf)) != $buflength) { &$logfunc ('error', "s_write error $r chars of $buflength written: $!."); } close $clientfh; close $serverfh; } return; } __END__ 1; =head1 SEE ALSO L, L =cut