#!/usr/bin/perl -T ###################################################################### # # search -- Freetext search # # Arne Georg Gleditsch # Per Kristian Gjermshus # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ###################################################################### use strict; use lib do { $0 =~ m{(.*)/} ? "$1/lib" : 'lib' }; # if LXR modules are in ./lib =head1 search script This script manages the HTTP requests for free-text search. =cut use LXR::Common; use LXR::Config; use LXR::Template; # # Global variables # my $maxhits = 1000; =head2 C Function C tells if search results for $file should be kept for display. =over =item 1 C<$filetext> a I containing the desired name fragment or pattern =item 2 C<$advanced> an I equal to 1 if C<$filetext> is a pattern, 0 otherwise =item 3 C<$casesensitive> an I equal to 1 if comparison is case-sensitive, 0 otherwise =item 4 C<$file> a I containing the filename to check =back Result is 1 if fragment C<$filetext> is present (either as a substring or a matching pattern) inside filename C<$file>. =cut sub filename_matches { my ($filetext, $advanced, $casesensitive, $file) = @_; if ($advanced) { if ($casesensitive) { if ($file =~ m/$filetext/) { return 1; } } elsif ($file =~ m/$filetext/i) { return 1; } } else { if ($casesensitive) { if (index($file, $filetext) != -1) { return 1; } } elsif (index(lc($file), lc($filetext)) != -1) { return 1; } } return 0; } =head2 C Function C searches the tree with I. =over =item 1 C<$searchtext> a I containing the text to look for =item 2 C<$filetext> a I containing the desired name fragment or pattern =item 3 C<$advanced> an I equal to 1 if C<$filetext> is a pattern, 0 otherwise =item 4 C<$casesensitive> an I equal to 1 if comparison is case-sensitive, 0 otherwise =back I is launched to search the source-tree for the string given. Results are then filtered on the optional filename. The remaining hits are returned in an array. =cut sub glimpsesearch { my ($searchtext, $filetext, $advanced, $casesensitive) = @_; my $sourceroot = $config->{$files->isa('LXR::Files::Plain') ? 'sourceroot' : 'sourcetext' } . '/' . $releaseid . '/'; $ENV{'PATH'} = '/bin:/usr/local/bin:/usr/bin:/usr/sbin'; $! = ''; open ( GLIMPSE , $config->{'glimpsebin'} # Don't scan files ending in ,v or ~ ." -F '-v (\\,v\|\\~)\$' " # Should we match casesensitive ? . ($casesensitive ? '' : '-i') # Location of index datadbase . ' -H ' . $config->{'glimpsedir'}.'/'.$releaseid # The pattern to search for ." -y -n '$searchtext' 2>&1 |" ) or die "Glimpse subprocess died unexpextedly: $!\n"; my $numlines = 0; my @glimpselines = (); my @glimpsemsgs = (); while () { if (s/^$sourceroot//) { my ($file) = $_ =~ m/(.*?):\s*/; # Keep only occurrences matching file name if given next if $filetext && !filename_matches($filetext, $advanced, $casesensitive, $file); $numlines++; push(@glimpselines, $_); if ($numlines >= $maxhits) { last; } } else { # Ignore information message next if m/^using working-directory/; # Strip off eventual command name s/^.*?glimpse:\s*//; push (@glimpsemsgs, $_); } } close(GLIMPSE); my $retval = $? >> 8; # The manpage for glimpse says that it returns 2 on syntax errors or # inaccessible files. It seems this is not the case. # We will have to work around it for the time being. if ($retval == 0) { my @ret; my $i = 0; foreach my $glimpseline (@glimpselines) { last if ($i > $maxhits); my ($file, $line, $text) = $glimpseline =~ m/(.*?):\s*(\d+)\s*:(.*)/; $text =~ s/&/&/g; $text =~ s//>/g; push @ret, [ $file, $line, $text ]; } continue { $i++; } # Sort the results ascending per file name return sort {$$a[0] cmp $$b[0]} @ret; } elsif ($retval == 1) { my $glimpsebin = $config->{'glimpsebin'}; my $glimpseresponse = join('
', @glimpselines); my $glimpseresponse =~ s/$glimpsebin/Reason/; my $glimpseresponse =~ s/glimpse: error in searching index//; print("

Search failed

\n

$glimpseresponse

\n"); foreach (@glimpsemsgs) { print("

$_

\n"); } return; } else { print("

Unexpected return value $retval from Glimpse (usually means syntax error)

\n"); foreach (@glimpsemsgs) { print("

$_

\n"); } return; } } =head2 C Function C searches the tree with I. =over =item 1 C<$searchtext> a I containing the text to look for =item 2 C<$filetext> a I containing the desired name fragment or pattern =item 3 C<$advanced> an I equal to 1 if C<$filetext> is a pattern, 0 otherwise =item 4 C<$casesensitive> an I equal to 1 if comparison is case-sensitive, 0 otherwise =back I is launched to search the source-tree for the string given. Results are then filtered on the optional filename. The remaining hits are returned in an array. =cut sub swishsearch { my ($searchtext, $filetext, $advanced, $casesensitive) = @_; my $swishIndex = $config->{'swishdir'} . '/' . $releaseid . '.index'; if (!-e $swishIndex) { print '

'; print "Version '$releaseid' has not been indexed and is unavailable for searching."; print '

'; return; } $ENV{'PATH'} = '/bin:/usr/local/bin:/usr/bin:/usr/sbin'; my $swishCommand = $config->{'swishbin'} . ' -f ' . $swishIndex . ' -s swishdocpath' . " -w '($searchtext)'" ; my $ret = `$swishCommand`; my @result = grep { not /^[\#\.]/ } split(/\n/, $ret); my $retval = $?; my @ret; if ($retval == 0) { my $numlines = 0; foreach my $hit (@result) { print $hit, "
\n" if $hit =~ /No such file or directory/; # feeble attempt to print possible errors (e.g. incomplete LD_LIBRARY_PATH causes linking errors) next if substr($hit, 0, 4) eq 'err:'; # skip; only 'no results' errors happen with return value 0 my ($score, $file) = $hit =~ m/^(\d+) \/(.+) "(.+)" \d+/; next if $filetext && !filename_matches($filetext, $advanced, $casesensitive, $file); push @ret, [ $file, $score ]; $numlines++; last if ($numlines >= $maxhits); } return @ret; } else { print( "

Search failed: internal error


\n

" . join('
', @result) . "<\p>\n"); return; } } =head2 C Function C is a "$variable" substitution function. It returns CSS class name C if the file timestamp is incorrect, meaning the file has been modified after I indexing. Otherwise, it returns an empty string. =over =item 1 C<$file> a I containing the filename to check =back Since the search engines return OS-relative path, the filename must be prefixed with a / to observe LXR file designation rule. =cut sub checkvalidref { my ($file) = @_; $file = '/' . $file; if ( !$index->filetimestamp ( $file , $files->filerev($file, $releaseid) ) && LXR::Lang::parseable($file, $releaseid) ) { 'searchinvalid' } else { '' } } =head2 C Function C is a "$function" substitution function. It returns an HTML string which is the concatenation of its expanded argument applied to every search result. =over =item 1 C<$templ> a I containing the template (i.e. argument) =item 2 C<$searchtext> a I containing the looked-for text =item 3 C<@results> an I containing the search results =back Because of the different nature of the results, processing is split depending on the search engines. With I, line for the hit was grabbed, while with I, only a relevance score is available. B =over =item I For these reasons, background is highlighted for modified files. =back =cut sub printresults { my $templ = shift; my $searchtext = shift; my @results = @_; my $ret; my ($file, $lastfile); foreach (@results) { # glimpse and swish-e searches provide different data for each result if ($config->{'glimpsebin'}) { my (@params) = @$_; $file = $params[0]; my $fileonce = $file if $lastfile ne $file; my $line = $params[1]; my $text = $params[2]; my $searchtextq = quotemeta ($searchtext); # May not work always because glimpse pattern are different from Perl's # but when it works highlights the occurrence of searchtext $text =~ s!($searchtextq)!$1!g; $ret .= expandtemplate ( $templ , ( 'text' => sub { return "

$text
" } , 'file' => sub { fileref($file, 'searchfile', "/$file") } , 'fileonce'=> sub { if ($fileonce) { return fileref($fileonce, 'searchfile', "/$file") } else { return ' ' } } , 'line' => sub { fileref($line, 'searchline', "/$file", $line) } , 'fileref' => sub { fileref("$file, line $line", 'searchfile', "/$file", $line) } , 'tdfile' => sub { if ($fileonce) { return 'searchfile' } else { return 'searchfilevoid' } } , 'searchinvalid' => sub { checkvalidref($file) } ) ); } else { my (@params) = @$_; $file = $params[0]; my $fileonce = $file if $lastfile ne $file; my $score = $params[1]; $ret .= expandtemplate ( $templ , ( 'text' => sub { return $score } , 'file' => sub { fileref($file, 'searchfile', "/$file") } , 'fileonce'=> sub { if ($fileonce) { return fileref($fileonce, 'searchfile', "/$file") } else { return ' ' } } , 'line' => sub { return '' } , 'fileref' => sub { fileref($file, 'searchfile', "/$file") } , 'tdfile' => sub { if ($fileonce) { return 'searchfile' } else { return 'searchfilevoid' } } , 'searchinvalid' => sub { checkvalidref($file) } ) ); } $lastfile = $file; } return $ret; } =head2 C Sub C is the main driver for free-text search. It dispatches search to the correct search engine, then calls the result editor. B =over =item I indexation because search is done against an internal list captured at indexing time.> =back =cut sub search { my $templ; my $errorsig = ""; $templ = gettemplate ( 'htmlsearch' , $errorsig ); if ($templ =~ m/^$errorsig/) { die "Free-text search not available without 'htmlsearch' template\n"; } my $searchtext = $HTTP->{'param'}{'_string'}; my $filetext = $HTTP->{'param'}{'_filestring'}; my $advanced = $HTTP->{'param'}{'_advanced'}; my $casesensitive = $HTTP->{'param'}{'_casesensitive'}; $searchtext =~ s/\+/ /g; # Reverse
space encoding my @results; if ($searchtext ne '') { my $escapedsearchtext = $searchtext; # Protect us against arbitrary command injection: # Since the pattern is enclosed within single quotes, remove any single # quote from search text so that malicious user can't close the parameter # and start a new command. # NOTE: quote in pattern, even if escaped with \ does not seem to work # well; therefore it should not be a big loss of functionality. # The GLIMPSE engine is word based and it is very hard to search # for isolated non-alphameric characters. Here again, not a big loss. $escapedsearchtext =~ s/'//g; if ($config->{'glimpsebin'}) { @results = glimpsesearch($escapedsearchtext, $filetext, $advanced, $casesensitive); } elsif ($config->{'swishbin'} && $config->{'swishdir'}) { @results = swishsearch($escapedsearchtext, $filetext, $advanced, $casesensitive); } else { warn "No freetext search engine configured.\n"; } } elsif ($filetext ne '') { my $FILELISTING; if ($config->{'swishdir'} && $config->{'swishbin'}) { unless ($FILELISTING = IO::File->new($config->{'swishdir'} . "/$releaseid.filenames")) { warn "Version '$releaseid' has not been indexed and is unavailable for searching
Could not open " . $config->{'swishdir'} . "/$releaseid.filenames\n"; return; } } elsif ($config->{'glimpsedir'} && $config->{'glimpsebin'}) { unless ($FILELISTING = IO::File->new($config->{'glimpsedir'} . '/' . $releaseid . "/.glimpse_filenames")) { warn "Version '$releaseid' has not been indexed and is unavailable for searching\n" . 'Could not open ' . $config->{'glimpsedir'} . "/$releaseid/.glimpse_filenames\n"; return; } } else { warn "Freetext search engine required for file search, and no freetext search engine is configured\n"; return; } my $sourceroot = $config->{$files->isa('LXR::Files::Plain') ? 'sourceroot' : 'sourcetext'} . '/' . $releaseid . '/'; while (<$FILELISTING>) { chomp; s/^$sourceroot//; push @results, [ $_ ] if filename_matches($filetext, $advanced, $casesensitive, $_); } close($FILELISTING); } print expandtemplate ( $templ , ( 'variables' => \&varexpand , 'searchtext' => sub { $_ = $searchtext; s/&/&/g; s//>/g; return $_; } , 'searchtext_escaped' => sub { $_ = $searchtext; s/&/&/g; s/\"/"/g; s//>/g; return $_; } , 'filetext_escaped' => sub { $_ = $filetext; s/\"/"/g; return $_; } , 'advancedchecked' => sub { $advanced ? 'checked' : '' } , 'casesensitivechecked'=> sub { $casesensitive ? 'checked' : '' } , 'varbtnaction' => sub { varbtnaction(@_, 'search') } , 'urlargs' => sub { urlexpand('-', 'search') } , 'noquery' => sub { return 'hidden' if $searchtext eq '' && $filetext eq '' } , 'maxhits_message' => sub { return @results == $maxhits ? "Too many hits, displaying first $maxhits
" : ''; } , 'results' => sub { printresults(@_, $searchtext, @results) } , 'resultcount' => sub { return scalar @results } , 'indexstate' => sub { displayindexstate('search') } ) ); } =head2 Script entry point Builds the header and footer and launches C for the real job. =cut httpinit(); std_http_headers('search'); makeheader('search'); if ( $files->isa('LXR::Files::Plain') || exists $config->{'sourcetext'} ) { if ( $config->{'glimpsebin'} && $config->{'glimpsebin'} =~ m!^(.*/)?true$! || $config->{'swishbin'} && $config->{'swishbin'} =~ m!^(.*/)?true$! ) { warn "Free-text search disabled by configuration file!\n"; } else { &search; } } else { warn "Free-text search not available with VCSs!\n"; } makefooter('search'); httpclean;