#!/usr/local/bin/perl ######################################################### # part of the HTML Dictionary # Distributed under the GNU copyleft (any version of your choice) # No part of these documents may be printed in any for-profit publication # copyleft sunil@magnetic.demon.co.uk ######################################################### ######################################################### # this script makes use of Glimpse, developed by # Udi Manber, Burra Gopal: University of Arizona # Sun Wu : National Chung-Cheng University, Taiwan # this version make use of the glimpse server and strips # out anything that looks like html. ######################################################### ######################################################### # CONFIGURE THESE ######################################################### ######################################################### # this little script assumes that the databases have a # thin and fat sub directory. # databases # | # +------+-------+-------+ # | | | | # this that other misc # | # +-----+----+ # | | # fat thin # | | # .glimpse_index .glimpse_index # .glimpse_stat.. .glimpse... # .... # # the fat index should have been indexed using # glimpseindex -o .... # glimpseindex -B -f -s .... # # two glimpseservers are needed for each database because # there are two databases (fat and thin). ######################################################### ######################################################### # *** put sensible things here *** #-------where does glimpse live?-------------------------- $glimpse_dir="/usr/local/lib/glimpse"; $glimpse_bin="$glimpse_dir"; #--------information about the database------------------- $glimpse_dbs="/database/directory/"; $db_name="all"; $title="search my server"; $doc_root="/usr/share/htdocs/" $doc_server="http://my_server:my_port/"; #--------if you are using the glimpseserver--------------- $thinport=3000; #port for glimpseserver on thin database $fatport=3001; #port for glimpseserver on fat database $glimpseserver="host"; #where are the glimpse servers? $use_server=1; #or 0 #-----------------------options--------------------------- #$default_search_type = "fat"; #or thin #$default_case_sensitive = 0; #or 1 #$the_guru = 'username@address'; ######################################################### ######################################################### # # Nothing to configure below here # ######################################################### $default_max_hits = 20; $glimpse_delim=": "; require "www_lib.pl"; %FIELDS=&GET_FIELDS(); $my_url=&get_this_URL(); #this may fail on cern httpd $fat_db_dir="$glimpse_dbs/$db_name/fat"; $thin_db_dir="$glimpse_dbs/$db_name/thin"; $input_field="input_field"; $result_field="result_field"; $match_field="match_field"; $word_field="word_field"; $error_field="error_field"; $case_field="case_field"; @result_set; $kudos="
This searching capability was brought to you thanks to the Glimpse full text search engine.

Glimpse was developed by and is © copyright of Udi Manber and Burra Gopal, Department of Computer Science, University of Arizona and Sun Wu, the National Chung-Cheng University, Taiwan.

Please send problem reports or enhancement requests to the glimpse guru. "; $this_is_a_subset = 0; %word_options= ( "Match whole words", "whole", "Allow fuzzy searching", "partial"); @case_options=( "Yes", "No" ); @match_options= ( "10", "20", "30", "40", "all matches"); %result_options = ( "Just the number of matches", "thin", "with contextual text", "fat"); @error_options = (0,1,3,5,8); %TITLE_LIST; $| = 1; #no buffering ######################################################### # ######################################################### sub show_query_form { local (@keys); &PRINT_HEADER ("Search the $title"); @keys = keys %FIELDS; if (@keys) { &h2 ("Search Expression - You didnt enter an expression"); } else { &h2 ("Search Expression"); } #------------------------------------------------------------- &form ($my_url); &nobr(); print "Enter the search expression "; print ""; &_nobr(); &h3("options"); &ul; &li(""); &gen_labelled_select ( "Result type", $result_field, keys (%result_options)); &li(""); &gen_labelled_select ( "number of matches", $match_field, @match_options); &li(""); &gen_labelled_select ( "Search Type", $word_field, keys (%word_options)); &li(""); &gen_labelled_select ( "match case?", $case_field, @case_options); &li(""); #too slow!!! &gen_labelled_select ( "errors allowed", $error_field, @error_options); &_ul; $_form; print "


"; #------------------------------------------------------------- &h2 ("Notes"); print "The search engine supports regular expressions"; &dl; &dt; &bold; print "Special characters"; &_bold; ⅆ print " The following characters are reserved to the search engine. They should be escaped by preceeding with a back-slash if you wish to search for them.

 ^   \$   *   [   ]   |   (   )   !   \   ;   ,   #   <   >   -   . 

^     matches the beginning of a line
\$     matches the end of a line
.     matches any single character
\#     matches any number of characters
*     matches any number of the previous character"; &p; &dt; &bold; print "Sets"; &_bold; ⅆ print " a set of characters inside [] matches any of the characters in that set.

"; &p; &dt; &bold; print "Complex operations"; &_bold; ⅆ print " You can contruct boolean expressions using \"AND\" and \"OR\". Complex expressions can be built by surrounding patterns with curly brackets {}.

'{political OR computer} AND science

will match 'political science' or 'computer science'."; &p; &dt; &bold; print "exact matches"; &_bold; ⅆ print " the default behaviour is to allow mistakes in the words being searched for. Surrounding an expression in angle brackets < > forces an exact match on that part of the expression.

"; &p; &dt; &bold; print "(ir)regular expressions"; &_bold; ⅆ print " "; print $kudos; &PRINT_FOOTER; } ######################################################### # ######################################################### sub process_query { local ($expression) = @_; local ($glimpse_expression, $item, @glimpse_cmd, $glimpse_pid); local ($max_hits,$search_type, $sensitive,$match_words, $allowed_errs); #----------------preamble-------------------------------- $max_hits = $FIELDS{$match_field}; if ( $max_hits ) { $max_hits =~ s/\D//g; } else { $max_hits = $default_max_hits; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $search_type = $FIELDS{$result_field}; if ( $search_type ) { $search_type = $result_options{$search_type}; } else { $search_type = $default_search_type; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $sensitive = $FIELDS{$case_field}; if ( $sensitive ) { if ($sensitive eq "Yes") { $sensitive =1; } else { $sensitive =0; } } else { $sensitive = $default_case_sensitive; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $match_words = $FIELDS{$word_field} ; $match_words = ( $word_options{$match_words} eq "whole" ); $allowed_errs = $FIELDS{$error_field}; #----------------convert into glimpse format-------------- $expression =~ s/{/ { /g; $expression =~ s/}/ } /g; if (( $expression !~ /\sand\s/) && ($expression !~ /\sor\s/)) { $expression =~ s/\s+/ and /g; } $expression = &trim ($expression); foreach $item ( split (/\s/, $expression )) { $item =~ s/^and$/\;/; $item =~ s/^or$/\,/; $glimpse_expression .= "$item"; } #----------------build command ------------------------------ push ( @glimpse_cmd, "$glimpse_bin/glimpse" ); if ($match_words ) { push ( @glimpse_cmd, "-w"); } if ($errs_allowed >0 ) { push ( @glimpse_cmd, "-${errs_allowed}"); } if ($use_server) { push ( @glimpse_cmd, "-C", "-J", "$glimpseserver", "-K" ); if ($search_type eq "thin") { push ( @glimpse_cmd, "$thinport" ); push ( @glimpse_cmd, "-c"); } else { push ( @glimpse_cmd," $fatport" ); } } else { if ($search_type eq "thin") { push ( @glimpse_cmd, "-H" , $thin_db_dir); } else { push ( @glimpse_cmd, "-H" , $fat_db_dir); } } if (! $sensitive) { push (@glimpse_cmd, "-i"); } push ( @glimpse_cmd, "-y", $glimpse_expression); #----------------Execute------------------------------ #print join (" ", @glimpse_cmd), "\n\n"; $glimpse_pid = open(GLIMPSE, "-|") || exec (@glimpse_cmd); &unbuffer ( GLIMPSE); while () { if ( $max_hits ) { if ( $. > $max_hits ) { $this_is_a_subset = 1; kill -9, $glimpse_pid; last; } } ($item = $_) =~ s#^$doc_root##; push (@result_set, $item); } close GLIMPSE; #--------------------heading on results -------------------------- &PRINT_HEADER ("Results of searching the $title"); if ( $this_is_a_subset ) { &h2("first $max_hits results"); &hr; print "The results only show the first $max_hits hits."; &ul; &li(" This is not the same as the best $max_hits matches as it is dependant on the order the documents were initially indexed."); if ($search_type eq "fat" ) { &li(" This will not show the first ten documents in which matches were found. For that you needed to select the show \"just the number of matches\" option on the search screen."); } &_ul; &hr; } else { &h2("results"); } ¢re;&code; print "$expression"; &_code;&_centre; $expression =~ s//>/g; $glimpse_expression =~ s//>/g; #--------------------display -------------------------- if ($search_type eq "thin") { &show_thin_results; } else { &show_fat_results; } #--------------------------------------------------------- &PRINT_FOOTER; } ######################################################### # ######################################################### sub show_link { local ($fname) = @_; &href ( "${doc_server}$fname", "$fname"); &newline; } ######################################################### # show the dcuments in order ######################################################### sub show_thin_results { local ($item, %assoc_array, $file, $value, $key); foreach $item (@result_set) { ($file, $value) = split ( /$glimpse_delim/ , $item); $assoc_array{$value} = $file; } ¢re; print " "; foreach $key ( sort sort_reverse_numerically (keys %assoc_array)) { print ""; } print "
No.
matches
name
$key"; &show_link ($assoc_array{$key}); print "
"; &_centre; } ######################################################### # ######################################################### sub show_fat_results { local ($item, $last_fname, $colon, $fname, $value, $last_fname); $last_fname= ""; &dl; foreach $item (@result_set) { $colon = index( $item, $glimpse_delim); $fname=substr($item,0,$colon); $value=substr($item,$colon); if ($fname ne $last_fname) { &p; &dt; &show_link ($fname); $last_fname = $fname; } $value = &unwebify($value); print "
$value\n"; } &_dl; &newline; } ######################################################### # ######################################################### sub unbuffer { local ($new_fh) = @_; local ($old_fh); $old_fh = select( $new_fh); $| = 1; select ($old_fh); } ######################################################### # ######################################################### sub main { local ($query); $query = $FIELDS{$input_field}; if ($query) { &process_query($query); } else { &show_query_form; } } &main;