#
# Isearch search algorithm implemented in Perl
#
# Brent Baccala <baccala@freesoft.org>
# June 2001
#
# Isearch (from www.etymon.com) is a full-text search engine.  This
# module implements the core features of searching an Isearch index.
#
# What's not supported:
#    - field searches don't work
#    - no date/numeric/bounding box searches
#    - TITLEs are not retrieved from HTML documents
#    - no multi-term or boolean searches
#
# $Log: Isearch.pm,v $
# Revision 1.16  2002/05/11 18:39:55  baccala
# Fixed handling of :3 type weights at end of search terms
#
# Revision 1.15  2002/05/11 18:29:52  baccala
# Fixed handling of asterisk in search terms
#
# Revision 1.14  2002/05/05 21:11:17  baccala
# Added ability to set debug level via an environment var
#
# Revision 1.13  2002/05/05 20:58:05  baccala
# Added support for big endian architectures (like the sparc I'm now running
# it on).  Checks DBI file for architecture type database was written on,
# and adjusts it's pack templates accordingly.
#
# Revision 1.12  2002/05/05 20:27:26  baccala
# Switched to using Freesoft's scoring mechanism
#
# Revision 1.11  2002/01/24 18:46:26  baccala
# added openMDT function
# fixed typo exists/defined
#
# Revision 1.10  2001/08/04 03:24:24  baccala
# Renamed package from Isearch2 to Isearch
#
# Revision 1.9  2001/07/04 19:43:45  baccala
# Added stop word list from Isearch
#
# Revision 1.8  2001/07/04 19:00:39  baccala
# Corrected slight problem with limit finding that could, in a rare case,
# drop a single valid entry at the end of the range
#
# Revision 1.7  2001/07/02 19:43:56  baccala
# Revised version works more like C Isearch - binary searches MDT on disk,
# rather than reading the whole thing into memory.
#
# Revision 1.6  2001/07/02 17:02:37  baccala
# &fetchBuffer now returns undef if it can't open a file instead of killing
# the whole script
#
# Revision 1.5  2001/06/21 06:36:32  baccala
# Dropped use of Tree::RedBlack; now read in raw MDT
# Fixed a slight bug in the binary search algorithm
#
# Revision 1.4  2001/06/21 05:11:20  baccala
# Added recursive decent parser (working, but not yet complete) to
# parse boolean search expressions
#
# Revision 1.3  2001/06/07 03:47:26  baccala
# Handles deleted documents, and that means it handles changed documents (yea!)
#

package Isearch;

use strict;
use Data::Dumper;

my $debug = 0;

if (exists $ENV{ISEARCH_DEBUG}) {
    $debug = $ENV{ISEARCH_DEBUG};
}

# Isearch indexes documents using a single global index (32 bits),
# which maps to character positions in the files.  To resolve a global
# index, you look it up in the Master Document Table (MDT) which lists
# initial and final global indices for each file in the collection,
# find the right file, then subtract the file's initial index to get a
# character offset in the file.  Ex: index.htm is 400 bytes long and
# maps to global indices 100001-100400.  So global index 100200
# corresponds to the 200th byte in index.htm.  The main index is just
# a sorted list of global indices for every word in the collection;
# you binary search it.

# &lookupGP - Find a global position (GP) in the MDT and returns its
# MDT entry.  Assumes MDT is already open on filehandle MDT, and
# number of records is in $mdtsize.  Assumes we're using an index
# built with Isearch v1.47, which writes 1380-byte MDT records using a
# C structure.

my $mdtsize;
my %DBI;

sub openMDT {
    my ($db) = @_;
    my $key;

    print "reading DBI for $db\n" if $debug>0;

    open(DBI,"<$db.dbi") or die "opening $db.dbi";

    while (<DBI>) {
	if (/^ \+(.*)/) {
	    $key = $1;
	}
	if (/^  \+(.*)/) {
	    $DBI{$key} = $1;
	    print "read DBI $key -> $1\n" if $debug>1;
	}
    }

    close(DBI);

    print "reading MDT for $db\n" if $debug>0;

    open(MDT, "<$db.mdt") or die "opening $db.mdt";
    binmode MDT;

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks) = stat(MDT)
	    or die "stat MDT";

    die "Invalid sized MDT for $db\n" if ($size%1380 != 0);

    $mdtsize = $size/1380;
}

sub lookupGP {

    my ($gp) = @_;

    my $pack_template;

    if ($DBI{BigEndian}) {
	$pack_template = 'Z16Z64Z255Z1024xNNNNCxxx';
    } else {
	$pack_template = 'Z16Z64Z255Z1024xVVVVCxxx';
    }

    my $left = 0;
    my $right = $mdtsize - 1;

    while ($left <= $right) {
	my $mid = int(($left + $right)/2);

	my $mdtentry;
	seek(MDT, 1380 * $mid, 0);
	read(MDT, $mdtentry, 1380);

	my ($dockey, $doctype, $pathname, $filename,
	    $gfstart, $gfend, $lrstart, $lrend,
	    $deleted) = unpack($pack_template, $mdtentry);

	if ($gp >= $gfstart and $gp <= $gfend) {
	    my $value = {
		DocKey => $dockey,
		DocType => $doctype,
		Pathname => $pathname,
		Filename => $filename,
		GFstart => $gfstart,
		GFend => $gfend,
		LRstart => $lrstart,
		LRend => $lrend,
		Deleted => $deleted
		};
	    return $value;
	} elsif ($gp < $gfstart) {
	    $right = $mid-1;
	} else {
	    $left = $mid+1;
	}
    }

    return undef;
}

my $readlen = 32;
my $exact = 0;

# &fetchBuffer($db, $inx) retrieves one word from the document
# collection, from the $inx'th position in the index file, and returns
# it.  Can return undef if the document was marked deleted. The index
# file is assumed to be opened on file handle INX.  Since the index
# file is just a sorted list of 32-bit global indices, we seek to
# 4*$inx in the index file, read the global index we find there, look
# it up in the MDT, open the file pointed to by the MDT entry, seek
# into that file, and read up to $readlen bytes from the indicated
# position.  Unless we're doing an exact match, we then throw away
# everything after the first non-word-constituent character and return
# a lowercased version of the word.

sub fetchBuffer {
    my ($db, $inx) = @_;
    my $buffer;
    my $gpos;

    seek(INX, 4*$inx, 0);
    read(INX, $buffer, 4);

    if ($DBI{BigEndian}) {
	$gpos = unpack('N', $buffer);
    } else {
	$gpos = unpack('V', $buffer);
    }

    my $mdtentry = &lookupGP($gpos);
    die "mdtentry undef" unless defined $mdtentry;

    return undef if $$mdtentry{Deleted};

    open(FILE, $$mdtentry{Pathname} . $$mdtentry{Filename})
	or return undef;
    binmode FILE;
    seek(FILE, $gpos - $$mdtentry{GFstart}, 0);
    read(FILE, $buffer, $readlen);
    close FILE;

    if ($exact) {
	$buffer =~ s/[\W_]$//;
	$buffer =~ s/[\W_]/ /g;
    } else {
	$buffer =~ s/[\W_].*//s;
    }

    #print "$inx($gpos) = $buffer\n";

    return lc $buffer;
}

# &fetchBufferNearestMid - retrieves one word from the document collection
# near the midpoint of index positions ($left, $right).  Uses &fetchBuffer,
# and takes into consideration that the actual midpoint might correspond
# to a deleted document.  Constructs a range around the midpoint containing
# deleted entries and bracketed on each side by either a legitimate hit
# or the limit of the original range.  Returns a four element list -
# left word, left index, right word, right index.  Each word will be
# undef if everything on that side up to the initial endpoint was deleted.

sub fetchBufferNearestMid {
    my ($db, $left, $right) = @_;

    my $mid = int(($right + $left)/2);
    my $buffer = &fetchBuffer($db, $mid);

    if (defined $buffer) {
	print "fetchBufferNearestMid returns ($buffer, $mid)\n" if $debug>1;
	return ($buffer, $mid, $buffer, $mid);
    }

    my ($bufferl, $midl, $bufferr, $midr);

    for ($midl = $mid-1; $midl >= $left; $midl --) {
	
	$bufferl = &fetchBuffer($db, $midl);
	last if defined $bufferl;
    }

    for ($midr = $mid+1; $midr <= $right; $midr ++) {
	
	$bufferr = &fetchBuffer($db, $midr);
	last if defined $bufferr;
    }

    print "fetchBufferNearestMid returns ($bufferl, $midl, $bufferr, $midr)\n"
	if $debug>1;

    return ($bufferl, $midl, $bufferr, $midr);
}

# &binarySearch - uses &fetchBufferNearestMid (so index file must be
# open on INX) to binary search an index file.  If a hit was found,
# immediately returns a reference to a three-item list: [$left, $hit,
# $right], where $hit is the index of the hit, while $left and $right
# bracket it, since there might be more than one matching index.
# $left is always less than the lowest matching index, while $right is
# always greater than the highest matching index.

sub binarySearch {
    my ($db, $searchWord, $left, $right) = @_;

    while ($left <= $right) {

	print "binarySearch($searchWord) [$left, $right]\n" if $debug>1;

	my ($bufferl, $midl, $bufferr, $midr)
	    = &fetchBufferNearestMid($db, $left, $right);

	if (defined $bufferl and $bufferl eq $searchWord) {
	    return [$left, $midl, $right];
	} elsif (defined $bufferr and $bufferr eq $searchWord) {
	    return [$left, $midr, $right];
	} elsif (defined $bufferr and $bufferr lt $searchWord) {
	    $left = $midr+1;
	} elsif (defined $bufferl and $bufferl gt $searchWord) {
	    $right = $midl-1;
	} else {
	    # by this point...
	    #    if defined $bufferl, $bufferl lt $searchWord
	    #    if defined $bufferr, $bufferr gt $searchWord
	    # ...so $searchWord must be in the range of deleted entries
	    return undef;
	}
    }

    return undef;
}

# &limitSearchLeft - called after &binarySearch to find the left
# boundary of the range of matching indices.  Returns the lowest index
# that matches $searchWord.  None of the indices in the passed-in
# range should be gt $searchWord, so we're just looking for the
# boundary between "lt" (on the left) and "eq" (on the right).  Uses
# &fetchBuffer, so index file must still be open on INX.

sub limitSearchLeft {
    my ($db, $searchWord, $left, $right) = @_;

    while ($left+1 < $right) {

	print "limitSearchLeft($searchWord) [$left, $right]\n" if $debug>1;

	my ($bufferl, $midl, $bufferr, $midr) =
	    &fetchBufferNearestMid($db, $left, $right);

	if (defined $bufferl and $bufferl eq $searchWord) {
	    $right = $midl;
	} elsif (defined $bufferr and $bufferr lt $searchWord) {
	    $left = $midr+1;
	} else {
	    # by this point...
	    #    if defined $bufferl, $bufferl lt $searchWord
	    #    if defined $bufferr, $bufferr eq $searchWord
	    # ...so the boundary must be in the range of deleted entries,
	    # and the returned right midpoint is the lowest matching index
	    return $midr if defined $bufferr;

	    # The passed-in right boundary should have been a legitimate
	    # hit returned by &binarySearch, so...
	    die "\$bufferr should never be undef in limitSearchLeft";
	}
    }

    my ($bufferl, $midl, $bufferr, $midr) =
	&fetchBufferNearestMid($db, $left, $left);

    return $left if (defined $bufferl and $bufferl eq $searchWord);
    return $right;
}

# &limitSearchRight - same deal for finding the right limit of
# the matching indices

sub limitSearchRight {
    my ($db, $searchWord, $left, $right) = @_;

    while ($left+1 < $right) {

	print "limitSearchRight($searchWord) [$left, $right]\n" if $debug>1;

	my ($bufferl, $midl, $bufferr, $midr) =
	    &fetchBufferNearestMid($db, $left, $right);

	if (defined $bufferl and $bufferl gt $searchWord) {
	    $right = $midl-1;
	} elsif (defined $bufferr and $bufferr eq $searchWord) {
	    $left = $midr;
	} else {
	    return $midl if defined $bufferl;
	    die "\$bufferl should never be undef in limitSearchRight";
	}
    }

    my ($bufferl, $midl, $bufferr, $midr) =
	&fetchBufferNearestMid($db, $right, $right);

    return $right if (defined $bufferr and $bufferr eq $searchWord);
    return $left;
}

# &searchTermINX - searches for a particular term in a particular index
# and returns a list of matching global positions (possibly the empty list).
#
# Opens the index (passed in), &binarySearch's for the term, then uses
# &limitSearch's to find the left and right boundaries.  Reads from the
# index file everything between the boundaries (these will be the
# global positions that matched), unpacks this into a list and returns it.

sub searchTermINX {
    my ($db, $index, $searchWord) = @_;

    open(INX, $index) or die "opening $index";
    binmode INX;

    my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
	$atime,$mtime,$ctime,$blksize,$blocks) = stat(INX)
	    or die "stat INX";

    my $result = &binarySearch($db, $searchWord, 0, int($size/4));

    if (not defined $result) {
	close INX;
	return ();
    }

    my $left = &limitSearchLeft($db, $searchWord, $$result[0], $$result[1]);
    my $right = &limitSearchRight($db, $searchWord, $$result[1], $$result[2]);
    print "$index: [$left, $right]\n" if $debug>0;

    my $buffer;
    seek(INX, 4*$left, 0);
    read(INX, $buffer, 4*($right-$left+1)) == 4*($right-$left+1)
	or die "short read on INX";

    close (INX);

    if ($DBI{BigEndian}) {
	return unpack('N*', $buffer);
    } else {
	return unpack('V*', $buffer);
    }
}

# &searchTermPrim - searches for a word in a database and returns a list
# of matching global positions.  If the database hasn't been optimized,
# then there might be multiple index files.  Calls &searchTermINX for
# every index file and concatenates all the results.  Note that the
# result set may contains global positions pointing to deleted files;
# the MDT still needs to be checked for each result to weed out deletions.

sub searchTermPrim {

    my ($db, $searchWord) = @_;

    if ($searchWord =~ s/\*$//) {
	# Wildcard - throw away * (the s did) and only match first n letters
	$readlen = length($searchWord);
    } else {
	# No wildcard - match n + 1 letters to make sure the word ends
	$readlen = length($searchWord) + 1;
    }

    $searchWord = lc $searchWord;
    $searchWord =~ s/[\W_]/ /g;
    $exact = 1;

    if (not -r "$db.inx.1") {
	return &searchTermINX($db, "$db.inx", $searchWord);
    } else {
	return map { &searchTermINX($db, $_, $searchWord) } (glob "$db.inx.*");
    }
}

# &searchTerm - searches for a word in a database and returns a hash
# reference.  The hash keys are filenames and the values are hit
# counts, normalized by dividing through by the square root of the sum
# of the squares.

sub searchTerm {

    my ($db, $searchWord) = @_;
    my @results;
    my %SCORES;
    my $mdtentry;
    my $weight;

    if ($searchWord =~ s/:([0-9]+)$//) {
	$weight = $1;
    } else {
	$weight = 1;
    }

    @results = &searchTermPrim($db, $searchWord);

    foreach my $gp (sort {$a <=> $b} @results) {

	if ((not defined $mdtentry) or $gp > $$mdtentry{GFend}) {
	    $mdtentry = &lookupGP($gp);
	}

	next if $$mdtentry{Deleted};

	# Standard Isearch scoring mechanism - scores all hits the same
	#$SCORES{$$mdtentry{Pathname}.$$mdtentry{Filename}} ++;

	# Freesoft's experimental scoring mechanism - score hits inversely
	# proportional to the size of the file
	$SCORES{$$mdtentry{Pathname}.$$mdtentry{Filename}} +=
	(1 / $$mdtentry{LRend});

    }

    my $sumsq = 0;

    foreach my $file (keys %SCORES) {
	$sumsq += $SCORES{$file} * $SCORES{$file};
    }

    my $sqrtsum = sqrt($sumsq);

    foreach my $file (keys %SCORES) {
	$SCORES{$file} /= $sqrtsum;
	$SCORES{$file} *= $weight;
    }

    return \%SCORES;
}

sub searchExpr {
    my ($db, @searchPhrase) = @_;
    my $result;

    ($result, @searchPhrase) = &searchExpr2($db, @searchPhrase);

    while (@searchPhrase > 0 and
	   ($searchPhrase[0] eq "or" or $searchPhrase[0] eq "||")) {

	my $result2;

	shift @searchPhrase;
	($result2, @searchPhrase) = &searchExpr2($db, @searchPhrase);

	foreach my $file (keys %$result2) {
	    $$result{$file} += $$result2{$file};
	}
    }

    return $result, @searchPhrase;
}

sub searchExpr2 {
    my ($db, @searchPhrase) = @_;
    my $result;

    ($result, @searchPhrase) = &searchExpr3($db, @searchPhrase);

    while (@searchPhrase > 0 and
	   ($searchPhrase[0] eq "and" or $searchPhrase[0] eq "&&")) {

	my $result2;
	my $newResult = {};

	shift @searchPhrase;
	($result2, @searchPhrase) = &searchExpr3($db, @searchPhrase);

	foreach my $file (keys %$result) {
	    if (exists $$result2{$file}) {
		$$newResult{$file} = $$result{$file} + $$result2{$file};
	    }
	}

	$result = $newResult;
    }

    return $result, @searchPhrase;
}

sub searchExpr3 { return &searchExpr4(@_); }
#sub searchExpr3 {
#    my ($db, @searchPhrase) = @_;
#
#    my ($result, @restPhrase) = &searchExpr4($db, @searchPhrase);
#
#    return $result if $#restPhrase == -1;
#
#    while ($restPhrase[0] eq "andnot" or $restPhrase[0] eq "&!") {
#	my $rightResult = &searchExpr4($db, shift @restPhrase);
#	# ANDNOT merge $rightResult into $result
#    }
#}

sub searchExpr4 { return &searchExpr5(@_); }
#sub searchExpr4 {
#    my ($db, @searchPhrase) = @_;
#
#    my ($result, @restPhrase) = &searchExpr5($db, @searchPhrase);
#
#    return $result if $#restPhrase == -1;
#
#    while ($restPhrase[0] eq "near")
#	my $rightResult = &searchExpr5($db, shift @restPhrase);
#	# NEAR merge $rightResult into $result
#    }
#}

sub searchExpr5 {
    my ($db, @searchPhrase) = @_;
    my $result;

    if ($searchPhrase[0] eq "(") {
	shift @searchPhrase;
	($result, @searchPhrase) = &searchExpr($db, @searchPhrase);
	if (shift @searchPhrase ne ")") {
	    die "Unterminated parenthesis in search request";
	}
    } else {
	$result = &searchTerm($db, shift @searchPhrase);

	while (defined $searchPhrase[0] and $searchPhrase[0] ne "("
	       and $searchPhrase[0] ne "and"
	       and $searchPhrase[0] ne "or"
	       and $searchPhrase[0] ne "andnot"
	       and $searchPhrase[0] ne "near") {

	    my $result2 = &searchTerm($db, shift @searchPhrase);
	    my $newResult = {};

	    foreach my $file (keys %$result) {
		if (exists $$result2{$file}) {
		    $$newResult{$file} = $$result{$file} + $$result2{$file};
		}
	    }

	    $result = $newResult;
	}
    }

    return $result, @searchPhrase;
}

# stop word list taken directly from Isearch (sw.hxx)

my @stopwords = (
  "0",
  "1",
  "2",
  "3",
  "4",
  "5",
  "6",
  "7",
  "8",
  "9",
  "a",
  "about",
  "above",
  "according",
  "across",
  "actually",
  "adj",
  "after",
  "afterwards",
  "again",
  "against",
  "all",
  "almost",
  "alone",
  "along",
  "already",
  "also",
  "although",
  "always",
  "among",
  "amongst",
  "an",
  "and",
  "another",
  "any",
  "anyhow",
  "anyone",
  "anything",
  "anywhere",
  "are",
  "aren't",
  "around",
  "as",
  "at",
  "b",
  "be",
  "became",
  "because",
  "become",
  "becomes",
  "becoming",
  "been",
  "before",
  "beforehand",
  "begin",
  "beginning",
  "behind",
  "being",
  "below",
  "beside",
  "besides",
  "between",
  "beyond",
  "billion",
  "both",
  "but",
  "by",
  "c",
  "can",
  "can't",
  "cannot",
  "caption",
  "co",
  "co.",
  "could",
  "couldn't",
  "d",
  "did",
  "didn't",
  "do",
  "does",
  "doesn't",
  "don't",
  "down",
  "during",
  "e",
  "each",
  "eg",
  "eight",
  "eighty",
  "either",
  "else",
  "elsewhere",
  "end",
  "ending",
  "enough",
  "etc",
  "even",
  "ever",
  "every",
  "everyone",
  "everything",
  "everywhere",
  "except",
  "f",
  "few",
  "fifty",
  "first",
  "five",
  "for",
  "former",
  "formerly",
  "forty",
  "found", "",
  "four",
  "from",
  "further",
  "g",
  "h",
  "had",
  "has",
  "hasn't",
  "have",
  "haven't",
  "he",
  "he'd",
  "he'll",
  "he's",
  "hence",
  "her",
  "here",
  "here's",
  "hereafter",
  "hereby",
  "herein",
  "hereupon",
  "hers",
  "herself",
  "him",
  "himself",
  "his",
  "how",
  "however",
  "hundred",
  "i",
  "i'd",
  "i'll",
  "i'm",
  "i've",
  "ie",
  "if",
  "in",
  "inc.",
  "indeed",
  "instead",
  "into",
  "is",
  "isn't",
  "it",
  "it's",
  "its",
  "itself",
  "j",
  "k",
  "l",
  "last",
  "later",
  "latter",
  "latterly",
  "least",
  "less",
  "let",
  "let's",
  "like",
  "likely",
  "ltd",
  "m",
  "made",
  "make",
  "makes",
  "many",
  "maybe",
  "me",
  "meantime",
  "meanwhile",
  "might",
  "million",
  "miss",
  "more",
  "moreover",
  "most",
  "mostly",
  "mr",
  "mrs",
  "much",
  "must",
  "my",
  "myself",
  "n",
  "namely",
  "neither",
  "never",
  "nevertheless",
  "next",
  "nine",
  "ninety",
  "no",
  "nobody",
  "none",
  "nonetheless",
  "noone",
  "nor",
  "not",
  "nothing",
  "now",
  "nowhere",
  "o",
  "of",
  "off",
  "often",
  "on",
  "once",
  "one",
  "one's",
  "only",
  "onto",
  "or",
  "other",
  "others",
  "otherwise",
  "our",
  "ours",
  "ourselves",
  "out",
  "over",
  "overall",
  "own",
  "p",
  "per",
  "perhaps",
  "q",
  "r",
  "rather",
  "recent",
  "recently",
  "s",
  "same",
  "seem",
  "seemed",
  "seeming",
  "seems",
  "seven",
  "seventy",
  "several",
  "she",
  "she'd",
  "she'll",
  "she's",
  "should",
  "shouldn't",
  "since",
  "six",
  "sixty",
  "so",
  "some",
  "somehow",
  "someone",
  "something",
  "sometime",
  "sometimes",
  "somewhere",
  "still",
  "stop",
  "such",
  "t",
  "taking",
  "ten",
  "than",
  "that",
  "that'll",
  "that's",
  "that've",
  "the",
  "their",
  "them",
  "themselves",
  "then",
  "thence",
  "there",
  "there'd",
  "there'll",
  "there're",
  "there's",
  "there've",
  "thereafter",
  "thereby",
  "therefore",
  "therein",
  "thereupon",
  "these",
  "they",
  "they'd",
  "they'll",
  "they're",
  "they've",
  "thirty",
  "this",
  "those",
  "though",
  "thousand",
  "three",
  "through",
  "throughout",
  "thru",
  "thus",
  "to",
  "together",
  "too",
  "toward",
  "towards",
  "trillion",
  "twenty",
  "two",
  "u",
  "under",
  "unless",
  "unlike",
  "unlikely",
  "until",
  "up",
  "upon",
  "us",
  "used",
  "using",
  "v",
  "very",
  "via",
  "w",
  "was",
  "wasn't",
  "we",
  "we'd",
  "we'll",
  "we're",
  "we've",
  "well",
  "were",
  "weren't",
  "what",
  "what'll",
  "what's",
  "what've",
  "whatever",
  "when",
  "whence",
  "whenever",
  "where",
  "where's",
  "whereafter",
  "whereas",
  "whereby",
  "wherein",
  "whereupon",
  "wherever",
  "whether",
  "which",
  "while",
  "whither",
  "who",
  "who'd",
  "who'll",
  "who's",
  "whoever",
  "whole",
  "whom",
  "whomever",
  "whose",
  "why",
  "will",
  "with",
  "within",
  "without",
  "won't",
  "would",
  "wouldn't",
  "x",
  "y",
  "yes",
  "yet",
  "you",
  "you'd",
  "you'll",
  "you're",
  "you've",
  "your",
  "yours",
  "yourself",
  "yourselves",
  "z"
);

my @specials = ("and", "or", "andnot", "near");

# (([^\s()&\|"{]|&(?![&!])|\|(?!\|)|"[^"]*"|\{[^}]*\})+|\(|\)|&&|&!|\|\|)

sub search {
    my ($db, $searchWord) = @_;

    my $searchPhrase = $searchWord;
    my @searchPhrase;
    while ($searchPhrase) {
	if ($searchPhrase =~ s!^\s+!!) {
	    # Throw away whitespace
	} elsif ($searchPhrase =~ s!^"([^"]*)"!!) {
	    my $word = lc $1;
	    push @searchPhrase, $word unless grep {$_ eq $word} @stopwords;
	} elsif ($searchPhrase =~ s!^([[:alnum:]]+\*?(:[0-9]+)?)!!) {
	    my $word = lc $1;
	    if (grep {$_ eq $word} @specials) {
		push @searchPhrase, $word;
	    } else {
		push @searchPhrase, $word unless grep {$_ eq $word} @stopwords;
	    }
	} else {
	    $searchPhrase =~ s!^(.)!!;
	    push @searchPhrase, $1;
	}
    }

    print Dumper(\@searchPhrase) if $debug>0;

    &openMDT($db);

    my $result;
    ($result, @searchPhrase) = &searchExpr($db, @searchPhrase);
    die "unterminated search expression" if (@searchPhrase > 0);

    close(MDT);

    print "search done\n" if $debug>0;

    return $result;
}

1;
