Re: [htdig] patch for htsearch excerpt highlighting


Gilles Detillieux (grdetil@scrc.umanitoba.ca)
Tue, 24 Aug 1999 12:51:21 -0500 (CDT)


According to Patrick:
> I've also noticed that if you searched for "finding my email", and
> "my" is in the "badwords" file, it will also be highlighted in the
> excerpt. May I ask if it is possible that you also append this
> into the patch?
>
> Currently, htdig/htsearch truly ignore words in the "badwords" file,
> but they still come up in the excerpt highlighted.

Thanks for the tip. Here's a revised patch that will make htsearch
ignore words in excerpts that are in the bad_word_list file, or that
are shorter than minimum_word_length, in addition to skipping over
punctuation:

--- htdig-3.1.2.bak/htlib/StringMatch.h Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.h Mon Aug 23 15:38:31 1999
@@ -98,6 +98,12 @@ public:
     void IgnoreCase();
 
     //
+ // Build a local translation table which ignores all given punctuation
+ // characters
+ //
+ void IgnorePunct(char *punct = NULL);
+
+ //
     // Determine if there is a pattern associated with this Match object.
     //
     int hasPattern() {return table[0] != 0;}
--- htdig-3.1.2.bak/htlib/StringMatch.cc Wed Apr 21 21:47:58 1999
+++ htdig-3.1.2/htlib/StringMatch.cc Mon Aug 23 16:40:14 1999
@@ -90,6 +90,8 @@ StringMatch::Pattern(char *pattern, char
         table[i] = new int[n];
         memset((unsigned char *) table[i], 0, n * sizeof(int));
     }
+ for (i = 0; i < n; i++)
+ table[0][i] = i; // "no-op" states for null char, to be ignored
 
     //
     // Set up a standard case translation table if needed.
@@ -127,6 +129,11 @@ StringMatch::Pattern(char *pattern, char
 #endif
 
         chr = trans[(unsigned char)*pattern];
+ if (chr == 0)
+ {
+ pattern++;
+ continue;
+ }
         if (chr == sep)
         {
             //
@@ -504,12 +511,39 @@ void StringMatch::TranslationTable(char
 //
 void StringMatch::IgnoreCase()
 {
- if (local_alloc)
- delete [] trans;
- trans = new unsigned char[256];
+ if (!local_alloc || !trans)
+ {
+ trans = new unsigned char[256];
+ for (int i = 0; i < 256; i++)
+ trans[i] = (unsigned char)i;
+ local_alloc = 1;
+ }
     for (int i = 0; i < 256; i++)
- trans[i] = tolower((unsigned char)i);
- local_alloc = 1;
+ if (isupper((unsigned char)i))
+ trans[i] = tolower((unsigned char)i);
+}
+
+
+//*****************************************************************************
+// void StringMatch::IgnorePunct(char *punct)
+// Set up the character translation table to ignore punctuation
+//
+void StringMatch::IgnorePunct(char *punct)
+{
+ if (!local_alloc || !trans)
+ {
+ trans = new unsigned char[256];
+ for (int i = 0; i < 256; i++)
+ trans[i] = (unsigned char)i;
+ local_alloc = 1;
+ }
+ if (punct)
+ for (int i = 0; punct[i]; i++)
+ trans[(unsigned char)punct[i]] = 0;
+ else
+ for (int i = 0; i < 256; i++)
+ if (HtIsWordChar(i) && !HtIsStrictWordChar(i))
+ trans[i] = 0;
 }
 
 
--- htdig-3.1.2.bak/htsearch/htsearch.cc Wed Aug 18 16:40:30 1999
+++ htdig-3.1.2/htsearch/htsearch.cc Tue Aug 24 12:34:23 1999
@@ -222,9 +222,11 @@ main(int ac, char **av)
     //
     origPattern += logicalPattern;
     searchWordsPattern.IgnoreCase();
- searchWordsPattern.Pattern(origPattern);
- if (debug > 2)
- cout << "Excerpt pattern: " << origPattern << "\n";
+ searchWordsPattern.IgnorePunct();
+ searchWordsPattern.Pattern(logicalPattern); // this should now be enough
+ //searchWordsPattern.Pattern(origPattern);
+ //if (debug > 2)
+ // cout << "Excerpt pattern: " << origPattern << "\n";
 
     //
     // If required keywords were given in the search form, we will
@@ -314,7 +316,8 @@ createLogicalWords(List &searchWords, St
         }
         else
             wasHidden = 1;
- if (ww->weight > 0) // Ignore boolean syntax stuff
+ if (ww->weight > 0 // Ignore boolean syntax stuff
+ && !ww->isIgnore) // Ignore short or bad words
         {
             if (pattern.length())
                 pattern << '|';

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------ To unsubscribe from the htdig mailing list, send a message to htdig@htdig.org containing the single word unsubscribe in the SUBJECT of the message.



This archive was generated by hypermail 2.0b3 on Tue Aug 24 1999 - 10:53:15 PDT