Re: [htdig] patch for Accents fuzzy algorithm for 3.2.0b2


Subject: Re: [htdig] patch for Accents fuzzy algorithm for 3.2.0b2
From: Gilles Detillieux (grdetil@scrc.umanitoba.ca)
Date: Tue May 02 2000 - 13:33:49 PDT


This is an adaptation for 3.2.0b2 of Robert Marchand's latest fix to his
accents fuzzy match algorithm. You should be able to apply this patch
in the main source directory of the htdig-3.2.0b2 source tree with
"patch -p1 < this_file".

Robert's fix changed the algorithm to avoid putting the key as a word
in the database, resulting in even more database space savings than
his earlier writeDB() method (now obsolete). A new getWords() method
adds the key to the list of words, so that htsearch will always search
for the unaccented word, even if entered with accents.

*** htdig-3.2.0b2/htfuzzy/Accents.h.orig Tue Apr 11 17:53:20 2000
--- htdig-3.2.0b2/htfuzzy/Accents.h Tue May 2 12:46:42 2000
*************** public:
*** 28,38 ****
          Accents(const HtConfiguration& config_arg);
          virtual ~Accents();
  
- virtual int writeDB();
-
          virtual void generateKey(char *word, String &key);
  
          virtual void addWord(char *word);
  
  private:
  };
--- 28,38 ----
          Accents(const HtConfiguration& config_arg);
          virtual ~Accents();
  
          virtual void generateKey(char *word, String &key);
  
          virtual void addWord(char *word);
+
+ virtual void getWords(char *word, List &words);
  
  private:
  };
*** htdig-3.2.0b2/htfuzzy/Accents.cc.orig Tue Apr 11 17:53:20 2000
--- htdig-3.2.0b2/htfuzzy/Accents.cc Tue May 2 12:49:52 2000
*************** Accents::~Accents()
*** 85,140 ****
  }
  
  //*****************************************************************************
- // int Accents::writeDB()
- //
- int
- Accents::writeDB()
- {
- String var = name;
- var << "_db";
- String filename = config[var];
-
- index = Database::getDatabaseInstance(DB_HASH);
- if (index->OpenReadWrite(filename, 0664) == NOTOK)
- return NOTOK;
-
- String *s;
- char *fuzzyKey;
-
- int count = 0;
-
- dict->Start_Get();
- while ((fuzzyKey = dict->Get_Next()))
- {
- s = (String *) dict->Find(fuzzyKey);
-
- // Only add if meaningfull list
- if (mystrcasecmp(fuzzyKey, s->get()) != 0) {
-
- index->Put(fuzzyKey, *s);
-
- if (debug > 1)
- {
- cout << "htfuzzy: '" << fuzzyKey << "' ==> '" << s->get() << "'\n"
- ;
- }
- count++;
- if ((count % 100) == 0 && debug == 1)
- {
- cout << "htfuzzy: keys: " << count << '\n';
- cout.flush();
- }
- }
- }
- if (debug == 1)
- {
- cout << "htfuzzy:Total keys: " << count << "\n";
- }
- return OK;
- }
-
-
- //*****************************************************************************
  // void Accents::generateKey(char *word, String &key)
  //
  void
--- 85,90 ----
*************** Accents::addWord(char *word)
*** 170,175 ****
--- 120,129 ----
      String key;
      generateKey(word, key);
  
+ // Do not add fuzzy key as a word, will be added at search time.
+ if (mystrcasecmp(word, key.get()) == 0)
+ return;
+
      String *s = (String *) dict->Find(key);
      if (s)
      {
*************** Accents::addWord(char *word)
*** 182,184 ****
--- 136,157 ----
      }
  }
  
+
+ //*****************************************************************************
+ // void Accents::getWords(char *word, List &words)
+ //
+ void
+ Accents::getWords(char *word, List &words)
+ {
+
+ if (!word || !*word)
+ return;
+
+ Fuzzy::getWords(word, words);
+
+ // fuzzy key itself is always searched.
+ String fuzzyKey;
+ generateKey(word, fuzzyKey);
+ if (mystrcasecmp(fuzzyKey.get(), word) != 0)
+ words.Add(new String(fuzzyKey));
+ }

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------ To unsubscribe from the htdig mailing list, send a message to htdig-unsubscribe@htdig.org You will receive a message to confirm this.



This archive was generated by hypermail 2b28 : Tue May 02 2000 - 11:20:33 PDT