Re: [htdig] problems with the "accent" patch


Subject: Re: [htdig] problems with the "accent" patch
From: Gilles Detillieux (grdetil@scrc.umanitoba.ca)
Date: Thu Mar 02 2000 - 09:48:55 PST


According to Eric.Doutreleau@int-evry.fr:
> I m quite interested in the accent patch By Robert Marchand.
>
> But i have problem to apply the patch to the 3.1.5 tree.
>
> I had to apply it by hand. I reindex my base and add the accents
> clause in the fuzzy line in the htdig.conf.

Robert didn't use diff -c or diff -u to make the patch, and didn't
include the paths to the files, so it doesn't readily apply on its
own, leaving you little choice but to apply it by hand.

> When i search my database i got far less answer than without the accent
> patch.

Strange. I could see how it would have no effect, if it wasn't working,
but I don't know why you'd get fewer results. Perhaps you broke something
in the code when applying the changes, or you accidentally disabled some
of the other fuzzy algorithms in your search_algorithms attribute.

> I guess that the patch should have worked at least for one person :)
>
> My question is this one
> how do you apply the patch?
>
> Thanks in advance for any help

I've just whipped together this patch for 3.1.5, which you should be able
to apply with "patch -p1 < this_file" while in the main source directory.
I should warn you that I haven't tested this under 3.1.5 yet. I did apply
it to 3.2, with a few necessary changes, and my preliminary tests there
should it worked. I made one change to Robert's code: when using the
characters as subscripts into the MinusculeISOLAT1 array, it's necessary
to cast them to unsigned char, or this will break on systems where chars
are signed by default.

        ----------------------------------
        diff -c3prN htdig-3.1.5{,.accents}
        ----------------------------------
diff -c3prN htdig-3.1.5/htcommon/defaults.cc htdig-3.1.5.accents/htcommon/defaults.cc
*** htdig-3.1.5/htcommon/defaults.cc Thu Feb 24 20:29:10 2000
--- htdig-3.1.5.accents/htcommon/defaults.cc Thu Mar 2 11:20:55 2000
*************** ConfigDefaults defaults[] =
*** 27,32 ****
--- 27,33 ----
      //
      // General defaults
      //
+ {"accents_db", "${database_base}.accents.db"},
      {"add_anchors_to_excerpt", "true"},
      {"allow_in_form", ""},
      {"allow_numbers", "false"},
diff -c3prN htdig-3.1.5/htfuzzy/Accents.cc htdig-3.1.5.accents/htfuzzy/Accents.cc
*** htdig-3.1.5/htfuzzy/Accents.cc Wed Dec 31 18:00:00 1969
--- htdig-3.1.5.accents/htfuzzy/Accents.cc Thu Mar 2 11:25:42 2000
***************
*** 0 ****
--- 1,173 ----
+ //
+ // Accents.cc
+ //
+ // Implementation of Accents
+ //
+ //
+ //
+ #if RELEASE
+ static char RCSid[] = "$Id: $";
+ #endif
+
+ #include "Configuration.h"
+ #include "htconfig.h"
+ #include "Accents.h"
+ #include "Dictionary.h"
+ #include <ctype.h>
+ #include <fstream.h>
+
+ extern int debug;
+
+ /*---------------------------------------------------------------.
+ | Ajoute par Robert Marchand pour permettre le traitement adequat de |
+ | l'ISO-LATIN (provient du code de Pierre Rosa) |
+ `---------------------------------------------------------------*/
+
+ /*--------------------------------------------------.
+ | table iso-latin1 "minusculisee" et "de-accentuee" |
+ `--------------------------------------------------*/
+
+ static char MinusculeISOLAT1[256] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+ 'x', 'y', 'z', 91, 92, 93, 94, 95,
+ 96, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+ 'x', 'y', 'z', 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135,
+ 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151,
+ 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167,
+ 168, 168, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183,
+ 184, 185, 186, 187, 188, 189, 190, 191,
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
+ 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
+ 208, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
+ 'o', 'u', 'u', 'u', 'u', 'y', 222, 223,
+ 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c',
+ 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
+ 240, 'n', 'o', 'o', 'o', 'o', 'o', 'o',
+ 'o', 'u', 'u', 'u', 'u', 'y', 254, 255};
+
+
+ //*****************************************************************************
+ // Accents::Accents()
+ //
+ Accents::Accents()
+ {
+ name = "accents";
+ }
+
+
+ //*****************************************************************************
+ // Accents::~Accents()
+ //
+ Accents::~Accents()
+ {
+ }
+
+ //*****************************************************************************
+ // int Accents::writeDB(Configuration &config)
+ //
+ int
+ Accents::writeDB(Configuration &config)
+ {
+ String var = name;
+ var << "_db";
+ String filename = config[var];
+
+ index = Database::getDatabaseInstance();
+ if (index->OpenReadWrite(filename, 0664) == NOTOK)
+ return NOTOK;
+
+ String *s;
+ char *fuzzyKey;
+
+ int count = 0;
+
+ dict->Start_Get();
+ while ((fuzzyKey = dict->Get_Next()))
+ {
+ s = (String *) dict->Find(fuzzyKey);
+
+ // Only add if meaningfull list
+ if (mystrcasecmp(fuzzyKey, s->get()) != 0) {
+
+ index->Put(fuzzyKey, *s);
+
+ if (debug > 1)
+ {
+ cout << "htfuzzy: '" << fuzzyKey << "' ==> '" << s->get() << "'\n"
+ ;
+ }
+ count++;
+ if ((count % 100) == 0 && debug == 1)
+ {
+ cout << "htfuzzy: keys: " << count << '\n';
+ cout.flush();
+ }
+ }
+ }
+ if (debug == 1)
+ {
+ cout << "htfuzzy:Total keys: " << count << "\n";
+ }
+ return OK;
+ }
+
+
+ //*****************************************************************************
+ // void Accents::generateKey(char *word, String &key)
+ //
+ void
+ Accents::generateKey(char *word, String &key)
+ {
+
+ if (!word || !*word)
+ return;
+
+ key = '0';
+ while (*word) {
+ key << MinusculeISOLAT1[ (unsigned char) *word++ ];
+ }
+ }
+
+
+ //*****************************************************************************
+ // void Accents::addWord(char *word)
+ //
+ void
+ Accents::addWord(char *word)
+ {
+ if (!dict)
+ {
+ dict = new Dictionary;
+ }
+
+ String key;
+ generateKey(word, key);
+
+ String *s = (String *) dict->Find(key);
+ if (s)
+ {
+ // if (mystrcasestr(s->get(), word) != 0)
+ (*s) << ' ' << word;
+ }
+ else
+ {
+ dict->Add(key, new String(word));
+ }
+ }
+
diff -c3prN htdig-3.1.5/htfuzzy/Accents.h htdig-3.1.5.accents/htfuzzy/Accents.h
*** htdig-3.1.5/htfuzzy/Accents.h Wed Dec 31 18:00:00 1969
--- htdig-3.1.5.accents/htfuzzy/Accents.h Thu Mar 2 11:24:56 2000
***************
*** 0 ****
--- 1,30 ----
+ //
+ // Accents.h
+ //
+ // $Id: $
+ //
+ //
+ #ifndef _Accents_h_
+ #define _Accents_h_
+
+ #include "Fuzzy.h"
+
+ class Accents : public Fuzzy
+ {
+ public:
+ //
+ // Construction/Destruction
+ //
+ Accents();
+ virtual ~Accents();
+
+ virtual int writeDB(Configuration &config);
+
+ virtual void generateKey(char *word, String &key);
+
+ virtual void addWord(char *word);
+
+ private:
+ };
+
+ #endif
diff -c3prN htdig-3.1.5/htfuzzy/Fuzzy.cc htdig-3.1.5.accents/htfuzzy/Fuzzy.cc
*** htdig-3.1.5/htfuzzy/Fuzzy.cc Thu Feb 24 20:29:10 2000
--- htdig-3.1.5.accents/htfuzzy/Fuzzy.cc Thu Mar 2 11:22:14 2000
*************** static char RCSid[] = "$Id: Fuzzy.cc,v 1
*** 13,18 ****
--- 13,19 ----
  #include "Configuration.h"
  #include "List.h"
  #include "StringList.h"
+ #include "Accents.h"
  #include "Endings.h"
  #include "Exact.h"
  #include "Metaphone.h"
*************** Fuzzy::getFuzzyByName(char *name)
*** 171,176 ****
--- 172,179 ----
          return new Soundex();
      else if (mystrcasecmp(name, "metaphone") == 0)
          return new Metaphone();
+ else if (mystrcasecmp(name, "accents") == 0)
+ return new Accents();
      else if (mystrcasecmp(name, "endings") == 0)
          return new Endings();
      else if (mystrcasecmp(name, "synonyms") == 0)
diff -c3prN htdig-3.1.5/htfuzzy/Makefile.in htdig-3.1.5.accents/htfuzzy/Makefile.in
*** htdig-3.1.5/htfuzzy/Makefile.in Thu Feb 24 20:29:10 2000
--- htdig-3.1.5.accents/htfuzzy/Makefile.in Thu Mar 2 11:23:48 2000
*************** include $(top_builddir)/Makefile.config
*** 10,20 ****
  OBJS= Endings.o EndingsDB.o Exact.o \
                  Fuzzy.o Metaphone.o Soundex.o \
                  SuffixEntry.o Synonym.o htfuzzy.o \
! Substring.o Prefix.o
  
  LIBOBJS= Endings.o Exact.o Fuzzy.o Metaphone.o \
                  Soundex.o Synonym.o EndingsDB.o SuffixEntry.o \
! Substring.o Prefix.o
  
  TARGET= htfuzzy
  LIBTARGET= libfuzzy.a
--- 10,20 ----
  OBJS= Endings.o EndingsDB.o Exact.o \
                  Fuzzy.o Metaphone.o Soundex.o \
                  SuffixEntry.o Synonym.o htfuzzy.o \
! Substring.o Prefix.o Accents.o
  
  LIBOBJS= Endings.o Exact.o Fuzzy.o Metaphone.o \
                  Soundex.o Synonym.o EndingsDB.o SuffixEntry.o \
! Substring.o Prefix.o Accents.o
  
  TARGET= htfuzzy
  LIBTARGET= libfuzzy.a
diff -c3prN htdig-3.1.5/htfuzzy/htfuzzy.cc htdig-3.1.5.accents/htfuzzy/htfuzzy.cc
*** htdig-3.1.5/htfuzzy/htfuzzy.cc Thu Feb 24 20:29:11 2000
--- htdig-3.1.5.accents/htfuzzy/htfuzzy.cc Thu Mar 2 11:23:12 2000
*************** static char RCSid[] = "$Id: htfuzzy.cc,v
*** 43,48 ****
--- 43,49 ----
  
  #include "htfuzzy.h"
  #include "Fuzzy.h"
+ #include "Accents.h"
  #include "Soundex.h"
  #include "Endings.h"
  #include "Metaphone.h"
*************** main(int ac, char **av)
*** 108,113 ****
--- 109,118 ----
          {
              wordAlgorithms.Add(new Metaphone);
          }
+ else if (mystrcasecmp(av[i], "accents") == 0)
+ {
+ wordAlgorithms.Add(new Accents);
+ }
          else if (mystrcasecmp(av[i], "endings") == 0)
          {
              noWordAlgorithms.Add(new Endings);
*************** usage()
*** 237,242 ****
--- 242,248 ----
      cout << "Supported algorithms:\n";
      cout << "\tsoundex\n";
      cout << "\tmetaphone\n";
+ cout << "\taccents\n";
      cout << "\tendings\n";
      cout << "\tsynonyms\n";
      cout << "\n";

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930

------------------------------------ To unsubscribe from the htdig mailing list, send a message to htdig-unsubscribe@htdig.org You will receive a message to confirm this.



This archive was generated by hypermail 2b28 : Thu Mar 02 2000 - 09:53:32 PST