ht://Dig 3.1.0


Domotor Akos (dome@impulzus.sch.bme.hu)
Mon, 14 Sep 1998 18:07:07 +0200 (CEST)


Hi Andrew,

I've just installed the new ht://Dig for the prefix matching algorithm
what was indeed the best change. Just some remarks:

- I had to change the references "db_init(NULL);" in "htlib/DB2_db.cc"
to "db_init((char *)NULL);" since it wouldn't compile because of the
declaration. Platform dependent?
[I'm using Linux 2.0.34 on AMD-K5, Debian2, libc6, gcc2.7.2.8 and 2.8.0]

- All the stuff in "htfuzzy/" needed libstdc++.so.2.8.0 since "__throw"
was not defined in 2.7.2.8

- I find it a bit lame that there is no dependency check in Makefile.
If I modify something in "htfuzzy/", it modifies "htfuzzy/libfuzzy.a"
what is linked into "htsearch/htsearch" but a "make" in "htsearch/" says
"htsearch" is up-to-date, which _is_not_.

- I've implemented a long-ago-desired option, to search for accented
words. I put it into "htfuzzy/Prefix.cc". It is now for the Hungarian
accents but one can trivially extend it to the other accented characters.

If the unaccented version of a character is given it will try the
unaccented character (case insensitive) and both the small letter and the
capital. If the accented small letter is written, it will try the accented
small letter and the accented capital. However, if the accented capital is
given, it will try only that accented capital.

TODO:
- Move it to its proper place in order to work for ordinary (non-prefix)
searches as well.
- Implement a checkbox or option selector to decide whether this feature
is to be switched on/off.

                                        Greets,
                                                        Akos

--------------------------------------------------------------------
Akos Domotor: scient. fellow http://impulzus.sch.bme.hu/dome
Techn. Univ. of Budapest, Hungary, Faculty of Electrical Engineering
Department of Measurement and Information Systems
H-1521 Budapest, Muegyetem rkp. 9., Hungary, (+36-1) 463-2057

//
// Prefix.cc
//
// Implementation of Prefix
//
// $Log: Prefix.cc,v $
// Revision 1.2 1998/08/03 16:50:38 ghutchis
//
// Fixed compiler warnings under -Wall
//
// Revision 1.1 1998/06/21 23:20:04 turtle
// patches by Esa and Jesse to add BerkeleyDB and Prefix searching
//
// Revision 1.2 1997/03/24 04:33:18 turtle
// Renamed the String.h file to htString.h to help compiling under win32
//
// Revision 1.1.1.1 1997/02/03 17:11:12 turtle
// Initial CVS
//
//
#if RELEASE
static char RCSid[] = "$Id: Prefix.cc,v 1.2 1998/08/03 16:50:38 ghutchis Exp $";
#endif

#include <ctype.h>
#include "Prefix.h"
#include <htString.h>
#include <List.h>
#include <StringMatch.h>
#include <Configuration.h>

extern Configuration config;


//*****************************************************************************
// Prefix::Prefix()
//
Prefix::Prefix()
{
}


//*****************************************************************************
// Prefix::~Prefix()
//
Prefix::~Prefix()
{
}


//*****************************************************************************
// static int charcode(char c)
// Returns the un-accented character code
//
static int charcode(char c)
{
  switch ((unsigned char)c) {
    case 225: return 'a'; // &aacute;
    case 193: return 'A'; // &Aacute;
    case 233: return 'e'; // &eacute;
    case 201: return 'E'; // &Eacute;
    case 237: return 'i'; // &iacute;
    case 205: return 'I'; // &Iacute;
    case 243: // &oacute;
    case 244: // &ocirc;
    case 245: // &otilde;
    case 246: return 'o'; // &ouml;
    case 211: // &Oacute;
    case 212: // &Ocirc;
    case 213: // &Otilde;
    case 214: return 'O'; // &Ouml;
    case 250: // &uacute;
    case 251: // &ucirc;
    case 252: return 'u'; // &uuml;
    case 218: // &Uacute;
    case 219: // &Ucirc;
    case 220: return 'U'; // &Uuml;
    default: return c;
  }
}

//*****************************************************************************
// static int strncasecmp_noaccent(char *s1, char *s2, int len)
// Compares two strings (as strncasecmp) with accent- and case insensitivity
//
static int strncasecmp_noaccent(const char *s1, const char *s2, int len)
{
  int f;
  
  for (f=0; f<len; f++) {
    if (tolower(charcode(s1[f]))<tolower(charcode(s2[f]))) return -1;
    if (tolower(charcode(s1[f]))>tolower(charcode(s2[f]))) return 1;
  }
  return 0;
}

//*****************************************************************************
// static void steptonext(char *c, int *flag)
// Returns the next accented character. It has a cycle of
// 'a' -> '&aacute;' -> '&Aacute;' in order to get the most intelligent combinations
// (If '&aacute;' is given, it will still search for '&Aacute;' but not the opposite.)
// For 'o' and 'u' this route is a bit more difficult, anyway.
// "flag" becomes 1 if there is not next character.
//
static void steptonext(char *c, int *flag)
{
  switch ((unsigned char)*c) {
    case 'A':
    case 225: *c=193; break;
    case 'a': *c=225; break;
    case 'E':
    case 233: *c=201; break;
    case 'e': *c=233; break;
    case 'I':
    case 237: *c=205; break;
    case 'i': *c=237; break;
    case 'O':
    case 244:
    case 245: *c=211; break;
    case 'o': *c=243; break;
    case 'U':
    case 251: *c=218; break;
    case 'u': *c=250; break;
    case 243: *c=246; break;
    case 246: *c=245; break;
    case 211: *c=214; break;
    case 214: *c=213; break;
    case 250: *c=252; break;
    case 252: *c=251; break;
    case 218: *c=220; break;
    case 220: *c=219; break;
    case 193:
    case 205:
    case 201:
    case 213:
    case 212:
    case 219: *flag=1; break;
   }
}

//*****************************************************************************
// static int recurse_accents(char *w, const char *w2, int pos)
// Recursively presents all of the possible accented combinations for a given word
// Returns 0 if there is no more combinations, 1 if there is.
// The original search string should be in "w", the result is in "w2".
// "pos" is the recursive index of the first triable position.
//
static int recurse_accents(const char *w, char *w2, int pos)
{
  int f,flag;
  char c;
  
  for (f=pos; (f<strlen(w2)); f++) {
    c=tolower(charcode(w2[f]));
    if (c=='a' || c=='e' || c=='i' || c=='o' || c=='u') break;
  }
  if (f>=strlen(w2)) return 0;

  flag=recurse_accents(w,w2,f+1);
  if (!flag) {
    steptonext(w2+f,&flag);
    if (flag) {
      w2[f]=w[f];
      return 0;
    }
  }
  return 1;
}


//*****************************************************************************
//
// Prefix search
//
void
Prefix::getWords(char *w, List &words)
{

    if (w == NULL || w[0] == '\0')
        return;

    char *prefix_suffix = config["prefix_match_character"];
    int prefix_suffix_length = prefix_suffix == NULL
                                        ? 0 : strlen(prefix_suffix);
    int minimum_prefix_length = config.Value("minimum_prefix_length");

    if (debug)
         cout << " word=" << w << " prefix_suffix=" << prefix_suffix
                << " prefix_suffix_length=" << prefix_suffix_length
                << " minimum_prefix_length=" << minimum_prefix_length << "\n";

    if (strlen(w) < minimum_prefix_length + prefix_suffix_length)
        return;

    // A null prefix character means that prefix matching should be
    // applied to every search word; otherwise return if the word does
    // not end in the prefix character(s).
    //
    if (prefix_suffix_length > 0
            && strcmp(prefix_suffix, w+strlen(w)-prefix_suffix_length))
        return;

    Database *dbf = Database::getDatabaseInstance();
    dbf->OpenRead(config["word_db"]);

    int wordCount = 0;
    int maximumWords = config.Value("max_prefix_matches", 1000);
    char *s;
    int len = strlen(w) - prefix_suffix_length;
    
    // Strip the prefix character(s)
    char w2[8192];
    strncpy(w2, w, sizeof(w2) - 1);
    w2[sizeof(w2) - 1] = '\0';
    w2[strlen(w2) - prefix_suffix_length] = '\0';
    
    dbf->Start_Seq(w2);

    while ((wordCount < maximumWords && (s = dbf->Get_Next_Seq())))
    {
        if (!strncasecmp_noaccent(s, w, len) && !strncasecmp(s, w2, len)) {
                   // ^
                   // Maybe there is not un-accented version in the dictionary so
                   // "Start_Seq" finds the accented word at the first time but we
                   // do not want it to add for every combination.
          words.Add(new String(s));
          wordCount++;
          continue;
        }
        else {
                   // If the prefix matches we can continue with the next
          if (!strncasecmp(s, w, len)) continue;
                   // but if it is not we should generate the next accented combination
          if (!recurse_accents(w,w2,0)) break;
                   // and jump to it.
          dbf->Start_Seq(w2);
        }
    }
    dbf->Close();
    delete dbf;
}


//*****************************************************************************
int
Prefix::openIndex(Configuration &)
{
  return 0;
}


//*****************************************************************************
void
Prefix::generateKey(char *, String &)
{
}


//*****************************************************************************
void
Prefix::addWord(char *)
{
}






This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:27:43 PST