[htdig] Latin1.patch


Subject: [htdig] Latin1.patch
From: Alexey Rodriguez (alexey@dicyt.umss.edu.bo)
Date: Fri Jun 30 2000 - 04:58:16 PDT


        Hello diggers!!, it took a while because of lack of time but i
have finished the patch. This corrects the Weird endings problem that i
reported previously in this mailing list. It can now work with ispell
files "out of the box" although the only problem was with the spanish one.
It also corrects latin1 character codification which previously was coded
for german. It now supports every latin1 ispell aff file. I tested with
spanish, german and postuguese but it should work for others.
        mungeWord code is horribly slow but works for now, maybe i'll
recode it using regex's to make it faster. E-mail me if you have troubles.
        Geoff, i had problems for providing the dictionary since mungeWord
is a static function so i did what i did. I hope it is ok.

                                        Alexey Rodriguez

---8<---- cut here ------8<------

diff -rc htdig-3.1.5/htfuzzy/Endings.h mod/htdig-3.1.5/htfuzzy/Endings.h
*** htdig-3.1.5/htfuzzy/Endings.h Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/Endings.h Thu Jun 29 14:37:09 2000
***************
*** 39,52 ****
      //
      int createDB(Configuration &config);
          
! static void mungeWord(char *, String &);
      
  private:
      Database *root2word;
      Database *word2root;
  
! int createRoot(Dictionary &, char *, char *, char *);
! int readRules(Dictionary &, char *);
      void expandWord(String &, List &, Dictionary &, char *, char *);
  };
  
--- 39,52 ----
      //
      int createDB(Configuration &config);
          
! static void mungeWord(char *, String &, Dictionary &);
      
  private:
      Database *root2word;
      Database *word2root;
  
! int createRoot(Dictionary &, char *, char *, char *, Dictionary &);
! int readRules(Dictionary &, char *, Dictionary &);
      void expandWord(String &, List &, Dictionary &, char *, char *);
  };
  
diff -rc htdig-3.1.5/htfuzzy/EndingsDB.cc mod/htdig-3.1.5/htfuzzy/EndingsDB.cc
*** htdig-3.1.5/htfuzzy/EndingsDB.cc Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/EndingsDB.cc Fri Jun 30 10:01:28 2000
***************
*** 25,31 ****
  int
  Endings::createDB(Configuration &config)
  {
! Dictionary rules;
      String tmpdir = getenv("TMPDIR");
      String word2root, root2word;
      if (tmpdir.length())
--- 25,31 ----
  int
  Endings::createDB(Configuration &config)
  {
! Dictionary rules, lat_encoding;
      String tmpdir = getenv("TMPDIR");
      String word2root, root2word;
      if (tmpdir.length())
***************
*** 45,58 ****
      if (debug)
          cout << "htfuzzy/endings: Reading rules\n";
          
! if (readRules(rules, config["endings_affix_file"]) == NOTOK)
          return NOTOK;
  
      if (debug)
          cout << "htfuzzy/endings: Creating databases\n";
          
      if (createRoot(rules, word2root, root2word,
! config["endings_dictionary"]) == NOTOK)
          return NOTOK;
  
      //
--- 45,65 ----
      if (debug)
          cout << "htfuzzy/endings: Reading rules\n";
          
! if (readRules(rules, config["endings_affix_file"], lat_encoding) == NOTOK)
          return NOTOK;
  
+ lat_encoding.Start_Get();
+ char *s;
+ while(s=lat_encoding.Get_Next())
+ {
+ cout<<s<<" "<< * (String*) lat_encoding[s] <<endl;
+ }
+
      if (debug)
          cout << "htfuzzy/endings: Creating databases\n";
          
      if (createRoot(rules, word2root, root2word,
! config["endings_dictionary"], lat_encoding) == NOTOK)
          return NOTOK;
  
      //
***************
*** 69,75 ****
  
  //*****************************************************************************
  int
! Endings::readRules(Dictionary &rules, char *rulesFile)
  {
      FILE *fl = fopen(rulesFile, "r");
  
--- 76,82 ----
  
  //*****************************************************************************
  int
! Endings::readRules(Dictionary &rules, char *rulesFile, Dictionary &lat_encoding)
  {
      FILE *fl = fopen(rulesFile, "r");
  
***************
*** 77,106 ****
          return NOTOK;
  
      int inSuffixes = 0;
      char currentSuffix[2] = " ";
! char *p;
      char input[1024];
      String line;
          
      while (fgets(input, sizeof(input), fl))
      {
          if (input[0] == '\n' || input[0] == '#')
              continue;
  
          if (mystrncasecmp(input, "suffixes", 8) == 0)
          {
              inSuffixes = 1;
              continue;
          }
          else if (mystrncasecmp(input, "prefixes", 8) == 0)
          {
              inSuffixes = 0;
              continue;
          }
! if (!inSuffixes)
              continue;
  
! if (mystrncasecmp(input, "flag ", 5) == 0)
          {
              p = input + 5;
              while (*p == '*' || *p == ' ' || *p == '\t')
--- 84,179 ----
          return NOTOK;
  
      int inSuffixes = 0;
+ int inLatin1 = 0;
      char currentSuffix[2] = " ";
! char *p, *t;
      char input[1024];
      String line;
+ String Target;
+ String Origin;
          
      while (fgets(input, sizeof(input), fl))
      {
+ // As code gets more complex it should be considered
+ // to redesign the parser or to move to yacc as ispell
+ // does.
          if (input[0] == '\n' || input[0] == '#')
              continue;
  
          if (mystrncasecmp(input, "suffixes", 8) == 0)
          {
              inSuffixes = 1;
+ inLatin1=0;
              continue;
          }
          else if (mystrncasecmp(input, "prefixes", 8) == 0)
          {
              inSuffixes = 0;
+ inLatin1=0;
              continue;
          }
! else if (mystrncasecmp(input, "altstringtype", 13) == 0)
! {
! if(mystrcasestr(input,"latin1")) inLatin1=1;
! else inLatin1=0;
! inSuffixes=0;
              continue;
+ }
+ if (!inSuffixes)
+ {
+ if(!inLatin1 || (mystrncasecmp(input, "altstringchar", 13) != 0 ) )
+ {
+ continue;
+ }
+ else
+ {
+ p = input;
+ p += 13; // Skip "altstringchar" thingy
+ Target="";
+ Origin="";
+ while(*p == ' ' || *p == '\t')
+ p++;
+
+ // Parse the latin1 encoded character
+ while(*p != ' ' && *p != '\t')
+ {
+ if(*p=='\\') // I am considering only two posibilities hex char or octal char
+ {
+ *p='0';
+ Target << (char) strtol(p,&t,0);
+ p=t;
+ }
+ else // Read an ordinary character
+ {
+ Target << *p;
+ p++;
+ }
+ }
+
+ // Skip blank spaces
+ while(*p == ' ' || *p == '\t')
+ p++;
+
+ // Parse the character as it is encoded in standard ispell files
+ while(*p != ' ' && *p != '\t' && *p != '\n')
+ {
+ if(*p=='\\') { // backslash is quoting the next character
+ Origin << *(p+1);
+ p+=2;
+ }
+ else if ((*p=='\'') || (*p=='\"'))
+ p++; // only skip and forget about it
+ // NOTE: this could be erroneous if there's a space or \t in the quoting
+ else {
+ Origin << *p; // ordinary characters make into Origin
+ p++;
+ }
+ }
+ lat_encoding.Add(Origin,new String(Target));
+ }
+ } // End of latin1 encoding
  
! else if (mystrncasecmp(input, "flag ", 5) == 0)
          {
              p = input + 5;
              while (*p == '*' || *p == ' ' || *p == '\t')
***************
*** 114,120 ****
              if (line.indexOf('>') > 0)
              {
                  List *list;
! SuffixEntry *se = new SuffixEntry(line);
                          
                  if (rules.Exists(currentSuffix))
                  {
--- 187,193 ----
              if (line.indexOf('>') > 0)
              {
                  List *list;
! SuffixEntry *se = new SuffixEntry(line, lat_encoding);
                          
                  if (rules.Exists(currentSuffix))
                  {
***************
*** 138,144 ****
  
  //*****************************************************************************
  int
! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char *dictFile)
  {
      FILE *fl = fopen(dictFile, "r");
      if (fl == NULL)
--- 211,217 ----
  
  //*****************************************************************************
  int
! Endings::createRoot(Dictionary &rules, char *word2root, char *root2word, char *dictFile, Dictionary & lat_encoding)
  {
      FILE *fl = fopen(dictFile, "r");
      if (fl == NULL)
***************
*** 173,179 ****
  
          *p++ = '\0';
  
! mungeWord(input, word);
          expandWord(words, wordList, rules, word, p);
  
          if (debug > 1)
--- 246,252 ----
  
          *p++ = '\0';
  
! mungeWord(input, word, lat_encoding);
          expandWord(words, wordList, rules, word, p);
  
          if (debug > 1)
***************
*** 212,281 ****
  // any accents will be combined into single characters.
  //
  void
! Endings::mungeWord(char *input, String &word)
  {
! char *p = input + 1;
      
      word = 0;
      while (*input)
      {
! p = input + 1;
! switch (*p)
          {
! case '"': // The previous character needs to get an umlaut
! switch (*input)
! {
! case 'a':
! case 'A':
! word << char(228);
! input += 2;
! continue;
! break;
! case 'e':
! case 'E':
! word << char(235);
! input += 2;
! continue;
! break;
! case 'i':
! case 'I':
! word << char(239);
! input += 2;
! continue;
! break;
! case 'o':
! case 'O':
! word << char(246);
! input += 2;
! continue;
! break;
! case 'u':
! case 'U':
! word << char(252);
! input += 2;
! continue;
! break;
! }
! break;
!
! case 'S': // See if the previous character needs to be an sz
! if (*input == 's')
! {
! word << char(223);
! input += 2;
! continue;
! }
! else
! {
! word << *input;
! }
! break;
!
! default:
! word << *input;
! break;
          }
- input++;
      }
      word.lowercase();
  }
--- 285,315 ----
  // any accents will be combined into single characters.
  //
  void
! Endings::mungeWord(char *input, String &word, Dictionary &lat_encoding)
  {
! char *p = input + 1 , *s;
! int len;
      
      word = 0;
      while (*input)
      {
! lat_encoding.Start_Get();
! // Replace ispell codification with latin1 codification
! // Slow, maybe in a next time this will be regexp'd
! while(s = lat_encoding.Get_Next())
          {
! if(mystrncasecmp(input ,s ,strlen(s) ) == 0)
! {
! word << (String*) lat_encoding[s];
! input += strlen( s );
! break;
! }
! }
! if(!s) // No matches
! {
! word << *input;
! input ++;
          }
      }
      word.lowercase();
  }
diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.cc mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc
*** htdig-3.1.5/htfuzzy/SuffixEntry.cc Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/SuffixEntry.cc Thu Jun 29 14:44:43 2000
***************
*** 19,27 ****
  //*****************************************************************************
  // SuffixEntry::SuffixEntry()
  //
! SuffixEntry::SuffixEntry(char *str)
  {
! parse(str);
  }
  
  
--- 19,27 ----
  //*****************************************************************************
  // SuffixEntry::SuffixEntry()
  //
! SuffixEntry::SuffixEntry(char *str, Dictionary &lat_encoding)
  {
! parse(str, lat_encoding);
  }
  
  
***************
*** 38,44 ****
  // Parse a string in the format <expr> '>' <rule> into ourselves.
  //
  void
! SuffixEntry::parse(char *str)
  {
      String temp = 0;
      
--- 38,44 ----
  // Parse a string in the format <expr> '>' <rule> into ourselves.
  //
  void
! SuffixEntry::parse(char *str, Dictionary &lat_encoding)
  {
      String temp = 0;
      
***************
*** 56,70 ****
      while (*str == ' ' || *str == '\t' || *str == '>')
          str++;
  
! Endings::mungeWord(temp, expression);
      
      temp = 0;
! while (*str != ' ' && *str != '\t' && *str != '\n' && *str != '\r' && *str)
      {
! temp << *str;
          str++;
      }
! Endings::mungeWord(temp, rule);
  }
-
-
--- 56,70 ----
      while (*str == ' ' || *str == '\t' || *str == '>')
          str++;
  
! Endings::mungeWord(temp, expression, lat_encoding);
      
      temp = 0;
! while (*str != '#' && *str != '\n' && *str != '\r' && *str)
      {
! if(*str!= ' ' && *str!= '\t') {
! temp << *str;
! }
          str++;
      }
! Endings::mungeWord(temp, rule, lat_encoding);
  }
diff -rc htdig-3.1.5/htfuzzy/SuffixEntry.h mod/htdig-3.1.5/htfuzzy/SuffixEntry.h
*** htdig-3.1.5/htfuzzy/SuffixEntry.h Fri Feb 25 02:29:10 2000
--- mod/htdig-3.1.5/htfuzzy/SuffixEntry.h Thu Jun 29 14:43:08 2000
***************
*** 15,20 ****
--- 15,21 ----
  #define _SuffixEntry_h_
  
  #include "Object.h"
+ #include <Dictionary.h>
  #include <htString.h>
  
  
***************
*** 24,36 ****
          //
          // Construction/Destruction
          //
! SuffixEntry(char *);
                                          ~SuffixEntry();
  
          String expression;
          String rule;
  
! void parse(char *str);
          
  private:
  };
--- 25,37 ----
          //
          // Construction/Destruction
          //
! SuffixEntry(char *, Dictionary &lat_encoding);
                                          ~SuffixEntry();
  
          String expression;
          String rule;
  
! void parse(char *str, Dictionary &lat_encoding);
          
  private:
  };

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig-unsubscribe@htdig.org
You will receive a message to confirm this.



This archive was generated by hypermail 2b28 : Fri Jun 30 2000 - 06:01:15 PDT