[htdig3-dev] patches to speed up htdig


Gilles Detillieux (grdetil@scrc.umanitoba.ca)
Wed, 27 Jan 1999 15:32:09 -0600 (CST)


Hi again. I went through the code looking for places to take config
dictionary lookups out of loops, to speed up htdig. Here's the patch
I came up with.

It includes another little change I made to fix what looked like a bug
to me. The handling of the keywords_meta_tag_names attributes was not
consistent with the other string list handling. As it was, it seems it
needed to have the list elements separated by no more than one space,
as each space was replaced with a '|'. Maybe someone can correct me if
I'm wrong, but I think using the StringList class for this attribute,
and allowing any number of spaces or tabs as separators, is a better
way of dealing with it. I don't currently allow commas as separators,
to be consistent with other string list attributes, but that could easily
be added to the list of separators if someone feels it should be.

--- ./htcommon/WordList.cc.speed Wed Jan 20 19:59:12 1999
+++ ./htcommon/WordList.cc Tue Jan 26 16:26:56 1999
@@ -144,11 +144,12 @@
     int control = 0;
     int alpha = 0;
     static int allow_numbers = config.Boolean("allow_numbers", 0);
+ static int minimum_word_length = config.Value("minimum_word_length", 3);
 
     if (badwords.Exists(word))
         return 0;
 
- if (strlen(word) < config.Value("minimum_word_length"))
+ if (strlen(word) < minimum_word_length)
       return 0;
 
     while (word && *word)
@@ -266,6 +267,7 @@
     char *word;
     String new_word;
     char *valid_punctuation = config["valid_punctuation"];
+ int minimum_word_length = config.Value("minimum_word_length", 3);
 
     while (fl && fgets(buffer, sizeof(buffer), fl))
     {
@@ -277,7 +279,7 @@
             new_word = word; // We need to clean it up before we add it
             new_word.lowercase(); // Just in case someone enters an odd one
             new_word.remove(valid_punctuation);
- if (new_word.length() >= config.Value("minimum_word_length", 3))
+ if (new_word.length() >= minimum_word_length)
               badwords.Add(new_word, 0);
           }
     }
--- ./htcommon/DocumentRef.cc.speed Sat Jan 23 21:13:42 1999
+++ ./htcommon/DocumentRef.cc Tue Jan 26 17:22:46 1999
@@ -311,7 +311,7 @@
     addstring(DOC_NOTIFICATION, s, docNotification);
     addstring(DOC_SUBJECT, s, docSubject);
 #ifdef HAVE_LIBZ
- int cf=config.Value("compression_level",0);
+ static int cf=config.Value("compression_level",0);
     if (cf) {
       //
       // Now compress s into c_s
@@ -363,7 +363,8 @@
     char *s;
     char *end;
     String c_s;
- if (config.Value("compression_level",0)) {
+ static int cf=config.Value("compression_level",0);
+ if (cf) {
       // Decompress stream
       z_stream d_stream; /* decompression stream */
 
@@ -594,9 +595,11 @@
     words->DocumentID(docID);
     
     // Parse words, taking care of valid_punctuation.
- char *p = desc;
- char *valid_punctuation = config["valid_punctuation"];
- int minimum_word_length = config.Value("minimum_word_length", 3);
+ char *p = desc;
+ static char *valid_punctuation = config["valid_punctuation"];
+ static int minimum_word_length = config.Value("minimum_word_length", 3);
+ static double description_factor = config.Double("description_factor");
+ static int max_descriptions = config.Value("max_descriptions", 5);
 
     // Not restricted to this size, just used as a hint.
     String word(MAX_WORD_LENGTH);
@@ -616,7 +619,7 @@
 
       if (word.length() >= minimum_word_length)
         // The wordlist takes care of lowercasing; just add it.
- words->Word(word, 0, 0, config.Double("description_factor"));
+ words->Word(word, 0, 0, description_factor);
 
       // No need to count in valid_punctuation for the beginning-char.
       while (*p && !isalnum(*p))
@@ -627,7 +630,7 @@
     words->Flush();
     
     // Now are we at the max_description limit?
- if (descriptions.Count() >= config.Value("max_descriptions", 5))
+ if (descriptions.Count() >= max_descriptions)
           return;
           
     descriptions.Start_Get();
--- ./htdig/HTML.cc.speed Thu Jan 14 22:52:19 1999
+++ ./htdig/HTML.cc Tue Jan 26 17:23:37 1999
@@ -100,6 +100,7 @@
 #include <Configuration.h>
 #include <ctype.h>
 #include <StringMatch.h>
+#include <StringList.h>
 #include <URL.h>
 
 static StringMatch tags;
@@ -131,11 +132,15 @@
     hrefMatch.IgnoreCase();
     hrefMatch.Pattern("href");
 
- String keywordNames = config["keywords_meta_tag_names"];
- keywordNames.replace(' ', '|');
- keywordNames.remove(",\t\r\n");
+ //String keywordNames = config["keywords_meta_tag_names"];
+ //keywordNames.replace(' ', '|');
+ //keywordNames.remove(",\t\r\n");
+ //keywordsMatch.IgnoreCase();
+ //keywordsMatch.Pattern(keywordNames);
+ StringList keywordNames(config["keywords_meta_tag_names"], " \t");
     keywordsMatch.IgnoreCase();
- keywordsMatch.Pattern(keywordNames);
+ keywordsMatch.Pattern(keywordNames.Join('|'));
+ keywordNames.Release();
     
     word = 0;
     href = 0;
@@ -203,8 +208,8 @@
       // Filter out section marked to be ignored for indexing.
       // This can contain any HTML.
       //
- char *skip_start = config["noindex_start"];
- char *skip_end = config["noindex_end"];
+ static char *skip_start = config["noindex_start"];
+ static char *skip_end = config["noindex_end"];
       if (strncmp((char *)position, skip_start, strlen(skip_start)) == 0)
         {
           q = (unsigned char*)strstr((char *)position, skip_end);
--- ./htdig/ExternalParser.cc.speed Wed Jan 20 12:08:29 1999
+++ ./htdig/ExternalParser.cc Tue Jan 26 16:47:00 1999
@@ -239,13 +239,15 @@
                   // (or class). Which should not stop anybody from
                   // finding a better solution.
                   // For now, there is duplicated code.
- StringMatch keywordsMatch;
- String keywordNames = config["keywords_meta_tag_names"];
-
- keywordNames.replace(' ', '|');
- keywordNames.remove(",\t\r\n");
- keywordsMatch.IgnoreCase();
- keywordsMatch.Pattern(keywordNames);
+ static StringMatch *keywordsMatch = 0;
+ if (!keywordsMatch)
+ {
+ StringList kn(config["keywords_meta_tag_names"], " \t");
+ keywordsMatch = new StringMatch();
+ keywordsMatch->IgnoreCase();
+ keywordsMatch->Pattern(kn.Join('|'));
+ kn.Release();
+ }
     
                   // <URL:http://www.w3.org/MarkUp/html-spec/html-spec_5.html#SEC5.2.5>
                   // says that the "name" attribute defaults to
@@ -280,7 +282,7 @@
                   //
                   if (*name != '\0' && *content != '\0')
                   {
- if (keywordsMatch.CompareWord(name))
+ if (keywordsMatch->CompareWord(name))
                     {
                       char *w = strtok(content, " ,\t\r");
                       while (w)
--- ./htdig/SGMLEntities.cc.speed Tue Jan 19 23:41:20 1999
+++ ./htdig/SGMLEntities.cc Tue Jan 26 17:03:15 1999
@@ -215,6 +215,9 @@
 {
     String entity;
     unsigned char *orig = entityStart;
+ static int translate_quot = config.Boolean("translate_quot");
+ static int translate_amp = config.Boolean("translate_amp");
+ static int translate_lt_gt = config.Boolean("translate_lt_gt");
     
     if (*entityStart == '&')
         entityStart++; // Don't need the '&' that starts the entity
@@ -225,7 +228,7 @@
         entity << *entityStart++;
       }
 
- if ( !config.Boolean("translate_quot") )
+ if ( !translate_quot )
       {
         //
         // Do NOT translate entities for '"' (quote).
@@ -238,7 +241,7 @@
           }
       }
 
- if ( !config.Boolean("translate_amp") )
+ if ( !translate_amp )
       {
         //
         // Do NOT translate entities for '&' since they can
@@ -252,7 +255,7 @@
           }
       }
 
- if ( !config.Boolean("translate_lt_gt") )
+ if ( !translate_lt_gt )
       {
         //
         // Do NOT translate entities for '<' and '>' since they can

The results of these patches were surprising. I expected a speed-up,
but this about halved the user CPU time in htdig! It reduced the total
elapsed time by about 25%. The effect on other utilities (htmerge,
htfuzzy) was negligible.

This was the 011799 snapshot:
htdig: Run complete
htdig: 1 server seen:
htdig: www.scrc.umanitoba.ca:80 392 documents
49.00user 4.32system 1:08.25elapsed 78%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (20747major+18666minor)pagefaults 0swaps

This was the 012499 snapshot:
htdig: Run complete
htdig: 1 server seen:
htdig: www.scrc.umanitoba.ca:80 392 documents
50.89user 4.37system 1:06.18elapsed 83%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (19989major+18902minor)pagefaults 0swaps

This was the 012499 snapshot, with my speed-up patches:
htdig: Run complete
htdig: 1 server seen:
htdig: www.scrc.umanitoba.ca:80 392 documents
22.25user 4.64system 0:43.28elapsed 62%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (20668major+19009minor)pagefaults 0swaps

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930
------------------------------------
To unsubscribe from the htdig3-dev mailing list, send a message to
htdig3-dev@htdig.org containing the single word "unsubscribe" in
the SUBJECT of the message.



This archive was generated by hypermail 2.0b3 on Thu Feb 04 1999 - 22:24:20 PST