ht://Dig parsing


Joel Peterson (jip@earthling.net)
Sun, 01 Feb 1998 13:08:00 -0500


First of all, great product.

I've attached two patches against htdig-3.0.8-b1. Please feel free to
incorporate these changes into the main distribution.

The patches are in unified diff format. If your version of patch chokes
on that, I can remake them in whatever format you'd like.

patch-htdig.parse fixes some parsing bugs in HTML.cc. In particular,
tag endings weren't being properly recognized (> is allowed inside
quotes in a tag without ending the tag) and punctuation was finding its
way into the word database. It adds a new configuration variable called
word_constituent_punctuation.

The new variable is used in all the parsing classes to determine the end
of a word. The old variable valid_punctuation is still used to strip
punctuation from words. This scheme allows more punctuation characters
to be recognized and discarded, since the word constituent punctuation
characters do not include any of the special characters used in HTML (or
postscript), but valid_punctuation can. It also allows for, say '+', to
be a word break even if there is no space around it.

I changed the default value for valid_punctuation to
.-_/!#$%^&*'?:;,<>{}[]()+=|"\ the default for
word_constituent_punctuation is .-_!#$%^&'?\ These values seem to work
for me.

I didn't do this, but each parser should strip its special characters
from word_constituent_punctuation; e.g. if the user adds '<', HTML.cc
would strip it out, but Plaintext.cc would leave it in.

The patch-htdig.string-warn changes a "String &" to "const String &" in
String.cc and htString.h. This eliminates a bunch of warnings I was
getting from egcs.

-- 
Joel Peterson
President
Odin Consulting, Inc.

diff -urw --unidirectional-new htdig-3.0.8b1/htcommon/defaults.cc htdig-3.0.8b1-parse/htcommon/defaults.cc --- htdig-3.0.8b1/htcommon/defaults.cc Mon Mar 17 00:54:25 1997 +++ htdig-3.0.8b1-parse/htcommon/defaults.cc Sun Feb 1 12:33:57 1998 @@ -105,7 +105,8 @@ {"title_factor", "100"}, {"url_list", "${database_base}.urls"}, {"use_star_image", "true"}, - {"valid_punctuation", ".-_/!#$%^&*'"}, + {"valid_punctuation", ".-_/!#$%^&*'?:;,<>{}[]()+=|\\\""}, + {"word_constituent_punctuation", ".-_!#$%^&'?\\"}, {"version", HTDIG_VERSION}, {"word_db", "${database_base}.words.gdbm"}, {"word_list", "${database_base}.wordlist"}, diff -urw --unidirectional-new htdig-3.0.8b1/htdig/HTML.cc htdig-3.0.8b1-parse/htdig/HTML.cc --- htdig-3.0.8b1/htdig/HTML.cc Fri Feb 7 04:13:27 1997 +++ htdig-3.0.8b1-parse/htdig/HTML.cc Sun Feb 1 12:37:04 1998 @@ -77,6 +77,29 @@ { } +void +HTML::parse_word(unsigned char *&position) +{ + word = 0; + while (*position && + (isalnum(*position) || + strchr(word_constituent_punctuation, *position) || + *position >= 160 || + *position == '&')) + { + if (*position == '&') + { + unsigned char ch; + ch = SGMLEntities::translateAndUpdate(position); + word << (char) ch; + } + else + { + word << (char)*position; + position++; + } + } +} //***************************************************************************** // void HTML::parse(Retriever &retriever, URL &baseURL) @@ -130,8 +153,15 @@ // Start of a tag. Since tags cannot be nested, we can simply // search for the closing '>' // - q = (unsigned char*)strchr((char *)position, '>'); - if (!q) + int quote=0; + for (q=position;*q&&(*q!='>'||quote);++q) + { + if (*q=='"') + quote=!quote; + else if (*q=='\\'&&quote) + ++q; + } + if (!*q) return; // Syntax error in the doc. Tag never ends. tag = 0; tag.append((char*)position, q - position + 1); @@ -182,24 +212,7 @@ // Start of a word. Try to find the whole thing // in_space = 0; - while (*position && - (isalnum(*position) || - strchr(valid_punctuation, *position) || - *position >= 160 || - *position == '&')) - { - if (*position == '&') - { - unsigned char ch; - ch = SGMLEntities::translateAndUpdate(position); - word << (char) ch; - } - else - { - word << (char)*position; - position++; - } - } + parse_word(position); if (in_title && doindex) { @@ -544,12 +557,21 @@ char *keywords = conf["htdig-keywords"]; if (!keywords) keywords = conf["keywords"]; - char *w = strtok(keywords, " \t\r\n"); - while (w) + unsigned char *w = (unsigned char *)keywords; + while (*w) + { + if ((isalnum(*w) || *w >= 160 || *w == '&')) + { + parse_word(w); + word.lowercase(); + if (word.length() >= minimumWordLength) { - if (strlen(w) >= minimumWordLength) - retriever.got_word(w, 1, 10); - w = strtok(0, " \t\r\n"); + word.remove(valid_punctuation); + retriever.got_word(word, 1, 10); + } + } + else + ++w; } } @@ -564,12 +586,21 @@ which = -1; if (keywordsMatch.CompareWord(cache)) { - char *w = strtok(conf["content"], " \t\r\n"); - while (w) + unsigned char *w = (unsigned char *)conf["content"]; + while (*w) + { + if ((isalnum(*w) || *w >= 160 || *w == '&')) { - if (strlen(w) >= minimumWordLength) - retriever.got_word(w, 1, 10); - w = strtok(0, " \t\r\n"); + parse_word(w); + word.lowercase(); + if (word.length() >= minimumWordLength) + { + word.remove(valid_punctuation); + retriever.got_word(word, 1, 10); + } + } + else + ++w; } } else if (mystrcasecmp(cache, "htdig-email") == 0) diff -urw --unidirectional-new htdig-3.0.8b1/htdig/HTML.h htdig-3.0.8b1-parse/htdig/HTML.h --- htdig-3.0.8b1/htdig/HTML.h Fri Feb 7 04:13:28 1997 +++ htdig-3.0.8b1-parse/htdig/HTML.h Sun Feb 1 12:33:57 1998 @@ -52,6 +52,7 @@ // Helper functions // void do_tag(Retriever &, String &); + void parse_word(unsigned char *&); }; #endif diff -urw --unidirectional-new htdig-3.0.8b1/htdig/Parsable.cc htdig-3.0.8b1-parse/htdig/Parsable.cc --- htdig-3.0.8b1/htdig/Parsable.cc Fri Feb 7 04:13:35 1997 +++ htdig-3.0.8b1-parse/htdig/Parsable.cc Sun Feb 1 12:33:57 1998 @@ -25,6 +25,7 @@ max_head_length = config.Value("max_head_length", 0); max_description_length = config.Value("max_description_length", 50); valid_punctuation = config["valid_punctuation"]; + word_constituent_punctuation = config["word_constituent_punctuation"]; } diff -urw --unidirectional-new htdig-3.0.8b1/htdig/Parsable.h htdig-3.0.8b1-parse/htdig/Parsable.h --- htdig-3.0.8b1/htdig/Parsable.h Sun Mar 23 23:33:34 1997 +++ htdig-3.0.8b1-parse/htdig/Parsable.h Sun Feb 1 12:33:57 1998 @@ -43,6 +43,7 @@ protected: String *contents; char *valid_punctuation; + char *word_constituent_punctuation; int max_head_length; int max_description_length; }; diff -urw --unidirectional-new htdig-3.0.8b1/htdig/Plaintext.cc htdig-3.0.8b1-parse/htdig/Plaintext.cc --- htdig-3.0.8b1/htdig/Plaintext.cc Sun Apr 20 11:23:43 1997 +++ htdig-3.0.8b1-parse/htdig/Plaintext.cc Sun Feb 1 12:33:57 1998 @@ -71,7 +71,7 @@ // Start of a word. Try to find the whole thing // in_space = 0; - while (*position && (isalnum(*position) || strchr(valid_punctuation, *position))) + while (*position && (isalnum(*position) || strchr(word_constituent_punctuation, *position))) { word << *position; position++; diff -urw --unidirectional-new htdig-3.0.8b1/htdig/Postscript.cc htdig-3.0.8b1-parse/htdig/Postscript.cc --- htdig-3.0.8b1/htdig/Postscript.cc Sun Mar 23 23:33:45 1997 +++ htdig-3.0.8b1-parse/htdig/Postscript.cc Sun Feb 1 12:33:57 1998 @@ -31,6 +31,7 @@ generatorType = 0; in_space = 0; valid_punctuation = config["valid_punctuation"]; + word_constituent_punctuation = config["word_constituent_punctuation"]; last_t = ""; last_y = ""; } @@ -307,7 +308,7 @@ in_space = 0; while (*position && ( isalnum(*position) || - strchr(valid_punctuation, *position))) + strchr(word_constituent_punctuation, *position))) { word << *position; position++; diff -urw --unidirectional-new htdig-3.0.8b1/htdig/Postscript.h htdig-3.0.8b1-parse/htdig/Postscript.h --- htdig-3.0.8b1/htdig/Postscript.h Fri Feb 7 04:13:51 1997 +++ htdig-3.0.8b1-parse/htdig/Postscript.h Sun Feb 1 12:33:57 1998 @@ -44,6 +44,7 @@ String last_t; String last_y; char *valid_punctuation; + char *word_constituent_punctuation; }; #endif

diff -urw --unidirectional-new htdig-3.0.8b1/htlib/String.cc htdig-3.0.8b1-joel/htlib/String.cc --- htdig-3.0.8b1/htlib/String.cc Sun Mar 23 23:34:50 1997 +++ htdig-3.0.8b1-joel/htlib/String.cc Sat Jan 31 20:33:55 1998 @@ -76,7 +76,7 @@ // This can be used for performance reasons if it is known the // String will need to grow. // -String::String(String &s, int allocation_hint) +String::String(const String &s, int allocation_hint) { Data = 0; diff -urw --unidirectional-new htdig-3.0.8b1/htlib/htString.h htdig-3.0.8b1-joel/htlib/htString.h --- htdig-3.0.8b1/htlib/htString.h Sun Mar 23 23:35:13 1997 +++ htdig-3.0.8b1-joel/htlib/htString.h Sat Jan 31 20:33:55 1998 @@ -30,7 +30,7 @@ // This can be used for performance reasons if it is known the // String will need to grow. // - String(String &s, int allocation_hint = 0); + String(const String &s, int allocation_hint = 0); ~String();



This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:25:40 PST