Re: [htdig] Re: Description patch


Gilles Detillieux (grdetil@scrc.umanitoba.ca)
Fri, 19 Mar 1999 15:39:33 -0600 (CST)


According to Antti Rauramo:
> Okay, the latest patch with the correction seems to work fine, thanks, now still
> the space-and-punctuation problem remains. Anyone?

Funny you should ask. I've spend a fair bit of my spare time the past two
days working on a big patch to HTML.cc, which fixes about three previously
reported bugs about inconsistent handling of space and punctuation in
the title, href descriptions and doc. head. It seems to have helped
a lot with some problems on my site, but I'm hoping others on the list
will test it as well, especially those who had reported problems with
inconsistent spacing or punctuation in titles or descriptions.

Thank you for volunteering, Antti! ;-)

This patch will apply to the 3.1.1 source, but not entirely successfully
to the 3.2.0 development source (one hunk will fail, and will need to
be applied by hand).

--- htdig/HTML.cc.spacebug Wed Mar 17 17:05:15 1999
+++ htdig/HTML.cc Fri Mar 19 08:15:53 1999
@@ -19,6 +19,9 @@ static char RCSid[] = "$Id: HTML.cc,v 1.
 #include "URL.h"
 
 static StringMatch tags;
+static StringMatch nobreaktags;
+static StringMatch spacebeforetags;
+static StringMatch spaceaftertags;
 static StringMatch attrs;
 static StringMatch srcMatch;
 static StringMatch hrefMatch;
@@ -26,6 +29,29 @@ static StringMatch keywordsMatch;
 
 
 //*****************************************************************************
+// ADDSPACE() macro, to insert space where needed in various strings
+// Reduces all multiple whitespace to a single space
+
+#define ADDSPACE(in_space) \
+ if (!in_space) \
+ { \
+ if (in_title && doindex) \
+ { \
+ title << ' '; \
+ } \
+ if (in_ref && description.length() < max_description_length) \
+ { \
+ description << ' '; \
+ } \
+ if (head.length() < max_head_length && doindex && !in_title) \
+ { \
+ head << ' '; \
+ } \
+ in_space = 1; \
+ }
+
+
+//*****************************************************************************
 // HTML::HTML()
 //
 HTML::HTML()
@@ -47,6 +73,19 @@ HTML::HTML()
     hrefMatch.IgnoreCase();
     hrefMatch.Pattern("href");
 
+ // These tags don't cause a word break. They may also be in "tags" above,
+ // except for the "a" tag, which must be handled as a special case.
+ // Note that <sup> & <sub> should cause a word break.
+ nobreaktags.IgnoreCase();
+ nobreaktags.Pattern("font|/font|em|/em|strong|/strong|i|/i|b|/b|u|/u|tt|/tt|abbr|/abbr|code|/code|q|/q|samp|/samp|kbd|/kbd|var|/var|dfn|/dfn|cite|/cite|blink|/blink|big|/big|small|/small|s|/s");
+
+ // These tags, which may also be in "tags" above, cause word breaks and
+ // therefore cause space to be inserted before (or after) do_tag() is done.
+ spacebeforetags.IgnoreCase();
+ spacebeforetags.Pattern("title|h1|h2|h3|h4|h5|h6|address|blockquote|noindex|img|li|th|td|dt|dd|p|br|hr|center|spacer");
+ spaceaftertags.IgnoreCase();
+ spaceaftertags.Pattern("/title|/h1|/h2|/h3|/h4|/h5|/h6|/address|/blockquote");
+
     //String keywordNames = config["keywords_meta_tag_names"];
     //keywordNames.replace(' ', '|');
     //keywordNames.remove(",\t\r\n");
@@ -100,7 +139,8 @@ HTML::parse(Retriever &retriever, URL &b
     // are looking for
     //
     int offset = 0;
- int in_space = 0;
+ int in_space;
+ int in_punct;
     unsigned char *q, *start;
     unsigned char *position = (unsigned char *) contents->get();
     unsigned char *text = (unsigned char *) new char[contents->length()+1];
@@ -117,6 +157,7 @@ HTML::parse(Retriever &retriever, URL &b
     in_title = 0;
     in_ref = 0;
     in_space = 0;
+ in_punct = 0;
         
     while (*position)
     {
@@ -246,7 +287,29 @@ HTML::parse(Retriever &retriever, URL &b
               break; // Syntax error in the doc. Tag never ends.
             tag = 0;
             tag.append((char*)position, q - position + 1);
+ position++;
+ while (isspace(*position))
+ position++;
+ if (!in_space && spacebeforetags.CompareWord((char *)position)
+ || !in_space && !in_punct && *position != '/')
+ {
+ // These opening tags cause a space to be inserted
+ // before anything they insert.
+ // Tags processed here (i.e. not in nobreaktags), like <a ...>
+ // tag, are a special case: they don't actually add space in
+ // formatted text, but because in our processing it causes
+ // a word break, we avoid word concatenation in "head" string.
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
             do_tag(retriever, tag);
+ if (!in_space && spaceaftertags.CompareWord((char *)position))
+ {
+ // These closing tags cause a space to be inserted
+ // after anything they insert.
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
             position = q+1;
           }
         else if (*position > 0 && (isalnum(*position)))
@@ -256,13 +319,34 @@ HTML::parse(Retriever &retriever, URL &b
             //
             word = 0;
             in_space = 0;
+ in_punct = 0;
             while (*position &&
                    (isalnum(*position) ||
                    strchr(valid_punctuation, *position)))
- {
- word << (char)*position;
- position++;
- }
+ {
+ word << (char)*position;
+ position++;
+ if (*position == '<')
+ {
+ q = position+1;
+ while (isspace(*q))
+ q++;
+ // Does this tag cause a word break?
+ if (nobreaktags.CompareWord((char *)q))
+ {
+ // These tags just change character formatting and
+ // don't break words.
+ q = (unsigned char*)strchr((char *)position, '>');
+ if (q)
+ {
+ tag = 0;
+ tag.append((char*)position, q - position + 1);
+ do_tag(retriever, tag);
+ position = q+1;
+ }
+ }
+ }
+ }
 
             if (in_title && doindex)
             {
@@ -271,8 +355,11 @@ HTML::parse(Retriever &retriever, URL &b
 
             if (in_ref)
             {
- description << word;
- if (description.length() > max_description_length)
+ if (description.length() < max_description_length)
+ {
+ description << word;
+ }
+ else
                 {
                     description << " ...";
                     if (dofollow)
@@ -311,51 +398,32 @@ HTML::parse(Retriever &retriever, URL &b
             //
             // Characters that are not part of a word
             //
- if (doindex)
+ if (isspace(*position))
             {
- if (isspace(*position))
+ ADDSPACE(in_space);
+ in_punct = 0;
+ }
+ else
+ {
+ //
+ // Not whitespace
+ //
+ if (head.length() < max_head_length && doindex && !in_title)
                 {
- //
- // Reduce all multiple whitespace to a single space
- //
- if (!in_space)
- {
- if (head.length() < max_head_length)
- {
- head << ' ';
- }
- if (in_ref)
- {
- description << ' ';
- }
- if (in_title)
- {
- title << ' ';
- }
- }
- in_space = 1;
+ // We don't want to add random chars to the
+ // excerpt if we're in the title.
+ head << *position;
                 }
- else
+ if (in_ref && description.length() < max_description_length)
                 {
- //
- // Not whitespace
- //
- if (head.length() < max_head_length && !in_title)
- {
- // We don't want to add random chars to the
- // excerpt if we're in the title.
- head << *position;
- }
- if (in_ref)
- {
- description << *position;
- }
- if (in_title)
- {
- title << *position;
- }
- in_space = 0;
+ description << *position;
                 }
+ if (in_title && doindex)
+ {
+ title << *position;
+ }
+ in_space = 0;
+ in_punct = 1;
             }
             position++;
         }
@@ -642,7 +710,7 @@ HTML::do_tag(Retriever &retriever, Strin
         }
 
         case 19: // "li"
- if (doindex && head.length() < max_head_length)
+ if (doindex && !in_title && head.length() < max_head_length)
                 head << "* ";
             break;
 
@@ -1022,6 +1090,7 @@ HTML::do_tag(Retriever &retriever, Strin
                     *base = tempBase;
                 }
             }
+ break;
         }
         
         default:

-- 
Gilles R. Detillieux              E-mail: <grdetil@scrc.umanitoba.ca>
Spinal Cord Research Centre       WWW:    http://www.scrc.umanitoba.ca/~grdetil
Dept. Physiology, U. of Manitoba  Phone:  (204)789-3766
Winnipeg, MB  R3E 3J7  (Canada)   Fax:    (204)789-3930
------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig@htdig.org containing the single word "unsubscribe" in
the SUBJECT of the message.



This archive was generated by hypermail 2.0b3 on Fri Mar 19 1999 - 17:32:55 PST