Re: htdig: single quotes in URL


Geoff Hutchison (Geoffrey.R.Hutchison@williams.edu)
Mon, 05 Oct 1998 15:00:08 -0400


At 6:00 PM -0400 10/3/98, Jerry Preeper wrote:
>I was wondering if anyone has a patch working satisfactorily yet to allow
>htdig to find URL's that are referenced with single quotes, as well as
>double quotes.

The following patch brings HTML.cc up from the revision in 3.1.0b1 to the
current revision (which includes several bug fixes in addition to the
single quote problem). If someone wants *just* the single-quote patch, use
CVSWeb on <http://dev.htdig.org/> to grab the diff between revision 1.14
and 1.13 of HTML.cc.

The patch seems to work for me, though I haven't given it a complete shakedown.

-Geoff Hutchison
Williams Students Online
http://wso.williams.edu/

===================================================================
RCS file: /opt/htdig/cvs/htdig3/htdig/HTML.cc,v
retrieving revision 1.10
retrieving revision 1.14
diff -u -r1.10 -r1.14
--- htdig3/htdig/HTML.cc 1998/09/10 04:16:25 1.10
+++ htdig3/htdig/HTML.cc 1998/09/30 17:31:50 1.14
@@ -4,7 +4,23 @@
 // Implementation of HTML
 //
 // $Log: HTML.cc,v $
+// Revision 1.14 1998/09/30 17:31:50 ghutchis
+// Changes for 3.1.0b2
+//
+// Revision 1.13 1998/09/23 14:58:21 ghutchis
+//
+// Many, many bug fixes
+//
+// Revision 1.12 1998/09/18 18:45:55 ghutchis
+//
+// YABF (Yet another bug fix)
+//
+// Revision 1.11 1998/09/18 02:38:08 ghutchis
+//
+// Bug fixes for 3.1.0b2
+//
 // Revision 1.10 1998/09/10 04:16:25 ghutchis
+//
 // More bug fixes.
 //
 // Revision 1.9 1998/09/08 03:29:09 ghutchis
@@ -39,7 +55,7 @@
 //
 //
 #if RELEASE
-static char RCSid[] = "$Id: HTML.cc,v 1.10 1998/09/10 04:16:25 ghutchis
Exp $";
+static char RCSid[] = "$Id: HTML.cc,v 1.14 1998/09/30 17:31:50 ghutchis
Exp $";
 #endif

 #include "htdig.h"
@@ -360,7 +376,7 @@
 HTML::do_tag(Retriever &retriever, String &tag)
 {
     char *position = tag.get() + 1; // Skip the '<'
- char *q;
+ char *q, *t;
     int which, length;

     while (isspace(*position))
@@ -409,12 +425,34 @@
                         position++;
                         while (isspace(*position))
                             position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                         {
                             position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                             if (!q)
                                 break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                         }
                         else
                         {
@@ -425,8 +463,8 @@
                                    *q != '?' &&
                                    *q != '#')
                                 q++;
+ *q = '\0';
                         }
- *q = '\0';
                         delete href;
                         href = new URL(position, *base);
                         in_ref = 1;
@@ -447,20 +485,42 @@
                         position++;
                         while (isspace(*position))
                             position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                         {
                             position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                             if (!q)
                                 break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                         }
                         else
                         {
                             q = position;
                             while (*q && *q != '>' && !isspace(*q))
                                 q++;
- }
                         *q = '\0';
+ }
                         retriever.got_anchor(position);
                         position = q + 1;
                         break;
@@ -537,20 +597,42 @@
             position++;
             while (isspace(*position))
                 position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
             {
                 position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                 if (!q)
                     break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
             }
             else
             {
                 q = position;
                 while (*q && *q != '>' && !isspace(*q))
                     q++;
- }
             *q = '\0';
+ }
             retriever.got_image(position);
             break;
         }
@@ -609,8 +691,9 @@
                 {
                     if (strlen(w) >= minimumWordLength)
                         retriever.got_word(w, 1, 10);
- w = strtok(0, " \t\r\n");
+ w = strtok(0, " ,\t\r\n");
                 }
+ w = '\0';
             }

             //
@@ -629,8 +712,9 @@
                     {
                         if (strlen(w) >= minimumWordLength)
                             retriever.got_word(w, 1, 10);
- w = strtok(0, " \t\r\n");
+ w = strtok(0, " ,\t\r\n");
                     }
+ w = '\0';
                 }
                 else if (mystrcasecmp(cache, "htdig-email") == 0)
                 {
@@ -655,14 +739,14 @@
                   {
                     String content_cache = conf["content"];

- if (content_cache.indexOf("noindex") != 0)
+ if (content_cache.indexOf("noindex") != -1)
                       {
                         doindex = 0;
                         retriever.got_noindex();
                       }
- else if (content_cache.indexOf("nofollow") != 0)
+ else if (content_cache.indexOf("nofollow") != -1)
                       dofollow = 0;
- else if (content_cache.indexOf("none") != 0)
+ else if (content_cache.indexOf("none") != -1)
                       {
                         doindex = 0;
                         dofollow = 0;
@@ -677,7 +761,11 @@
                     //
                     meta_dsc = conf["content"];
                     if (meta_dsc.length() > max_meta_description_length)
- meta_dsc = meta_dsc.sub(0, max_meta_description_length);
+ {
+ String temp = meta_dsc.sub(0,
max_meta_description_length);
+ meta_dsc = temp;
+ temp = 0;
+ }
                     if (debug > 1)
                       cout << "META Description: " << conf["content"] << endl;
                     retriever.got_meta_dsc(meta_dsc);
@@ -693,6 +781,7 @@
                           retriever.got_word(w, 1, 11);
                         w = strtok(0, " \t\r\n");
                       }
+ w = '\0';
                   }
             }
             else if (conf["name"] &&
@@ -724,12 +813,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -740,8 +851,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
+ *q = '\0';
                     }
- *q = '\0';
                     delete href;
                     href = new URL(position, *base);
                     if (dofollow)
@@ -776,12 +887,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -792,8 +925,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
+ *q = '\0';
                     }
- *q = '\0';
                     delete href;
                     href = new URL(position, *base);
                     if (dofollow)
@@ -827,12 +960,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -843,8 +998,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
- }
                     *q = '\0';
+ }
                     URL tempBase(position, *base);
                     *base = tempBase;
                 }

----------------------------------------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig-request@sdsu.edu containing the single word "unsubscribe" in
the body of the message.



This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:28:29 PST