ht://dig - URL quoting patch - correction


Tim Frost (tim@nz.eds.com)
Thu, 9 Apr 1998 10:06:19 +1200 (NZST)


In checking the unofficial patch site for ht://dig, I find that an early,
broken, version of my fix for handling URL quoting is held on the patch
site.

The file HTML.cc-h.0, found under the 3.0.8b2 directory, should be
deleted, and the following patch should be made available (as HTML.cc.1?).
This patch was created against V3.0.8b2 as a unified diff. I can produce
a context diff if desired. The original patch attempted to use a new
function to do the duplicated work, but this failed because it did not
return the (at least) two pointer values that were needed.

Tim

diff -u htdig-3.0.8b2/htdig/HTML.cc-orig htdig-3.0.8b2/htdig/HTML.cc
--- htdig-3.0.8b2/htdig/HTML.cc-orig Sun Dec 7 22:14:40 1997
+++ htdig-3.0.8b2/htdig/HTML.cc Fri Jan 9 21:24:03 1998
@@ -309,7 +309,7 @@
 HTML::do_tag(Retriever &retriever, String &tag)
 {
     char *position = tag.get() + 1; // Skip the '<'
- char *q;
+ char *q, *t;
     int which, length;
 
     while (isspace(*position))
@@ -358,12 +358,34 @@
                         position++;
                         while (isspace(*position))
                             position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                         {
                             position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                             if (!q)
                                 break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                         }
                         else
                         {
@@ -374,8 +396,8 @@
                                    *q != '?' &&
                                    *q != '#')
                                 q++;
+ *q = '\0';
                         }
- *q = '\0';
                         delete href;
                         href = new URL(position, *base);
                         in_ref = 1;
@@ -396,20 +418,42 @@
                         position++;
                         while (isspace(*position))
                             position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                         {
                             position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                             if (!q)
                                 break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                         }
                         else
                         {
                             q = position;
                             while (*q && *q != '>' && !isspace(*q))
                                 q++;
+ *q = '\0';
                         }
- *q = '\0';
                         retriever.got_anchor(position);
                         position = q + 1;
                         break;
@@ -484,20 +528,42 @@
             position++;
             while (isspace(*position))
                 position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
             {
                 position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                 if (!q)
                     break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
             }
             else
             {
                 q = position;
                 while (*q && *q != '>' && !isspace(*q))
                     q++;
+ *q = '\0';
             }
- *q = '\0';
             retriever.got_image(position);
             break;
         }
@@ -616,12 +682,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -632,8 +720,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
+ *q = '\0';
                     }
- *q = '\0';
                     delete href;
                     href = new URL(position, *base);
                     if (doindex)
@@ -668,12 +756,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -684,8 +794,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
+ *q = '\0';
                     }
- *q = '\0';
                     delete href;
                     href = new URL(position, *base);
                     if (doindex)
@@ -719,12 +829,34 @@
                     position++;
                     while (isspace(*position))
                         position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
                     {
                         position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
                         if (!q)
                             break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
                     }
                     else
                     {
@@ -735,8 +867,8 @@
                                *q != '?' &&
                                *q != '#')
                             q++;
+ *q = '\0';
                     }
- *q = '\0';
                     URL tempBase(position, *base);
                     *base = tempBase;
                 }



This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:26:01 PST