Tim Frost (tim@nz.eds.com)
Thu, 9 Apr 1998 10:06:19 +1200 (NZST)
In checking the unofficial patch site for ht://dig, I find that an early,
broken, version of my fix for handling URL quoting is held on the patch
site.
The file HTML.cc-h.0, found under the 3.0.8b2 directory, should be
deleted, and the following patch should be made available (as HTML.cc.1?).
This patch was created against V3.0.8b2 as a unified diff. I can produce
a context diff if desired. The original patch attempted to use a new
function to do the duplicated work, but this failed because it did not
return the (at least) two pointer values that were needed.
Tim
diff -u htdig-3.0.8b2/htdig/HTML.cc-orig htdig-3.0.8b2/htdig/HTML.cc
--- htdig-3.0.8b2/htdig/HTML.cc-orig Sun Dec 7 22:14:40 1997
+++ htdig-3.0.8b2/htdig/HTML.cc Fri Jan 9 21:24:03 1998
@@ -309,7 +309,7 @@
HTML::do_tag(Retriever &retriever, String &tag)
{
char *position = tag.get() + 1; // Skip the '<'
- char *q;
+ char *q, *t;
int which, length;
while (isspace(*position))
@@ -358,12 +358,34 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
@@ -374,8 +396,8 @@
*q != '?' &&
*q != '#')
q++;
+ *q = '\0';
}
- *q = '\0';
delete href;
href = new URL(position, *base);
in_ref = 1;
@@ -396,20 +418,42 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
q = position;
while (*q && *q != '>' && !isspace(*q))
q++;
+ *q = '\0';
}
- *q = '\0';
retriever.got_anchor(position);
position = q + 1;
break;
@@ -484,20 +528,42 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
q = position;
while (*q && *q != '>' && !isspace(*q))
q++;
+ *q = '\0';
}
- *q = '\0';
retriever.got_image(position);
break;
}
@@ -616,12 +682,34 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
@@ -632,8 +720,8 @@
*q != '?' &&
*q != '#')
q++;
+ *q = '\0';
}
- *q = '\0';
delete href;
href = new URL(position, *base);
if (doindex)
@@ -668,12 +756,34 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
@@ -684,8 +794,8 @@
*q != '?' &&
*q != '#')
q++;
+ *q = '\0';
}
- *q = '\0';
delete href;
href = new URL(position, *base);
if (doindex)
@@ -719,12 +829,34 @@
position++;
while (isspace(*position))
position++;
- if (*position == '"')
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
{
position++;
- q = strchr(position, '"');
+ q = strchr(position, position[-1]);
if (!q)
break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '?' or '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ // Is there a better way of looking for these?
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ if ((t = strchr(position, '?')) != NULL)
+ *t = '\0';
}
else
{
@@ -735,8 +867,8 @@
*q != '?' &&
*q != '#')
q++;
+ *q = '\0';
}
- *q = '\0';
URL tempBase(position, *base);
*base = tempBase;
}
This archive was generated by hypermail 2.0b3 on Sat Jan 02 1999 - 16:26:01 PST