Re: [htdig] External parsers: VRML added: Following <embed> tags?


Geoff Hutchison (ghutchis@wso.williams.edu)
Fri, 4 Jun 1999 14:40:39 -0400 (EDT)


On Fri, 4 Jun 1999, Rzepa, Henry wrote:

> If anyone can give us some hints as to how to modify htdig to follow
> <embed> as well as <a> tags, we would be most grateful!!

This patch has not even been tested to see if it compiles. But it should
do what you ask.

-Geoff Hutchison
Williams Students Online
http://wso.williams.edu/

Index: HTML.cc
===================================================================
RCS file: /opt/htdig/cvs/htdig3/htdig/HTML.cc,v
retrieving revision 1.41
diff -c -3 -p -r1.41 HTML.cc
*** HTML.cc 1999/05/16 21:20:57 1.41
--- HTML.cc 1999/06/04 17:50:35
*************** HTML::HTML()
*** 63,69 ****
      // the attrs Match object is used to match names of tag parameters.
      //
      tags.IgnoreCase();
! tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base");
  
      attrs.IgnoreCase();
      attrs.Pattern("src|href|name");
--- 63,69 ----
      // the attrs Match object is used to match names of tag parameters.
      //
      tags.IgnoreCase();
! tags.Pattern("title|/title|a|/a|h1|h2|h3|h4|h5|h6|/h1|/h2|/h3|/h4|/h5|/h6|noindex|/noindex|img|li|meta|frame|area|base|embed|object");
  
      attrs.IgnoreCase();
      attrs.Pattern("src|href|name");
*************** HTML::do_tag(Retriever &retriever, Strin
*** 1097,1102 ****
--- 1097,1202 ----
              break;
          }
          
+ case 24: // embed
+ {
+ which = -1;
+ int pos = attrs.FindFirstWord(position, which, length);
+ if (pos < 0 || which != 0)
+ break;
+ position += pos + length;
+ while (*position && *position != '=')
+ position++;
+ if (!*position)
+ break;
+ position++;
+ while (isspace(*position))
+ position++;
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
+ {
+ position++;
+ q = strchr(position, position[-1]);
+ if (!q)
+ break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ }
+ else
+ {
+ q = position;
+ while (*q && *q != '>' && !isspace(*q))
+ q++;
+ *q = '\0';
+ }
+ retriever.got_href(position);
+ break;
+ }
+
+ case 25: // object
+ {
+ which = -1;
+ int pos = attrs.FindFirstWord(position, which, length);
+ if (pos < 0 || which != 0)
+ break;
+ position += pos + length;
+ while (*position && *position != '=')
+ position++;
+ if (!*position)
+ break;
+ position++;
+ while (isspace(*position))
+ position++;
+ //
+ // Allow either single quotes or double quotes
+ // around the URL itself
+ //
+ if (*position == '"'||*position == '\'')
+ {
+ position++;
+ q = strchr(position, position[-1]);
+ if (!q)
+ break;
+ //
+ // We seem to have matched the opening quote char
+ // Mark the end of the quotes as our endpoint, so
+ // that we can continue parsing after the current
+ // text
+ //
+ *q = '\0';
+ //
+ // If a '#' is present in a quoted URL,
+ // treat that as the end of the URL, but we skip
+ // past the quote to parse the rest of the anchor.
+ //
+ if ((t = strchr(position, '#')) != NULL)
+ *t = '\0';
+ }
+ else
+ {
+ q = position;
+ while (*q && *q != '>' && !isspace(*q))
+ q++;
+ *q = '\0';
+ }
+ retriever.got_href(position);
+ break;
+ }
+
          default:
              return; // Nothing...
      }

------------------------------------
To unsubscribe from the htdig mailing list, send a message to
htdig@htdig.org containing the single word "unsubscribe" in
the SUBJECT of the message.



This archive was generated by hypermail 2.0b3 on Fri Jun 04 1999 - 10:53:56 PDT