2005-01-19 20:16:09 +03:00
|
|
|
/* ***** BEGIN LICENSE BLOCK *****
|
|
|
|
* Version: MPL 1.1
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with
|
|
|
|
* the License. You may obtain a copy of the License at
|
|
|
|
* http://www.mozilla.org/MPL/
|
|
|
|
*
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
|
|
|
|
* for the specific language governing rights and limitations under the
|
|
|
|
* License.
|
|
|
|
*
|
|
|
|
* The Original Code is SniffURI.
|
|
|
|
*
|
|
|
|
* The Initial Developer of the Original Code is
|
|
|
|
* Erik van der Poel <erik@vanderpoel.org>.
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 1998-2005
|
|
|
|
* the Initial Developer. All Rights Reserved.
|
|
|
|
*
|
|
|
|
* Contributor(s):
|
|
|
|
* Bruce Robson <bns_robson@hotmail.com>
|
|
|
|
*
|
|
|
|
* ***** END LICENSE BLOCK ***** */
|
2000-02-01 21:24:20 +03:00
|
|
|
|
2005-01-19 20:16:09 +03:00
|
|
|
#include "all.h"
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
#define IS_WHITE_SPACE(c) \
|
|
|
|
( \
|
|
|
|
((c) == ' ' ) || \
|
|
|
|
((c) == '\t') || \
|
|
|
|
((c) == '\r') || \
|
|
|
|
((c) == '\n') \
|
|
|
|
)
|
|
|
|
|
|
|
|
typedef struct HTMLState
|
|
|
|
{
|
|
|
|
unsigned short mask;
|
|
|
|
HTML *html;
|
|
|
|
} HTMLState;
|
|
|
|
|
|
|
|
static HashTable *tagTable = NULL;
|
|
|
|
|
|
|
|
static HTMLHandler tagHandler = NULL;
|
|
|
|
|
|
|
|
static char *urlAttributes[] =
|
|
|
|
{
|
|
|
|
"a", "href",
|
|
|
|
"applet", "codebase",
|
|
|
|
"area", "href",
|
|
|
|
"base", "href",
|
|
|
|
"blockquote", "cite",
|
|
|
|
"body", "background",
|
|
|
|
"del", "cite",
|
|
|
|
"form", "action",
|
|
|
|
"frame", "longdesc",
|
|
|
|
"frame", "src",
|
|
|
|
"head", "profile",
|
|
|
|
"iframe", "longdesc",
|
|
|
|
"iframe", "src",
|
|
|
|
"img", "longdesc",
|
|
|
|
"img", "src",
|
|
|
|
"img", "usemap",
|
|
|
|
"input", "src",
|
|
|
|
"input", "usemap",
|
|
|
|
"ins", "cite",
|
|
|
|
"link", "href",
|
|
|
|
"object", "archive",
|
|
|
|
"object", "classid",
|
|
|
|
"object", "codebase",
|
|
|
|
"object", "data",
|
|
|
|
"object", "usemap",
|
|
|
|
"q", "cite",
|
|
|
|
"script", "for",
|
|
|
|
"script", "src",
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static int htmlInitialized = 0;
|
|
|
|
|
|
|
|
static HashTable *knownTagTable = NULL;
|
|
|
|
|
|
|
|
static char *knownTags[] =
|
|
|
|
{
|
|
|
|
"!doctype",
|
|
|
|
"a",
|
|
|
|
"address",
|
|
|
|
"applet",
|
|
|
|
"area",
|
|
|
|
"b",
|
|
|
|
"base",
|
|
|
|
"basefont",
|
|
|
|
"big",
|
|
|
|
"blink",
|
|
|
|
"blockquote",
|
|
|
|
"body",
|
|
|
|
"br",
|
|
|
|
"caption",
|
|
|
|
"cell",
|
|
|
|
"center",
|
|
|
|
"certificate",
|
|
|
|
"charles",
|
|
|
|
"cite",
|
|
|
|
"code",
|
|
|
|
"colormap",
|
|
|
|
"dd",
|
|
|
|
"dir",
|
|
|
|
"div",
|
|
|
|
"dl",
|
|
|
|
"dt",
|
|
|
|
"em",
|
|
|
|
"embed",
|
|
|
|
"font",
|
|
|
|
"form",
|
|
|
|
"frame",
|
|
|
|
"frameset",
|
|
|
|
"h1",
|
|
|
|
"h2",
|
|
|
|
"h3",
|
|
|
|
"h4",
|
|
|
|
"h5",
|
|
|
|
"h6",
|
|
|
|
"head",
|
|
|
|
"hr",
|
|
|
|
"html",
|
|
|
|
"hype",
|
|
|
|
"i",
|
|
|
|
"ilayer",
|
|
|
|
"image",
|
|
|
|
"img",
|
|
|
|
"inlineinput",
|
|
|
|
"input",
|
|
|
|
"isindex",
|
|
|
|
"jean",
|
|
|
|
"kbd",
|
|
|
|
"keygen",
|
|
|
|
"layer",
|
|
|
|
"li",
|
|
|
|
"link",
|
|
|
|
"listing",
|
|
|
|
"map",
|
|
|
|
"media",
|
|
|
|
"menu",
|
|
|
|
"meta",
|
|
|
|
"mquote",
|
|
|
|
"multicol",
|
|
|
|
"nobr",
|
|
|
|
"noembed",
|
|
|
|
"noframes",
|
|
|
|
"nolayer",
|
|
|
|
"noscript",
|
|
|
|
"nscp_close",
|
|
|
|
"nscp_open",
|
|
|
|
"nscp_reblock",
|
|
|
|
"nsdt",
|
|
|
|
"object",
|
|
|
|
"ol",
|
|
|
|
"option",
|
|
|
|
"p",
|
|
|
|
"param",
|
|
|
|
"plaintext",
|
|
|
|
"pre",
|
|
|
|
"s",
|
|
|
|
"samp",
|
|
|
|
"script",
|
|
|
|
"select",
|
|
|
|
"server",
|
|
|
|
"small",
|
|
|
|
"spacer",
|
|
|
|
"span",
|
|
|
|
"spell",
|
|
|
|
"strike",
|
|
|
|
"strong",
|
|
|
|
"style",
|
|
|
|
"sub",
|
|
|
|
"subdoc",
|
|
|
|
"sup",
|
|
|
|
"table",
|
|
|
|
"td",
|
|
|
|
"textarea",
|
|
|
|
"th",
|
|
|
|
"title",
|
|
|
|
"tr",
|
|
|
|
"tt",
|
|
|
|
"u",
|
|
|
|
"ul",
|
|
|
|
"var",
|
|
|
|
"wbr",
|
|
|
|
"xmp",
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
diag(int line, HTMLState *state, unsigned short c)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "%s(%d): 0x%02x(%c) tag %s attr %s\n", __FILE__, line,
|
|
|
|
c, c, state->html->tag ? (char *) state->html->tag : "NULL",
|
|
|
|
state->html->currentAttribute ?
|
|
|
|
(char *) state->html->currentAttribute->name : "NULL");
|
|
|
|
fprintf(stderr, "(%s)\n", state->html->url);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
htmlInit(void)
|
|
|
|
{
|
|
|
|
char **p;
|
|
|
|
|
|
|
|
knownTagTable = hashAlloc(NULL);
|
|
|
|
p = knownTags;
|
|
|
|
while (*p)
|
|
|
|
{
|
|
|
|
hashAdd(knownTagTable, copyString((unsigned char *) *p), NULL);
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
htmlInitialized = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
htmlCheckForBaseURL(HTML* html)
|
|
|
|
{
|
|
|
|
if
|
|
|
|
(
|
|
|
|
(!strcmp((char *) html->tag, "base")) &&
|
|
|
|
(!strcmp((char *) html->currentAttribute->name, "href"))
|
|
|
|
)
|
|
|
|
{
|
|
|
|
FREE(html->base);
|
|
|
|
html->base = copyString(html->currentAttribute->value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
htmlCheckForURLAttribute(HTML *html)
|
|
|
|
{
|
|
|
|
char **p;
|
|
|
|
|
|
|
|
html->currentAttributeIsURL = 0;
|
|
|
|
p = urlAttributes;
|
|
|
|
while (*p)
|
|
|
|
{
|
|
|
|
if
|
|
|
|
(
|
|
|
|
(!strcmp((char *) html->tag, p[0])) &&
|
|
|
|
(!strcmp((char *) html->currentAttribute->name, p[1]))
|
|
|
|
)
|
|
|
|
{
|
|
|
|
html->currentAttributeIsURL = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
p += 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
htmlCheckAttribute(HTML *html)
|
|
|
|
{
|
|
|
|
htmlCheckForBaseURL(html);
|
|
|
|
htmlCheckForURLAttribute(html);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
htmlRegister(char *tag, char *attributeName, HTMLHandler handler)
|
|
|
|
{
|
|
|
|
HashEntry *attrEntry;
|
|
|
|
HashEntry *tagEntry;
|
|
|
|
|
|
|
|
if (!tagTable)
|
|
|
|
{
|
|
|
|
tagTable = hashAlloc(NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
tagEntry = hashLookup(tagTable, (unsigned char *) tag);
|
|
|
|
if (!tagEntry)
|
|
|
|
{
|
|
|
|
tagEntry = hashAdd(tagTable, (unsigned char *) tag,
|
|
|
|
hashAlloc(NULL));
|
|
|
|
}
|
|
|
|
attrEntry = hashLookup(tagEntry->value,
|
|
|
|
(unsigned char *) attributeName);
|
|
|
|
if (attrEntry)
|
|
|
|
{
|
|
|
|
attrEntry->value = (void *) handler;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
hashAdd(tagEntry->value, (unsigned char *) attributeName,
|
|
|
|
(void *) handler);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
htmlRegisterURLHandler(HTMLHandler handler)
|
|
|
|
{
|
|
|
|
char **p;
|
|
|
|
|
|
|
|
p = urlAttributes;
|
|
|
|
while (*p)
|
|
|
|
{
|
|
|
|
htmlRegister(p[0], p[1], handler);
|
|
|
|
p += 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2005-01-21 00:09:40 +03:00
|
|
|
callHandler(App *app, HTML *html)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
HashEntry *attrEntry;
|
|
|
|
HashEntry *tagEntry;
|
|
|
|
|
|
|
|
if (!tagTable)
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
tagEntry = hashLookup(tagTable, html->tag);
|
|
|
|
if (tagEntry)
|
|
|
|
{
|
|
|
|
attrEntry = hashLookup(tagEntry->value,
|
|
|
|
html->currentAttribute->name);
|
|
|
|
if (attrEntry)
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
(*((HTMLHandler) attrEntry->value))(app, html);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
htmlRegisterTagHandler(HTMLHandler handler)
|
|
|
|
{
|
|
|
|
tagHandler = handler;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
htmlGetByte(Buf *buf, HTMLState *state)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
unsigned short ret;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
c = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
ret = c;
|
|
|
|
}
|
|
|
|
else if (c == 0x1b)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
ret = c;
|
|
|
|
}
|
|
|
|
else if (c == '$')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
ret = c;
|
|
|
|
}
|
|
|
|
else if (c == '(')
|
|
|
|
{
|
|
|
|
/* throw away 4th byte in ESC sequence */
|
2005-01-25 11:22:29 +03:00
|
|
|
bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
state->mask = 0x80;
|
2005-01-25 11:22:29 +03:00
|
|
|
c = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
ret = c;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ret = c | state->mask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
state->mask = 0x80;
|
2005-01-25 11:22:29 +03:00
|
|
|
c = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
ret = c;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ret = c | state->mask;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (c == '(')
|
|
|
|
{
|
|
|
|
state->mask = 0;
|
|
|
|
/* throw away 3rd byte in ESC sequence */
|
2005-01-25 11:22:29 +03:00
|
|
|
bufGetByte(buf);
|
|
|
|
ret = bufGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
bufUnGetByte(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
ret = 0x1b;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ret = c | state->mask;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
eatWhiteSpace(Buf *buf, HTMLState *state, unsigned short c)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
while
|
|
|
|
(
|
|
|
|
(c == ' ') ||
|
|
|
|
(c == '\t') ||
|
|
|
|
(c == '\r') ||
|
|
|
|
(c == '\n')
|
|
|
|
)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
htmlFreeAttributes(HTMLState *state)
|
|
|
|
{
|
|
|
|
HTMLAttribute *attr;
|
|
|
|
HTMLAttribute *tmp;
|
|
|
|
|
|
|
|
attr = state->html->attributes;
|
|
|
|
state->html->attributes = NULL;
|
|
|
|
while (attr)
|
|
|
|
{
|
|
|
|
free(attr->name);
|
|
|
|
free(attr->value);
|
|
|
|
tmp = attr;
|
|
|
|
attr = attr->next;
|
|
|
|
free(tmp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
readAttribute(App *app, Buf *buf, HTMLState *state, unsigned short c)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
HTMLAttribute *attr;
|
|
|
|
unsigned short quote;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
app->html(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
while
|
|
|
|
(
|
|
|
|
(c != 256) &&
|
|
|
|
(c != '>') &&
|
2005-02-03 04:06:24 +03:00
|
|
|
(c != '/') &&
|
2000-02-01 21:24:20 +03:00
|
|
|
(c != '=') &&
|
|
|
|
(c != ' ') &&
|
|
|
|
(c != '\t') &&
|
|
|
|
(c != '\r') &&
|
|
|
|
(c != '\n')
|
|
|
|
)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
2000-02-01 21:24:20 +03:00
|
|
|
attr = calloc(sizeof(HTMLAttribute), 1);
|
|
|
|
if (!attr)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "cannot calloc HTMLAttribute\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
if (state->html->currentAttribute)
|
|
|
|
{
|
|
|
|
state->html->currentAttribute->next = attr;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (state->html->attributes)
|
|
|
|
{
|
|
|
|
htmlFreeAttributes(state);
|
|
|
|
}
|
|
|
|
state->html->attributes = attr;
|
|
|
|
}
|
|
|
|
state->html->currentAttribute = attr;
|
2005-01-25 11:22:29 +03:00
|
|
|
attr->name = bufCopyLower(buf);
|
|
|
|
app->htmlAttributeName(app, state->html, buf);
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
if ((c == 256) || (c == '>'))
|
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
if (c != '=')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = eatWhiteSpace(buf, state, c);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
if ((c == 256) || (c == '>'))
|
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
if (c == '=')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = eatWhiteSpace(buf, state, htmlGetByte(buf, state));
|
2000-02-01 21:24:20 +03:00
|
|
|
if ((c == '"') || (c == '\''))
|
|
|
|
{
|
|
|
|
quote = c;
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, 0);
|
|
|
|
app->html(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
do
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
} while ((c != 256) && (c != quote));
|
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
diag(__LINE__, state, c);
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
attr->value = bufCopy(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
htmlCheckAttribute(state->html);
|
2005-01-25 11:22:29 +03:00
|
|
|
app->htmlAttributeValue(app, state->html, buf);
|
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
app->html(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
while
|
|
|
|
(
|
|
|
|
(c != 256) &&
|
|
|
|
(c != '>') &&
|
|
|
|
(c != ' ') &&
|
|
|
|
(c != '\t') &&
|
|
|
|
(c != '\r') &&
|
|
|
|
(c != '\n')
|
|
|
|
)
|
|
|
|
{
|
|
|
|
if ((c == '"') || (c == '\''))
|
|
|
|
{
|
|
|
|
diag(__LINE__, state, c);
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
attr->value = bufCopy(buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
htmlCheckAttribute(state->html);
|
2005-01-25 11:22:29 +03:00
|
|
|
app->htmlAttributeValue(app, state->html, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-21 00:09:40 +03:00
|
|
|
callHandler(app, state->html);
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
|
|
|
if ((c == 256) || (c == '>'))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
c = eatWhiteSpace(buf, state, c);
|
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
|
|
|
return c;
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
2005-01-25 11:22:29 +03:00
|
|
|
caseCompare(char *str, Buf *buf, HTMLState *state, unsigned short *ret)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; str[i]; i++)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (tolower(c) != tolower(str[i]))
|
|
|
|
{
|
|
|
|
*ret = c;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
*ret = c;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2005-01-20 04:17:05 +03:00
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
endTag(App *app, Buf *buf, HTMLState *state)
|
2005-01-20 04:17:05 +03:00
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2005-01-20 04:17:05 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
app->html(app, buf);
|
2005-01-20 04:17:05 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
static unsigned short
|
2005-02-03 04:06:24 +03:00
|
|
|
readTag(App *app, Buf *buf, HTMLState *state, unsigned short c)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-02-03 04:06:24 +03:00
|
|
|
bufMark(buf, c == '/' ? 0 : -1);
|
2005-01-25 11:22:29 +03:00
|
|
|
app->html(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
do
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
while
|
|
|
|
(
|
|
|
|
(c != 256) &&
|
|
|
|
(c != '>') &&
|
2005-02-03 04:06:24 +03:00
|
|
|
(c != '/') &&
|
2000-02-01 21:24:20 +03:00
|
|
|
(c != ' ') &&
|
|
|
|
(c != '\t') &&
|
|
|
|
(c != '\r') &&
|
|
|
|
(c != '\n')
|
|
|
|
);
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
2000-02-01 21:24:20 +03:00
|
|
|
FREE(state->html->tag);
|
2005-01-25 11:22:29 +03:00
|
|
|
state->html->tag = bufCopyLower(buf);
|
2005-02-03 04:06:24 +03:00
|
|
|
if (hashLookup(knownTagTable, state->html->tag))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
state->html->tagIsKnown = 1;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
state->html->tagIsKnown = 0;
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
app->htmlTag(app, state->html, buf);
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '>')
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
return endTag(app, buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
c = eatWhiteSpace(buf, state, c);
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '/')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == '>')
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
return endTag(app, buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
do
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = readAttribute(app, buf, state, c);
|
2000-02-01 21:24:20 +03:00
|
|
|
} while ((c != 256) && (c != '>'));
|
|
|
|
state->html->currentAttribute = NULL;
|
|
|
|
if (tagHandler)
|
|
|
|
{
|
2005-01-21 00:09:40 +03:00
|
|
|
(*tagHandler)(app, state->html);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
return c;
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
if (c == '>')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
return endTag(app, buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
2005-02-03 04:06:24 +03:00
|
|
|
static unsigned short
|
|
|
|
readComment(App *app, Buf *buf, HTMLState *state)
|
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
unsigned long begin;
|
|
|
|
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
if (c != '-')
|
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
} while ((c != 256) && (c != '>'));
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
begin = bufCurrent(buf);
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
if (c == '-')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
if (c == '-')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf,
|
|
|
|
state);
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
return endTag(app, buf, state);
|
|
|
|
}
|
|
|
|
if (c == '-')
|
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
} while (c == '-');
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
return endTag(app, buf, state);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
bufSet(buf, begin);
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, state);
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
return endTag(app, buf, state);
|
|
|
|
}
|
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "bad comment\n");
|
|
|
|
bufMark(buf, -1);
|
|
|
|
FREE(state->html->tag);
|
|
|
|
state->html->tag = copyString(
|
|
|
|
(unsigned char *) "!--");
|
|
|
|
state->html->tagIsKnown = 1;
|
|
|
|
app->htmlTag(app, state->html, buf);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-02-01 21:24:20 +03:00
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
readText(App *app, Buf *buf, HTMLState *state)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
app->html(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
do
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
} while ((c != 256) && (c != '<'));
|
2005-01-25 11:22:29 +03:00
|
|
|
bufMark(buf, -1);
|
|
|
|
app->htmlText(app, buf);
|
2000-02-01 21:24:20 +03:00
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned short
|
2005-01-25 11:22:29 +03:00
|
|
|
dealWithScript(Buf *buf, HTMLState *state, unsigned short c)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
if (state->html->tag &&
|
|
|
|
(!strcasecmp((char *) state->html->tag, "script")))
|
|
|
|
{
|
|
|
|
while (1)
|
|
|
|
{
|
|
|
|
if (c == 256)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (c == '<')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
if (caseCompare("/script>", buf, state, &c))
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
FREE(state->html->tag);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2005-01-25 11:22:29 +03:00
|
|
|
htmlRead(App *app, Buf *buf, unsigned char *base)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
unsigned short c;
|
|
|
|
HTML html;
|
|
|
|
HTMLState state;
|
|
|
|
|
|
|
|
if (!htmlInitialized)
|
|
|
|
{
|
|
|
|
htmlInit();
|
|
|
|
}
|
|
|
|
|
|
|
|
html.base = copyString(base);
|
|
|
|
html.url = copyString(base);
|
|
|
|
html.tag = NULL;
|
|
|
|
html.attributes = NULL;
|
|
|
|
html.currentAttribute = NULL;
|
|
|
|
|
|
|
|
state.mask = 0;
|
|
|
|
state.html = &html;
|
|
|
|
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, &state);
|
2000-02-01 21:24:20 +03:00
|
|
|
while (c != 256)
|
|
|
|
{
|
|
|
|
if (c == '<')
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = htmlGetByte(buf, &state);
|
2000-02-01 21:24:20 +03:00
|
|
|
if
|
|
|
|
(
|
|
|
|
(('a' <= c) && (c <= 'z')) ||
|
|
|
|
(('A' <= c) && (c <= 'Z')) ||
|
2005-02-03 04:06:24 +03:00
|
|
|
(c == '/')
|
2000-02-01 21:24:20 +03:00
|
|
|
)
|
|
|
|
{
|
2005-02-03 04:06:24 +03:00
|
|
|
c = readTag(app, buf, &state, c);
|
2005-01-25 11:22:29 +03:00
|
|
|
c = dealWithScript(buf, &state, c);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
2005-02-03 04:06:24 +03:00
|
|
|
else if (c == '!')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, &state);
|
|
|
|
if (c == '-')
|
|
|
|
{
|
|
|
|
c = readComment(app, buf, &state);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
while ((c != 256) && (c != '>'))
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, &state);
|
|
|
|
}
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, &state);
|
|
|
|
}
|
|
|
|
bufMark(buf, -1);
|
|
|
|
app->htmlDeclaration(app, buf);
|
|
|
|
}
|
|
|
|
else if (c == '?')
|
|
|
|
{
|
|
|
|
do
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, &state);
|
|
|
|
} while ((c != 256) && (c != '>'));
|
|
|
|
if (c == '>')
|
|
|
|
{
|
|
|
|
c = htmlGetByte(buf, &state);
|
|
|
|
}
|
|
|
|
bufMark(buf, -1);
|
|
|
|
app->htmlProcessingInstruction(app, buf);
|
|
|
|
}
|
2000-02-01 21:24:20 +03:00
|
|
|
else
|
|
|
|
{
|
|
|
|
diag(__LINE__, &state, c);
|
2005-01-25 11:22:29 +03:00
|
|
|
c = readText(app, buf, &state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-01-25 11:22:29 +03:00
|
|
|
c = readText(app, buf, &state);
|
2000-02-01 21:24:20 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
FREE(html.base);
|
2005-01-25 11:22:29 +03:00
|
|
|
FREE(html.url);
|
2000-02-01 21:24:20 +03:00
|
|
|
FREE(html.tag);
|
|
|
|
htmlFreeAttributes(&state);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned char *
|
|
|
|
toHTML(unsigned char *str)
|
2002-09-17 01:31:58 +04:00
|
|
|
{
|
|
|
|
unsigned char *escaped_str;
|
|
|
|
unsigned char *result;
|
|
|
|
|
|
|
|
escaped_str = escapeHTML(str);
|
|
|
|
|
|
|
|
result = NULL;
|
|
|
|
|
2005-01-19 21:03:44 +03:00
|
|
|
result = calloc(strlen((char *) escaped_str)+3, 1);
|
2002-09-17 01:31:58 +04:00
|
|
|
if (!result)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "cannot calloc toHTML string\n");
|
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
result[0] = '"';
|
|
|
|
strcat((char *) result, (char *) escaped_str);
|
|
|
|
strcat((char *) result, "\"");
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned char *
|
|
|
|
escapeHTML(unsigned char *str)
|
2000-02-01 21:24:20 +03:00
|
|
|
{
|
|
|
|
char buf[2];
|
|
|
|
int i;
|
|
|
|
int j;
|
|
|
|
int len;
|
|
|
|
char *replacement;
|
|
|
|
unsigned char *result;
|
|
|
|
|
|
|
|
buf[1] = 0;
|
|
|
|
len = 0;
|
|
|
|
result = NULL;
|
|
|
|
|
|
|
|
for (i = 0; i < 2; i++)
|
|
|
|
{
|
|
|
|
for (j = 0; str[j]; j++)
|
|
|
|
{
|
|
|
|
switch (str[j])
|
|
|
|
{
|
|
|
|
case '<':
|
|
|
|
replacement = "<";
|
|
|
|
break;
|
|
|
|
case '>':
|
|
|
|
replacement = ">";
|
|
|
|
break;
|
|
|
|
case '&':
|
|
|
|
replacement = "&";
|
|
|
|
break;
|
2003-01-02 20:07:34 +03:00
|
|
|
case '"':
|
|
|
|
replacement = """;
|
|
|
|
break;
|
2000-02-01 21:24:20 +03:00
|
|
|
default:
|
|
|
|
replacement = buf;
|
|
|
|
buf[0] = str[j];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (result)
|
|
|
|
{
|
|
|
|
strcat((char *) result, replacement);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
len += strlen(replacement);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!result)
|
|
|
|
{
|
2002-09-17 01:31:58 +04:00
|
|
|
result = calloc(len + 1, 1);
|
2000-02-01 21:24:20 +03:00
|
|
|
if (!result)
|
|
|
|
{
|
|
|
|
fprintf(stderr,
|
2002-09-17 01:31:58 +04:00
|
|
|
"cannot calloc escapeHTML string\n");
|
2000-02-01 21:24:20 +03:00
|
|
|
exit(0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|