gecko-dev/webtools/web-sniffer/html.c

/*
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is Web Sniffer.
 *
 * The Initial Developer of the Original Code is Erik van der Poel.
 * Portions created by Erik van der Poel are
 * Copyright (C) 1998,1999,2000 Erik van der Poel.
 * All Rights Reserved.
 *
 * Contributor(s): Bruce Robson
 */

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

#include "hash.h"
#include "html.h"
#include "http.h"
#include "io.h"
#include "main.h"
#include "url.h"
#include "utils.h"

#define IS_WHITE_SPACE(c) \
	( \
		((c) == ' ' ) || \
		((c) == '\t') || \
		((c) == '\r') || \
		((c) == '\n') \
	)

typedef struct HTMLState
{
	unsigned short	mask;
	unsigned short	saved;
	unsigned short	unGotten;
	HTML		*html;
} HTMLState;

static HashTable *tagTable = NULL;

static HTMLHandler tagHandler = NULL;

static char *urlAttributes[] =
{
	"a", "href",
	"applet", "codebase",
	"area", "href",
	"base", "href",
	"blockquote", "cite",
	"body", "background",
	"del", "cite",
	"form", "action",
	"frame", "longdesc",
	"frame", "src",
	"head", "profile",
	"iframe", "longdesc",
	"iframe", "src",
	"img", "longdesc",
	"img", "src",
	"img", "usemap",
	"input", "src",
	"input", "usemap",
	"ins", "cite",
	"link", "href",
	"object", "archive",
	"object", "classid",
	"object", "codebase",
	"object", "data",
	"object", "usemap",
	"q", "cite",
	"script", "for",
	"script", "src",
	NULL
};

static int htmlInitialized = 0;

static HashTable *knownTagTable = NULL;

static char *knownTags[] =
{
	"!doctype",
	"a",
	"address",
	"applet",
	"area",
	"b",
	"base",
	"basefont",
	"big",
	"blink",
	"blockquote",
	"body",
	"br",
	"caption",
	"cell",
	"center",
	"certificate",
	"charles",
	"cite",
	"code",
	"colormap",
	"dd",
	"dir",
	"div",
	"dl",
	"dt",
	"em",
	"embed",
	"font",
	"form",
	"frame",
	"frameset",
	"h1",
	"h2",
	"h3",
	"h4",
	"h5",
	"h6",
	"head",
	"hr",
	"html",
	"hype",
	"i",
	"ilayer",
	"image",
	"img",
	"inlineinput",
	"input",
	"isindex",
	"jean",
	"kbd",
	"keygen",
	"layer",
	"li",
	"link",
	"listing",
	"map",
	"media",
	"menu",
	"meta",
	"mquote",
	"multicol",
	"nobr",
	"noembed",
	"noframes",
	"nolayer",
	"noscript",
	"nscp_close",
	"nscp_open",
	"nscp_reblock",
	"nsdt",
	"object",
	"ol",
	"option",
	"p",
	"param",
	"plaintext",
	"pre",
	"s",
	"samp",
	"script",
	"select",
	"server",
	"small",
	"spacer",
	"span",
	"spell",
	"strike",
	"strong",
	"style",
	"sub",
	"subdoc",
	"sup",
	"table",
	"td",
	"textarea",
	"th",
	"title",
	"tr",
	"tt",
	"u",
	"ul",
	"var",
	"wbr",
	"xmp",
	NULL
};

static void
diag(int line, HTMLState *state, unsigned short c)
{
	fprintf(stderr, "%s(%d): 0x%02x(%c) tag %s attr %s\n", __FILE__, line,
		c, c, state->html->tag ? (char *) state->html->tag : "NULL",
		state->html->currentAttribute ?
			(char *) state->html->currentAttribute->name : "NULL");
	fprintf(stderr, "(%s)\n", state->html->url);
}

static void
htmlInit(void)
{
	char	**p;

	knownTagTable = hashAlloc(NULL);
	p = knownTags;
	while (*p)
	{
		hashAdd(knownTagTable, copyString((unsigned char *) *p), NULL);
		p++;
	}
	htmlInitialized = 1;
}

static void
htmlCheckForBaseURL(HTML* html)
{
	if
	(
		(!strcmp((char *) html->tag, "base")) &&
		(!strcmp((char *) html->currentAttribute->name, "href"))
	)
	{
		FREE(html->base);
		html->base = copyString(html->currentAttribute->value);
	}
}

static void
htmlCheckForURLAttribute(HTML *html)
{
	char	**p;

	html->currentAttributeIsURL = 0;
	p = urlAttributes;
	while (*p)
	{
		if
		(
			(!strcmp((char *) html->tag, p[0])) &&
			(!strcmp((char *) html->currentAttribute->name, p[1]))
		)
		{
			html->currentAttributeIsURL = 1;
			break;
		}
		p += 2;
	}
}

static void
htmlCheckAttribute(HTML *html)
{
	htmlCheckForBaseURL(html);
	htmlCheckForURLAttribute(html);
}

void
htmlRegister(char *tag, char *attributeName, HTMLHandler handler)
{
	HashEntry	*attrEntry;
	HashEntry	*tagEntry;

	if (!tagTable)
	{
		tagTable = hashAlloc(NULL);
	}

	tagEntry = hashLookup(tagTable, (unsigned char *) tag);
	if (!tagEntry)
	{
		tagEntry = hashAdd(tagTable, (unsigned char *) tag,
			hashAlloc(NULL));
	}
	attrEntry = hashLookup(tagEntry->value,
		(unsigned char *) attributeName);
	if (attrEntry)
	{
		attrEntry->value = (void *) handler;
	}
	else
	{
		hashAdd(tagEntry->value, (unsigned char *) attributeName,
			(void *) handler);
	}
}

void
htmlRegisterURLHandler(HTMLHandler handler)
{
	char	**p;

	p = urlAttributes;
	while (*p)
	{
		htmlRegister(p[0], p[1], handler);
		p += 2;
	}
}

static void
callHandler(void *a, HTML *html)
{
	HashEntry	*attrEntry;
	HashEntry	*tagEntry;

	if (!tagTable)
	{
		return;
	}
	tagEntry = hashLookup(tagTable, html->tag);
	if (tagEntry)
	{
		attrEntry = hashLookup(tagEntry->value,
			html->currentAttribute->name);
		if (attrEntry)
		{
			(*((HTMLHandler) attrEntry->value))(a, html);
		}
	}
}

void
htmlRegisterTagHandler(HTMLHandler handler)
{
	tagHandler = handler;
}

static unsigned short
htmlGetByte(Input *input, HTMLState *state)
{
	unsigned short	c;
	unsigned short	ret;
	unsigned short	tmp;

	if (state->unGotten != 256)
	{
		tmp = state->unGotten;
		state->unGotten = 256;
		return tmp;
	}

	c = getByte(input);
	if (c == 256)
	{
		ret = c;
	}
	else if (c == 0x1b)
	{
		c = getByte(input);
		if (c == 256)
		{
			ret = c;
		}
		else if (c == '$')
		{
			c = getByte(input);
			if (c == 256)
			{
				ret = c;
			}
			else if (c == '(')
			{
				/* throw away 4th byte in ESC sequence */
				getByte(input);
				state->mask = 0x80;
				c = getByte(input);
				if (c == 256)
				{
					ret = c;
				}
				else
				{
					ret = c | state->mask;
				}
			}
			else
			{
				state->mask = 0x80;
				c = getByte(input);
				if (c == 256)
				{
					ret = c;
				}
				else
				{
					ret = c | state->mask;
				}
			}
		}
		else if (c == '(')
		{
			state->mask = 0;
			/* throw away 3rd byte in ESC sequence */
			getByte(input);
			ret = getByte(input);
		}
		else
		{
			unGetByte(input);
			ret = 0x1b;
		}
	}
	else
	{
		ret = c | state->mask;
	}

	state->saved = ret;

	return ret;
}

static void
htmlUnGetByte(HTMLState *state)
{
	state->unGotten = state->saved;
}

static unsigned short
eatWhiteSpace(Input *input, HTMLState *state, unsigned short c)
{
	while
	(
		(c == ' ') ||
		(c == '\t') ||
		(c == '\r') ||
		(c == '\n')
	)
	{
		c = htmlGetByte(input, state);
	}

	return c;
}

static void
htmlFreeAttributes(HTMLState *state)
{
	HTMLAttribute	*attr;
	HTMLAttribute	*tmp;

	attr = state->html->attributes;
	state->html->attributes = NULL;
	while (attr)
	{
		free(attr->name);
		free(attr->value);
		tmp = attr;
		attr = attr->next;
		free(tmp);
	}
}

static unsigned short
readAttribute(void *a, Input *input, HTMLState *state, unsigned short c)
{
	HTMLAttribute	*attr;
	unsigned short	quote;

	mark(input, -1);
	reportHTML(a, input);
	while
	(
		(c != 256) &&
		(c != '>') &&
		(c != '=') &&
		(c != ' ') &&
		(c != '\t') &&
		(c != '\r') &&
		(c != '\n')
	)
	{
		c = htmlGetByte(input, state);
	}
	mark(input, -1);
	attr = calloc(sizeof(HTMLAttribute), 1);
	if (!attr)
	{
		fprintf(stderr, "cannot calloc HTMLAttribute\n");
		exit(0);
	}
	if (state->html->currentAttribute)
	{
		state->html->currentAttribute->next = attr;
	}
	else
	{
		if (state->html->attributes)
		{
			htmlFreeAttributes(state);
		}
		state->html->attributes = attr;
	}
	state->html->currentAttribute = attr;
	attr->name = copyLower(input);
	reportHTMLAttributeName(a, state->html, input);
	if ((c == 256) || (c == '>'))
	{
		return c;
	}
	if (c != '=')
	{
		c = eatWhiteSpace(input, state, c);
	}
	if ((c == 256) || (c == '>'))
	{
		return c;
	}
	if (c == '=')
	{
		c = eatWhiteSpace(input, state, htmlGetByte(input, state));
		if ((c == '"') || (c == '\''))
		{
			quote = c;
			mark(input, 0);
			reportHTML(a, input);
			do
			{
				c = htmlGetByte(input, state);
			} while ((c != 256) && (c != quote));
			if (c == 256)
			{
				diag(__LINE__, state, c);
			}
			mark(input, -1);
			attr->value = copy(input);
			htmlCheckAttribute(state->html);
			reportHTMLAttributeValue(a, state->html, input);
			c = htmlGetByte(input, state);
		}
		else
		{
			mark(input, -1);
			reportHTML(a, input);
			while
			(
				(c != 256) &&
				(c != '>') &&
				(c != ' ') &&
				(c != '\t') &&
				(c != '\r') &&
				(c != '\n')
			)
			{
				if ((c == '"') || (c == '\''))
				{
					diag(__LINE__, state, c);
				}
				c = htmlGetByte(input, state);
			}
			mark(input, -1);
			attr->value = copy(input);
			htmlCheckAttribute(state->html);
			reportHTMLAttributeValue(a, state->html, input);
		}
		callHandler(a, state->html);
		if (c == '>')
		{
			return c;
		}
	}
	return eatWhiteSpace(input, state, c);
}

static int
caseCompare(char *str, Input *input, HTMLState *state, unsigned short *ret)
{
	unsigned short	c;
	int		i;

	for (i = 0; str[i]; i++)
	{
		c = htmlGetByte(input, state);
		if (tolower(c) != tolower(str[i]))
		{
			*ret = c;
			return 0;
		}
	}
	c = htmlGetByte(input, state);
	*ret = c;
	return 1;
}

static unsigned short
readTag(void *a, Input *input, HTMLState *state)
{
	unsigned short	c;

	mark(input, -1);
	reportHTML(a, input);

	c = htmlGetByte(input, state);
	if (c == '!')
	{
		c = htmlGetByte(input, state);
		if (c == '-')
		{
			c = htmlGetByte(input, state);
			if (c == '-')
			{
				const unsigned char *beginningOfComment =
					current(input);
				while (1)
				{
					c = htmlGetByte(input, state);
					if (c == '-')
					{
						c = htmlGetByte(input, state);
						if (c == '-')
						{
							c = htmlGetByte(input,
							state);
							if (c == '>')
							{
							    return htmlGetByte(input, state);
							}
							else if (c == '-')
							{
							  do
							  {
							    c = htmlGetByte(input, state);
							  } while (c == '-');
							  if (c == '>')
							  {
							    return htmlGetByte(input, state);
							  }
							}
						}
					}
					if (c == 256)
					{
					    set(input, beginningOfComment);
					    while (1)
					    {
						c = htmlGetByte(input, state);
						if (c == '>')
						{
						    return htmlGetByte(input, state);
						}
						else if (c == 256)
						{
						    fprintf(stderr,
							"bad comment\n");
						    mark(input, -1);
						    FREE(state->html->tag);
						    state->html->tag =
							copyString((unsigned
								char *) "!--");
						    state->html->tagIsKnown = 1;
						    reportHTMLTag(a,
							state->html, input);
						    return c;
						}
					    }
					}
				}
			}
			else
			{
				htmlUnGetByte(state);
			}
		}
		else
		{
			htmlUnGetByte(state);
		}
	}
	else
	{
		htmlUnGetByte(state);
	}

	do
	{
		c = htmlGetByte(input, state);
	}
	while
	(
		(c != 256) &&
		(c != '>') &&
		(c != ' ') &&
		(c != '\t') &&
		(c != '\r') &&
		(c != '\n')
	);
	mark(input, -1);
	FREE(state->html->tag);
	state->html->tag = copyLower(input);
	if (hashLookup(knownTagTable, (*state->html->tag == '/') ?
		state->html->tag + 1 : state->html->tag))
	{
		state->html->tagIsKnown = 1;
	}
	else
	{
		state->html->tagIsKnown = 0;
	}
	reportHTMLTag(a, state->html, input);
	if (c == 256)
	{
		return c;
	}
	else if (c == '>')
	{
		return htmlGetByte(input, state);
	}
	c = eatWhiteSpace(input, state, c);
	if (c == 256)
	{
		return c;
	}
	else if (c == '>')
	{
		return htmlGetByte(input, state);
	}
	do
	{
		c = readAttribute(a, input, state, c);
	} while ((c != 256) && (c != '>'));
	state->html->currentAttribute = NULL;
	if (tagHandler)
	{
		(*tagHandler)(a, state->html);
	}
	if (c == '>')
	{
		return htmlGetByte(input, state);
	}

	return c;
}

static unsigned short
readText(void *a, Input *input, HTMLState *state)
{
	unsigned short	c;

	mark(input, -1);
	reportHTML(a, input);
	do
	{
		c = htmlGetByte(input, state);
	} while ((c != 256) && (c != '<'));
	mark(input, -1);
	reportHTMLText(a, input);

	return c;
}

static unsigned short
dealWithScript(Input *input, HTMLState *state, unsigned short c)
{
	if (state->html->tag &&
		(!strcasecmp((char *) state->html->tag, "script")))
	{
		while (1)
		{
			if (c == 256)
			{
				break;
			}
			if (c == '<')
			{
				if (caseCompare("/script>", input, state, &c))
				{
					FREE(state->html->tag);
					break;
				}
			}
			c = htmlGetByte(input, state);
		}
	}

	return c;
}

void
htmlRead(void *a, Input *input, unsigned char *base)
{
	unsigned short	c;
	HTML		html;
	HTMLState	state;

	if (!htmlInitialized)
	{
		htmlInit();
	}

	html.base = copyString(base);
	html.url = copyString(base);
	html.tag = NULL;
	html.attributes = NULL;
	html.currentAttribute = NULL;

	state.mask = 0;
	state.saved = 0;
	state.unGotten = 256;
	state.html = &html;

	c = htmlGetByte(input, &state);
	while (c != 256)
	{
		if (c == '<')
		{
			c = htmlGetByte(input, &state);
			htmlUnGetByte(&state);
			if
			(
				(('a' <= c) && (c <= 'z')) ||
				(('A' <= c) && (c <= 'Z')) ||
				(c == '/') ||
				(c == '!')
			)
			{
				c = readTag(a, input, &state);
				c = dealWithScript(input, &state, c);
			}
			else
			{
				diag(__LINE__, &state, c);
				c = readText(a, input, &state);
			}
		}
		else
		{
			c = readText(a, input, &state);
		}
	}

	FREE(html.base);
	FREE(html.tag);
	htmlFreeAttributes(&state);
}

unsigned char *
toHTML(unsigned char *str)
{
	char		buf[2];
	int		i;
	int		j;
	int		len;
	char		*replacement;
	unsigned char	*result;

	buf[1] = 0;
	len = 0;
	result = NULL;

	for (i = 0; i < 2; i++)
	{
		for (j = 0; str[j]; j++)
		{
			switch (str[j])
			{
			case '<':
				replacement = "&lt;";
				break;
			case '>':
				replacement = "&gt;";
				break;
			case '&':
				replacement = "&amp;";
				break;
			default:
				replacement = buf;
				buf[0] = str[j];
				break;
			}
			if (result)
			{
				strcat((char *) result, replacement);
			}
			else
			{
				len += strlen(replacement);
			}
		}
		if (!result)
		{
			result = calloc(len + 3, 1);
			if (!result)
			{
				fprintf(stderr,
					"cannot calloc toHTML string\n");
				exit(0);
			}
			result[0] = '"';
		}
	}
	strcat((char *) result, "\"");

	return result;
}