pjs/network/main/htmparse.c

511 строки
16 KiB
C
Исходник Обычный вид История

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
/*** htmparse.c ***************************************************/
/* description: html parser */
/********************************************************************
$Revision: 1.2 $
$Date: 1998-05-19 00:53:23 $
*********************************************************************/
#include "xp.h"
#include "xp_str.h"
#include "htmparse.h"
#include "prtypes.h"
#include "pa_tags.h"
#include "pa_parse.h" /* for pa_tokenize_tag */
#include "prmem.h"
#if 0
#include "prio.h" /* for test only */
#include <stdio.h> /* for test only */
#endif
typedef char ParseState;
/* states of the parser */
#define PS_START 0 /* starting state */
#define PS_BETWEEN_TAGS 1 /* characters not enclosed by < > */
#define PS_TAG_NAME 2
#define PS_EMPTY_TAG 3
#define PS_CLOSE_BRACKET 4
#define PS_ATTRIBUTE 5
#define PS_EQUALS 6
#define PS_VALUE 7
#define PS_START_COMMENT 8
#define PS_END_COMMENT 9
typedef struct _CRAWL_TagStruc {
char *name;
intn token;
char **attributeNames;
char **attributeValues; /* max length of html attribute is 1024 chars */
uint16 sizeNames;
uint16 numNames;
uint16 sizeValues;
uint16 numValues;
PRBool emptyTagp;
PRBool endTagp;
} CRAWL_TagStruc;
/* maintains state of parser */
typedef struct _CRAWL_ParseObjStruc {
ParseState state;
CRAWL_Tag tag;
char *data;
uint16 dataLen;
uint16 dataSize;
char *str;
uint16 strLen;
uint16 strSize;
char prev1;
char prev2;
char inQuote; /* current quote character. when not in quote, value is '\0' */
PRBool inComment; /* we don't support comment nesting anymore */
PRBool inScript; /* inside <SCRIPT> and </SCRIPT> */
PRBool skipWhitespace;
PRBool isRDF;
} CRAWL_ParseObjStruc;
/* prototypes */
static CRAWL_Tag crawl_makeTag();
static void crawl_recycleTag(CRAWL_Tag tag);
static void crawl_destroyTag(CRAWL_Tag tag);
static void crawl_recycleParseObj(CRAWL_ParseObj obj);
int crawl_appendString(char **str, uint16 *len, uint16 *size, char c);
int crawl_appendStringList(char ***list_p, uint16 *len, uint16 *size, char *str);
/* accessors */
PR_IMPLEMENT(CRAWL_Tag) CRAWL_GetTagParsed(CRAWL_ParseObj obj) {
if (obj->data != NULL) return NULL;
else return obj->tag;
}
PR_IMPLEMENT(char*) CRAWL_GetDataParsed(CRAWL_ParseObj obj) {
if (obj->data != NULL) return obj->data;
else return NULL;
}
PR_IMPLEMENT(char*) CRAWL_GetTagName(CRAWL_Tag tag) {
return tag->name;
}
PR_IMPLEMENT(intn) CRAWL_GetTagToken(CRAWL_Tag tag) {
return tag->token;
}
PR_IMPLEMENT(PRBool) CRAWL_IsEmptyTag(CRAWL_Tag tag) {
return tag->emptyTagp;
}
PR_IMPLEMENT(PRBool) CRAWL_IsEndTag(CRAWL_Tag tag) {
return tag->endTagp;
}
PR_IMPLEMENT(uint16) CRAWL_GetNumberOfAttributes(CRAWL_Tag tag) {
return tag->numNames;
}
PR_IMPLEMENT(char*) CRAWL_GetNthAttributeName(CRAWL_Tag tag, uint16 n) {
return *(tag->attributeNames + n);
}
PR_IMPLEMENT(char*) CRAWL_GetNthAttributeValue(CRAWL_Tag tag, uint16 n) {
return *(tag->attributeValues + n);
}
PR_IMPLEMENT(char*) CRAWL_GetAttributeValue(CRAWL_Tag tag, char *attributeName) {
int count = 0;
while (count < tag->numNames) {
if (PL_strcasecmp(attributeName, *(tag->attributeNames + count)) == 0)
return *(tag->attributeValues + count);
count++;
}
return NULL;
}
static CRAWL_Tag crawl_makeTag() {
CRAWL_Tag tag = PR_NEWZAP(CRAWL_TagStruc);
if (tag == NULL) return NULL;
tag->sizeNames = tag->sizeValues = 4;
tag->attributeNames = (char**)PR_MALLOC(sizeof(char*) * tag->sizeNames);
if (tag->attributeNames == NULL) return NULL;
tag->attributeValues = (char**)PR_MALLOC(sizeof(char*) * tag->sizeValues);
if (tag->attributeValues == NULL) return NULL;
return tag;
}
static void crawl_recycleTag(CRAWL_Tag tag) {
int count;
if (tag->name != NULL) PR_Free(tag->name);
tag->name = NULL;
for (count = 0; count < tag->numNames; count++) {
PR_Free(*(tag->attributeNames + count));
}
tag->numNames = 0;
for (count = 0; count < tag->numValues; count++) {
PR_Free(*(tag->attributeValues + count));
}
tag->numValues = 0;
tag->emptyTagp = PR_FALSE;
tag->endTagp = PR_FALSE;
}
static void crawl_destroyTag(CRAWL_Tag tag) {
crawl_recycleTag(tag);
if (tag->attributeNames != NULL) PR_Free(tag->attributeNames);
if (tag->attributeValues != NULL) PR_Free(tag->attributeValues);
PR_Free(tag);
}
static void crawl_recycleParseObj(CRAWL_ParseObj obj) {
crawl_recycleTag(obj->tag);
if (obj->data != NULL) PR_Free(obj->data);
obj->data = NULL;
obj->dataLen = obj->dataSize = 0;
}
PR_IMPLEMENT(CRAWL_ParseObj) CRAWL_MakeParseObj() {
CRAWL_ParseObj obj = PR_NEWZAP(CRAWL_ParseObjStruc);
if (obj == NULL) return NULL;
obj->tag = crawl_makeTag();
if (obj->tag == NULL) {
PR_Free(obj);
return NULL;
}
return obj;
}
PR_IMPLEMENT(void) CRAWL_DestroyParseObj(CRAWL_ParseObj obj) {
crawl_destroyTag(obj->tag);
if (obj->data != NULL) PR_Free(obj->data);
obj->data = NULL;
obj->dataLen = obj->dataSize = 0;
if (obj->str != NULL) PR_Free(obj->str);
obj->str = NULL;
obj->strLen = obj->strSize = 0;
PR_Free(obj);
}
#define STRING_EXPANSION_INCREMENT 16
/* returns 0 if no error, -1 if no memory */
int crawl_appendString(char **str, uint16 *len, uint16 *size, char c) {
if (*len == *size) {
char *newName = (char*)PR_MALLOC(*size + STRING_EXPANSION_INCREMENT);
char *old = *str;
if (newName == NULL) return -1;
memcpy(newName, *str, *size);
*str = newName;
if (old != NULL) PR_Free(old);
*size += STRING_EXPANSION_INCREMENT;
}
*(*str + *len) = c;
++(*len);
return 0;
}
#define STRINGLIST_EXPANSION_INCREMENT 8
/* returns 0 if no error, -1 if no memory */
int crawl_appendStringList(char ***list_p, uint16 *len, uint16 *size, char *str) {
char **list = *list_p;
if (*len == *size) {
char **newList = (char**)PR_MALLOC(sizeof(char*) * (*size + STRINGLIST_EXPANSION_INCREMENT));
char **old = list;
if (newList == NULL) return -1;
memcpy(newList, list, (sizeof(char*) * (*size)));
list = newList;
if (old != NULL) PR_Free(old);
*size += STRINGLIST_EXPANSION_INCREMENT;
}
*(list + *len) = str;
++(*len);
*list_p = list;
return 0;
}
/* returns index to last character of buffer parsed */
PR_IMPLEMENT(int) CRAWL_ParserPut(CRAWL_ParseObj obj, char *str, uint32 len, CRAWL_ParseFunc func, void *data) {
uint32 n = 0; /* where we are in the buffer */
uint32 lastn = 0; /* position the last time in the loop */
char c;
while (n < len) {
if (lastn < n) { /* we advanced a character */
obj->prev1 = obj->prev2;
obj->prev2 = c;
}
lastn = n;
c = *(str + n);
if (obj->inComment) {
/* if we're in a comment, ignore everything until we detect end of comment */
if ((obj->prev1 == '-') && (obj->prev2 == '-') && (c == '>')) obj->inComment = PR_FALSE;
n++;
} else if (obj->skipWhitespace) {
if ((c == ' ') || (c == '\n') || (c == '\r')) {
n++;
} else obj->skipWhitespace = PR_FALSE;
} else {
PRBool endOfString = PR_FALSE;
switch (obj->state) {
case PS_START:
/* PS_START - expecting open bracket or character data */
if (c == '<') {
obj->state = PS_TAG_NAME;
n++;
} else {
obj->state = PS_BETWEEN_TAGS;
}
break;
case PS_BETWEEN_TAGS:
/* PS_BETWEEN_TAGS - expecting open bracket (terminating character data) or more character data */
if (obj->inQuote == c) {
obj->inQuote = '\0'; /* close quote */
} else if ((c == '"') || (obj->inScript && (c == '\''))) { /* start a quote, only double quotes significant in between tags */
obj->inQuote = c;
}
/* open bracket not in quoted section indicates end of data */
if ((obj->inQuote == '\0') && (c == '<')) {
obj->state = PS_START;
if (crawl_appendString(&obj->data, &obj->dataLen, &obj->dataSize, '\0') != 0) /* null terminate string */
return CRAWL_PARSE_OUT_OF_MEMORY;
if (func(obj, PR_FALSE, data) == PARSE_STOP) return CRAWL_PARSE_TERMINATE;
crawl_recycleParseObj(obj);
} else {
if (crawl_appendString(&obj->data, &obj->dataLen, &obj->dataSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
}
break;
case PS_TAG_NAME:
/* PS_TAG_NAME - terminated by space, \r, \n, >, / */
if ((c == '"') || (c == '\'')) return CRAWL_PARSE_ERROR; /* error - these are not allowed in tagname */
else if (c == ' ') {
/* Note: Both mozilla and XML don't allow any spaces between < and tagname.
Need to check for zero-length tagname.
*/
if (obj->str == NULL) return CRAWL_PARSE_ERROR; /* obj->str is the buffer we're working on */
endOfString = PR_TRUE;
obj->state = PS_ATTRIBUTE;
obj->skipWhitespace = PR_TRUE;
n++;
} else if (c == '/') {
if (obj->tag->name == NULL) obj->tag->endTagp = PR_TRUE; /* indicates end tag if no tag name read yet */
else if (obj->isRDF) { /* otherwise its an empty tag (RDF only) */
endOfString = PR_TRUE;
obj->tag->emptyTagp = PR_TRUE;
obj->state = PS_CLOSE_BRACKET;
} else return CRAWL_PARSE_ERROR;
n++;
} else if (c == '>') {
endOfString = PR_TRUE;
obj->state = PS_CLOSE_BRACKET;
} else if ((c != '\r') && (c != '\n')) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
} else {
endOfString = PR_TRUE;
obj->state = PS_ATTRIBUTE; /* note - mozilla allows newline after tag name */
obj->skipWhitespace = PR_TRUE;
n++;
}
if (endOfString) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */
return CRAWL_PARSE_OUT_OF_MEMORY;
if (strcmp(obj->str, "!--") == 0) { /* html comment */
obj->inComment = PR_TRUE;
obj->state = PS_START;
} else {
obj->tag->name = obj->str;
obj->tag->token = pa_tokenize_tag(obj->str);
}
obj->str = NULL;
obj->strLen = obj->strSize = 0;
endOfString = PR_FALSE;
}
break;
case PS_CLOSE_BRACKET:
/* PS_CLOSE_BRACKET - expecting a close bracket, anything else is an error */
if (c == '>') {
if (!obj->isRDF && (obj->tag->token == P_SCRIPT)) {
/* we're inside a script tag (not RDF) */
if (obj->tag->endTagp) obj->inScript = PR_FALSE;
else obj->inScript = PR_TRUE;
}
if (func(obj, PR_TRUE, data) == PARSE_STOP) return CRAWL_PARSE_TERMINATE;
crawl_recycleParseObj(obj);
obj->state = PS_START;
n++;
} else return CRAWL_PARSE_ERROR; /* error */
break;
case PS_ATTRIBUTE:
/* PS_ATTRIBUTE - expecting an attribute name, or / (RDF only) or > indicating no more attributes */
/* accept attributes without values, such as <tag attr1 attr2=val2>
or <tag attr2=val2 attr1>
*/
if (obj->inQuote == c) {
obj->inQuote = '\0'; /* close quote */
} else if (((c == '"') || (c == '\'')) && (obj->inQuote == '\0')) {
/* start a quote if none is already in effect */
obj->inQuote = c;
}
if (obj->inQuote == '\0') {
if ((((c == '/') && obj->isRDF) || (c == '>')) && (obj->str == NULL)) {
obj->state = PS_CLOSE_BRACKET;
} else if ((c == ' ') || (c == '=') || (c == '\n') || (c == '\r') || ((c == '/') && obj->isRDF) || (c == '>')) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */
return CRAWL_PARSE_OUT_OF_MEMORY;
if (crawl_appendStringList(&obj->tag->attributeNames, &obj->tag->numNames, &obj->tag->sizeNames, obj->str) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
obj->str = NULL;
obj->strLen = obj->strSize = 0;
obj->state = PS_EQUALS; /* if non-null attribute name */
} else {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
}
} else {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
}
break;
case PS_EQUALS:
if ((c == ' ') || (c == '\n') || (c == '\r')) {
obj->skipWhitespace = PR_TRUE;
n++;
} else if (c == '=') {
obj->skipWhitespace = PR_TRUE;
obj->state = PS_VALUE;
n++;
} else { /* no value for the attribute - error in RDF? */
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */
return CRAWL_PARSE_OUT_OF_MEMORY;
if (crawl_appendStringList(&obj->tag->attributeValues, &obj->tag->numValues, &obj->tag->sizeValues, obj->str) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
obj->str = NULL;
obj->strLen = obj->strSize = 0;
obj->state = PS_ATTRIBUTE;
}
break;
case PS_VALUE:
/* expecting a value, or space, / (RDF only), or > indicating end of value. */
{
PRBool include = PR_TRUE; /* whether the current character should be included in value */
if (obj->inQuote == c) {
obj->inQuote = '\0'; /* close quote */
include = PR_FALSE;
} else if (((c == '"') || (c == '\'')) && (obj->inQuote == '\0')) {
/* start a quote if none is already in effect */
obj->inQuote = c;
include = PR_FALSE;
}
if (obj->inQuote == '\0') {
if ((c == '/') && obj->isRDF) {
endOfString = PR_TRUE;
obj->state = PS_CLOSE_BRACKET;
n++;
} else if (c == '>') {
endOfString = PR_TRUE;
obj->state = PS_CLOSE_BRACKET;
} else if ((c == ' ') || (c == '\r') || (c == '\n')) {
endOfString = PR_TRUE;
obj->skipWhitespace = PR_TRUE;
obj->state = PS_ATTRIBUTE; /* if non-null value name */
n++;
} else if (include) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
} else n++;
} else if (include) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
n++;
} else n++;
if (endOfString) {
if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */
return CRAWL_PARSE_OUT_OF_MEMORY;
if (crawl_appendStringList(&obj->tag->attributeValues, &obj->tag->numValues, &obj->tag->sizeValues, obj->str) != 0)
return CRAWL_PARSE_OUT_OF_MEMORY;
obj->str = NULL;
obj->strLen = obj->strSize = 0;
endOfString = PR_FALSE;
}
break;
}
default:
break;
}
}
}
return CRAWL_PARSE_NO_ERROR;
}
#if 0
void printParseObj(CRAWL_ParseObj obj, PRBool isTag, void *data) {
if (isTag) {
CRAWL_Tag tag = CRAWL_GetTagParsed(obj);
if (CRAWL_IsEndTag(tag)) {
printf("</%s>\n", CRAWL_GetTagName(tag));
} else {
uint16 i;
printf("<%s", CRAWL_GetTagName(tag));
for (i = 0; i < CRAWL_GetNumberOfAttributes(tag); i++) {
printf(" %s=\"%s\"", CRAWL_GetNthAttributeName(tag, i), CRAWL_GetNthAttributeValue(tag, i));
}
if (CRAWL_IsEmptyTag(tag)) printf("/>\n");
else printf(">\n");
}
} else printf(">>>>>%s<<<<<\n", CRAWL_GetDataParsed(obj));
}
void parseLocalFile (char *url) {
PRFileDesc *fp;
int32 len;
char *path;
static char buf[512]; /* xxx alloc */
CRAWL_ParseObj parse;
/* XXX need to unescape URL */
path=&url[8];
fp = PR_Open(path, PR_RDONLY, 0644); /* WR_ONLY|PR_TRUNCATE */
if(fp == NULL)
{
/* abortRDFParse(file); */
return;
}
parse = CRAWL_MakeParseObj();
while((len=PR_Read(fp, buf, 512))>0) {
int result;
result = CRAWL_ParserPut(parse, buf, len, printParseObj, NULL);
if (result == len) printf("************NO ERRORS************\n");
else printf("************PARSING ERROR************\n");
}
PR_Close(fp);
CRAWL_DestroyParseObj(parse);
/* finishRDFParse(file); */
return;
}
#endif