/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- * * The contents of this file are subject to the Netscape Public License * Version 1.0 (the "NPL"); you may not use this file except in * compliance with the NPL. You may obtain a copy of the NPL at * http://www.mozilla.org/NPL/ * * Software distributed under the NPL is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL * for the specific language governing rights and limitations under the * NPL. * * The Initial Developer of this code under the NPL is Netscape * Communications Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All Rights * Reserved. */ /*** htmparse.c ***************************************************/ /* description: html parser */ /******************************************************************** $Revision: 1.2 $ $Date: 1998-05-19 00:53:23 $ *********************************************************************/ #include "xp.h" #include "xp_str.h" #include "htmparse.h" #include "prtypes.h" #include "pa_tags.h" #include "pa_parse.h" /* for pa_tokenize_tag */ #include "prmem.h" #if 0 #include "prio.h" /* for test only */ #include /* for test only */ #endif typedef char ParseState; /* states of the parser */ #define PS_START 0 /* starting state */ #define PS_BETWEEN_TAGS 1 /* characters not enclosed by < > */ #define PS_TAG_NAME 2 #define PS_EMPTY_TAG 3 #define PS_CLOSE_BRACKET 4 #define PS_ATTRIBUTE 5 #define PS_EQUALS 6 #define PS_VALUE 7 #define PS_START_COMMENT 8 #define PS_END_COMMENT 9 typedef struct _CRAWL_TagStruc { char *name; intn token; char **attributeNames; char **attributeValues; /* max length of html attribute is 1024 chars */ uint16 sizeNames; uint16 numNames; uint16 sizeValues; uint16 numValues; PRBool emptyTagp; PRBool endTagp; } CRAWL_TagStruc; /* maintains state of parser */ typedef struct _CRAWL_ParseObjStruc { ParseState state; CRAWL_Tag tag; char *data; uint16 dataLen; uint16 dataSize; char *str; uint16 strLen; uint16 strSize; char prev1; char prev2; char inQuote; /* current quote character. when not in quote, value is '\0' */ PRBool inComment; /* we don't support comment nesting anymore */ PRBool inScript; /* inside */ PRBool skipWhitespace; PRBool isRDF; } CRAWL_ParseObjStruc; /* prototypes */ static CRAWL_Tag crawl_makeTag(); static void crawl_recycleTag(CRAWL_Tag tag); static void crawl_destroyTag(CRAWL_Tag tag); static void crawl_recycleParseObj(CRAWL_ParseObj obj); int crawl_appendString(char **str, uint16 *len, uint16 *size, char c); int crawl_appendStringList(char ***list_p, uint16 *len, uint16 *size, char *str); /* accessors */ PR_IMPLEMENT(CRAWL_Tag) CRAWL_GetTagParsed(CRAWL_ParseObj obj) { if (obj->data != NULL) return NULL; else return obj->tag; } PR_IMPLEMENT(char*) CRAWL_GetDataParsed(CRAWL_ParseObj obj) { if (obj->data != NULL) return obj->data; else return NULL; } PR_IMPLEMENT(char*) CRAWL_GetTagName(CRAWL_Tag tag) { return tag->name; } PR_IMPLEMENT(intn) CRAWL_GetTagToken(CRAWL_Tag tag) { return tag->token; } PR_IMPLEMENT(PRBool) CRAWL_IsEmptyTag(CRAWL_Tag tag) { return tag->emptyTagp; } PR_IMPLEMENT(PRBool) CRAWL_IsEndTag(CRAWL_Tag tag) { return tag->endTagp; } PR_IMPLEMENT(uint16) CRAWL_GetNumberOfAttributes(CRAWL_Tag tag) { return tag->numNames; } PR_IMPLEMENT(char*) CRAWL_GetNthAttributeName(CRAWL_Tag tag, uint16 n) { return *(tag->attributeNames + n); } PR_IMPLEMENT(char*) CRAWL_GetNthAttributeValue(CRAWL_Tag tag, uint16 n) { return *(tag->attributeValues + n); } PR_IMPLEMENT(char*) CRAWL_GetAttributeValue(CRAWL_Tag tag, char *attributeName) { int count = 0; while (count < tag->numNames) { if (PL_strcasecmp(attributeName, *(tag->attributeNames + count)) == 0) return *(tag->attributeValues + count); count++; } return NULL; } static CRAWL_Tag crawl_makeTag() { CRAWL_Tag tag = PR_NEWZAP(CRAWL_TagStruc); if (tag == NULL) return NULL; tag->sizeNames = tag->sizeValues = 4; tag->attributeNames = (char**)PR_MALLOC(sizeof(char*) * tag->sizeNames); if (tag->attributeNames == NULL) return NULL; tag->attributeValues = (char**)PR_MALLOC(sizeof(char*) * tag->sizeValues); if (tag->attributeValues == NULL) return NULL; return tag; } static void crawl_recycleTag(CRAWL_Tag tag) { int count; if (tag->name != NULL) PR_Free(tag->name); tag->name = NULL; for (count = 0; count < tag->numNames; count++) { PR_Free(*(tag->attributeNames + count)); } tag->numNames = 0; for (count = 0; count < tag->numValues; count++) { PR_Free(*(tag->attributeValues + count)); } tag->numValues = 0; tag->emptyTagp = PR_FALSE; tag->endTagp = PR_FALSE; } static void crawl_destroyTag(CRAWL_Tag tag) { crawl_recycleTag(tag); if (tag->attributeNames != NULL) PR_Free(tag->attributeNames); if (tag->attributeValues != NULL) PR_Free(tag->attributeValues); PR_Free(tag); } static void crawl_recycleParseObj(CRAWL_ParseObj obj) { crawl_recycleTag(obj->tag); if (obj->data != NULL) PR_Free(obj->data); obj->data = NULL; obj->dataLen = obj->dataSize = 0; } PR_IMPLEMENT(CRAWL_ParseObj) CRAWL_MakeParseObj() { CRAWL_ParseObj obj = PR_NEWZAP(CRAWL_ParseObjStruc); if (obj == NULL) return NULL; obj->tag = crawl_makeTag(); if (obj->tag == NULL) { PR_Free(obj); return NULL; } return obj; } PR_IMPLEMENT(void) CRAWL_DestroyParseObj(CRAWL_ParseObj obj) { crawl_destroyTag(obj->tag); if (obj->data != NULL) PR_Free(obj->data); obj->data = NULL; obj->dataLen = obj->dataSize = 0; if (obj->str != NULL) PR_Free(obj->str); obj->str = NULL; obj->strLen = obj->strSize = 0; PR_Free(obj); } #define STRING_EXPANSION_INCREMENT 16 /* returns 0 if no error, -1 if no memory */ int crawl_appendString(char **str, uint16 *len, uint16 *size, char c) { if (*len == *size) { char *newName = (char*)PR_MALLOC(*size + STRING_EXPANSION_INCREMENT); char *old = *str; if (newName == NULL) return -1; memcpy(newName, *str, *size); *str = newName; if (old != NULL) PR_Free(old); *size += STRING_EXPANSION_INCREMENT; } *(*str + *len) = c; ++(*len); return 0; } #define STRINGLIST_EXPANSION_INCREMENT 8 /* returns 0 if no error, -1 if no memory */ int crawl_appendStringList(char ***list_p, uint16 *len, uint16 *size, char *str) { char **list = *list_p; if (*len == *size) { char **newList = (char**)PR_MALLOC(sizeof(char*) * (*size + STRINGLIST_EXPANSION_INCREMENT)); char **old = list; if (newList == NULL) return -1; memcpy(newList, list, (sizeof(char*) * (*size))); list = newList; if (old != NULL) PR_Free(old); *size += STRINGLIST_EXPANSION_INCREMENT; } *(list + *len) = str; ++(*len); *list_p = list; return 0; } /* returns index to last character of buffer parsed */ PR_IMPLEMENT(int) CRAWL_ParserPut(CRAWL_ParseObj obj, char *str, uint32 len, CRAWL_ParseFunc func, void *data) { uint32 n = 0; /* where we are in the buffer */ uint32 lastn = 0; /* position the last time in the loop */ char c; while (n < len) { if (lastn < n) { /* we advanced a character */ obj->prev1 = obj->prev2; obj->prev2 = c; } lastn = n; c = *(str + n); if (obj->inComment) { /* if we're in a comment, ignore everything until we detect end of comment */ if ((obj->prev1 == '-') && (obj->prev2 == '-') && (c == '>')) obj->inComment = PR_FALSE; n++; } else if (obj->skipWhitespace) { if ((c == ' ') || (c == '\n') || (c == '\r')) { n++; } else obj->skipWhitespace = PR_FALSE; } else { PRBool endOfString = PR_FALSE; switch (obj->state) { case PS_START: /* PS_START - expecting open bracket or character data */ if (c == '<') { obj->state = PS_TAG_NAME; n++; } else { obj->state = PS_BETWEEN_TAGS; } break; case PS_BETWEEN_TAGS: /* PS_BETWEEN_TAGS - expecting open bracket (terminating character data) or more character data */ if (obj->inQuote == c) { obj->inQuote = '\0'; /* close quote */ } else if ((c == '"') || (obj->inScript && (c == '\''))) { /* start a quote, only double quotes significant in between tags */ obj->inQuote = c; } /* open bracket not in quoted section indicates end of data */ if ((obj->inQuote == '\0') && (c == '<')) { obj->state = PS_START; if (crawl_appendString(&obj->data, &obj->dataLen, &obj->dataSize, '\0') != 0) /* null terminate string */ return CRAWL_PARSE_OUT_OF_MEMORY; if (func(obj, PR_FALSE, data) == PARSE_STOP) return CRAWL_PARSE_TERMINATE; crawl_recycleParseObj(obj); } else { if (crawl_appendString(&obj->data, &obj->dataLen, &obj->dataSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } break; case PS_TAG_NAME: /* PS_TAG_NAME - terminated by space, \r, \n, >, / */ if ((c == '"') || (c == '\'')) return CRAWL_PARSE_ERROR; /* error - these are not allowed in tagname */ else if (c == ' ') { /* Note: Both mozilla and XML don't allow any spaces between < and tagname. Need to check for zero-length tagname. */ if (obj->str == NULL) return CRAWL_PARSE_ERROR; /* obj->str is the buffer we're working on */ endOfString = PR_TRUE; obj->state = PS_ATTRIBUTE; obj->skipWhitespace = PR_TRUE; n++; } else if (c == '/') { if (obj->tag->name == NULL) obj->tag->endTagp = PR_TRUE; /* indicates end tag if no tag name read yet */ else if (obj->isRDF) { /* otherwise its an empty tag (RDF only) */ endOfString = PR_TRUE; obj->tag->emptyTagp = PR_TRUE; obj->state = PS_CLOSE_BRACKET; } else return CRAWL_PARSE_ERROR; n++; } else if (c == '>') { endOfString = PR_TRUE; obj->state = PS_CLOSE_BRACKET; } else if ((c != '\r') && (c != '\n')) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } else { endOfString = PR_TRUE; obj->state = PS_ATTRIBUTE; /* note - mozilla allows newline after tag name */ obj->skipWhitespace = PR_TRUE; n++; } if (endOfString) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */ return CRAWL_PARSE_OUT_OF_MEMORY; if (strcmp(obj->str, "!--") == 0) { /* html comment */ obj->inComment = PR_TRUE; obj->state = PS_START; } else { obj->tag->name = obj->str; obj->tag->token = pa_tokenize_tag(obj->str); } obj->str = NULL; obj->strLen = obj->strSize = 0; endOfString = PR_FALSE; } break; case PS_CLOSE_BRACKET: /* PS_CLOSE_BRACKET - expecting a close bracket, anything else is an error */ if (c == '>') { if (!obj->isRDF && (obj->tag->token == P_SCRIPT)) { /* we're inside a script tag (not RDF) */ if (obj->tag->endTagp) obj->inScript = PR_FALSE; else obj->inScript = PR_TRUE; } if (func(obj, PR_TRUE, data) == PARSE_STOP) return CRAWL_PARSE_TERMINATE; crawl_recycleParseObj(obj); obj->state = PS_START; n++; } else return CRAWL_PARSE_ERROR; /* error */ break; case PS_ATTRIBUTE: /* PS_ATTRIBUTE - expecting an attribute name, or / (RDF only) or > indicating no more attributes */ /* accept attributes without values, such as or */ if (obj->inQuote == c) { obj->inQuote = '\0'; /* close quote */ } else if (((c == '"') || (c == '\'')) && (obj->inQuote == '\0')) { /* start a quote if none is already in effect */ obj->inQuote = c; } if (obj->inQuote == '\0') { if ((((c == '/') && obj->isRDF) || (c == '>')) && (obj->str == NULL)) { obj->state = PS_CLOSE_BRACKET; } else if ((c == ' ') || (c == '=') || (c == '\n') || (c == '\r') || ((c == '/') && obj->isRDF) || (c == '>')) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */ return CRAWL_PARSE_OUT_OF_MEMORY; if (crawl_appendStringList(&obj->tag->attributeNames, &obj->tag->numNames, &obj->tag->sizeNames, obj->str) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; obj->str = NULL; obj->strLen = obj->strSize = 0; obj->state = PS_EQUALS; /* if non-null attribute name */ } else { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } } else { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } break; case PS_EQUALS: if ((c == ' ') || (c == '\n') || (c == '\r')) { obj->skipWhitespace = PR_TRUE; n++; } else if (c == '=') { obj->skipWhitespace = PR_TRUE; obj->state = PS_VALUE; n++; } else { /* no value for the attribute - error in RDF? */ if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */ return CRAWL_PARSE_OUT_OF_MEMORY; if (crawl_appendStringList(&obj->tag->attributeValues, &obj->tag->numValues, &obj->tag->sizeValues, obj->str) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; obj->str = NULL; obj->strLen = obj->strSize = 0; obj->state = PS_ATTRIBUTE; } break; case PS_VALUE: /* expecting a value, or space, / (RDF only), or > indicating end of value. */ { PRBool include = PR_TRUE; /* whether the current character should be included in value */ if (obj->inQuote == c) { obj->inQuote = '\0'; /* close quote */ include = PR_FALSE; } else if (((c == '"') || (c == '\'')) && (obj->inQuote == '\0')) { /* start a quote if none is already in effect */ obj->inQuote = c; include = PR_FALSE; } if (obj->inQuote == '\0') { if ((c == '/') && obj->isRDF) { endOfString = PR_TRUE; obj->state = PS_CLOSE_BRACKET; n++; } else if (c == '>') { endOfString = PR_TRUE; obj->state = PS_CLOSE_BRACKET; } else if ((c == ' ') || (c == '\r') || (c == '\n')) { endOfString = PR_TRUE; obj->skipWhitespace = PR_TRUE; obj->state = PS_ATTRIBUTE; /* if non-null value name */ n++; } else if (include) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } else n++; } else if (include) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, c) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; n++; } else n++; if (endOfString) { if (crawl_appendString(&obj->str, &obj->strLen, &obj->strSize, '\0') != 0) /* null terminate string */ return CRAWL_PARSE_OUT_OF_MEMORY; if (crawl_appendStringList(&obj->tag->attributeValues, &obj->tag->numValues, &obj->tag->sizeValues, obj->str) != 0) return CRAWL_PARSE_OUT_OF_MEMORY; obj->str = NULL; obj->strLen = obj->strSize = 0; endOfString = PR_FALSE; } break; } default: break; } } } return CRAWL_PARSE_NO_ERROR; } #if 0 void printParseObj(CRAWL_ParseObj obj, PRBool isTag, void *data) { if (isTag) { CRAWL_Tag tag = CRAWL_GetTagParsed(obj); if (CRAWL_IsEndTag(tag)) { printf("\n", CRAWL_GetTagName(tag)); } else { uint16 i; printf("<%s", CRAWL_GetTagName(tag)); for (i = 0; i < CRAWL_GetNumberOfAttributes(tag); i++) { printf(" %s=\"%s\"", CRAWL_GetNthAttributeName(tag, i), CRAWL_GetNthAttributeValue(tag, i)); } if (CRAWL_IsEmptyTag(tag)) printf("/>\n"); else printf(">\n"); } } else printf(">>>>>%s<<<<<\n", CRAWL_GetDataParsed(obj)); } void parseLocalFile (char *url) { PRFileDesc *fp; int32 len; char *path; static char buf[512]; /* xxx alloc */ CRAWL_ParseObj parse; /* XXX need to unescape URL */ path=&url[8]; fp = PR_Open(path, PR_RDONLY, 0644); /* WR_ONLY|PR_TRUNCATE */ if(fp == NULL) { /* abortRDFParse(file); */ return; } parse = CRAWL_MakeParseObj(); while((len=PR_Read(fp, buf, 512))>0) { int result; result = CRAWL_ParserPut(parse, buf, len, printParseObj, NULL); if (result == len) printf("************NO ERRORS************\n"); else printf("************PARSING ERROR************\n"); } PR_Close(fp); CRAWL_DestroyParseObj(parse); /* finishRDFParse(file); */ return; } #endif