From c1ce4a8d423012ecc6701b4a2bab057adc0c86cd Mon Sep 17 00:00:00 2001 From: "jgellman%netscape.com" Date: Thu, 20 Aug 1998 21:20:50 +0000 Subject: [PATCH] 2nd try; new parser from james clark --- modules/xml/expat/xmlparse/hashtable.c | 19 +- modules/xml/expat/xmlparse/hashtable.h | 10 +- modules/xml/expat/xmlparse/xmlparse.c | 1578 +++++++++++++++++++----- modules/xml/expat/xmlparse/xmlparse.h | 295 ++++- modules/xml/expat/xmltok/xmlrole.c | 2 +- modules/xml/expat/xmltok/xmlrole.h | 1 + modules/xml/expat/xmltok/xmltok.c | 606 +++++++-- modules/xml/expat/xmltok/xmltok.h | 74 +- modules/xml/expat/xmltok/xmltok_impl.c | 188 ++- modules/xml/macbuild/XML.mcp.exp | 7 +- 10 files changed, 2269 insertions(+), 511 deletions(-) diff --git a/modules/xml/expat/xmlparse/hashtable.c b/modules/xml/expat/xmlparse/hashtable.c index e029f9f44227..2876975bd2bb 100644 --- a/modules/xml/expat/xmlparse/hashtable.c +++ b/modules/xml/expat/xmlparse/hashtable.c @@ -1,7 +1,7 @@ /* The contents of this file are subject to the Mozilla Public License Version 1.0 (the "License"); you may not use this file except in -compliance with the License. You may obtain a copy of the License at +csompliance with the License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ Software distributed under the License is distributed on an "AS IS" @@ -18,15 +18,22 @@ James Clark. All Rights Reserved. Contributor(s): */ -#include "xmldef.h" -#include "hashtable.h" #include #include +#include "xmldef.h" +#include "hashtable.h" + +#ifdef XML_UNICODE +#define keycmp wcscmp +#else +#define keycmp strcmp +#endif + #define INIT_SIZE 64 static -unsigned long hash(const char *s) +unsigned long hash(KEY s) { unsigned long h = 0; while (*s) @@ -34,7 +41,7 @@ unsigned long hash(const char *s) return h; } -NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize) +NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize) { size_t i; if (table->size == 0) { @@ -52,7 +59,7 @@ NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize) for (i = h & (table->size - 1); table->v[i]; i == 0 ? i = table->size - 1 : --i) { - if (strcmp(name, table->v[i]->name) == 0) + if (keycmp(name, table->v[i]->name) == 0) return table->v[i]; } if (!createSize) diff --git a/modules/xml/expat/xmlparse/hashtable.h b/modules/xml/expat/xmlparse/hashtable.h index 19ec9902bdfd..d10e591c7ff8 100644 --- a/modules/xml/expat/xmlparse/hashtable.h +++ b/modules/xml/expat/xmlparse/hashtable.h @@ -21,8 +21,14 @@ Contributor(s): #include +#ifdef XML_UNICODE +typedef const wchar_t *KEY; +#else +typedef const char *KEY; +#endif + typedef struct { - const char *name; + KEY name; } NAMED; typedef struct { @@ -32,7 +38,7 @@ typedef struct { size_t usedLim; } HASH_TABLE; -NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize); +NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize); void hashTableInit(HASH_TABLE *); void hashTableDestroy(HASH_TABLE *); diff --git a/modules/xml/expat/xmlparse/xmlparse.c b/modules/xml/expat/xmlparse/xmlparse.c index 3c733bc0eb0e..4709579a7f63 100644 --- a/modules/xml/expat/xmlparse/xmlparse.c +++ b/modules/xml/expat/xmlparse/xmlparse.c @@ -18,16 +18,43 @@ James Clark. All Rights Reserved. Contributor(s): */ +#include +#include +#include + #include "xmldef.h" + +#ifdef XML_UNICODE +#define XML_ENCODE_MAX XML_UTF16_ENCODE_MAX +#define XmlConvert XmlUtf16Convert +#define XmlGetInternalEncoding XmlGetUtf16InternalEncoding +#define XmlEncode XmlUtf16Encode +#define MUST_CONVERT(enc, s) (!(enc)->isUtf16 || (((unsigned long)s) & 1)) +typedef unsigned short ICHAR; +#else +#define XML_ENCODE_MAX XML_UTF8_ENCODE_MAX +#define XmlConvert XmlUtf8Convert +#define XmlGetInternalEncoding XmlGetUtf8InternalEncoding +#define XmlEncode XmlUtf8Encode +#define MUST_CONVERT(enc, s) (!(enc)->isUtf8) +typedef char ICHAR; +#endif + +#ifdef XML_UNICODE_WCHAR_T +#define XML_T(x) L ## x +#else +#define XML_T(x) x +#endif + +/* Round up n to be a multiple of sz, where sz is a power of 2. */ +#define ROUND_UP(n, sz) (((n) + ((sz) - 1)) & ~((sz) - 1)) + #include "xmlparse.h" #include "xmltok.h" #include "xmlrole.h" #include "hashtable.h" -#include -#include - -#define INIT_TAG_BUF_SIZE 32 +#define INIT_TAG_BUF_SIZE 32 /* must be a multiple of sizeof(XML_Char) */ #define INIT_DATA_BUF_SIZE 1024 #define INIT_ATTS_SIZE 16 #define INIT_BLOCK_SIZE 1024 @@ -37,51 +64,51 @@ typedef struct tag { struct tag *parent; const char *rawName; int rawNameLength; - const char *name; + const XML_Char *name; char *buf; char *bufEnd; } TAG; typedef struct { - const char *name; - const char *textPtr; + const XML_Char *name; + const XML_Char *textPtr; int textLen; - const char *systemId; - const char *publicId; - const char *notation; + const XML_Char *systemId; + const XML_Char *base; + const XML_Char *publicId; + const XML_Char *notation; char open; - char magic; } ENTITY; typedef struct block { struct block *next; int size; - char s[1]; + XML_Char s[1]; } BLOCK; typedef struct { BLOCK *blocks; BLOCK *freeBlocks; - const char *end; - char *ptr; - char *start; + const XML_Char *end; + XML_Char *ptr; + XML_Char *start; } STRING_POOL; -/* The byte before the name is a scratch byte used to determine whether +/* The XML_Char before the name is used to determine whether an attribute has been specified. */ typedef struct { - char *name; + XML_Char *name; char maybeTokenized; } ATTRIBUTE_ID; typedef struct { const ATTRIBUTE_ID *id; char isCdata; - const char *value; + const XML_Char *value; } DEFAULT_ATTRIBUTE; typedef struct { - const char *name; + const XML_Char *name; int nDefaultAtts; int allocDefaultAtts; DEFAULT_ATTRIBUTE *defaultAtts; @@ -94,6 +121,7 @@ typedef struct { STRING_POOL pool; int complete; int standalone; + const XML_Char *base; } DTD; typedef enum XML_Error Processor(XML_Parser parser, @@ -102,16 +130,30 @@ typedef enum XML_Error Processor(XML_Parser parser, const char **endPtr); static Processor prologProcessor; +static Processor prologInitProcessor; static Processor contentProcessor; +static Processor cdataSectionProcessor; static Processor epilogProcessor; static Processor errorProcessor; +static Processor externalEntityInitProcessor; +static Processor externalEntityInitProcessor2; +static Processor externalEntityInitProcessor3; +static Processor externalEntityContentProcessor; +static enum XML_Error +handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName); +static enum XML_Error +processXmlDecl(XML_Parser parser, int isGeneralTextEntity, const char *, const char *); +static enum XML_Error +initializeEncoding(XML_Parser parser); static enum XML_Error doContent(XML_Parser parser, int startTagLevel, const ENCODING *enc, const char *start, const char *end, const char **endPtr); -static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const char *tagName, const char *s); +static enum XML_Error +doCdataSection(XML_Parser parser, const ENCODING *, const char **startPtr, const char *end, const char **nextPtr); +static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *, const XML_Char *tagName, const char *s); static int -defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const char *dfltValue); +defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *, int isCdata, const XML_Char *dfltValue); static enum XML_Error storeAttributeValue(XML_Parser parser, const ENCODING *, int isCdata, const char *, const char *, STRING_POOL *); @@ -124,31 +166,42 @@ static enum XML_Error storeEntityValue(XML_Parser parser, const char *start, const char *end); static int reportProcessingInstruction(XML_Parser parser, const ENCODING *enc, const char *start, const char *end); +static void +reportDefault(XML_Parser parser, const ENCODING *enc, const char *start, const char *end); +static const XML_Char *getOpenEntityNames(XML_Parser parser); +static int setOpenEntityNames(XML_Parser parser, const XML_Char *openEntityNames); +static void normalizePublicId(XML_Char *s); static int dtdInit(DTD *); static void dtdDestroy(DTD *); +static int dtdCopy(DTD *newDtd, const DTD *oldDtd); static void poolInit(STRING_POOL *); static void poolClear(STRING_POOL *); static void poolDestroy(STRING_POOL *); -static char *poolAppend(STRING_POOL *pool, const ENCODING *enc, - const char *ptr, const char *end); -static char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, - const char *ptr, const char *end); +static XML_Char *poolAppend(STRING_POOL *pool, const ENCODING *enc, + const char *ptr, const char *end); +static XML_Char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, + const char *ptr, const char *end); static int poolGrow(STRING_POOL *pool); +static const XML_Char *poolCopyString(STRING_POOL *pool, const XML_Char *s); +static const XML_Char *poolCopyStringN(STRING_POOL *pool, const XML_Char *s, int n); #define poolStart(pool) ((pool)->start) #define poolEnd(pool) ((pool)->ptr) #define poolLength(pool) ((pool)->ptr - (pool)->start) #define poolChop(pool) ((void)--(pool->ptr)) -#define poolLastByte(pool) (((pool)->ptr)[-1]) +#define poolLastChar(pool) (((pool)->ptr)[-1]) #define poolDiscard(pool) ((pool)->ptr = (pool)->start) #define poolFinish(pool) ((pool)->start = (pool)->ptr) -#define poolAppendByte(pool, c) \ +#define poolAppendChar(pool, c) \ (((pool)->ptr == (pool)->end && !poolGrow(pool)) \ ? 0 \ : ((*((pool)->ptr)++ = c), 1)) typedef struct { + /* The first member must be userData so that the XML_GetUserData macro works. */ + void *userData; + void *handlerArg; char *buffer; /* first character to be parsed */ const char *bufferPtr; @@ -156,22 +209,36 @@ typedef struct { char *bufferEnd; /* allocated end of buffer */ const char *bufferLim; - long bufferEndByteIndex; - char *dataBuf; - char *dataBufEnd; - void *userData; + long parseEndByteIndex; + const char *parseEndPtr; + XML_Char *dataBuf; + XML_Char *dataBufEnd; XML_StartElementHandler startElementHandler; XML_EndElementHandler endElementHandler; XML_CharacterDataHandler characterDataHandler; XML_ProcessingInstructionHandler processingInstructionHandler; + XML_DefaultHandler defaultHandler; + XML_UnparsedEntityDeclHandler unparsedEntityDeclHandler; + XML_NotationDeclHandler notationDeclHandler; + XML_ExternalEntityRefHandler externalEntityRefHandler; + XML_UnknownEncodingHandler unknownEncodingHandler; const ENCODING *encoding; INIT_ENCODING initEncoding; + const XML_Char *protocolEncodingName; + void *unknownEncodingMem; + void *unknownEncodingData; + void *unknownEncodingHandlerData; + void (*unknownEncodingRelease)(void *); PROLOG_STATE prologState; Processor *processor; enum XML_Error errorCode; - const char *errorPtr; + const char *eventPtr; + const char *eventEndPtr; + const char *positionPtr; int tagLevel; ENTITY *declEntity; + const XML_Char *declNotationName; + const XML_Char *declNotationPublicId; ELEMENT_TYPE *declElementType; ATTRIBUTE_ID *declAttributeId; char declAttributeIsCdata; @@ -181,7 +248,6 @@ typedef struct { int attsSize; ATTRIBUTE *atts; POSITION position; - long errorByteIndex; STRING_POOL tempPool; STRING_POOL temp2Pool; char *groupConnector; @@ -190,28 +256,44 @@ typedef struct { } Parser; #define userData (((Parser *)parser)->userData) +#define handlerArg (((Parser *)parser)->handlerArg) #define startElementHandler (((Parser *)parser)->startElementHandler) #define endElementHandler (((Parser *)parser)->endElementHandler) #define characterDataHandler (((Parser *)parser)->characterDataHandler) #define processingInstructionHandler (((Parser *)parser)->processingInstructionHandler) +#define defaultHandler (((Parser *)parser)->defaultHandler) +#define unparsedEntityDeclHandler (((Parser *)parser)->unparsedEntityDeclHandler) +#define notationDeclHandler (((Parser *)parser)->notationDeclHandler) +#define externalEntityRefHandler (((Parser *)parser)->externalEntityRefHandler) +#define unknownEncodingHandler (((Parser *)parser)->unknownEncodingHandler) #define encoding (((Parser *)parser)->encoding) #define initEncoding (((Parser *)parser)->initEncoding) +#define unknownEncodingMem (((Parser *)parser)->unknownEncodingMem) +#define unknownEncodingData (((Parser *)parser)->unknownEncodingData) +#define unknownEncodingHandlerData \ + (((Parser *)parser)->unknownEncodingHandlerData) +#define unknownEncodingRelease (((Parser *)parser)->unknownEncodingRelease) +#define protocolEncodingName (((Parser *)parser)->protocolEncodingName) #define prologState (((Parser *)parser)->prologState) #define processor (((Parser *)parser)->processor) #define errorCode (((Parser *)parser)->errorCode) -#define errorPtr (((Parser *)parser)->errorPtr) -#define errorByteIndex (((Parser *)parser)->errorByteIndex) +#define eventPtr (((Parser *)parser)->eventPtr) +#define eventEndPtr (((Parser *)parser)->eventEndPtr) +#define positionPtr (((Parser *)parser)->positionPtr) #define position (((Parser *)parser)->position) #define tagLevel (((Parser *)parser)->tagLevel) #define buffer (((Parser *)parser)->buffer) #define bufferPtr (((Parser *)parser)->bufferPtr) #define bufferEnd (((Parser *)parser)->bufferEnd) -#define bufferEndByteIndex (((Parser *)parser)->bufferEndByteIndex) +#define parseEndByteIndex (((Parser *)parser)->parseEndByteIndex) +#define parseEndPtr (((Parser *)parser)->parseEndPtr) #define bufferLim (((Parser *)parser)->bufferLim) #define dataBuf (((Parser *)parser)->dataBuf) #define dataBufEnd (((Parser *)parser)->dataBufEnd) #define dtd (((Parser *)parser)->dtd) #define declEntity (((Parser *)parser)->declEntity) +#define declNotationName (((Parser *)parser)->declNotationName) +#define declNotationPublicId (((Parser *)parser)->declNotationPublicId) #define declElementType (((Parser *)parser)->declElementType) #define declAttributeId (((Parser *)parser)->declAttributeId) #define declAttributeIsCdata (((Parser *)parser)->declAttributeIsCdata) @@ -225,50 +307,102 @@ typedef struct { #define groupSize (((Parser *)parser)->groupSize) #define hadExternalDoctype (((Parser *)parser)->hadExternalDoctype) -XML_Parser XML_ParserCreate(const char *encodingName) +XML_Parser XML_ParserCreate(const XML_Char *encodingName) { XML_Parser parser = malloc(sizeof(Parser)); if (!parser) return parser; - processor = prologProcessor; + processor = prologInitProcessor; XmlPrologStateInit(&prologState); userData = 0; + handlerArg = 0; startElementHandler = 0; endElementHandler = 0; characterDataHandler = 0; processingInstructionHandler = 0; + defaultHandler = 0; + unparsedEntityDeclHandler = 0; + notationDeclHandler = 0; + externalEntityRefHandler = 0; + unknownEncodingHandler = 0; buffer = 0; bufferPtr = 0; bufferEnd = 0; - bufferEndByteIndex = 0; + parseEndByteIndex = 0; + parseEndPtr = 0; bufferLim = 0; declElementType = 0; declAttributeId = 0; declEntity = 0; + declNotationName = 0; + declNotationPublicId = 0; memset(&position, 0, sizeof(POSITION)); errorCode = XML_ERROR_NONE; - errorByteIndex = 0; - errorPtr = 0; + eventPtr = 0; + eventEndPtr = 0; + positionPtr = 0; tagLevel = 0; tagStack = 0; freeTagList = 0; attsSize = INIT_ATTS_SIZE; atts = malloc(attsSize * sizeof(ATTRIBUTE)); - dataBuf = malloc(INIT_DATA_BUF_SIZE); + dataBuf = malloc(INIT_DATA_BUF_SIZE * sizeof(XML_Char)); groupSize = 0; groupConnector = 0; hadExternalDoctype = 0; + unknownEncodingMem = 0; + unknownEncodingRelease = 0; + unknownEncodingData = 0; + unknownEncodingHandlerData = 0; poolInit(&tempPool); poolInit(&temp2Pool); - if (!dtdInit(&dtd) || !atts || !dataBuf) { + protocolEncodingName = encodingName ? poolCopyString(&tempPool, encodingName) : 0; + if (!dtdInit(&dtd) || !atts || !dataBuf + || (encodingName && !protocolEncodingName)) { XML_ParserFree(parser); return 0; } dataBufEnd = dataBuf + INIT_DATA_BUF_SIZE; - if (!XmlInitEncoding(&initEncoding, &encoding, encodingName)) { - errorCode = XML_ERROR_UNKNOWN_ENCODING; - processor = errorProcessor; + XmlInitEncoding(&initEncoding, &encoding, 0); + return parser; +} + +XML_Parser XML_ExternalEntityParserCreate(XML_Parser oldParser, + const XML_Char *openEntityNames, + const XML_Char *encodingName) +{ + XML_Parser parser = oldParser; + DTD *oldDtd = &dtd; + XML_StartElementHandler oldStartElementHandler = startElementHandler; + XML_EndElementHandler oldEndElementHandler = endElementHandler; + XML_CharacterDataHandler oldCharacterDataHandler = characterDataHandler; + XML_ProcessingInstructionHandler oldProcessingInstructionHandler = processingInstructionHandler; + XML_DefaultHandler oldDefaultHandler = defaultHandler; + XML_ExternalEntityRefHandler oldExternalEntityRefHandler = externalEntityRefHandler; + XML_UnknownEncodingHandler oldUnknownEncodingHandler = unknownEncodingHandler; + void *oldUserData = userData; + void *oldHandlerArg = handlerArg; + + parser = XML_ParserCreate(encodingName); + if (!parser) + return 0; + startElementHandler = oldStartElementHandler; + endElementHandler = oldEndElementHandler; + characterDataHandler = oldCharacterDataHandler; + processingInstructionHandler = oldProcessingInstructionHandler; + defaultHandler = oldDefaultHandler; + externalEntityRefHandler = oldExternalEntityRefHandler; + unknownEncodingHandler = oldUnknownEncodingHandler; + userData = oldUserData; + if (oldUserData == oldHandlerArg) + handlerArg = userData; + else + handlerArg = parser; + if (!dtdCopy(&dtd, oldDtd) || !setOpenEntityNames(parser, openEntityNames)) { + XML_ParserFree(parser); + return 0; } + processor = externalEntityInitProcessor; return parser; } @@ -294,12 +428,41 @@ void XML_ParserFree(XML_Parser parser) free(groupConnector); free(buffer); free(dataBuf); + free(unknownEncodingMem); + if (unknownEncodingRelease) + unknownEncodingRelease(unknownEncodingData); free(parser); } +void XML_UseParserAsHandlerArg(XML_Parser parser) +{ + handlerArg = parser; +} + void XML_SetUserData(XML_Parser parser, void *p) { - userData = p; + if (handlerArg == userData) + handlerArg = userData = p; + else + userData = p; +} + +int XML_SetBase(XML_Parser parser, const XML_Char *p) +{ + if (p) { + p = poolCopyString(&dtd.pool, p); + if (!p) + return 0; + dtd.base = p; + } + else + dtd.base = 0; + return 1; +} + +const XML_Char *XML_GetBase(XML_Parser parser) +{ + return dtd.base; } void XML_SetElementHandler(XML_Parser parser, @@ -322,44 +485,75 @@ void XML_SetProcessingInstructionHandler(XML_Parser parser, processingInstructionHandler = handler; } +void XML_SetDefaultHandler(XML_Parser parser, + XML_DefaultHandler handler) +{ + defaultHandler = handler; +} + +void XML_SetUnparsedEntityDeclHandler(XML_Parser parser, + XML_UnparsedEntityDeclHandler handler) +{ + unparsedEntityDeclHandler = handler; +} + +void XML_SetNotationDeclHandler(XML_Parser parser, + XML_NotationDeclHandler handler) +{ + notationDeclHandler = handler; +} + +void XML_SetExternalEntityRefHandler(XML_Parser parser, + XML_ExternalEntityRefHandler handler) +{ + externalEntityRefHandler = handler; +} + +void XML_SetUnknownEncodingHandler(XML_Parser parser, + XML_UnknownEncodingHandler handler, + void *data) +{ + unknownEncodingHandler = handler; + unknownEncodingHandlerData = data; +} + int XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) { - bufferEndByteIndex += len; if (len == 0) { if (!isFinal) return 1; - errorCode = processor(parser, bufferPtr, bufferEnd, 0); - return errorCode == XML_ERROR_NONE; + errorCode = processor(parser, bufferPtr, parseEndPtr = bufferEnd, 0); + if (errorCode == XML_ERROR_NONE) + return 1; + eventEndPtr = eventPtr; + return 0; } else if (bufferPtr == bufferEnd) { const char *end; int nLeftOver; + parseEndByteIndex += len; + positionPtr = s; if (isFinal) { - errorCode = processor(parser, s, s + len, 0); + errorCode = processor(parser, s, parseEndPtr = s + len, 0); if (errorCode == XML_ERROR_NONE) return 1; - if (errorPtr) { - errorByteIndex = bufferEndByteIndex - (s + len - errorPtr); - XmlUpdatePosition(encoding, s, errorPtr, &position); - } + eventEndPtr = eventPtr; return 0; } - errorCode = processor(parser, s, s + len, &end); + errorCode = processor(parser, s, parseEndPtr = s + len, &end); if (errorCode != XML_ERROR_NONE) { - if (errorPtr) { - errorByteIndex = bufferEndByteIndex - (s + len - errorPtr); - XmlUpdatePosition(encoding, s, errorPtr, &position); - } + eventEndPtr = eventPtr; return 0; } - XmlUpdatePosition(encoding, s, end, &position); + XmlUpdatePosition(encoding, positionPtr, end, &position); nLeftOver = s + len - end; if (nLeftOver) { if (buffer == 0 || nLeftOver > bufferLim - buffer) { /* FIXME avoid integer overflow */ - buffer = realloc(buffer, len * 2); + buffer = buffer == 0 ? malloc(len * 2) : realloc(buffer, len * 2); if (!buffer) { errorCode = XML_ERROR_NO_MEMORY; + eventPtr = eventEndPtr = 0; return 0; } bufferLim = buffer + len * 2; @@ -379,19 +573,18 @@ int XML_Parse(XML_Parser parser, const char *s, int len, int isFinal) int XML_ParseBuffer(XML_Parser parser, int len, int isFinal) { const char *start = bufferPtr; + positionPtr = start; bufferEnd += len; - errorCode = processor(parser, bufferPtr, bufferEnd, + parseEndByteIndex += len; + errorCode = processor(parser, start, parseEndPtr = bufferEnd, isFinal ? (const char **)0 : &bufferPtr); if (errorCode == XML_ERROR_NONE) { if (!isFinal) - XmlUpdatePosition(encoding, start, bufferPtr, &position); + XmlUpdatePosition(encoding, positionPtr, bufferPtr, &position); return 1; } else { - if (errorPtr) { - errorByteIndex = bufferEndByteIndex - (bufferEnd - errorPtr); - XmlUpdatePosition(encoding, start, errorPtr, &position); - } + eventEndPtr = eventPtr; return 0; } } @@ -431,49 +624,67 @@ void *XML_GetBuffer(XML_Parser parser, int len) return bufferEnd; } -int XML_GetErrorCode(XML_Parser parser) +enum XML_Error XML_GetErrorCode(XML_Parser parser) { return errorCode; } -int XML_GetErrorLineNumber(XML_Parser parser) +long XML_GetCurrentByteIndex(XML_Parser parser) { + if (eventPtr) + return parseEndByteIndex - (parseEndPtr - eventPtr); + return -1; +} + +int XML_GetCurrentLineNumber(XML_Parser parser) +{ + if (eventPtr) { + XmlUpdatePosition(encoding, positionPtr, eventPtr, &position); + positionPtr = eventPtr; + } return position.lineNumber + 1; } -int XML_GetErrorColumnNumber(XML_Parser parser) +int XML_GetCurrentColumnNumber(XML_Parser parser) { + if (eventPtr) { + XmlUpdatePosition(encoding, positionPtr, eventPtr, &position); + positionPtr = eventPtr; + } return position.columnNumber; } -long XML_GetErrorByteIndex(XML_Parser parser) +void XML_DefaultCurrent(XML_Parser parser) { - return errorByteIndex; + if (defaultHandler) + reportDefault(parser, encoding, eventPtr, eventEndPtr); } -const char *XML_ErrorString(int code) +const XML_LChar *XML_ErrorString(int code) { - static const char *message[] = { + static const XML_LChar *message[] = { 0, - "out of memory", - "syntax error", - "no element found", - "not well-formed", - "unclosed token", - "unclosed token", - "mismatched tag", - "duplicate attribute", - "junk after document element", - "parameter entity reference not allowed within declaration in internal subset", - "undefined entity", - "recursive entity reference", - "asynchronous entity", - "reference to invalid character number", - "reference to binary entity", - "reference to external entity in attribute", - "xml processing instruction not at start of external entity", - "unknown encoding", - "encoding specified in XML declaration is incorrect" + XML_T("out of memory"), + XML_T("syntax error"), + XML_T("no element found"), + XML_T("not well-formed"), + XML_T("unclosed token"), + XML_T("unclosed token"), + XML_T("mismatched tag"), + XML_T("duplicate attribute"), + XML_T("junk after document element"), + XML_T("illegal parameter entity reference"), + XML_T("undefined entity"), + XML_T("recursive entity reference"), + XML_T("asynchronous entity"), + XML_T("reference to invalid character number"), + XML_T("reference to binary entity"), + XML_T("reference to external entity in attribute"), + XML_T("xml processing instruction not at start of external entity"), + XML_T("unknown encoding"), + XML_T("encoding specified in XML declaration is incorrect"), + XML_T("unclosed CDATA section"), + XML_T("error in processing external entity reference") }; if (code > 0 && code < sizeof(message)/sizeof(message[0])) return message[code]; @@ -489,6 +700,96 @@ enum XML_Error contentProcessor(XML_Parser parser, return doContent(parser, 0, encoding, start, end, endPtr); } +static +enum XML_Error externalEntityInitProcessor(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + enum XML_Error result = initializeEncoding(parser); + if (result != XML_ERROR_NONE) + return result; + processor = externalEntityInitProcessor2; + return externalEntityInitProcessor2(parser, start, end, endPtr); +} + +static +enum XML_Error externalEntityInitProcessor2(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + const char *next; + int tok = XmlContentTok(encoding, start, end, &next); + switch (tok) { + case XML_TOK_BOM: + start = next; + break; + case XML_TOK_PARTIAL: + if (endPtr) { + *endPtr = start; + return XML_ERROR_NONE; + } + eventPtr = start; + return XML_ERROR_UNCLOSED_TOKEN; + case XML_TOK_PARTIAL_CHAR: + if (endPtr) { + *endPtr = start; + return XML_ERROR_NONE; + } + eventPtr = start; + return XML_ERROR_PARTIAL_CHAR; + } + processor = externalEntityInitProcessor3; + return externalEntityInitProcessor3(parser, start, end, endPtr); +} + +static +enum XML_Error externalEntityInitProcessor3(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + const char *next; + int tok = XmlContentTok(encoding, start, end, &next); + switch (tok) { + case XML_TOK_XML_DECL: + { + enum XML_Error result = processXmlDecl(parser, 1, start, next); + if (result != XML_ERROR_NONE) + return result; + start = next; + } + break; + case XML_TOK_PARTIAL: + if (endPtr) { + *endPtr = start; + return XML_ERROR_NONE; + } + eventPtr = start; + return XML_ERROR_UNCLOSED_TOKEN; + case XML_TOK_PARTIAL_CHAR: + if (endPtr) { + *endPtr = start; + return XML_ERROR_NONE; + } + eventPtr = start; + return XML_ERROR_PARTIAL_CHAR; + } + processor = externalEntityContentProcessor; + tagLevel = 1; + return doContent(parser, 1, encoding, start, end, endPtr); +} + +static +enum XML_Error externalEntityContentProcessor(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + return doContent(parser, 1, encoding, start, end, endPtr); +} + static enum XML_Error doContent(XML_Parser parser, int startTagLevel, @@ -497,89 +798,130 @@ doContent(XML_Parser parser, const char *end, const char **nextPtr) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *internalEnc = XmlGetInternalEncoding(); + const char *dummy; + const char **eventPP; + const char **eventEndPP; + if (enc == encoding) { + eventPP = &eventPtr; + *eventPP = s; + eventEndPP = &eventEndPtr; + } + else + eventPP = eventEndPP = &dummy; for (;;) { const char *next; int tok = XmlContentTok(enc, s, end, &next); + *eventEndPP = next; switch (tok) { case XML_TOK_TRAILING_CR: + if (nextPtr) { + *nextPtr = s; + return XML_ERROR_NONE; + } + *eventEndPP = end; + if (characterDataHandler) { + XML_Char c = XML_T('\n'); + characterDataHandler(handlerArg, &c, 1); + } + else if (defaultHandler) + reportDefault(parser, enc, s, end); + if (startTagLevel == 0) + return XML_ERROR_NO_ELEMENTS; + if (tagLevel != startTagLevel) + return XML_ERROR_ASYNC_ENTITY; + return XML_ERROR_NONE; case XML_TOK_NONE: if (nextPtr) { *nextPtr = s; return XML_ERROR_NONE; } if (startTagLevel > 0) { - if (tagLevel != startTagLevel) { - errorPtr = s; + if (tagLevel != startTagLevel) return XML_ERROR_ASYNC_ENTITY; - } return XML_ERROR_NONE; } - errorPtr = s; return XML_ERROR_NO_ELEMENTS; case XML_TOK_INVALID: - errorPtr = next; + *eventPP = next; return XML_ERROR_INVALID_TOKEN; case XML_TOK_PARTIAL: if (nextPtr) { *nextPtr = s; return XML_ERROR_NONE; } - errorPtr = s; return XML_ERROR_UNCLOSED_TOKEN; case XML_TOK_PARTIAL_CHAR: if (nextPtr) { *nextPtr = s; return XML_ERROR_NONE; } - errorPtr = s; return XML_ERROR_PARTIAL_CHAR; case XML_TOK_ENTITY_REF: { - const char *name = poolStoreString(&dtd.pool, enc, - s + enc->minBytesPerChar, - next - enc->minBytesPerChar); + const XML_Char *name; ENTITY *entity; + XML_Char ch = XmlPredefinedEntityName(enc, + s + enc->minBytesPerChar, + next - enc->minBytesPerChar); + if (ch) { + if (characterDataHandler) + characterDataHandler(handlerArg, &ch, 1); + else if (defaultHandler) + reportDefault(parser, enc, s, next); + break; + } + name = poolStoreString(&dtd.pool, enc, + s + enc->minBytesPerChar, + next - enc->minBytesPerChar); if (!name) return XML_ERROR_NO_MEMORY; entity = (ENTITY *)lookup(&dtd.generalEntities, name, 0); poolDiscard(&dtd.pool); if (!entity) { - if (dtd.complete || dtd.standalone) { - errorPtr = s; + if (dtd.complete || dtd.standalone) return XML_ERROR_UNDEFINED_ENTITY; - } + if (defaultHandler) + reportDefault(parser, enc, s, next); break; } - if (entity->magic) { - if (characterDataHandler) - characterDataHandler(userData, entity->textPtr, entity->textLen); - break; - } - if (entity->open) { - errorPtr = s; + if (entity->open) return XML_ERROR_RECURSIVE_ENTITY_REF; - } - if (entity->notation) { - errorPtr = s; + if (entity->notation) return XML_ERROR_BINARY_ENTITY_REF; - } if (entity) { if (entity->textPtr) { enum XML_Error result; + if (defaultHandler) { + reportDefault(parser, enc, s, next); + break; + } + /* Protect against the possibility that somebody sets + the defaultHandler from inside another handler. */ + *eventEndPP = *eventPP; entity->open = 1; result = doContent(parser, tagLevel, - utf8, - entity->textPtr, - entity->textPtr + entity->textLen, + internalEnc, + (char *)entity->textPtr, + (char *)(entity->textPtr + entity->textLen), 0); entity->open = 0; - if (result) { - errorPtr = s; + if (result) return result; - } } + else if (externalEntityRefHandler) { + const XML_Char *openEntityNames; + entity->open = 1; + openEntityNames = getOpenEntityNames(parser); + entity->open = 0; + if (!openEntityNames) + return XML_ERROR_NO_MEMORY; + if (!externalEntityRefHandler(parser, openEntityNames, dtd.base, entity->systemId, entity->publicId)) + return XML_ERROR_EXTERNAL_ENTITY_HANDLING; + } + else if (defaultHandler) + reportDefault(parser, enc, s, next); } break; } @@ -613,6 +955,7 @@ doContent(XML_Parser parser, if (nextPtr) { if (tag->rawNameLength > tag->bufEnd - tag->buf) { int bufSize = tag->rawNameLength * 4; + bufSize = ROUND_UP(bufSize, sizeof(XML_Char)); tag->buf = realloc(tag->buf, bufSize); if (!tag->buf) return XML_ERROR_NO_MEMORY; @@ -624,18 +967,19 @@ doContent(XML_Parser parser, ++tagLevel; if (startElementHandler) { enum XML_Error result; - char *toPtr; - const char *rawNameEnd = tag->rawName + tag->rawNameLength; + XML_Char *toPtr; for (;;) { + const char *rawNameEnd = tag->rawName + tag->rawNameLength; const char *fromPtr = tag->rawName; int bufSize; - toPtr = tag->buf; if (nextPtr) - toPtr += tag->rawNameLength; + toPtr = (XML_Char *)(tag->buf + ROUND_UP(tag->rawNameLength, sizeof(XML_Char))); + else + toPtr = (XML_Char *)tag->buf; tag->name = toPtr; - XmlConvert(enc, XML_UTF8_ENCODING, + XmlConvert(enc, &fromPtr, rawNameEnd, - &toPtr, tag->bufEnd - 1); + (ICHAR **)&toPtr, (ICHAR *)tag->bufEnd - 1); if (fromPtr == rawNameEnd) break; bufSize = (tag->bufEnd - tag->buf) << 1; @@ -643,16 +987,21 @@ doContent(XML_Parser parser, if (!tag->buf) return XML_ERROR_NO_MEMORY; tag->bufEnd = tag->buf + bufSize; + if (nextPtr) + tag->rawName = tag->buf; } - *toPtr = 0; + *toPtr = XML_T('\0'); result = storeAtts(parser, enc, tag->name, s); if (result) return result; - startElementHandler(userData, tag->name, (const char **)atts); + startElementHandler(handlerArg, tag->name, (const XML_Char **)atts); poolClear(&tempPool); } - else + else { tag->name = 0; + if (defaultHandler) + reportDefault(parser, enc, s, next); + } break; } case XML_TOK_EMPTY_ELEMENT_WITH_ATTS: @@ -665,9 +1014,9 @@ doContent(XML_Parser parser, case XML_TOK_EMPTY_ELEMENT_NO_ATTS: if (startElementHandler || endElementHandler) { const char *rawName = s + enc->minBytesPerChar; - const char *name = poolStoreString(&tempPool, enc, rawName, - rawName - + XmlNameLength(enc, rawName)); + const XML_Char *name = poolStoreString(&tempPool, enc, rawName, + rawName + + XmlNameLength(enc, rawName)); if (!name) return XML_ERROR_NO_MEMORY; poolFinish(&tempPool); @@ -675,20 +1024,23 @@ doContent(XML_Parser parser, enum XML_Error result = storeAtts(parser, enc, name, s); if (result) return result; - startElementHandler(userData, name, (const char **)atts); + startElementHandler(handlerArg, name, (const XML_Char **)atts); + } + if (endElementHandler) { + if (startElementHandler) + *eventEndPP = *eventPP; + endElementHandler(handlerArg, name); } - if (endElementHandler) - endElementHandler(userData, name); poolClear(&tempPool); } + else if (defaultHandler) + reportDefault(parser, enc, s, next); if (tagLevel == 0) return epilogProcessor(parser, next, end, nextPtr); break; case XML_TOK_END_TAG: - if (tagLevel == startTagLevel) { - errorPtr = s; + if (tagLevel == startTagLevel) return XML_ERROR_ASYNC_ENTITY; - } else { int len; const char *rawName; @@ -700,22 +1052,24 @@ doContent(XML_Parser parser, len = XmlNameLength(enc, rawName); if (len != tag->rawNameLength || memcmp(tag->rawName, rawName, len) != 0) { - errorPtr = rawName; + *eventPP = rawName; return XML_ERROR_TAG_MISMATCH; } --tagLevel; if (endElementHandler) { if (tag->name) - endElementHandler(userData, tag->name); + endElementHandler(handlerArg, tag->name); else { - const char *name = poolStoreString(&tempPool, enc, rawName, - rawName + len); + const XML_Char *name = poolStoreString(&tempPool, enc, rawName, + rawName + len); if (!name) - return XML_ERROR_NO_MEMORY; - endElementHandler(userData, name); + return XML_ERROR_NO_MEMORY; + endElementHandler(handlerArg, name); poolClear(&tempPool); } } + else if (defaultHandler) + reportDefault(parser, enc, s, next); if (tagLevel == 0) return epilogProcessor(parser, next, end, nextPtr); } @@ -723,51 +1077,98 @@ doContent(XML_Parser parser, case XML_TOK_CHAR_REF: { int n = XmlCharRefNumber(enc, s); - if (n < 0) { - errorPtr = s; + if (n < 0) return XML_ERROR_BAD_CHAR_REF; - } if (characterDataHandler) { - char buf[XML_MAX_BYTES_PER_CHAR]; - characterDataHandler(userData, buf, XmlEncode(utf8, n, buf)); + XML_Char buf[XML_ENCODE_MAX]; + characterDataHandler(handlerArg, buf, XmlEncode(n, (ICHAR *)buf)); } + else if (defaultHandler) + reportDefault(parser, enc, s, next); } break; case XML_TOK_XML_DECL: - errorPtr = s; return XML_ERROR_MISPLACED_XML_PI; case XML_TOK_DATA_NEWLINE: if (characterDataHandler) { - char c = '\n'; - characterDataHandler(userData, &c, 1); + XML_Char c = XML_T('\n'); + characterDataHandler(handlerArg, &c, 1); + } + else if (defaultHandler) + reportDefault(parser, enc, s, next); + break; + case XML_TOK_CDATA_SECT_OPEN: + { + enum XML_Error result; + if (characterDataHandler) + characterDataHandler(handlerArg, dataBuf, 0); + else if (defaultHandler) + reportDefault(parser, enc, s, next); + result = doCdataSection(parser, enc, &next, end, nextPtr); + if (!next) { + processor = cdataSectionProcessor; + return result; + } } break; - case XML_TOK_CDATA_SECTION: + case XML_TOK_TRAILING_RSQB: + if (nextPtr) { + *nextPtr = s; + return XML_ERROR_NONE; + } if (characterDataHandler) { - const char *lim = next - enc->minBytesPerChar * 3; - s += enc->minBytesPerChar * 9; - do { - char *dataPtr = dataBuf; - XmlConvert(enc, XML_UTF8_ENCODING, &s, lim, &dataPtr, dataBufEnd); - characterDataHandler(userData, dataBuf, dataPtr - dataBuf); - } while (s != lim); + if (MUST_CONVERT(enc, s)) { + ICHAR *dataPtr = (ICHAR *)dataBuf; + XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd); + characterDataHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + } + else + characterDataHandler(handlerArg, + (XML_Char *)s, + (XML_Char *)end - (XML_Char *)s); } - break; + else if (defaultHandler) + reportDefault(parser, enc, s, end); + if (startTagLevel == 0) { + *eventPP = end; + return XML_ERROR_NO_ELEMENTS; + } + if (tagLevel != startTagLevel) { + *eventPP = end; + return XML_ERROR_ASYNC_ENTITY; + } + return XML_ERROR_NONE; case XML_TOK_DATA_CHARS: if (characterDataHandler) { - do { - char *dataPtr = dataBuf; - XmlConvert(enc, XML_UTF8_ENCODING, &s, next, &dataPtr, dataBufEnd); - characterDataHandler(userData, dataBuf, dataPtr - dataBuf); - } while (s != next); + if (MUST_CONVERT(enc, s)) { + for (;;) { + ICHAR *dataPtr = (ICHAR *)dataBuf; + XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); + *eventEndPP = s; + characterDataHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + if (s == next) + break; + *eventPP = s; + } + } + else + characterDataHandler(handlerArg, + (XML_Char *)s, + (XML_Char *)next - (XML_Char *)s); } + else if (defaultHandler) + reportDefault(parser, enc, s, next); break; case XML_TOK_PI: if (!reportProcessingInstruction(parser, enc, s, next)) return XML_ERROR_NO_MEMORY; break; + default: + if (defaultHandler) + reportDefault(parser, enc, s, next); + break; } - s = next; + *eventPP = s = next; } /* not reached */ } @@ -776,11 +1177,11 @@ doContent(XML_Parser parser, otherwise just check the attributes for well-formedness. */ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc, - const char *tagName, const char *s) + const XML_Char *tagName, const char *s) { ELEMENT_TYPE *elementType = 0; int nDefaultAtts = 0; - const char **appAtts = (const char **)atts; + const XML_Char **appAtts; int i; int n; @@ -792,13 +1193,15 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc, n = XmlGetAttributes(enc, s, attsSize, atts); if (n + nDefaultAtts > attsSize) { - attsSize = 2*n; + int oldAttsSize = attsSize; + attsSize = n + nDefaultAtts + INIT_ATTS_SIZE; atts = realloc((void *)atts, attsSize * sizeof(ATTRIBUTE)); if (!atts) return XML_ERROR_NO_MEMORY; - if (n > attsSize) + if (n > oldAttsSize) XmlGetAttributes(enc, s, n, atts); } + appAtts = (const XML_Char **)atts; for (i = 0; i < n; i++) { ATTRIBUTE_ID *attId = getAttributeId(parser, enc, atts[i].name, atts[i].name @@ -806,7 +1209,8 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc, if (!attId) return XML_ERROR_NO_MEMORY; if ((attId->name)[-1]) { - errorPtr = atts[i].name; + if (enc == encoding) + eventPtr = atts[i].name; return XML_ERROR_DUPLICATE_ATTRIBUTE; } (attId->name)[-1] = 1; @@ -858,10 +1262,242 @@ static enum XML_Error storeAtts(XML_Parser parser, const ENCODING *enc, appAtts[i << 1] = 0; } while (i-- > 0) - ((char *)appAtts[i << 1])[-1] = 0; + ((XML_Char *)appAtts[i << 1])[-1] = 0; return XML_ERROR_NONE; } +/* The idea here is to avoid using stack for each CDATA section when +the whole file is parsed with one call. */ + +static +enum XML_Error cdataSectionProcessor(XML_Parser parser, + const char *start, + const char *end, + const char **endPtr) +{ + enum XML_Error result = doCdataSection(parser, encoding, &start, end, endPtr); + if (start) { + processor = contentProcessor; + return contentProcessor(parser, start, end, endPtr); + } + return result; +} + +/* startPtr gets set to non-null is the section is closed, and to null if +the section is not yet closed. */ + +static +enum XML_Error doCdataSection(XML_Parser parser, + const ENCODING *enc, + const char **startPtr, + const char *end, + const char **nextPtr) +{ + const char *s = *startPtr; + const char *dummy; + const char **eventPP; + const char **eventEndPP; + if (enc == encoding) { + eventPP = &eventPtr; + *eventPP = s; + eventEndPP = &eventEndPtr; + } + else + eventPP = eventEndPP = &dummy; + *startPtr = 0; + for (;;) { + const char *next; + int tok = XmlCdataSectionTok(enc, s, end, &next); + *eventEndPP = next; + switch (tok) { + case XML_TOK_CDATA_SECT_CLOSE: + if (characterDataHandler) + characterDataHandler(handlerArg, dataBuf, 0); + else if (defaultHandler) + reportDefault(parser, enc, s, next); + *startPtr = next; + return XML_ERROR_NONE; + case XML_TOK_DATA_NEWLINE: + if (characterDataHandler) { + XML_Char c = XML_T('\n'); + characterDataHandler(handlerArg, &c, 1); + } + else if (defaultHandler) + reportDefault(parser, enc, s, next); + break; + case XML_TOK_DATA_CHARS: + if (characterDataHandler) { + if (MUST_CONVERT(enc, s)) { + for (;;) { + ICHAR *dataPtr = (ICHAR *)dataBuf; + XmlConvert(enc, &s, next, &dataPtr, (ICHAR *)dataBufEnd); + *eventEndPP = next; + characterDataHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + if (s == next) + break; + *eventPP = s; + } + } + else + characterDataHandler(handlerArg, + (XML_Char *)s, + (XML_Char *)next - (XML_Char *)s); + } + else if (defaultHandler) + reportDefault(parser, enc, s, next); + break; + case XML_TOK_INVALID: + *eventPP = next; + return XML_ERROR_INVALID_TOKEN; + case XML_TOK_PARTIAL_CHAR: + if (nextPtr) { + *nextPtr = s; + return XML_ERROR_NONE; + } + return XML_ERROR_PARTIAL_CHAR; + case XML_TOK_PARTIAL: + case XML_TOK_NONE: + if (nextPtr) { + *nextPtr = s; + return XML_ERROR_NONE; + } + return XML_ERROR_UNCLOSED_CDATA_SECTION; + default: + abort(); + } + *eventPP = s = next; + } + /* not reached */ +} + +static enum XML_Error +initializeEncoding(XML_Parser parser) +{ + const char *s; +#ifdef XML_UNICODE + char encodingBuf[128]; + if (!protocolEncodingName) + s = 0; + else { + int i; + for (i = 0; protocolEncodingName[i]; i++) { + if (i == sizeof(encodingBuf) - 1 + || protocolEncodingName[i] >= 0x80 + || protocolEncodingName[i] < 0) { + encodingBuf[0] = '\0'; + break; + } + encodingBuf[i] = (char)protocolEncodingName[i]; + } + encodingBuf[i] = '\0'; + s = encodingBuf; + } +#else + s = protocolEncodingName; +#endif + if (XmlInitEncoding(&initEncoding, &encoding, s)) + return XML_ERROR_NONE; + return handleUnknownEncoding(parser, protocolEncodingName); +} + +static enum XML_Error +processXmlDecl(XML_Parser parser, int isGeneralTextEntity, + const char *s, const char *next) +{ + const char *encodingName = 0; + const ENCODING *newEncoding = 0; + const char *version; + int standalone = -1; + if (!XmlParseXmlDecl(isGeneralTextEntity, + encoding, + s, + next, + &eventPtr, + &version, + &encodingName, + &newEncoding, + &standalone)) + return XML_ERROR_SYNTAX; + if (defaultHandler) + reportDefault(parser, encoding, s, next); + if (!protocolEncodingName) { + if (newEncoding) { + if (newEncoding->minBytesPerChar != encoding->minBytesPerChar) { + eventPtr = encodingName; + return XML_ERROR_INCORRECT_ENCODING; + } + encoding = newEncoding; + } + else if (encodingName) { + enum XML_Error result; + const XML_Char *s = poolStoreString(&tempPool, + encoding, + encodingName, + encodingName + + XmlNameLength(encoding, encodingName)); + if (!s) + return XML_ERROR_NO_MEMORY; + result = handleUnknownEncoding(parser, s); + poolDiscard(&tempPool); + if (result == XML_ERROR_UNKNOWN_ENCODING) + eventPtr = encodingName; + return result; + } + } + if (!isGeneralTextEntity && standalone == 1) + dtd.standalone = 1; + return XML_ERROR_NONE; +} + +static enum XML_Error +handleUnknownEncoding(XML_Parser parser, const XML_Char *encodingName) +{ + if (unknownEncodingHandler) { + XML_Encoding info; + int i; + for (i = 0; i < 256; i++) + info.map[i] = -1; + info.convert = 0; + info.data = 0; + info.release = 0; + if (unknownEncodingHandler(unknownEncodingHandlerData, encodingName, &info)) { + ENCODING *enc; + unknownEncodingMem = malloc(XmlSizeOfUnknownEncoding()); + if (!unknownEncodingMem) { + if (info.release) + info.release(info.data); + return XML_ERROR_NO_MEMORY; + } + enc = XmlInitUnknownEncoding(unknownEncodingMem, + info.map, + info.convert, + info.data); + if (enc) { + unknownEncodingData = info.data; + unknownEncodingRelease = info.release; + encoding = enc; + return XML_ERROR_NONE; + } + } + if (info.release) + info.release(info.data); + } + return XML_ERROR_UNKNOWN_ENCODING; +} + +static enum XML_Error +prologInitProcessor(XML_Parser parser, + const char *s, + const char *end, + const char **nextPtr) +{ + enum XML_Error result = initializeEncoding(parser); + if (result != XML_ERROR_NONE) + return result; + processor = prologProcessor; + return prologProcessor(parser, s, end, nextPtr); +} + static enum XML_Error prologProcessor(XML_Parser parser, const char *s, @@ -878,7 +1514,7 @@ prologProcessor(XML_Parser parser, } switch (tok) { case XML_TOK_INVALID: - errorPtr = next; + eventPtr = next; return XML_ERROR_INVALID_TOKEN; case XML_TOK_NONE: return XML_ERROR_NO_ELEMENTS; @@ -887,7 +1523,7 @@ prologProcessor(XML_Parser parser, case XML_TOK_PARTIAL_CHAR: return XML_ERROR_PARTIAL_CHAR; case XML_TOK_TRAILING_CR: - errorPtr = s + encoding->minBytesPerChar; + eventPtr = s + encoding->minBytesPerChar; return XML_ERROR_NO_ELEMENTS; default: abort(); @@ -896,43 +1532,29 @@ prologProcessor(XML_Parser parser, switch (XmlTokenRole(&prologState, tok, s, next, encoding)) { case XML_ROLE_XML_DECL: { - const char *encodingName = 0; - const ENCODING *newEncoding = 0; - const char *version; - int standalone = -1; - if (!XmlParseXmlDecl(0, - encoding, - s, - next, - &errorPtr, - &version, - &encodingName, - &newEncoding, - &standalone)) - return XML_ERROR_SYNTAX; - if (newEncoding) { - if (newEncoding->minBytesPerChar != encoding->minBytesPerChar) { - errorPtr = encodingName; - return XML_ERROR_INCORRECT_ENCODING; - } - encoding = newEncoding; - } - else if (encodingName) { - errorPtr = encodingName; - return XML_ERROR_UNKNOWN_ENCODING; - } - if (standalone == 1) - dtd.standalone = 1; - break; + enum XML_Error result = processXmlDecl(parser, 0, s, next); + if (result != XML_ERROR_NONE) + return result; } + break; case XML_ROLE_DOCTYPE_SYSTEM_ID: hadExternalDoctype = 1; break; case XML_ROLE_DOCTYPE_PUBLIC_ID: case XML_ROLE_ENTITY_PUBLIC_ID: - case XML_ROLE_NOTATION_PUBLIC_ID: - if (!XmlIsPublicId(encoding, s, next, &errorPtr)) + if (!XmlIsPublicId(encoding, s, next, &eventPtr)) return XML_ERROR_SYNTAX; + if (declEntity) { + XML_Char *tem = poolStoreString(&dtd.pool, + encoding, + s + encoding->minBytesPerChar, + next - encoding->minBytesPerChar); + if (!tem) + return XML_ERROR_NO_MEMORY; + normalizePublicId(tem); + declEntity->publicId = tem; + poolFinish(&dtd.pool); + } break; case XML_ROLE_INSTANCE_START: processor = contentProcessor; @@ -941,7 +1563,7 @@ prologProcessor(XML_Parser parser, return contentProcessor(parser, s, end, nextPtr); case XML_ROLE_ATTLIST_ELEMENT_NAME: { - const char *name = poolStoreString(&dtd.pool, encoding, s, next); + const XML_Char *name = poolStoreString(&dtd.pool, encoding, s, next); if (!name) return XML_ERROR_NO_MEMORY; declElementType = (ELEMENT_TYPE *)lookup(&dtd.elementTypes, name, sizeof(ELEMENT_TYPE)); @@ -971,7 +1593,7 @@ prologProcessor(XML_Parser parser, case XML_ROLE_DEFAULT_ATTRIBUTE_VALUE: case XML_ROLE_FIXED_ATTRIBUTE_VALUE: { - const char *attVal; + const XML_Char *attVal; enum XML_Error result = storeAttributeValue(parser, encoding, declAttributeIsCdata, s + encoding->minBytesPerChar, @@ -1000,6 +1622,7 @@ prologProcessor(XML_Parser parser, next - encoding->minBytesPerChar); if (!declEntity->systemId) return XML_ERROR_NO_MEMORY; + declEntity->base = dtd.base; poolFinish(&dtd.pool); } break; @@ -1009,11 +1632,26 @@ prologProcessor(XML_Parser parser, if (!declEntity->notation) return XML_ERROR_NO_MEMORY; poolFinish(&dtd.pool); + if (unparsedEntityDeclHandler) { + eventPtr = eventEndPtr = s; + unparsedEntityDeclHandler(handlerArg, + declEntity->name, + declEntity->base, + declEntity->systemId, + declEntity->publicId, + declEntity->notation); + } + } break; case XML_ROLE_GENERAL_ENTITY_NAME: { - const char *name = poolStoreString(&dtd.pool, encoding, s, next); + const XML_Char *name; + if (XmlPredefinedEntityName(encoding, s, next)) { + declEntity = 0; + break; + } + name = poolStoreString(&dtd.pool, encoding, s, next); if (!name) return XML_ERROR_NO_MEMORY; if (dtd.complete) { @@ -1036,8 +1674,61 @@ prologProcessor(XML_Parser parser, case XML_ROLE_PARAM_ENTITY_NAME: declEntity = 0; break; + case XML_ROLE_NOTATION_NAME: + declNotationPublicId = 0; + declNotationName = 0; + if (notationDeclHandler) { + declNotationName = poolStoreString(&tempPool, encoding, s, next); + if (!declNotationName) + return XML_ERROR_NO_MEMORY; + poolFinish(&tempPool); + } + break; + case XML_ROLE_NOTATION_PUBLIC_ID: + if (!XmlIsPublicId(encoding, s, next, &eventPtr)) + return XML_ERROR_SYNTAX; + if (declNotationName) { + XML_Char *tem = poolStoreString(&tempPool, + encoding, + s + encoding->minBytesPerChar, + next - encoding->minBytesPerChar); + if (!tem) + return XML_ERROR_NO_MEMORY; + normalizePublicId(tem); + declNotationPublicId = tem; + poolFinish(&tempPool); + } + break; + case XML_ROLE_NOTATION_SYSTEM_ID: + if (declNotationName && notationDeclHandler) { + const XML_Char *systemId + = poolStoreString(&tempPool, encoding, + s + encoding->minBytesPerChar, + next - encoding->minBytesPerChar); + if (!systemId) + return XML_ERROR_NO_MEMORY; + eventPtr = eventEndPtr = s; + notationDeclHandler(handlerArg, + declNotationName, + dtd.base, + systemId, + declNotationPublicId); + } + poolClear(&tempPool); + break; + case XML_ROLE_NOTATION_NO_SYSTEM_ID: + if (declNotationPublicId && notationDeclHandler) { + eventPtr = eventEndPtr = s; + notationDeclHandler(handlerArg, + declNotationName, + dtd.base, + 0, + declNotationPublicId); + } + poolClear(&tempPool); + break; case XML_ROLE_ERROR: - errorPtr = s; + eventPtr = s; switch (tok) { case XML_TOK_PARAM_ENTITY_REF: return XML_ERROR_PARAM_ENTITY_REF; @@ -1059,14 +1750,14 @@ prologProcessor(XML_Parser parser, break; case XML_ROLE_GROUP_SEQUENCE: if (groupConnector[prologState.level] == '|') { - errorPtr = s; + eventPtr = s; return XML_ERROR_SYNTAX; } groupConnector[prologState.level] = ','; break; case XML_ROLE_GROUP_CHOICE: if (groupConnector[prologState.level] == ',') { - errorPtr = s; + eventPtr = s; return XML_ERROR_SYNTAX; } groupConnector[prologState.level] = '|'; @@ -1077,12 +1768,26 @@ prologProcessor(XML_Parser parser, case XML_ROLE_NONE: switch (tok) { case XML_TOK_PI: + eventPtr = s; + eventEndPtr = next; if (!reportProcessingInstruction(parser, encoding, s, next)) return XML_ERROR_NO_MEMORY; break; } break; } + if (defaultHandler) { + switch (tok) { + case XML_TOK_PI: + case XML_TOK_BOM: + case XML_TOK_XML_DECL: + break; + default: + eventPtr = s; + eventEndPtr = next; + reportDefault(parser, encoding, s, next); + } + } s = next; } /* not reached */ @@ -1095,44 +1800,50 @@ enum XML_Error epilogProcessor(XML_Parser parser, const char **nextPtr) { processor = epilogProcessor; + eventPtr = s; for (;;) { const char *next; int tok = XmlPrologTok(encoding, s, end, &next); + eventEndPtr = next; switch (tok) { case XML_TOK_TRAILING_CR: + if (defaultHandler) { + eventEndPtr = end; + reportDefault(parser, encoding, s, end); + } + /* fall through */ case XML_TOK_NONE: if (nextPtr) *nextPtr = end; return XML_ERROR_NONE; case XML_TOK_PROLOG_S: case XML_TOK_COMMENT: + if (defaultHandler) + reportDefault(parser, encoding, s, next); break; case XML_TOK_PI: if (!reportProcessingInstruction(parser, encoding, s, next)) return XML_ERROR_NO_MEMORY; break; case XML_TOK_INVALID: - errorPtr = next; + eventPtr = next; return XML_ERROR_INVALID_TOKEN; case XML_TOK_PARTIAL: if (nextPtr) { *nextPtr = s; return XML_ERROR_NONE; } - errorPtr = s; return XML_ERROR_UNCLOSED_TOKEN; case XML_TOK_PARTIAL_CHAR: if (nextPtr) { *nextPtr = s; return XML_ERROR_NONE; } - errorPtr = s; return XML_ERROR_PARTIAL_CHAR; default: - errorPtr = s; return XML_ERROR_JUNK_AFTER_DOC_ELEMENT; } - s = next; + eventPtr = s = next; } } @@ -1153,9 +1864,9 @@ storeAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, enum XML_Error result = appendAttributeValue(parser, enc, isCdata, ptr, end, pool); if (result) return result; - if (!isCdata && poolLength(pool) && poolLastByte(pool) == ' ') + if (!isCdata && poolLength(pool) && poolLastChar(pool) == XML_T(' ')) poolChop(pool); - if (!poolAppendByte(pool, 0)) + if (!poolAppendChar(pool, XML_T('\0'))) return XML_ERROR_NO_MEMORY; return XML_ERROR_NONE; } @@ -1165,7 +1876,7 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, const char *ptr, const char *end, STRING_POOL *pool) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *internalEnc = XmlGetInternalEncoding(); for (;;) { const char *next; int tok = XmlAttributeValueTok(enc, ptr, end, &next); @@ -1173,31 +1884,35 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, case XML_TOK_NONE: return XML_ERROR_NONE; case XML_TOK_INVALID: - errorPtr = next; + if (enc == encoding) + eventPtr = next; return XML_ERROR_INVALID_TOKEN; case XML_TOK_PARTIAL: - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_INVALID_TOKEN; case XML_TOK_CHAR_REF: { - char buf[XML_MAX_BYTES_PER_CHAR]; + XML_Char buf[XML_ENCODE_MAX]; int i; int n = XmlCharRefNumber(enc, ptr); if (n < 0) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_BAD_CHAR_REF; } if (!isCdata - && n == ' ' - && (poolLength(pool) == 0 || poolLastByte(pool) == ' ')) + && n == 0x20 /* space */ + && (poolLength(pool) == 0 || poolLastChar(pool) == XML_T(' '))) break; - n = XmlEncode(utf8, n, buf); + n = XmlEncode(n, (ICHAR *)buf); if (!n) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_BAD_CHAR_REF; } for (i = 0; i < n; i++) { - if (!poolAppendByte(pool, buf[i])) + if (!poolAppendChar(pool, buf[i])) return XML_ERROR_NO_MEMORY; } } @@ -1212,55 +1927,60 @@ appendAttributeValue(XML_Parser parser, const ENCODING *enc, int isCdata, /* fall through */ case XML_TOK_ATTRIBUTE_VALUE_S: case XML_TOK_DATA_NEWLINE: - if (!isCdata && (poolLength(pool) == 0 || poolLastByte(pool) == ' ')) + if (!isCdata && (poolLength(pool) == 0 || poolLastChar(pool) == XML_T(' '))) break; - if (!poolAppendByte(pool, ' ')) + if (!poolAppendChar(pool, XML_T(' '))) return XML_ERROR_NO_MEMORY; break; case XML_TOK_ENTITY_REF: { - const char *name = poolStoreString(&temp2Pool, enc, - ptr + enc->minBytesPerChar, - next - enc->minBytesPerChar); + const XML_Char *name; ENTITY *entity; + XML_Char ch = XmlPredefinedEntityName(enc, + ptr + enc->minBytesPerChar, + next - enc->minBytesPerChar); + if (ch) { + if (!poolAppendChar(pool, ch)) + return XML_ERROR_NO_MEMORY; + break; + } + name = poolStoreString(&temp2Pool, enc, + ptr + enc->minBytesPerChar, + next - enc->minBytesPerChar); if (!name) return XML_ERROR_NO_MEMORY; entity = (ENTITY *)lookup(&dtd.generalEntities, name, 0); poolDiscard(&temp2Pool); if (!entity) { if (dtd.complete) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_UNDEFINED_ENTITY; } } else if (entity->open) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_RECURSIVE_ENTITY_REF; } else if (entity->notation) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_BINARY_ENTITY_REF; } - else if (entity->magic) { - int i; - for (i = 0; i < entity->textLen; i++) - if (!poolAppendByte(pool, entity->textPtr[i])) - return XML_ERROR_NO_MEMORY; - } else if (!entity->textPtr) { - errorPtr = ptr; + if (enc == encoding) + eventPtr = ptr; return XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF; } else { enum XML_Error result; - const char *textEnd = entity->textPtr + entity->textLen; + const XML_Char *textEnd = entity->textPtr + entity->textLen; entity->open = 1; - result = appendAttributeValue(parser, utf8, isCdata, entity->textPtr, textEnd, pool); + result = appendAttributeValue(parser, internalEnc, isCdata, (char *)entity->textPtr, (char *)textEnd, pool); entity->open = 0; - if (result) { - errorPtr = ptr; + if (result) return result; - } } } break; @@ -1277,7 +1997,7 @@ enum XML_Error storeEntityValue(XML_Parser parser, const char *entityTextPtr, const char *entityTextEnd) { - const ENCODING *utf8 = XmlGetInternalEncoding(XML_UTF8_ENCODING); + const ENCODING *internalEnc = XmlGetInternalEncoding(); STRING_POOL *pool = &(dtd.pool); entityTextPtr += encoding->minBytesPerChar; entityTextEnd -= encoding->minBytesPerChar; @@ -1286,7 +2006,7 @@ enum XML_Error storeEntityValue(XML_Parser parser, int tok = XmlEntityValueTok(encoding, entityTextPtr, entityTextEnd, &next); switch (tok) { case XML_TOK_PARAM_ENTITY_REF: - errorPtr = entityTextPtr; + eventPtr = entityTextPtr; return XML_ERROR_SYNTAX; case XML_TOK_NONE: if (declEntity) { @@ -1308,20 +2028,20 @@ enum XML_Error storeEntityValue(XML_Parser parser, case XML_TOK_DATA_NEWLINE: if (pool->end == pool->ptr && !poolGrow(pool)) return XML_ERROR_NO_MEMORY; - *(pool->ptr)++ = '\n'; + *(pool->ptr)++ = XML_T('\n'); break; case XML_TOK_CHAR_REF: { - char buf[XML_MAX_BYTES_PER_CHAR]; + XML_Char buf[XML_ENCODE_MAX]; int i; int n = XmlCharRefNumber(encoding, entityTextPtr); if (n < 0) { - errorPtr = entityTextPtr; + eventPtr = entityTextPtr; return XML_ERROR_BAD_CHAR_REF; } - n = XmlEncode(utf8, n, buf); + n = XmlEncode(n, (ICHAR *)buf); if (!n) { - errorPtr = entityTextPtr; + eventPtr = entityTextPtr; return XML_ERROR_BAD_CHAR_REF; } for (i = 0; i < n; i++) { @@ -1332,10 +2052,10 @@ enum XML_Error storeEntityValue(XML_Parser parser, } break; case XML_TOK_PARTIAL: - errorPtr = entityTextPtr; + eventPtr = entityTextPtr; return XML_ERROR_INVALID_TOKEN; case XML_TOK_INVALID: - errorPtr = next; + eventPtr = next; return XML_ERROR_INVALID_TOKEN; default: abort(); @@ -1346,36 +2066,42 @@ enum XML_Error storeEntityValue(XML_Parser parser, } static void -normalizeLines(char *s) +normalizeLines(XML_Char *s) { - char *p; - s = strchr(s, '\r'); - if (!s) - return; + XML_Char *p; + for (;; s++) { + if (*s == XML_T('\0')) + return; + if (*s == XML_T('\r')) + break; + } p = s; - while (*s) { - if (*s == '\r') { - *p++ = '\n'; - if (*++s == '\n') + do { + if (*s == XML_T('\r')) { + *p++ = XML_T('\n'); + if (*++s == XML_T('\n')) s++; } else *p++ = *s++; - } - *p = '\0'; + } while (*s); + *p = XML_T('\0'); } static int reportProcessingInstruction(XML_Parser parser, const ENCODING *enc, const char *start, const char *end) { - const char *target; - char *data; + const XML_Char *target; + XML_Char *data; const char *tem; - if (!processingInstructionHandler) + if (!processingInstructionHandler) { + if (defaultHandler) + reportDefault(parser, enc, start, end); return 1; - target = start + enc->minBytesPerChar * 2; - tem = target + XmlNameLength(enc, target); - target = poolStoreString(&tempPool, enc, target, tem); + } + start += enc->minBytesPerChar * 2; + tem = start + XmlNameLength(enc, start); + target = poolStoreString(&tempPool, enc, start, tem); if (!target) return 0; poolFinish(&tempPool); @@ -1385,22 +2111,50 @@ reportProcessingInstruction(XML_Parser parser, const ENCODING *enc, const char * if (!data) return 0; normalizeLines(data); - processingInstructionHandler(userData, target, data); + processingInstructionHandler(handlerArg, target, data); poolClear(&tempPool); return 1; } +static void +reportDefault(XML_Parser parser, const ENCODING *enc, const char *s, const char *end) +{ + if (MUST_CONVERT(enc, s)) { + for (;;) { + ICHAR *dataPtr = (ICHAR *)dataBuf; + XmlConvert(enc, &s, end, &dataPtr, (ICHAR *)dataBufEnd); + if (s == end) { + defaultHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + break; + } + if (enc == encoding) { + eventEndPtr = s; + defaultHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + eventPtr = s; + } + else + defaultHandler(handlerArg, dataBuf, dataPtr - (ICHAR *)dataBuf); + } + } + else + defaultHandler(handlerArg, (XML_Char *)s, (XML_Char *)end - (XML_Char *)s); +} + + static int -defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *attId, int isCdata, const char *value) +defineAttribute(ELEMENT_TYPE *type, ATTRIBUTE_ID *attId, int isCdata, const XML_Char *value) { DEFAULT_ATTRIBUTE *att; if (type->nDefaultAtts == type->allocDefaultAtts) { - if (type->allocDefaultAtts == 0) + if (type->allocDefaultAtts == 0) { type->allocDefaultAtts = 8; - else + type->defaultAtts = malloc(type->allocDefaultAtts*sizeof(DEFAULT_ATTRIBUTE)); + } + else { type->allocDefaultAtts *= 2; - type->defaultAtts = realloc(type->defaultAtts, - type->allocDefaultAtts*sizeof(DEFAULT_ATTRIBUTE)); + type->defaultAtts = realloc(type->defaultAtts, + type->allocDefaultAtts*sizeof(DEFAULT_ATTRIBUTE)); + } if (!type->defaultAtts) return 0; } @@ -1418,8 +2172,8 @@ static ATTRIBUTE_ID * getAttributeId(XML_Parser parser, const ENCODING *enc, const char *start, const char *end) { ATTRIBUTE_ID *id; - const char *name; - if (!poolAppendByte(&dtd.pool, 0)) + const XML_Char *name; + if (!poolAppendChar(&dtd.pool, XML_T('\0'))) return 0; name = poolStoreString(&dtd.pool, enc, start, end); if (!name) @@ -1435,25 +2189,88 @@ getAttributeId(XML_Parser parser, const ENCODING *enc, const char *start, const return id; } +static +const XML_Char *getOpenEntityNames(XML_Parser parser) +{ + HASH_TABLE_ITER iter; + + hashTableIterInit(&iter, &(dtd.generalEntities)); + for (;;) { + const XML_Char *s; + ENTITY *e = (ENTITY *)hashTableIterNext(&iter); + if (!e) + break; + if (!e->open) + continue; + if (poolLength(&tempPool) > 0 && !poolAppendChar(&tempPool, XML_T(' '))) + return 0; + for (s = e->name; *s; s++) + if (!poolAppendChar(&tempPool, *s)) + return 0; + } + + if (!poolAppendChar(&tempPool, XML_T('\0'))) + return 0; + return tempPool.start; +} + +static +int setOpenEntityNames(XML_Parser parser, const XML_Char *openEntityNames) +{ + const XML_Char *s = openEntityNames; + while (*openEntityNames != XML_T('\0')) { + if (*s == XML_T(' ') || *s == XML_T('\0')) { + ENTITY *e; + if (!poolAppendChar(&tempPool, XML_T('\0'))) + return 0; + e = (ENTITY *)lookup(&dtd.generalEntities, poolStart(&tempPool), 0); + if (e) + e->open = 1; + if (*s == XML_T(' ')) + s++; + openEntityNames = s; + poolDiscard(&tempPool); + } + else { + if (!poolAppendChar(&tempPool, *s)) + return 0; + s++; + } + } + return 1; +} + + +static +void normalizePublicId(XML_Char *publicId) +{ + XML_Char *p = publicId; + XML_Char *s; + for (s = publicId; *s; s++) { + switch (*s) { + case XML_T(' '): + case XML_T('\r'): + case XML_T('\n'): + if (p != publicId && p[-1] != XML_T(' ')) + *p++ = XML_T(' '); + break; + default: + *p++ = *s; + } + } + if (p != publicId && p[-1] == XML_T(' ')) + --p; + *p = XML_T('\0'); +} + static int dtdInit(DTD *p) { - static const char *names[] = { "lt", "amp", "gt", "quot", "apos" }; - static const char chars[] = { '<', '&', '>', '"', '\'' }; - int i; - poolInit(&(p->pool)); hashTableInit(&(p->generalEntities)); - for (i = 0; i < 5; i++) { - ENTITY *entity = (ENTITY *)lookup(&(p->generalEntities), names[i], sizeof(ENTITY)); - if (!entity) - return 0; - entity->textPtr = chars + i; - entity->textLen = 1; - entity->magic = 1; - } hashTableInit(&(p->elementTypes)); hashTableInit(&(p->attributeIds)); p->complete = 1; + p->base = 0; return 1; } @@ -1465,7 +2282,8 @@ static void dtdDestroy(DTD *p) ELEMENT_TYPE *e = (ELEMENT_TYPE *)hashTableIterNext(&iter); if (!e) break; - free(e->defaultAtts); + if (e->allocDefaultAtts != 0) + free(e->defaultAtts); } hashTableDestroy(&(p->generalEntities)); hashTableDestroy(&(p->elementTypes)); @@ -1473,6 +2291,124 @@ static void dtdDestroy(DTD *p) poolDestroy(&(p->pool)); } +/* Do a deep copy of the DTD. Return 0 for out of memory; non-zero otherwise. +The new DTD has already been initialized. */ + +static int dtdCopy(DTD *newDtd, const DTD *oldDtd) +{ + HASH_TABLE_ITER iter; + + if (oldDtd->base) { + const XML_Char *tem = poolCopyString(&(newDtd->pool), oldDtd->base); + if (!tem) + return 0; + newDtd->base = tem; + } + + hashTableIterInit(&iter, &(oldDtd->attributeIds)); + + /* Copy the attribute id table. */ + + for (;;) { + ATTRIBUTE_ID *newA; + const XML_Char *name; + const ATTRIBUTE_ID *oldA = (ATTRIBUTE_ID *)hashTableIterNext(&iter); + + if (!oldA) + break; + /* Remember to allocate the scratch byte before the name. */ + if (!poolAppendChar(&(newDtd->pool), XML_T('\0'))) + return 0; + name = poolCopyString(&(newDtd->pool), oldA->name); + if (!name) + return 0; + ++name; + newA = (ATTRIBUTE_ID *)lookup(&(newDtd->attributeIds), name, sizeof(ATTRIBUTE_ID)); + if (!newA) + return 0; + newA->maybeTokenized = oldA->maybeTokenized; + } + + /* Copy the element type table. */ + + hashTableIterInit(&iter, &(oldDtd->elementTypes)); + + for (;;) { + int i; + ELEMENT_TYPE *newE; + const XML_Char *name; + const ELEMENT_TYPE *oldE = (ELEMENT_TYPE *)hashTableIterNext(&iter); + if (!oldE) + break; + name = poolCopyString(&(newDtd->pool), oldE->name); + if (!name) + return 0; + newE = (ELEMENT_TYPE *)lookup(&(newDtd->elementTypes), name, sizeof(ELEMENT_TYPE)); + if (!newE) + return 0; + newE->defaultAtts = (DEFAULT_ATTRIBUTE *)malloc(oldE->nDefaultAtts * sizeof(DEFAULT_ATTRIBUTE)); + if (!newE->defaultAtts) + return 0; + newE->allocDefaultAtts = newE->nDefaultAtts = oldE->nDefaultAtts; + for (i = 0; i < newE->nDefaultAtts; i++) { + newE->defaultAtts[i].id = (ATTRIBUTE_ID *)lookup(&(newDtd->attributeIds), oldE->defaultAtts[i].id->name, 0); + newE->defaultAtts[i].isCdata = oldE->defaultAtts[i].isCdata; + newE->defaultAtts[i].value = poolCopyString(&(newDtd->pool), oldE->defaultAtts[i].value); + if (!newE->defaultAtts[i].value) + return 0; + } + } + + /* Copy the entity table. */ + + hashTableIterInit(&iter, &(oldDtd->generalEntities)); + + for (;;) { + ENTITY *newE; + const XML_Char *name; + const ENTITY *oldE = (ENTITY *)hashTableIterNext(&iter); + if (!oldE) + break; + name = poolCopyString(&(newDtd->pool), oldE->name); + if (!name) + return 0; + newE = (ENTITY *)lookup(&(newDtd->generalEntities), name, sizeof(ENTITY)); + if (!newE) + return 0; + if (oldE->systemId) { + const XML_Char *tem = poolCopyString(&(newDtd->pool), oldE->systemId); + if (!tem) + return 0; + newE->systemId = tem; + if (oldE->base) { + if (oldE->base == oldDtd->base) + newE->base = newDtd->base; + tem = poolCopyString(&(newDtd->pool), oldE->base); + if (!tem) + return 0; + newE->base = tem; + } + } + else { + const XML_Char *tem = poolCopyStringN(&(newDtd->pool), oldE->textPtr, oldE->textLen); + if (!tem) + return 0; + newE->textPtr = tem; + newE->textLen = oldE->textLen; + } + if (oldE->notation) { + const XML_Char *tem = poolCopyString(&(newDtd->pool), oldE->notation); + if (!tem) + return 0; + newE->notation = tem; + } + } + + newDtd->complete = oldDtd->complete; + newDtd->standalone = oldDtd->standalone; + return 1; +} + static void poolInit(STRING_POOL *pool) { @@ -1526,13 +2462,13 @@ void poolDestroy(STRING_POOL *pool) } static -char *poolAppend(STRING_POOL *pool, const ENCODING *enc, - const char *ptr, const char *end) +XML_Char *poolAppend(STRING_POOL *pool, const ENCODING *enc, + const char *ptr, const char *end) { if (!pool->ptr && !poolGrow(pool)) return 0; for (;;) { - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &(pool->ptr), pool->end); + XmlConvert(enc, &ptr, end, (ICHAR **)&(pool->ptr), (ICHAR *)pool->end); if (ptr == end) break; if (!poolGrow(pool)) @@ -1541,10 +2477,34 @@ char *poolAppend(STRING_POOL *pool, const ENCODING *enc, return pool->start; } +static const XML_Char *poolCopyString(STRING_POOL *pool, const XML_Char *s) +{ + do { + if (!poolAppendChar(pool, *s)) + return 0; + } while (*s++); + s = pool->start; + poolFinish(pool); + return s; +} + +static const XML_Char *poolCopyStringN(STRING_POOL *pool, const XML_Char *s, int n) +{ + if (!pool->ptr && !poolGrow(pool)) + return 0; + for (; n > 0; --n, s++) { + if (!poolAppendChar(pool, *s)) + return 0; + + } + s = pool->start; + poolFinish(pool); + return s; +} static -char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, - const char *ptr, const char *end) +XML_Char *poolStoreString(STRING_POOL *pool, const ENCODING *enc, + const char *ptr, const char *end) { if (!poolAppend(pool, enc, ptr, end)) return 0; @@ -1572,7 +2532,7 @@ int poolGrow(STRING_POOL *pool) pool->freeBlocks->next = pool->blocks; pool->blocks = pool->freeBlocks; pool->freeBlocks = tem; - memcpy(pool->blocks->s, pool->start, pool->end - pool->start); + memcpy(pool->blocks->s, pool->start, (pool->end - pool->start) * sizeof(XML_Char)); pool->ptr = pool->blocks->s + (pool->ptr - pool->start); pool->start = pool->blocks->s; pool->end = pool->start + pool->blocks->size; @@ -1581,7 +2541,7 @@ int poolGrow(STRING_POOL *pool) } if (pool->blocks && pool->start == pool->blocks->s) { int blockSize = (pool->end - pool->start)*2; - pool->blocks = realloc(pool->blocks, offsetof(BLOCK, s) + blockSize); + pool->blocks = realloc(pool->blocks, offsetof(BLOCK, s) + blockSize * sizeof(XML_Char)); if (!pool->blocks) return 0; pool->blocks->size = blockSize; @@ -1596,13 +2556,13 @@ int poolGrow(STRING_POOL *pool) blockSize = INIT_BLOCK_SIZE; else blockSize *= 2; - tem = malloc(offsetof(BLOCK, s) + blockSize); + tem = malloc(offsetof(BLOCK, s) + blockSize * sizeof(XML_Char)); if (!tem) return 0; tem->size = blockSize; tem->next = pool->blocks; pool->blocks = tem; - memcpy(tem->s, pool->start, pool->ptr - pool->start); + memcpy(tem->s, pool->start, (pool->ptr - pool->start) * sizeof(XML_Char)); pool->ptr = tem->s + (pool->ptr - pool->start); pool->start = tem->s; pool->end = tem->s + blockSize; diff --git a/modules/xml/expat/xmlparse/xmlparse.h b/modules/xml/expat/xmlparse/xmlparse.h index 216ec6d07861..13d5885ca2ab 100644 --- a/modules/xml/expat/xmlparse/xmlparse.h +++ b/modules/xml/expat/xmlparse/xmlparse.h @@ -31,32 +31,199 @@ extern "C" { typedef void *XML_Parser; -/* Constructs a new parser; encoding should be the name of the charset from -the Content-Type header if the Content-Type is text/xml, or null otherwise. */ +#ifdef XML_UNICODE_WCHAR_T -XML_Parser XMLPARSEAPI -XML_ParserCreate(const char *encoding); +/* XML_UNICODE_WCHAR_T will work only if sizeof(wchar_t) == 2 and wchar_t +uses Unicode. */ +/* Information is UTF-16 encoded as wchar_ts */ + +#ifndef XML_UNICODE +#define XML_UNICODE +#endif + +#include +typedef wchar_t XML_Char; +typedef wchar_t XML_LChar; + +#else /* not XML_UNICODE_WCHAR_T */ + +#ifdef XML_UNICODE + +/* Information is UTF-16 encoded as unsigned shorts */ +typedef unsigned short XML_Char; +typedef char XML_LChar; + +#else /* not XML_UNICODE */ /* Information is UTF-8 encoded. */ +typedef char XML_Char; +typedef char XML_LChar; -/* atts is array of name/value pairs, terminated by NULL; - names and values are '\0' terminated. */ +#endif /* not XML_UNICODE */ + +#endif /* not XML_UNICODE_WCHAR_T */ + + +/* Constructs a new parser; encoding is the encoding specified by the external +protocol or null if there is none specified. */ + +XML_Parser XMLPARSEAPI +XML_ParserCreate(const XML_Char *encoding); + + +/* atts is array of name/value pairs, terminated by 0; + names and values are 0 terminated. */ typedef void (*XML_StartElementHandler)(void *userData, - const char *name, - const char **atts); + const XML_Char *name, + const XML_Char **atts); typedef void (*XML_EndElementHandler)(void *userData, - const char *name); + const XML_Char *name); +/* s is not 0 terminated. */ typedef void (*XML_CharacterDataHandler)(void *userData, - const char *s, + const XML_Char *s, int len); -/* target and data are '\0' terminated */ +/* target and data are 0 terminated */ typedef void (*XML_ProcessingInstructionHandler)(void *userData, - const char *target, - const char *data); + const XML_Char *target, + const XML_Char *data); + +/* This is called for any characters in the XML document for +which there is no applicable handler. This includes both +characters that are part of markup which is of a kind that is +not reported (comments, markup declarations), or characters +that are part of a construct which could be reported but +for which no handler has been supplied. The characters are passed +exactly as they were in the XML document except that +they will be encoded in UTF-8. Line boundaries are not normalized. +Note that a byte order mark character is not passed to the default handler. +If a default handler is set, internal entity references +are not expanded. There are no guarantees about +how characters are divided between calls to the default handler: +for example, a comment might be split between multiple calls. */ + +typedef void (*XML_DefaultHandler)(void *userData, + const XML_Char *s, + int len); + +/* This is called for a declaration of an unparsed (NDATA) +entity. The base argument is whatever was set by XML_SetBase. +The entityName, systemId and notationName arguments will never be null. +The other arguments may be. */ + +typedef void (*XML_UnparsedEntityDeclHandler)(void *userData, + const XML_Char *entityName, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId, + const XML_Char *notationName); + +/* This is called for a declaration of notation. +The base argument is whatever was set by XML_SetBase. +The notationName will never be null. The other arguments can be. */ + +typedef void (*XML_NotationDeclHandler)(void *userData, + const XML_Char *notationName, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId); + +/* This is called for a reference to an external parsed general entity. +The referenced entity is not automatically parsed. +The application can parse it immediately or later using +XML_ExternalEntityParserCreate. +The parser argument is the parser parsing the entity containing the reference; +it can be passed as the parser argument to XML_ExternalEntityParserCreate. +The systemId argument is the system identifier as specified in the entity declaration; +it will not be null. +The base argument is the system identifier that should be used as the base for +resolving systemId if systemId was relative; this is set by XML_SetBase; +it may be null. +The publicId argument is the public identifier as specified in the entity declaration, +or null if none was specified; the whitespace in the public identifier +will have been normalized as required by the XML spec. +The openEntityNames argument is a space-separated list of the names of the entities +that are open for the parse of this entity (including the name of the referenced +entity); this can be passed as the openEntityNames argument to +XML_ExternalEntityParserCreate; openEntityNames is valid only until the handler +returns, so if the referenced entity is to be parsed later, it must be copied. +The handler should return 0 if processing should not continue because of +a fatal error in the handling of the external entity. +In this case the calling parser will return an XML_ERROR_EXTERNAL_ENTITY_HANDLING +error. +Note that unlike other handlers the first argument is the parser, not userData. */ + +typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser, + const XML_Char *openEntityNames, + const XML_Char *base, + const XML_Char *systemId, + const XML_Char *publicId); + +/* This structure is filled in by the XML_UnknownEncodingHandler +to provide information to the parser about encodings that are unknown +to the parser. +The map[b] member gives information about byte sequences +whose first byte is b. +If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c. +If map[b] is -1, then the byte sequence is malformed. +If map[b] is -n, where n >= 2, then b is the first byte of an n-byte +sequence that encodes a single Unicode scalar value. +The data member will be passed as the first argument to the convert function. +The convert function is used to convert multibyte sequences; +s will point to a n-byte sequence where map[(unsigned char)*s] == -n. +The convert function must return the Unicode scalar value +represented by this byte sequence or -1 if the byte sequence is malformed. +The convert function may be null if the encoding is a single-byte encoding, +that is if map[b] >= -1 for all bytes b. +When the parser is finished with the encoding, then if release is not null, +it will call release passing it the data member; +once release has been called, the convert function will not be called again. + +Expat places certain restrictions on the encodings that are supported +using this mechanism. + +1. Every ASCII character that can appear in a well-formed XML document, +other than the characters + + $@\^`{}~ + +must be represented by a single byte, and that byte must be the +same byte that represents that character in ASCII. + +2. No character may require more than 4 bytes to encode. + +3. All characters encoded must have Unicode scalar values <= 0xFFFF, +(ie characters that would be encoded by surrogates in UTF-16 +are not allowed). Note that this restriction doesn't apply to +the built-in support for UTF-8 and UTF-16. + +4. No Unicode character may be encoded by more than one distinct sequence +of bytes. */ + +typedef struct { + int map[256]; + void *data; + int (*convert)(void *data, const char *s); + void (*release)(void *data); +} XML_Encoding; + +/* This is called for an encoding that is unknown to the parser. +The encodingHandlerData argument is that which was passed as the +second argument to XML_SetUnknownEncodingHandler. +The name argument gives the name of the encoding as specified in +the encoding declaration. +If the callback can provide information about the encoding, +it must fill in the XML_Encoding structure, and return 1. +Otherwise it must return 0. +If info does not describe a suitable encoding, +then the parser will return an XML_UNKNOWN_ENCODING error. */ + +typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData, + const XML_Char *name, + XML_Encoding *info); void XMLPARSEAPI XML_SetElementHandler(XML_Parser parser, @@ -71,10 +238,62 @@ void XMLPARSEAPI XML_SetProcessingInstructionHandler(XML_Parser parser, XML_ProcessingInstructionHandler handler); +void XMLPARSEAPI +XML_SetDefaultHandler(XML_Parser parser, + XML_DefaultHandler handler); + +void XMLPARSEAPI +XML_SetUnparsedEntityDeclHandler(XML_Parser parser, + XML_UnparsedEntityDeclHandler handler); + +void XMLPARSEAPI +XML_SetNotationDeclHandler(XML_Parser parser, + XML_NotationDeclHandler handler); + +void XMLPARSEAPI +XML_SetExternalEntityRefHandler(XML_Parser parser, + XML_ExternalEntityRefHandler handler); + +void XMLPARSEAPI +XML_SetUnknownEncodingHandler(XML_Parser parser, + XML_UnknownEncodingHandler handler, + void *encodingHandlerData); + +/* This can be called within a handler for a start element, end element, +processing instruction or character data. It causes the corresponding +markup to be passed to the default handler. +Within the expansion of an internal entity, nothing will be passed +to the default handler, although this usually will not happen since +setting a default handler inhibits expansion of internal entities. */ +void XMLPARSEAPI XML_DefaultCurrent(XML_Parser parser); + /* This value is passed as the userData argument to callbacks. */ void XMLPARSEAPI XML_SetUserData(XML_Parser parser, void *userData); +/* Returns the last value set by XML_SetUserData or null. */ +#define XML_GetUserData(parser) (*(void **)(parser)) + +/* If this function is called, then the parser will be passed +as the first argument to callbacks instead of userData. +The userData will still be accessible using XML_GetUserData. */ + +void XMLPARSEAPI +XML_UseParserAsHandlerArg(XML_Parser parser); + +/* Sets the base to be used for resolving relative URIs in system identifiers in +declarations. Resolving relative identifiers is left to the application: +this value will be passed through as the base argument to the +XML_ExternalEntityRefHandler, XML_NotationDeclHandler +and XML_UnparsedEntityDeclHandler. The base argument will be copied. +Returns zero if out of memory, non-zero otherwise. */ + +int XMLPARSEAPI +XML_SetBase(XML_Parser parser, const XML_Char *base); + +const XML_Char XMLPARSEAPI * +XML_GetBase(XML_Parser parser); + /* Parses some input. Returns 0 if a fatal error is detected. The last call to XML_Parse must have isFinal true; len may be zero for this call (or any other). */ @@ -87,8 +306,20 @@ XML_GetBuffer(XML_Parser parser, int len); int XMLPARSEAPI XML_ParseBuffer(XML_Parser parser, int len, int isFinal); -/* If XML_Parser or XML_ParseEnd have returned 0, then XML_GetError* -returns information about the error. */ +/* Creates an XML_Parser object that can parse an external general entity; +openEntityNames is a space-separated list of the names of the entities that are open +for the parse of this entity (including the name of this one); +encoding is the externally specified encoding, +or null if there is no externally specified encoding. +This can be called at any point after the first call to an ExternalEntityRefHandler +so longer as the parser has not yet been freed. +The new parser is completely independent and may safely be used in a separate thread. +The handlers and userData are initialized from the parser argument. +Returns 0 if out of memory. Otherwise returns a new XML_Parser object. */ +XML_Parser XMLPARSEAPI +XML_ExternalEntityParserCreate(XML_Parser parser, + const XML_Char *openEntityNames, + const XML_Char *encoding); enum XML_Error { XML_ERROR_NONE, @@ -110,19 +341,39 @@ enum XML_Error { XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF, XML_ERROR_MISPLACED_XML_PI, XML_ERROR_UNKNOWN_ENCODING, - XML_ERROR_INCORRECT_ENCODING + XML_ERROR_INCORRECT_ENCODING, + XML_ERROR_UNCLOSED_CDATA_SECTION, + XML_ERROR_EXTERNAL_ENTITY_HANDLING }; -int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser); -int XMLPARSEAPI XML_GetErrorLineNumber(XML_Parser parser); -int XMLPARSEAPI XML_GetErrorColumnNumber(XML_Parser parser); -long XMLPARSEAPI XML_GetErrorByteIndex(XML_Parser parser); +/* If XML_Parse or XML_ParseBuffer have returned 0, then XML_GetErrorCode +returns information about the error. */ +enum XML_Error XMLPARSEAPI XML_GetErrorCode(XML_Parser parser); + +/* These functions return information about the current parse location. +They may be called when XML_Parse or XML_ParseBuffer return 0; +in this case the location is the location of the character at which +the error was detected. +They may also be called from any other callback called to report +some parse event; in this the location is the location of the first +of the sequence of characters that generated the event. */ + +int XMLPARSEAPI XML_GetCurrentLineNumber(XML_Parser parser); +int XMLPARSEAPI XML_GetCurrentColumnNumber(XML_Parser parser); +long XMLPARSEAPI XML_GetCurrentByteIndex(XML_Parser parser); + +/* For backwards compatibility with previous versions. */ +#define XML_GetErrorLineNumber XML_GetCurrentLineNumber +#define XML_GetErrorColumnNumber XML_GetCurrentColumnNumber +#define XML_GetErrorByteIndex XML_GetCurrentByteIndex + +/* Frees memory used by the parser. */ void XMLPARSEAPI XML_ParserFree(XML_Parser parser); -const char XMLPARSEAPI * -XML_ErrorString(int code); +/* Returns a string describing the error. */ +const XML_LChar XMLPARSEAPI *XML_ErrorString(int code); #ifdef __cplusplus } diff --git a/modules/xml/expat/xmltok/xmlrole.c b/modules/xml/expat/xmltok/xmlrole.c index 340147ee6926..72be89bff8fa 100644 --- a/modules/xml/expat/xmltok/xmlrole.c +++ b/modules/xml/expat/xmltok/xmlrole.c @@ -594,7 +594,7 @@ int notation4(PROLOG_STATE *state, return XML_ROLE_NOTATION_SYSTEM_ID; case XML_TOK_DECL_CLOSE: state->handler = internalSubset; - return XML_ROLE_NONE; + return XML_ROLE_NOTATION_NO_SYSTEM_ID; } return syntaxError(state); } diff --git a/modules/xml/expat/xmltok/xmlrole.h b/modules/xml/expat/xmltok/xmlrole.h index 4f4655f821c8..ecbcc26dff05 100644 --- a/modules/xml/expat/xmltok/xmlrole.h +++ b/modules/xml/expat/xmltok/xmlrole.h @@ -44,6 +44,7 @@ enum { XML_ROLE_ENTITY_NOTATION_NAME, XML_ROLE_NOTATION_NAME, XML_ROLE_NOTATION_SYSTEM_ID, + XML_ROLE_NOTATION_NO_SYSTEM_ID, XML_ROLE_NOTATION_PUBLIC_ID, XML_ROLE_ATTRIBUTE_NAME, XML_ROLE_ATTRIBUTE_TYPE_CDATA, diff --git a/modules/xml/expat/xmltok/xmltok.c b/modules/xml/expat/xmltok/xmltok.c index aba5e55c78ee..bcd06eb9f99f 100644 --- a/modules/xml/expat/xmltok/xmltok.c +++ b/modules/xml/expat/xmltok/xmltok.c @@ -23,7 +23,7 @@ Contributor(s): #include "nametab.h" #define VTABLE1 \ - { PREFIX(prologTok), PREFIX(contentTok) }, \ + { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \ { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \ PREFIX(sameName), \ PREFIX(nameMatchesAscii), \ @@ -31,14 +31,11 @@ Contributor(s): PREFIX(skipS), \ PREFIX(getAtts), \ PREFIX(charRefNumber), \ + PREFIX(predefinedEntityName), \ PREFIX(updatePosition), \ PREFIX(isPublicId) -#define VTABLE2 \ - PREFIX(encode), \ - { PREFIX(toUtf8) } - -#define VTABLE VTABLE1, VTABLE2 +#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16) #define UCS2_GET_NAMING(pages, hi, lo) \ (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F))) @@ -81,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0) +static +int isNever(const ENCODING *enc, const char *p) +{ + return 0; +} + +static +int utf8_isName2(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING2(namePages, (const unsigned char *)p); +} + +static +int utf8_isName3(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING3(namePages, (const unsigned char *)p); +} + +#define utf8_isName4 isNever + +static +int utf8_isNmstrt2(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p); +} + +static +int utf8_isNmstrt3(const ENCODING *enc, const char *p) +{ + return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p); +} + +#define utf8_isNmstrt4 isNever + +#define utf8_isInvalid2 isNever + +static +int utf8_isInvalid3(const ENCODING *enc, const char *p) +{ + return UTF8_INVALID3((const unsigned char *)p); +} + +static +int utf8_isInvalid4(const ENCODING *enc, const char *p) +{ + return UTF8_INVALID4((const unsigned char *)p); +} + struct normal_encoding { ENCODING enc; unsigned char type[256]; + int (*isName2)(const ENCODING *, const char *); + int (*isName3)(const ENCODING *, const char *); + int (*isName4)(const ENCODING *, const char *); + int (*isNmstrt2)(const ENCODING *, const char *); + int (*isNmstrt3)(const ENCODING *, const char *); + int (*isNmstrt4)(const ENCODING *, const char *); + int (*isInvalid2)(const ENCODING *, const char *); + int (*isInvalid3)(const ENCODING *, const char *); + int (*isInvalid4)(const ENCODING *, const char *); }; +#define NORMAL_VTABLE(E) \ + E ## isName2, \ + E ## isName3, \ + E ## isName4, \ + E ## isNmstrt2, \ + E ## isNmstrt3, \ + E ## isNmstrt4, \ + E ## isInvalid2, \ + E ## isInvalid3, \ + E ## isInvalid4 + static int checkCharRefNumber(int); #include "xmltok_impl.h" @@ -95,12 +160,16 @@ static int checkCharRefNumber(int); #define BYTE_TYPE(enc, p) \ (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)]) #define BYTE_TO_ASCII(enc, p) (*p) -#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n) -#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n) + +#define IS_NAME_CHAR(enc, p, n) \ + (((const struct normal_encoding *)(enc))->isName ## n(enc, p)) +#define IS_NMSTRT_CHAR(enc, p, n) \ + (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p)) #define IS_INVALID_CHAR(enc, p, n) \ -((n) == 3 \ - ? UTF8_INVALID3((const unsigned char *)(p)) \ - : ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0)) + (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p)) + +#define IS_NAME_CHAR_MINBPC(enc, p) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0) /* c is an ASCII character */ #define CHAR_MATCHES(enc, p, c) (*(p) == c) @@ -113,51 +182,18 @@ static int checkCharRefNumber(int); #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR -enum { - /* cvalN is value of masked first byte of N byte sequence */ - cval1 = 0x00, - cval2 = 0xc0, - cval3 = 0xe0, - cval4 = 0xf0, - /* minN is minimum legal resulting value for N byte sequence */ - min2 = 0x80, - min3 = 0x800, - min4 = 0x10000 +enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */ + UTF8_cval1 = 0x00, + UTF8_cval2 = 0xc0, + UTF8_cval3 = 0xe0, + UTF8_cval4 = 0xf0 }; -static -int utf8_encode(const ENCODING *enc, int c, char *buf) -{ - if (c < 0) - return 0; - if (c < min2) { - buf[0] = (c | cval1); - return 1; - } - if (c < min3) { - buf[0] = ((c >> 6) | cval2); - buf[1] = ((c & 0x3f) | 0x80); - return 2; - } - if (c < min4) { - buf[0] = ((c >> 12) | cval3); - buf[1] = (((c >> 6) & 0x3f) | 0x80); - buf[2] = ((c & 0x3f) | 0x80); - return 3; - } - if (c < 0x110000) { - buf[0] = ((c >> 18) | cval4); - buf[1] = (((c >> 12) & 0x3f) | 0x80); - buf[2] = (((c >> 6) & 0x3f) | 0x80); - buf[3] = ((c & 0x3f) | 0x80); - return 4; - } - return 0; -} - static void utf8_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, @@ -177,34 +213,63 @@ void utf8_toUtf8(const ENCODING *enc, *toP = to; } +static +void utf8_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + unsigned short **toP, const unsigned short *toLim) +{ + unsigned short *to = *toP; + const char *from = *fromP; + while (from != fromLim && to != toLim) { + switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) { + case BT_LEAD2: + *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f); + from += 2; + break; + case BT_LEAD3: + *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f); + from += 3; + break; + case BT_LEAD4: + { + unsigned long n; + if (to + 1 == toLim) + break; + n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f); + n -= 0x10000; + to[0] = (unsigned short)((n >> 10) | 0xD800); + to[1] = (unsigned short)((n & 0x3FF) | 0xDC00); + to += 2; + from += 4; + } + break; + default: + *to++ = *from++; + break; + } + } + *fromP = from; + *toP = to; +} + static const struct normal_encoding utf8_encoding = { - { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, + { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #include "asciitab.h" #include "utf8tab.h" - } + }, + NORMAL_VTABLE(utf8_) }; static const struct normal_encoding internal_utf8_encoding = { - { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 }, + { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 }, { #include "iasciitab.h" #include "utf8tab.h" - } + }, + NORMAL_VTABLE(utf8_) }; -static -int latin1_encode(const ENCODING *enc, int c, char *buf) -{ - if (c < 0) - return 0; - if (c <= 0xFF) { - buf[0] = (char)c; - return 1; - } - return 0; -} - static void latin1_toUtf8(const ENCODING *enc, const char **fromP, const char *fromLim, @@ -218,7 +283,7 @@ void latin1_toUtf8(const ENCODING *enc, if (c & 0x80) { if (toLim - *toP < 2) break; - *(*toP)++ = ((c >> 6) | cval2); + *(*toP)++ = ((c >> 6) | UTF8_cval2); *(*toP)++ = ((c & 0x3f) | 0x80); (*fromP)++; } @@ -230,15 +295,39 @@ void latin1_toUtf8(const ENCODING *enc, } } +static +void latin1_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + unsigned short **toP, const unsigned short *toLim) +{ + while (*fromP != fromLim && *toP != toLim) + *(*toP)++ = (unsigned char)*(*fromP)++; +} + static const struct normal_encoding latin1_encoding = { - { VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 }, + { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 }, { #include "asciitab.h" #include "latin1tab.h" } }; -#define latin1tab (latin1_encoding.type) +static +void ascii_toUtf8(const ENCODING *enc, + const char **fromP, const char *fromLim, + char **toP, const char *toLim) +{ + while (*fromP != fromLim && *toP != toLim) + *(*toP)++ = *(*fromP)++; +} + +static const struct normal_encoding ascii_encoding = { + { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 }, + { +#include "asciitab.h" +/* BT_NONXML == 0 */ + } +}; #undef PREFIX @@ -260,25 +349,6 @@ static int unicode_byte_type(char hi, char lo) return BT_NONASCII; } -#define DEFINE_UTF16_ENCODE \ -static \ -int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \ -{ \ - if (charNum < 0) \ - return 0; \ - if (charNum < 0x10000) { \ - SET2(buf, charNum); \ - return 2; \ - } \ - if (charNum < 0x110000) { \ - charNum -= 0x10000; \ - SET2(buf, (charNum >> 10) + 0xD800); \ - SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \ - return 4; \ - } \ - return 0; \ -} - #define DEFINE_UTF16_TO_UTF8 \ static \ void PREFIX(toUtf8)(const ENCODING *enc, \ @@ -308,7 +378,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ *fromP = from; \ return; \ } \ - *(*toP)++ = ((lo >> 6) | (hi << 2) | cval2); \ + *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ default: \ @@ -317,7 +387,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ return; \ } \ /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \ - *(*toP)++ = ((hi >> 4) | cval3); \ + *(*toP)++ = ((hi >> 4) | UTF8_cval3); \ *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \ *(*toP)++ = ((lo & 0x3f) | 0x80); \ break; \ @@ -327,7 +397,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ return; \ } \ plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \ - *(*toP)++ = ((plane >> 2) | cval4); \ + *(*toP)++ = ((plane >> 2) | UTF8_cval4); \ *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \ from += 2; \ lo2 = GET_LO(from); \ @@ -342,15 +412,33 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ *fromP = from; \ } +#define DEFINE_UTF16_TO_UTF16 \ +static \ +void PREFIX(toUtf16)(const ENCODING *enc, \ + const char **fromP, const char *fromLim, \ + unsigned short **toP, const unsigned short *toLim) \ +{ \ + /* Avoid copying first half only of surrogate */ \ + if (fromLim - *fromP > ((toLim - *toP) << 1) \ + && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \ + fromLim -= 2; \ + for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \ + *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \ +} + #define PREFIX(ident) little2_ ## ident #define MINBPC 2 #define BYTE_TYPE(enc, p) \ - ((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0])) + ((p)[1] == 0 \ + ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \ + : unicode_byte_type((p)[1], (p)[0])) #define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c) -#define IS_NAME_CHAR(enc, p, n) \ +#define IS_NAME_CHAR(enc, p, n) (0) +#define IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0]) -#define IS_NMSTRT_CHAR(enc, p, n) \ +#define IS_NMSTRT_CHAR(enc, p, n) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0]) #include "xmltok_impl.c" @@ -360,8 +448,8 @@ void PREFIX(toUtf8)(const ENCODING *enc, \ #define GET_LO(ptr) ((unsigned char)(ptr)[0]) #define GET_HI(ptr) ((unsigned char)(ptr)[1]) -DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 +DEFINE_UTF16_TO_UTF16 #undef SET2 #undef GET_LO @@ -371,10 +459,32 @@ DEFINE_UTF16_TO_UTF8 #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR -static const struct encoding little2_encoding = { VTABLE, 2 }; +static const struct normal_encoding little2_encoding = { + { VTABLE, 2, 0, +#if BYTE_ORDER == 12 + 1 +#else + 0 +#endif + }, +#include "asciitab.h" +#include "latin1tab.h" +}; + +#if BYTE_ORDER != 21 + +static const struct normal_encoding internal_little2_encoding = { + { VTABLE, 2, 0, 1 }, +#include "iasciitab.h" +#include "latin1tab.h" +}; + +#endif #undef PREFIX @@ -382,12 +492,16 @@ static const struct encoding little2_encoding = { VTABLE, 2 }; #define MINBPC 2 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */ #define BYTE_TYPE(enc, p) \ - ((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1])) + ((p)[0] == 0 \ + ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \ + : unicode_byte_type((p)[0], (p)[1])) #define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1) #define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c) -#define IS_NAME_CHAR(enc, p, n) \ +#define IS_NAME_CHAR(enc, p, n) 0 +#define IS_NAME_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1]) -#define IS_NMSTRT_CHAR(enc, p, n) \ +#define IS_NMSTRT_CHAR(enc, p, n) (0) +#define IS_NMSTRT_CHAR_MINBPC(enc, p) \ UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1]) #include "xmltok_impl.c" @@ -397,8 +511,8 @@ static const struct encoding little2_encoding = { VTABLE, 2 }; #define GET_LO(ptr) ((unsigned char)(ptr)[1]) #define GET_HI(ptr) ((unsigned char)(ptr)[0]) -DEFINE_UTF16_ENCODE DEFINE_UTF16_TO_UTF8 +DEFINE_UTF16_TO_UTF16 #undef SET2 #undef GET_LO @@ -408,10 +522,32 @@ DEFINE_UTF16_TO_UTF8 #undef BYTE_TO_ASCII #undef CHAR_MATCHES #undef IS_NAME_CHAR +#undef IS_NAME_CHAR_MINBPC #undef IS_NMSTRT_CHAR +#undef IS_NMSTRT_CHAR_MINBPC #undef IS_INVALID_CHAR -static const struct encoding big2_encoding = { VTABLE, 2 }; +static const struct normal_encoding big2_encoding = { + { VTABLE, 2, 0, +#if BYTE_ORDER == 21 + 1 +#else + 0 +#endif + }, +#include "asciitab.h" +#include "latin1tab.h" +}; + +#if BYTE_ORDER != 12 + +static const struct normal_encoding internal_big2_encoding = { + { VTABLE, 2, 0, 1 }, +#include "iasciitab.h" +#include "latin1tab.h" +}; + +#endif #undef PREFIX @@ -454,18 +590,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end, else { switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) { case 0x003C: - *encPtr = &big2_encoding; + *encPtr = &big2_encoding.enc; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); case 0xFEFF: *nextTokPtr = ptr + 2; - *encPtr = &big2_encoding; + *encPtr = &big2_encoding.enc; return XML_TOK_BOM; case 0x3C00: - *encPtr = &little2_encoding; + *encPtr = &little2_encoding.enc; return XmlTok(*encPtr, state, ptr, end, nextTokPtr); case 0xFFFE: *nextTokPtr = ptr + 2; - *encPtr = &little2_encoding; + *encPtr = &little2_encoding.enc; return XML_TOK_BOM; } } @@ -494,13 +630,21 @@ void initUpdatePosition(const ENCODING *enc, const char *ptr, normal_updatePosition(&utf8_encoding.enc, ptr, end, pos); } -const ENCODING *XmlGetInternalEncoding(int e) +const ENCODING *XmlGetUtf8InternalEncoding() { - switch (e) { - case XML_UTF8_ENCODING: - return &internal_utf8_encoding.enc; - } - return 0; + return &internal_utf8_encoding.enc; +} + +const ENCODING *XmlGetUtf16InternalEncoding() +{ +#if BYTE_ORDER == 12 + return &internal_little2_encoding.enc; +#elif BYTE_ORDER == 21 + return &internal_big2_encoding.enc; +#else + const short n = 1; + return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc; +#endif } int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name) @@ -514,6 +658,10 @@ int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name) *encPtr = &utf8_encoding.enc; return 1; } + if (streqci(name, "US-ASCII")) { + *encPtr = &ascii_encoding.enc; + return 1; + } if (!streqci(name, "UTF-16")) return 0; } @@ -531,7 +679,7 @@ int toAscii(const ENCODING *enc, const char *ptr, const char *end) { char buf[1]; char *p = buf; - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1); + XmlUtf8Convert(enc, &ptr, end, &p, p + 1); if (p == buf) return -1; else @@ -641,7 +789,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e char buf[ENCODING_MAX]; char *p = buf; int i; - XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1); + XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1); if (ptr != end) return 0; *p = 0; @@ -653,11 +801,13 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e return &utf8_encoding.enc; if (streqci(buf, "ISO-8859-1")) return &latin1_encoding.enc; + if (streqci(buf, "US-ASCII")) + return &ascii_encoding.enc; if (streqci(buf, "UTF-16")) { static const unsigned short n = 1; if (enc->minBytesPerChar == 2) return enc; - return &big2_encoding; + return &big2_encoding.enc; } return 0; } @@ -757,3 +907,229 @@ int checkCharRefNumber(int result) return result; } +int XmlUtf8Encode(int c, char *buf) +{ + enum { + /* minN is minimum legal resulting value for N byte sequence */ + min2 = 0x80, + min3 = 0x800, + min4 = 0x10000 + }; + + if (c < 0) + return 0; + if (c < min2) { + buf[0] = (c | UTF8_cval1); + return 1; + } + if (c < min3) { + buf[0] = ((c >> 6) | UTF8_cval2); + buf[1] = ((c & 0x3f) | 0x80); + return 2; + } + if (c < min4) { + buf[0] = ((c >> 12) | UTF8_cval3); + buf[1] = (((c >> 6) & 0x3f) | 0x80); + buf[2] = ((c & 0x3f) | 0x80); + return 3; + } + if (c < 0x110000) { + buf[0] = ((c >> 18) | UTF8_cval4); + buf[1] = (((c >> 12) & 0x3f) | 0x80); + buf[2] = (((c >> 6) & 0x3f) | 0x80); + buf[3] = ((c & 0x3f) | 0x80); + return 4; + } + return 0; +} + +int XmlUtf16Encode(int charNum, unsigned short *buf) +{ + if (charNum < 0) + return 0; + if (charNum < 0x10000) { + buf[0] = charNum; + return 1; + } + if (charNum < 0x110000) { + charNum -= 0x10000; + buf[0] = (charNum >> 10) + 0xD800; + buf[1] = (charNum & 0x3FF) + 0xDC00; + return 2; + } + return 0; +} + +struct unknown_encoding { + struct normal_encoding normal; + int (*convert)(void *userData, const char *p); + void *userData; + unsigned short utf16[256]; + char utf8[256][4]; +}; + +int XmlSizeOfUnknownEncoding() +{ + return sizeof(struct unknown_encoding); +} + +static +int unknown_isName(const ENCODING *enc, const char *p) +{ + int c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p); + if (c & ~0xFFFF) + return 0; + return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF); +} + +static +int unknown_isNmstrt(const ENCODING *enc, const char *p) +{ + int c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p); + if (c & ~0xFFFF) + return 0; + return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF); +} + +static +int unknown_isInvalid(const ENCODING *enc, const char *p) +{ + int c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, p); + return (c & ~0xFFFF) || checkCharRefNumber(c) < 0; +} + +static +void unknown_toUtf8(const ENCODING *enc, + const char **fromP, const char *fromLim, + char **toP, const char *toLim) +{ + char buf[XML_UTF8_ENCODE_MAX]; + for (;;) { + const char *utf8; + int n; + if (*fromP == fromLim) + break; + utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP]; + n = *utf8++; + if (n == 0) { + int c = ((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); + n = XmlUtf8Encode(c, buf); + if (n > toLim - *toP) + break; + utf8 = buf; + *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] + - (BT_LEAD2 - 2); + } + else { + if (n > toLim - *toP) + break; + (*fromP)++; + } + do { + *(*toP)++ = *utf8++; + } while (--n != 0); + } +} + +static +void unknown_toUtf16(const ENCODING *enc, + const char **fromP, const char *fromLim, + unsigned short **toP, const unsigned short *toLim) +{ + while (*fromP != fromLim && *toP != toLim) { + unsigned short c + = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP]; + if (c == 0) { + c = (unsigned short)((const struct unknown_encoding *)enc) + ->convert(((const struct unknown_encoding *)enc)->userData, *fromP); + *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP] + - (BT_LEAD2 - 2); + } + else + (*fromP)++; + *(*toP)++ = c; + } +} + +ENCODING * +XmlInitUnknownEncoding(void *mem, + int *table, + int (*convert)(void *userData, const char *p), + void *userData) +{ + int i; + struct unknown_encoding *e = mem; + for (i = 0; i < sizeof(struct normal_encoding); i++) + ((char *)mem)[i] = ((char *)&latin1_encoding)[i]; + for (i = 0; i < 128; i++) + if (latin1_encoding.type[i] != BT_OTHER + && latin1_encoding.type[i] != BT_NONXML + && table[i] != i) + return 0; + for (i = 0; i < 256; i++) { + int c = table[i]; + if (c == -1) { + e->normal.type[i] = BT_MALFORM; + /* This shouldn't really get used. */ + e->utf16[i] = 0xFFFF; + e->utf8[i][0] = 1; + e->utf8[i][1] = 0; + } + else if (c < 0) { + if (c < -4) + return 0; + e->normal.type[i] = BT_LEAD2 - (c + 2); + e->utf8[i][0] = 0; + e->utf16[i] = 0; + } + else if (c < 0x80) { + if (latin1_encoding.type[c] != BT_OTHER + && latin1_encoding.type[c] != BT_NONXML + && c != i) + return 0; + e->normal.type[i] = latin1_encoding.type[c]; + e->utf8[i][0] = 1; + e->utf8[i][1] = (char)c; + e->utf16[i] = c == 0 ? 0xFFFF : c; + } + else if (checkCharRefNumber(c) < 0) { + e->normal.type[i] = BT_NONXML; + /* This shouldn't really get used. */ + e->utf16[i] = 0xFFFF; + e->utf8[i][0] = 1; + e->utf8[i][1] = 0; + } + else { + if (c > 0xFFFF) + return 0; + if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff)) + e->normal.type[i] = BT_NMSTRT; + else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff)) + e->normal.type[i] = BT_NAME; + else + e->normal.type[i] = BT_OTHER; + e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1); + e->utf16[i] = c; + } + } + e->userData = userData; + e->convert = convert; + if (convert) { + e->normal.isName2 = unknown_isName; + e->normal.isName3 = unknown_isName; + e->normal.isName4 = unknown_isName; + e->normal.isNmstrt2 = unknown_isNmstrt; + e->normal.isNmstrt3 = unknown_isNmstrt; + e->normal.isNmstrt4 = unknown_isNmstrt; + e->normal.isInvalid2 = unknown_isInvalid; + e->normal.isInvalid3 = unknown_isInvalid; + e->normal.isInvalid4 = unknown_isInvalid; + } + e->normal.enc.utf8Convert = unknown_toUtf8; + e->normal.enc.utf16Convert = unknown_toUtf16; + return &(e->normal.enc); +} diff --git a/modules/xml/expat/xmltok/xmltok.h b/modules/xml/expat/xmltok/xmltok.h index d1fa5af5c3e2..6d0b91dff2ce 100644 --- a/modules/xml/expat/xmltok/xmltok.h +++ b/modules/xml/expat/xmltok/xmltok.h @@ -29,6 +29,9 @@ extern "C" { #define XMLTOKAPI /* as nothing */ #endif +/* The following token may be returned by XmlContentTok */ +#define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be start of + illegal ]]> sequence */ /* The following tokens may be returned by both XmlPrologTok and XmlContentTok */ #define XML_TOK_NONE -4 /* The string to be scanned is empty */ #define XML_TOK_TRAILING_CR -3 /* A CR at the end of the scan; @@ -38,7 +41,7 @@ extern "C" { #define XML_TOK_INVALID 0 /* The following tokens are returned by XmlContentTok; some are also - returned by XmlAttributeValueTok and XmlEntityTok */ + returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */ #define XML_TOK_START_TAG_WITH_ATTS 1 #define XML_TOK_START_TAG_NO_ATTS 2 @@ -47,7 +50,7 @@ extern "C" { #define XML_TOK_END_TAG 5 #define XML_TOK_DATA_CHARS 6 #define XML_TOK_DATA_NEWLINE 7 -#define XML_TOK_CDATA_SECTION 8 +#define XML_TOK_CDATA_SECT_OPEN 8 #define XML_TOK_ENTITY_REF 9 #define XML_TOK_CHAR_REF 10 /* numeric character reference */ @@ -85,25 +88,25 @@ extern "C" { #define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */ #define XML_TOK_COMMA 38 - /* The following tokens is returned only by XmlAttributeValueTok */ +/* The following token is returned only by XmlAttributeValueTok */ #define XML_TOK_ATTRIBUTE_VALUE_S 39 -#define XML_N_STATES 2 +/* The following token is returned only by XmlCdataSectionTok */ +#define XML_TOK_CDATA_SECT_CLOSE 40 + +#define XML_N_STATES 3 #define XML_PROLOG_STATE 0 #define XML_CONTENT_STATE 1 +#define XML_CDATA_SECTION_STATE 2 #define XML_N_LITERAL_TYPES 2 #define XML_ATTRIBUTE_VALUE_LITERAL 0 #define XML_ENTITY_VALUE_LITERAL 1 -#define XML_N_INTERNAL_ENCODINGS 1 -#define XML_UTF8_ENCODING 0 -#if 0 -#define XML_UTF16_ENCODING 1 -#define XML_UCS4_ENCODING 2 -#endif - -#define XML_MAX_BYTES_PER_CHAR 4 +/* The size of the buffer passed to XmlUtf8Encode must be at least this. */ +#define XML_UTF8_ENCODE_MAX 4 +/* The size of the buffer passed to XmlUtf16Encode must be at least this. */ +#define XML_UTF16_ENCODE_MAX 2 typedef struct position { /* first line and first column are 0 not 1 */ @@ -139,21 +142,26 @@ struct encoding { int (*getAtts)(const ENCODING *enc, const char *ptr, int attsMax, ATTRIBUTE *atts); int (*charRefNumber)(const ENCODING *enc, const char *ptr); + int (*predefinedEntityName)(const ENCODING *, const char *, const char *); void (*updatePosition)(const ENCODING *, const char *ptr, const char *end, POSITION *); int (*isPublicId)(const ENCODING *enc, const char *ptr, const char *end, const char **badPtr); - int (*encode)(const ENCODING *enc, - int charNum, - char *buf); - void (*convert[XML_N_INTERNAL_ENCODINGS])(const ENCODING *enc, - const char **fromP, - const char *fromLim, - char **toP, - const char *toLim); + void (*utf8Convert)(const ENCODING *enc, + const char **fromP, + const char *fromLim, + char **toP, + const char *toLim); + void (*utf16Convert)(const ENCODING *enc, + const char **fromP, + const char *fromLim, + unsigned short **toP, + const unsigned short *toLim); int minBytesPerChar; + char isUtf8; + char isUtf16; }; /* @@ -186,6 +194,9 @@ literals, comments and processing instructions. #define XmlContentTok(enc, ptr, end, nextTokPtr) \ XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr) +#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \ + XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr) + /* This is used for performing a 2nd-level tokenization on the content of a literal that has already been returned by XmlTok. */ @@ -215,17 +226,20 @@ the content of a literal that has already been returned by XmlTok. */ #define XmlCharRefNumber(enc, ptr) \ (((enc)->charRefNumber)(enc, ptr)) +#define XmlPredefinedEntityName(enc, ptr, end) \ + (((enc)->predefinedEntityName)(enc, ptr, end)) + #define XmlUpdatePosition(enc, ptr, end, pos) \ (((enc)->updatePosition)(enc, ptr, end, pos)) #define XmlIsPublicId(enc, ptr, end, badPtr) \ (((enc)->isPublicId)(enc, ptr, end, badPtr)) -#define XmlEncode(enc, ch, buf) \ - (((enc)->encode)(enc, ch, buf)) +#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \ + (((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim)) -#define XmlConvert(enc, targetEnc, fromP, fromLim, toP, toLim) \ - (((enc)->convert[targetEnc])(enc, fromP, fromLim, toP, toLim)) +#define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \ + (((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim)) typedef struct { ENCODING initEnc; @@ -243,7 +257,17 @@ int XMLTOKAPI XmlParseXmlDecl(int isGeneralTextEntity, int *standalonePtr); int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name); -const ENCODING XMLTOKAPI *XmlGetInternalEncoding(int); +const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding(); +const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding(); +int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf); +int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf); + +int XMLTOKAPI XmlSizeOfUnknownEncoding(); +ENCODING XMLTOKAPI * +XmlInitUnknownEncoding(void *mem, + int *table, + int (*convert)(void *userData, const char *p), + void *userData); #ifdef __cplusplus } diff --git a/modules/xml/expat/xmltok/xmltok_impl.c b/modules/xml/expat/xmltok/xmltok_impl.c index 00475542ea04..513935ae9e02 100644 --- a/modules/xml/expat/xmltok/xmltok_impl.c +++ b/modules/xml/expat/xmltok/xmltok_impl.c @@ -56,7 +56,7 @@ Contributor(s): #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \ case BT_NONASCII: \ - if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \ + if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -84,7 +84,7 @@ Contributor(s): #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \ case BT_NONASCII: \ - if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \ + if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end, return XML_TOK_PARTIAL; } -/* ptr points to character following " */ - if (end - ptr < 9 * MINBPC) + /* CDATA[ */ + if (end - ptr < 6 * MINBPC) return XML_TOK_PARTIAL; for (i = 0; i < 6; i++, ptr += MINBPC) { if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) { @@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e return XML_TOK_INVALID; } } - end -= 2 * MINBPC; - while (ptr != end) { - switch (BYTE_TYPE(enc, ptr)) { - INVALID_CASES(ptr, nextTokPtr) - case BT_RSQB: - if (CHAR_MATCHES(enc, ptr + MINBPC, ']') - && CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) { - *nextTokPtr = ptr + 3 * MINBPC; - return XML_TOK_CDATA_SECTION; - } - /* fall through */ - default: - ptr += MINBPC; + *nextTokPtr = ptr; + return XML_TOK_CDATA_SECT_OPEN; +} + +static +int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end, + const char **nextTokPtr) +{ + if (ptr == end) + return XML_TOK_NONE; +#if MINBPC > 1 + { + size_t n = end - ptr; + if (n & (MINBPC - 1)) { + n &= ~(MINBPC - 1); + if (n == 0) + return XML_TOK_PARTIAL; + end = ptr + n; } } - return XML_TOK_PARTIAL; +#endif + switch (BYTE_TYPE(enc, ptr)) { + case BT_RSQB: + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (!CHAR_MATCHES(enc, ptr, ']')) + break; + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (!CHAR_MATCHES(enc, ptr, '>')) { + ptr -= MINBPC; + break; + } + *nextTokPtr = ptr + MINBPC; + return XML_TOK_CDATA_SECT_CLOSE; + case BT_CR: + ptr += MINBPC; + if (ptr == end) + return XML_TOK_PARTIAL; + if (BYTE_TYPE(enc, ptr) == BT_LF) + ptr += MINBPC; + *nextTokPtr = ptr; + return XML_TOK_DATA_NEWLINE; + case BT_LF: + *nextTokPtr = ptr + MINBPC; + return XML_TOK_DATA_NEWLINE; + INVALID_CASES(ptr, nextTokPtr) + default: + ptr += MINBPC; + break; + } + while (ptr != end) { + switch (BYTE_TYPE(enc, ptr)) { +#define LEAD_CASE(n) \ + case BT_LEAD ## n: \ + if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \ + *nextTokPtr = ptr; \ + return XML_TOK_DATA_CHARS; \ + } \ + ptr += n; \ + break; + LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4) +#undef LEAD_CASE + case BT_NONXML: + case BT_MALFORM: + case BT_TRAIL: + case BT_CR: + case BT_LF: + case BT_RSQB: + *nextTokPtr = ptr; + return XML_TOK_DATA_CHARS; + default: + ptr += MINBPC; + break; + } + } + *nextTokPtr = ptr; + return XML_TOK_DATA_CHARS; } /* ptr points to character following "')) { ptr -= MINBPC; break; @@ -766,7 +847,7 @@ int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end, if (ptr == end) return XML_TOK_PARTIAL; switch (BYTE_TYPE(enc, ptr)) { - CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr) + CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) case BT_S: case BT_LF: case BT_CR: case BT_PERCNT: *nextTokPtr = ptr; return XML_TOK_PERCENT; @@ -795,7 +876,7 @@ int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end, if (ptr == end) return XML_TOK_PARTIAL; switch (BYTE_TYPE(enc, ptr)) { - CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr) + CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) default: *nextTokPtr = ptr; return XML_TOK_INVALID; @@ -944,7 +1025,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_RPAR: ptr += MINBPC; if (ptr == end) - return XML_TOK_INVALID; + return XML_TOK_PARTIAL; switch (BYTE_TYPE(enc, ptr)) { case BT_AST: *nextTokPtr = ptr + MINBPC; @@ -1001,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, ptr += MINBPC; break; case BT_NONASCII: - if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { + if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { ptr += MINBPC; tok = XML_TOK_NAME; break; } - if (IS_NAME_CHAR(enc, ptr, MINBPC)) { + if (IS_NAME_CHAR_MINBPC(enc, ptr)) { ptr += MINBPC; tok = XML_TOK_NMTOKEN; break; @@ -1343,6 +1424,59 @@ int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr) return checkCharRefNumber(result); } +static +int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end) +{ + switch (end - ptr) { + case 2 * MINBPC: + if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) { + switch (BYTE_TO_ASCII(enc, ptr)) { + case 'l': + return '<'; + case 'g': + return '>'; + } + } + break; + case 3 * MINBPC: + if (CHAR_MATCHES(enc, ptr, 'a')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'm')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'p')) + return '&'; + } + } + break; + case 4 * MINBPC: + switch (BYTE_TO_ASCII(enc, ptr)) { + case 'q': + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'u')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'o')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 't')) + return '"'; + } + } + break; + case 'a': + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'p')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 'o')) { + ptr += MINBPC; + if (CHAR_MATCHES(enc, ptr, 's')) + return '\''; + } + } + break; + } + } + return 0; +} + static int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2) { diff --git a/modules/xml/macbuild/XML.mcp.exp b/modules/xml/macbuild/XML.mcp.exp index ce85346a71b1..e29d4a2c5120 100644 --- a/modules/xml/macbuild/XML.mcp.exp +++ b/modules/xml/macbuild/XML.mcp.exp @@ -1,7 +1,7 @@ XML_ErrorString -XML_GetErrorByteIndex -XML_GetErrorColumnNumber -XML_GetErrorLineNumber +XML_GetCurrentLineNumber +XML_GetCurrentColumnNumber +XML_GetCurrentByteIndex XML_GetErrorCode XML_GetBuffer XML_ParseBuffer @@ -19,7 +19,6 @@ hashTableDestroy lookup XmlParseXmlDecl XmlInitEncoding -XmlGetInternalEncoding XmlPrologStateInit tokenizeXMLElement XMLDOM_PIHandler