зеркало из https://github.com/mozilla/gecko-dev.git
2nd try; new parser from james clark
This commit is contained in:
Родитель
105745e312
Коммит
c1ce4a8d42
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
The contents of this file are subject to the Mozilla Public License
|
||||
Version 1.0 (the "License"); you may not use this file except in
|
||||
compliance with the License. You may obtain a copy of the License at
|
||||
csompliance with the License. You may obtain a copy of the License at
|
||||
http://www.mozilla.org/MPL/
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS"
|
||||
|
@ -18,15 +18,22 @@ James Clark. All Rights Reserved.
|
|||
Contributor(s):
|
||||
*/
|
||||
|
||||
#include "xmldef.h"
|
||||
#include "hashtable.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "xmldef.h"
|
||||
#include "hashtable.h"
|
||||
|
||||
#ifdef XML_UNICODE
|
||||
#define keycmp wcscmp
|
||||
#else
|
||||
#define keycmp strcmp
|
||||
#endif
|
||||
|
||||
#define INIT_SIZE 64
|
||||
|
||||
static
|
||||
unsigned long hash(const char *s)
|
||||
unsigned long hash(KEY s)
|
||||
{
|
||||
unsigned long h = 0;
|
||||
while (*s)
|
||||
|
@ -34,7 +41,7 @@ unsigned long hash(const char *s)
|
|||
return h;
|
||||
}
|
||||
|
||||
NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
|
||||
NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize)
|
||||
{
|
||||
size_t i;
|
||||
if (table->size == 0) {
|
||||
|
@ -52,7 +59,7 @@ NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
|
|||
for (i = h & (table->size - 1);
|
||||
table->v[i];
|
||||
i == 0 ? i = table->size - 1 : --i) {
|
||||
if (strcmp(name, table->v[i]->name) == 0)
|
||||
if (keycmp(name, table->v[i]->name) == 0)
|
||||
return table->v[i];
|
||||
}
|
||||
if (!createSize)
|
||||
|
|
|
@ -21,8 +21,14 @@ Contributor(s):
|
|||
|
||||
#include <stddef.h>
|
||||
|
||||
#ifdef XML_UNICODE
|
||||
typedef const wchar_t *KEY;
|
||||
#else
|
||||
typedef const char *KEY;
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
KEY name;
|
||||
} NAMED;
|
||||
|
||||
typedef struct {
|
||||
|
@ -32,7 +38,7 @@ typedef struct {
|
|||
size_t usedLim;
|
||||
} HASH_TABLE;
|
||||
|
||||
NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize);
|
||||
NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize);
|
||||
void hashTableInit(HASH_TABLE *);
|
||||
void hashTableDestroy(HASH_TABLE *);
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -31,32 +31,199 @@ extern "C" {
|
|||
|
||||
typedef void *XML_Parser;
|
||||
|
||||
/* Constructs a new parser; encoding should be the name of the charset from
|
||||
the Content-Type header if the Content-Type is text/xml, or null otherwise. */
|
||||
#ifdef XML_UNICODE_WCHAR_T
|
||||
|
||||
XML_Parser XMLPARSEAPI
|
||||
XML_ParserCreate(const char *encoding);
|
||||
/* XML_UNICODE_WCHAR_T will work only if sizeof(wchar_t) == 2 and wchar_t
|
||||
uses Unicode. */
|
||||
/* Information is UTF-16 encoded as wchar_ts */
|
||||
|
||||
#ifndef XML_UNICODE
|
||||
#define XML_UNICODE
|
||||
#endif
|
||||
|
||||
#include <stddef.h>
|
||||
typedef wchar_t XML_Char;
|
||||
typedef wchar_t XML_LChar;
|
||||
|
||||
#else /* not XML_UNICODE_WCHAR_T */
|
||||
|
||||
#ifdef XML_UNICODE
|
||||
|
||||
/* Information is UTF-16 encoded as unsigned shorts */
|
||||
typedef unsigned short XML_Char;
|
||||
typedef char XML_LChar;
|
||||
|
||||
#else /* not XML_UNICODE */
|
||||
|
||||
/* Information is UTF-8 encoded. */
|
||||
typedef char XML_Char;
|
||||
typedef char XML_LChar;
|
||||
|
||||
/* atts is array of name/value pairs, terminated by NULL;
|
||||
names and values are '\0' terminated. */
|
||||
#endif /* not XML_UNICODE */
|
||||
|
||||
#endif /* not XML_UNICODE_WCHAR_T */
|
||||
|
||||
|
||||
/* Constructs a new parser; encoding is the encoding specified by the external
|
||||
protocol or null if there is none specified. */
|
||||
|
||||
XML_Parser XMLPARSEAPI
|
||||
XML_ParserCreate(const XML_Char *encoding);
|
||||
|
||||
|
||||
/* atts is array of name/value pairs, terminated by 0;
|
||||
names and values are 0 terminated. */
|
||||
|
||||
typedef void (*XML_StartElementHandler)(void *userData,
|
||||
const char *name,
|
||||
const char **atts);
|
||||
const XML_Char *name,
|
||||
const XML_Char **atts);
|
||||
|
||||
typedef void (*XML_EndElementHandler)(void *userData,
|
||||
const char *name);
|
||||
const XML_Char *name);
|
||||
|
||||
/* s is not 0 terminated. */
|
||||
typedef void (*XML_CharacterDataHandler)(void *userData,
|
||||
const char *s,
|
||||
const XML_Char *s,
|
||||
int len);
|
||||
|
||||
/* target and data are '\0' terminated */
|
||||
/* target and data are 0 terminated */
|
||||
typedef void (*XML_ProcessingInstructionHandler)(void *userData,
|
||||
const char *target,
|
||||
const char *data);
|
||||
const XML_Char *target,
|
||||
const XML_Char *data);
|
||||
|
||||
/* This is called for any characters in the XML document for
|
||||
which there is no applicable handler. This includes both
|
||||
characters that are part of markup which is of a kind that is
|
||||
not reported (comments, markup declarations), or characters
|
||||
that are part of a construct which could be reported but
|
||||
for which no handler has been supplied. The characters are passed
|
||||
exactly as they were in the XML document except that
|
||||
they will be encoded in UTF-8. Line boundaries are not normalized.
|
||||
Note that a byte order mark character is not passed to the default handler.
|
||||
If a default handler is set, internal entity references
|
||||
are not expanded. There are no guarantees about
|
||||
how characters are divided between calls to the default handler:
|
||||
for example, a comment might be split between multiple calls. */
|
||||
|
||||
typedef void (*XML_DefaultHandler)(void *userData,
|
||||
const XML_Char *s,
|
||||
int len);
|
||||
|
||||
/* This is called for a declaration of an unparsed (NDATA)
|
||||
entity. The base argument is whatever was set by XML_SetBase.
|
||||
The entityName, systemId and notationName arguments will never be null.
|
||||
The other arguments may be. */
|
||||
|
||||
typedef void (*XML_UnparsedEntityDeclHandler)(void *userData,
|
||||
const XML_Char *entityName,
|
||||
const XML_Char *base,
|
||||
const XML_Char *systemId,
|
||||
const XML_Char *publicId,
|
||||
const XML_Char *notationName);
|
||||
|
||||
/* This is called for a declaration of notation.
|
||||
The base argument is whatever was set by XML_SetBase.
|
||||
The notationName will never be null. The other arguments can be. */
|
||||
|
||||
typedef void (*XML_NotationDeclHandler)(void *userData,
|
||||
const XML_Char *notationName,
|
||||
const XML_Char *base,
|
||||
const XML_Char *systemId,
|
||||
const XML_Char *publicId);
|
||||
|
||||
/* This is called for a reference to an external parsed general entity.
|
||||
The referenced entity is not automatically parsed.
|
||||
The application can parse it immediately or later using
|
||||
XML_ExternalEntityParserCreate.
|
||||
The parser argument is the parser parsing the entity containing the reference;
|
||||
it can be passed as the parser argument to XML_ExternalEntityParserCreate.
|
||||
The systemId argument is the system identifier as specified in the entity declaration;
|
||||
it will not be null.
|
||||
The base argument is the system identifier that should be used as the base for
|
||||
resolving systemId if systemId was relative; this is set by XML_SetBase;
|
||||
it may be null.
|
||||
The publicId argument is the public identifier as specified in the entity declaration,
|
||||
or null if none was specified; the whitespace in the public identifier
|
||||
will have been normalized as required by the XML spec.
|
||||
The openEntityNames argument is a space-separated list of the names of the entities
|
||||
that are open for the parse of this entity (including the name of the referenced
|
||||
entity); this can be passed as the openEntityNames argument to
|
||||
XML_ExternalEntityParserCreate; openEntityNames is valid only until the handler
|
||||
returns, so if the referenced entity is to be parsed later, it must be copied.
|
||||
The handler should return 0 if processing should not continue because of
|
||||
a fatal error in the handling of the external entity.
|
||||
In this case the calling parser will return an XML_ERROR_EXTERNAL_ENTITY_HANDLING
|
||||
error.
|
||||
Note that unlike other handlers the first argument is the parser, not userData. */
|
||||
|
||||
typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser,
|
||||
const XML_Char *openEntityNames,
|
||||
const XML_Char *base,
|
||||
const XML_Char *systemId,
|
||||
const XML_Char *publicId);
|
||||
|
||||
/* This structure is filled in by the XML_UnknownEncodingHandler
|
||||
to provide information to the parser about encodings that are unknown
|
||||
to the parser.
|
||||
The map[b] member gives information about byte sequences
|
||||
whose first byte is b.
|
||||
If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
|
||||
If map[b] is -1, then the byte sequence is malformed.
|
||||
If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
|
||||
sequence that encodes a single Unicode scalar value.
|
||||
The data member will be passed as the first argument to the convert function.
|
||||
The convert function is used to convert multibyte sequences;
|
||||
s will point to a n-byte sequence where map[(unsigned char)*s] == -n.
|
||||
The convert function must return the Unicode scalar value
|
||||
represented by this byte sequence or -1 if the byte sequence is malformed.
|
||||
The convert function may be null if the encoding is a single-byte encoding,
|
||||
that is if map[b] >= -1 for all bytes b.
|
||||
When the parser is finished with the encoding, then if release is not null,
|
||||
it will call release passing it the data member;
|
||||
once release has been called, the convert function will not be called again.
|
||||
|
||||
Expat places certain restrictions on the encodings that are supported
|
||||
using this mechanism.
|
||||
|
||||
1. Every ASCII character that can appear in a well-formed XML document,
|
||||
other than the characters
|
||||
|
||||
$@\^`{}~
|
||||
|
||||
must be represented by a single byte, and that byte must be the
|
||||
same byte that represents that character in ASCII.
|
||||
|
||||
2. No character may require more than 4 bytes to encode.
|
||||
|
||||
3. All characters encoded must have Unicode scalar values <= 0xFFFF,
|
||||
(ie characters that would be encoded by surrogates in UTF-16
|
||||
are not allowed). Note that this restriction doesn't apply to
|
||||
the built-in support for UTF-8 and UTF-16.
|
||||
|
||||
4. No Unicode character may be encoded by more than one distinct sequence
|
||||
of bytes. */
|
||||
|
||||
typedef struct {
|
||||
int map[256];
|
||||
void *data;
|
||||
int (*convert)(void *data, const char *s);
|
||||
void (*release)(void *data);
|
||||
} XML_Encoding;
|
||||
|
||||
/* This is called for an encoding that is unknown to the parser.
|
||||
The encodingHandlerData argument is that which was passed as the
|
||||
second argument to XML_SetUnknownEncodingHandler.
|
||||
The name argument gives the name of the encoding as specified in
|
||||
the encoding declaration.
|
||||
If the callback can provide information about the encoding,
|
||||
it must fill in the XML_Encoding structure, and return 1.
|
||||
Otherwise it must return 0.
|
||||
If info does not describe a suitable encoding,
|
||||
then the parser will return an XML_UNKNOWN_ENCODING error. */
|
||||
|
||||
typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData,
|
||||
const XML_Char *name,
|
||||
XML_Encoding *info);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetElementHandler(XML_Parser parser,
|
||||
|
@ -71,10 +238,62 @@ void XMLPARSEAPI
|
|||
XML_SetProcessingInstructionHandler(XML_Parser parser,
|
||||
XML_ProcessingInstructionHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetDefaultHandler(XML_Parser parser,
|
||||
XML_DefaultHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetUnparsedEntityDeclHandler(XML_Parser parser,
|
||||
XML_UnparsedEntityDeclHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetNotationDeclHandler(XML_Parser parser,
|
||||
XML_NotationDeclHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetExternalEntityRefHandler(XML_Parser parser,
|
||||
XML_ExternalEntityRefHandler handler);
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_SetUnknownEncodingHandler(XML_Parser parser,
|
||||
XML_UnknownEncodingHandler handler,
|
||||
void *encodingHandlerData);
|
||||
|
||||
/* This can be called within a handler for a start element, end element,
|
||||
processing instruction or character data. It causes the corresponding
|
||||
markup to be passed to the default handler.
|
||||
Within the expansion of an internal entity, nothing will be passed
|
||||
to the default handler, although this usually will not happen since
|
||||
setting a default handler inhibits expansion of internal entities. */
|
||||
void XMLPARSEAPI XML_DefaultCurrent(XML_Parser parser);
|
||||
|
||||
/* This value is passed as the userData argument to callbacks. */
|
||||
void XMLPARSEAPI
|
||||
XML_SetUserData(XML_Parser parser, void *userData);
|
||||
|
||||
/* Returns the last value set by XML_SetUserData or null. */
|
||||
#define XML_GetUserData(parser) (*(void **)(parser))
|
||||
|
||||
/* If this function is called, then the parser will be passed
|
||||
as the first argument to callbacks instead of userData.
|
||||
The userData will still be accessible using XML_GetUserData. */
|
||||
|
||||
void XMLPARSEAPI
|
||||
XML_UseParserAsHandlerArg(XML_Parser parser);
|
||||
|
||||
/* Sets the base to be used for resolving relative URIs in system identifiers in
|
||||
declarations. Resolving relative identifiers is left to the application:
|
||||
this value will be passed through as the base argument to the
|
||||
XML_ExternalEntityRefHandler, XML_NotationDeclHandler
|
||||
and XML_UnparsedEntityDeclHandler. The base argument will be copied.
|
||||
Returns zero if out of memory, non-zero otherwise. */
|
||||
|
||||
int XMLPARSEAPI
|
||||
XML_SetBase(XML_Parser parser, const XML_Char *base);
|
||||
|
||||
const XML_Char XMLPARSEAPI *
|
||||
XML_GetBase(XML_Parser parser);
|
||||
|
||||
/* Parses some input. Returns 0 if a fatal error is detected.
|
||||
The last call to XML_Parse must have isFinal true;
|
||||
len may be zero for this call (or any other). */
|
||||
|
@ -87,8 +306,20 @@ XML_GetBuffer(XML_Parser parser, int len);
|
|||
int XMLPARSEAPI
|
||||
XML_ParseBuffer(XML_Parser parser, int len, int isFinal);
|
||||
|
||||
/* If XML_Parser or XML_ParseEnd have returned 0, then XML_GetError*
|
||||
returns information about the error. */
|
||||
/* Creates an XML_Parser object that can parse an external general entity;
|
||||
openEntityNames is a space-separated list of the names of the entities that are open
|
||||
for the parse of this entity (including the name of this one);
|
||||
encoding is the externally specified encoding,
|
||||
or null if there is no externally specified encoding.
|
||||
This can be called at any point after the first call to an ExternalEntityRefHandler
|
||||
so longer as the parser has not yet been freed.
|
||||
The new parser is completely independent and may safely be used in a separate thread.
|
||||
The handlers and userData are initialized from the parser argument.
|
||||
Returns 0 if out of memory. Otherwise returns a new XML_Parser object. */
|
||||
XML_Parser XMLPARSEAPI
|
||||
XML_ExternalEntityParserCreate(XML_Parser parser,
|
||||
const XML_Char *openEntityNames,
|
||||
const XML_Char *encoding);
|
||||
|
||||
enum XML_Error {
|
||||
XML_ERROR_NONE,
|
||||
|
@ -110,19 +341,39 @@ enum XML_Error {
|
|||
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
|
||||
XML_ERROR_MISPLACED_XML_PI,
|
||||
XML_ERROR_UNKNOWN_ENCODING,
|
||||
XML_ERROR_INCORRECT_ENCODING
|
||||
XML_ERROR_INCORRECT_ENCODING,
|
||||
XML_ERROR_UNCLOSED_CDATA_SECTION,
|
||||
XML_ERROR_EXTERNAL_ENTITY_HANDLING
|
||||
};
|
||||
|
||||
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
|
||||
int XMLPARSEAPI XML_GetErrorLineNumber(XML_Parser parser);
|
||||
int XMLPARSEAPI XML_GetErrorColumnNumber(XML_Parser parser);
|
||||
long XMLPARSEAPI XML_GetErrorByteIndex(XML_Parser parser);
|
||||
/* If XML_Parse or XML_ParseBuffer have returned 0, then XML_GetErrorCode
|
||||
returns information about the error. */
|
||||
|
||||
enum XML_Error XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
|
||||
|
||||
/* These functions return information about the current parse location.
|
||||
They may be called when XML_Parse or XML_ParseBuffer return 0;
|
||||
in this case the location is the location of the character at which
|
||||
the error was detected.
|
||||
They may also be called from any other callback called to report
|
||||
some parse event; in this the location is the location of the first
|
||||
of the sequence of characters that generated the event. */
|
||||
|
||||
int XMLPARSEAPI XML_GetCurrentLineNumber(XML_Parser parser);
|
||||
int XMLPARSEAPI XML_GetCurrentColumnNumber(XML_Parser parser);
|
||||
long XMLPARSEAPI XML_GetCurrentByteIndex(XML_Parser parser);
|
||||
|
||||
/* For backwards compatibility with previous versions. */
|
||||
#define XML_GetErrorLineNumber XML_GetCurrentLineNumber
|
||||
#define XML_GetErrorColumnNumber XML_GetCurrentColumnNumber
|
||||
#define XML_GetErrorByteIndex XML_GetCurrentByteIndex
|
||||
|
||||
/* Frees memory used by the parser. */
|
||||
void XMLPARSEAPI
|
||||
XML_ParserFree(XML_Parser parser);
|
||||
|
||||
const char XMLPARSEAPI *
|
||||
XML_ErrorString(int code);
|
||||
/* Returns a string describing the error. */
|
||||
const XML_LChar XMLPARSEAPI *XML_ErrorString(int code);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -594,7 +594,7 @@ int notation4(PROLOG_STATE *state,
|
|||
return XML_ROLE_NOTATION_SYSTEM_ID;
|
||||
case XML_TOK_DECL_CLOSE:
|
||||
state->handler = internalSubset;
|
||||
return XML_ROLE_NONE;
|
||||
return XML_ROLE_NOTATION_NO_SYSTEM_ID;
|
||||
}
|
||||
return syntaxError(state);
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ enum {
|
|||
XML_ROLE_ENTITY_NOTATION_NAME,
|
||||
XML_ROLE_NOTATION_NAME,
|
||||
XML_ROLE_NOTATION_SYSTEM_ID,
|
||||
XML_ROLE_NOTATION_NO_SYSTEM_ID,
|
||||
XML_ROLE_NOTATION_PUBLIC_ID,
|
||||
XML_ROLE_ATTRIBUTE_NAME,
|
||||
XML_ROLE_ATTRIBUTE_TYPE_CDATA,
|
||||
|
|
|
@ -23,7 +23,7 @@ Contributor(s):
|
|||
#include "nametab.h"
|
||||
|
||||
#define VTABLE1 \
|
||||
{ PREFIX(prologTok), PREFIX(contentTok) }, \
|
||||
{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
|
||||
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
|
||||
PREFIX(sameName), \
|
||||
PREFIX(nameMatchesAscii), \
|
||||
|
@ -31,14 +31,11 @@ Contributor(s):
|
|||
PREFIX(skipS), \
|
||||
PREFIX(getAtts), \
|
||||
PREFIX(charRefNumber), \
|
||||
PREFIX(predefinedEntityName), \
|
||||
PREFIX(updatePosition), \
|
||||
PREFIX(isPublicId)
|
||||
|
||||
#define VTABLE2 \
|
||||
PREFIX(encode), \
|
||||
{ PREFIX(toUtf8) }
|
||||
|
||||
#define VTABLE VTABLE1, VTABLE2
|
||||
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
|
||||
|
||||
#define UCS2_GET_NAMING(pages, hi, lo) \
|
||||
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
|
||||
|
@ -81,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and
|
|||
|
||||
#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
|
||||
|
||||
static
|
||||
int isNever(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isName2(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isName3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
#define utf8_isName4 isNever
|
||||
|
||||
static
|
||||
int utf8_isNmstrt2(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isNmstrt3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
|
||||
}
|
||||
|
||||
#define utf8_isNmstrt4 isNever
|
||||
|
||||
#define utf8_isInvalid2 isNever
|
||||
|
||||
static
|
||||
int utf8_isInvalid3(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_INVALID3((const unsigned char *)p);
|
||||
}
|
||||
|
||||
static
|
||||
int utf8_isInvalid4(const ENCODING *enc, const char *p)
|
||||
{
|
||||
return UTF8_INVALID4((const unsigned char *)p);
|
||||
}
|
||||
|
||||
struct normal_encoding {
|
||||
ENCODING enc;
|
||||
unsigned char type[256];
|
||||
int (*isName2)(const ENCODING *, const char *);
|
||||
int (*isName3)(const ENCODING *, const char *);
|
||||
int (*isName4)(const ENCODING *, const char *);
|
||||
int (*isNmstrt2)(const ENCODING *, const char *);
|
||||
int (*isNmstrt3)(const ENCODING *, const char *);
|
||||
int (*isNmstrt4)(const ENCODING *, const char *);
|
||||
int (*isInvalid2)(const ENCODING *, const char *);
|
||||
int (*isInvalid3)(const ENCODING *, const char *);
|
||||
int (*isInvalid4)(const ENCODING *, const char *);
|
||||
};
|
||||
|
||||
#define NORMAL_VTABLE(E) \
|
||||
E ## isName2, \
|
||||
E ## isName3, \
|
||||
E ## isName4, \
|
||||
E ## isNmstrt2, \
|
||||
E ## isNmstrt3, \
|
||||
E ## isNmstrt4, \
|
||||
E ## isInvalid2, \
|
||||
E ## isInvalid3, \
|
||||
E ## isInvalid4
|
||||
|
||||
static int checkCharRefNumber(int);
|
||||
|
||||
#include "xmltok_impl.h"
|
||||
|
@ -95,12 +160,16 @@ static int checkCharRefNumber(int);
|
|||
#define BYTE_TYPE(enc, p) \
|
||||
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
|
||||
#define BYTE_TO_ASCII(enc, p) (*p)
|
||||
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
|
||||
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
|
||||
#define IS_INVALID_CHAR(enc, p, n) \
|
||||
((n) == 3 \
|
||||
? UTF8_INVALID3((const unsigned char *)(p)) \
|
||||
: ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0))
|
||||
(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
|
||||
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
|
||||
|
||||
/* c is an ASCII character */
|
||||
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
|
||||
|
@ -113,51 +182,18 @@ static int checkCharRefNumber(int);
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
enum {
|
||||
/* cvalN is value of masked first byte of N byte sequence */
|
||||
cval1 = 0x00,
|
||||
cval2 = 0xc0,
|
||||
cval3 = 0xe0,
|
||||
cval4 = 0xf0,
|
||||
/* minN is minimum legal resulting value for N byte sequence */
|
||||
min2 = 0x80,
|
||||
min3 = 0x800,
|
||||
min4 = 0x10000
|
||||
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
|
||||
UTF8_cval1 = 0x00,
|
||||
UTF8_cval2 = 0xc0,
|
||||
UTF8_cval3 = 0xe0,
|
||||
UTF8_cval4 = 0xf0
|
||||
};
|
||||
|
||||
static
|
||||
int utf8_encode(const ENCODING *enc, int c, char *buf)
|
||||
{
|
||||
if (c < 0)
|
||||
return 0;
|
||||
if (c < min2) {
|
||||
buf[0] = (c | cval1);
|
||||
return 1;
|
||||
}
|
||||
if (c < min3) {
|
||||
buf[0] = ((c >> 6) | cval2);
|
||||
buf[1] = ((c & 0x3f) | 0x80);
|
||||
return 2;
|
||||
}
|
||||
if (c < min4) {
|
||||
buf[0] = ((c >> 12) | cval3);
|
||||
buf[1] = (((c >> 6) & 0x3f) | 0x80);
|
||||
buf[2] = ((c & 0x3f) | 0x80);
|
||||
return 3;
|
||||
}
|
||||
if (c < 0x110000) {
|
||||
buf[0] = ((c >> 18) | cval4);
|
||||
buf[1] = (((c >> 12) & 0x3f) | 0x80);
|
||||
buf[2] = (((c >> 6) & 0x3f) | 0x80);
|
||||
buf[3] = ((c & 0x3f) | 0x80);
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
void utf8_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
|
@ -177,34 +213,63 @@ void utf8_toUtf8(const ENCODING *enc,
|
|||
*toP = to;
|
||||
}
|
||||
|
||||
static
|
||||
void utf8_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
unsigned short *to = *toP;
|
||||
const char *from = *fromP;
|
||||
while (from != fromLim && to != toLim) {
|
||||
switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
|
||||
case BT_LEAD2:
|
||||
*to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
|
||||
from += 2;
|
||||
break;
|
||||
case BT_LEAD3:
|
||||
*to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
|
||||
from += 3;
|
||||
break;
|
||||
case BT_LEAD4:
|
||||
{
|
||||
unsigned long n;
|
||||
if (to + 1 == toLim)
|
||||
break;
|
||||
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
|
||||
n -= 0x10000;
|
||||
to[0] = (unsigned short)((n >> 10) | 0xD800);
|
||||
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
|
||||
to += 2;
|
||||
from += 4;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
*to++ = *from++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*fromP = from;
|
||||
*toP = to;
|
||||
}
|
||||
|
||||
static const struct normal_encoding utf8_encoding = {
|
||||
{ VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
|
||||
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
||||
{
|
||||
#include "asciitab.h"
|
||||
#include "utf8tab.h"
|
||||
}
|
||||
},
|
||||
NORMAL_VTABLE(utf8_)
|
||||
};
|
||||
|
||||
static const struct normal_encoding internal_utf8_encoding = {
|
||||
{ VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
|
||||
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
|
||||
{
|
||||
#include "iasciitab.h"
|
||||
#include "utf8tab.h"
|
||||
}
|
||||
},
|
||||
NORMAL_VTABLE(utf8_)
|
||||
};
|
||||
|
||||
static
|
||||
int latin1_encode(const ENCODING *enc, int c, char *buf)
|
||||
{
|
||||
if (c < 0)
|
||||
return 0;
|
||||
if (c <= 0xFF) {
|
||||
buf[0] = (char)c;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
void latin1_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
|
@ -218,7 +283,7 @@ void latin1_toUtf8(const ENCODING *enc,
|
|||
if (c & 0x80) {
|
||||
if (toLim - *toP < 2)
|
||||
break;
|
||||
*(*toP)++ = ((c >> 6) | cval2);
|
||||
*(*toP)++ = ((c >> 6) | UTF8_cval2);
|
||||
*(*toP)++ = ((c & 0x3f) | 0x80);
|
||||
(*fromP)++;
|
||||
}
|
||||
|
@ -230,15 +295,39 @@ void latin1_toUtf8(const ENCODING *enc,
|
|||
}
|
||||
}
|
||||
|
||||
static
|
||||
void latin1_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
while (*fromP != fromLim && *toP != toLim)
|
||||
*(*toP)++ = (unsigned char)*(*fromP)++;
|
||||
}
|
||||
|
||||
static const struct normal_encoding latin1_encoding = {
|
||||
{ VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 },
|
||||
{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
|
||||
{
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
}
|
||||
};
|
||||
|
||||
#define latin1tab (latin1_encoding.type)
|
||||
static
|
||||
void ascii_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
{
|
||||
while (*fromP != fromLim && *toP != toLim)
|
||||
*(*toP)++ = *(*fromP)++;
|
||||
}
|
||||
|
||||
static const struct normal_encoding ascii_encoding = {
|
||||
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
|
||||
{
|
||||
#include "asciitab.h"
|
||||
/* BT_NONXML == 0 */
|
||||
}
|
||||
};
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
|
@ -260,25 +349,6 @@ static int unicode_byte_type(char hi, char lo)
|
|||
return BT_NONASCII;
|
||||
}
|
||||
|
||||
#define DEFINE_UTF16_ENCODE \
|
||||
static \
|
||||
int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \
|
||||
{ \
|
||||
if (charNum < 0) \
|
||||
return 0; \
|
||||
if (charNum < 0x10000) { \
|
||||
SET2(buf, charNum); \
|
||||
return 2; \
|
||||
} \
|
||||
if (charNum < 0x110000) { \
|
||||
charNum -= 0x10000; \
|
||||
SET2(buf, (charNum >> 10) + 0xD800); \
|
||||
SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \
|
||||
return 4; \
|
||||
} \
|
||||
return 0; \
|
||||
}
|
||||
|
||||
#define DEFINE_UTF16_TO_UTF8 \
|
||||
static \
|
||||
void PREFIX(toUtf8)(const ENCODING *enc, \
|
||||
|
@ -308,7 +378,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
*fromP = from; \
|
||||
return; \
|
||||
} \
|
||||
*(*toP)++ = ((lo >> 6) | (hi << 2) | cval2); \
|
||||
*(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
|
||||
*(*toP)++ = ((lo & 0x3f) | 0x80); \
|
||||
break; \
|
||||
default: \
|
||||
|
@ -317,7 +387,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
return; \
|
||||
} \
|
||||
/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
|
||||
*(*toP)++ = ((hi >> 4) | cval3); \
|
||||
*(*toP)++ = ((hi >> 4) | UTF8_cval3); \
|
||||
*(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
|
||||
*(*toP)++ = ((lo & 0x3f) | 0x80); \
|
||||
break; \
|
||||
|
@ -327,7 +397,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
return; \
|
||||
} \
|
||||
plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
|
||||
*(*toP)++ = ((plane >> 2) | cval4); \
|
||||
*(*toP)++ = ((plane >> 2) | UTF8_cval4); \
|
||||
*(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
|
||||
from += 2; \
|
||||
lo2 = GET_LO(from); \
|
||||
|
@ -342,15 +412,33 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
*fromP = from; \
|
||||
}
|
||||
|
||||
#define DEFINE_UTF16_TO_UTF16 \
|
||||
static \
|
||||
void PREFIX(toUtf16)(const ENCODING *enc, \
|
||||
const char **fromP, const char *fromLim, \
|
||||
unsigned short **toP, const unsigned short *toLim) \
|
||||
{ \
|
||||
/* Avoid copying first half only of surrogate */ \
|
||||
if (fromLim - *fromP > ((toLim - *toP) << 1) \
|
||||
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
|
||||
fromLim -= 2; \
|
||||
for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
|
||||
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
|
||||
}
|
||||
|
||||
#define PREFIX(ident) little2_ ## ident
|
||||
#define MINBPC 2
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
|
||||
((p)[1] == 0 \
|
||||
? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
|
||||
: unicode_byte_type((p)[1], (p)[0]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
#define IS_NAME_CHAR(enc, p, n) (0)
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
@ -360,8 +448,8 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
|
|||
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
|
||||
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
|
||||
|
||||
DEFINE_UTF16_ENCODE
|
||||
DEFINE_UTF16_TO_UTF8
|
||||
DEFINE_UTF16_TO_UTF16
|
||||
|
||||
#undef SET2
|
||||
#undef GET_LO
|
||||
|
@ -371,10 +459,32 @@ DEFINE_UTF16_TO_UTF8
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct encoding little2_encoding = { VTABLE, 2 };
|
||||
static const struct normal_encoding little2_encoding = {
|
||||
{ VTABLE, 2, 0,
|
||||
#if BYTE_ORDER == 12
|
||||
1
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
},
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#if BYTE_ORDER != 21
|
||||
|
||||
static const struct normal_encoding internal_little2_encoding = {
|
||||
{ VTABLE, 2, 0, 1 },
|
||||
#include "iasciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
|
@ -382,12 +492,16 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
|
|||
#define MINBPC 2
|
||||
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
|
||||
#define BYTE_TYPE(enc, p) \
|
||||
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
|
||||
((p)[0] == 0 \
|
||||
? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
|
||||
: unicode_byte_type((p)[0], (p)[1]))
|
||||
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
|
||||
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
|
||||
#define IS_NAME_CHAR(enc, p, n) \
|
||||
#define IS_NAME_CHAR(enc, p, n) 0
|
||||
#define IS_NAME_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) \
|
||||
#define IS_NMSTRT_CHAR(enc, p, n) (0)
|
||||
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
|
||||
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
|
||||
|
||||
#include "xmltok_impl.c"
|
||||
|
@ -397,8 +511,8 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
|
|||
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
|
||||
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
|
||||
|
||||
DEFINE_UTF16_ENCODE
|
||||
DEFINE_UTF16_TO_UTF8
|
||||
DEFINE_UTF16_TO_UTF16
|
||||
|
||||
#undef SET2
|
||||
#undef GET_LO
|
||||
|
@ -408,10 +522,32 @@ DEFINE_UTF16_TO_UTF8
|
|||
#undef BYTE_TO_ASCII
|
||||
#undef CHAR_MATCHES
|
||||
#undef IS_NAME_CHAR
|
||||
#undef IS_NAME_CHAR_MINBPC
|
||||
#undef IS_NMSTRT_CHAR
|
||||
#undef IS_NMSTRT_CHAR_MINBPC
|
||||
#undef IS_INVALID_CHAR
|
||||
|
||||
static const struct encoding big2_encoding = { VTABLE, 2 };
|
||||
static const struct normal_encoding big2_encoding = {
|
||||
{ VTABLE, 2, 0,
|
||||
#if BYTE_ORDER == 21
|
||||
1
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
},
|
||||
#include "asciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#if BYTE_ORDER != 12
|
||||
|
||||
static const struct normal_encoding internal_big2_encoding = {
|
||||
{ VTABLE, 2, 0, 1 },
|
||||
#include "iasciitab.h"
|
||||
#include "latin1tab.h"
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
#undef PREFIX
|
||||
|
||||
|
@ -454,18 +590,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
|
|||
else {
|
||||
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
|
||||
case 0x003C:
|
||||
*encPtr = &big2_encoding;
|
||||
*encPtr = &big2_encoding.enc;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFEFF:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &big2_encoding;
|
||||
*encPtr = &big2_encoding.enc;
|
||||
return XML_TOK_BOM;
|
||||
case 0x3C00:
|
||||
*encPtr = &little2_encoding;
|
||||
*encPtr = &little2_encoding.enc;
|
||||
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
|
||||
case 0xFFFE:
|
||||
*nextTokPtr = ptr + 2;
|
||||
*encPtr = &little2_encoding;
|
||||
*encPtr = &little2_encoding.enc;
|
||||
return XML_TOK_BOM;
|
||||
}
|
||||
}
|
||||
|
@ -494,13 +630,21 @@ void initUpdatePosition(const ENCODING *enc, const char *ptr,
|
|||
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
|
||||
}
|
||||
|
||||
const ENCODING *XmlGetInternalEncoding(int e)
|
||||
const ENCODING *XmlGetUtf8InternalEncoding()
|
||||
{
|
||||
switch (e) {
|
||||
case XML_UTF8_ENCODING:
|
||||
return &internal_utf8_encoding.enc;
|
||||
}
|
||||
return 0;
|
||||
return &internal_utf8_encoding.enc;
|
||||
}
|
||||
|
||||
const ENCODING *XmlGetUtf16InternalEncoding()
|
||||
{
|
||||
#if BYTE_ORDER == 12
|
||||
return &internal_little2_encoding.enc;
|
||||
#elif BYTE_ORDER == 21
|
||||
return &internal_big2_encoding.enc;
|
||||
#else
|
||||
const short n = 1;
|
||||
return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
|
||||
#endif
|
||||
}
|
||||
|
||||
int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
|
||||
|
@ -514,6 +658,10 @@ int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
|
|||
*encPtr = &utf8_encoding.enc;
|
||||
return 1;
|
||||
}
|
||||
if (streqci(name, "US-ASCII")) {
|
||||
*encPtr = &ascii_encoding.enc;
|
||||
return 1;
|
||||
}
|
||||
if (!streqci(name, "UTF-16"))
|
||||
return 0;
|
||||
}
|
||||
|
@ -531,7 +679,7 @@ int toAscii(const ENCODING *enc, const char *ptr, const char *end)
|
|||
{
|
||||
char buf[1];
|
||||
char *p = buf;
|
||||
XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1);
|
||||
XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
|
||||
if (p == buf)
|
||||
return -1;
|
||||
else
|
||||
|
@ -641,7 +789,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
|
|||
char buf[ENCODING_MAX];
|
||||
char *p = buf;
|
||||
int i;
|
||||
XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1);
|
||||
XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1);
|
||||
if (ptr != end)
|
||||
return 0;
|
||||
*p = 0;
|
||||
|
@ -653,11 +801,13 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
|
|||
return &utf8_encoding.enc;
|
||||
if (streqci(buf, "ISO-8859-1"))
|
||||
return &latin1_encoding.enc;
|
||||
if (streqci(buf, "US-ASCII"))
|
||||
return &ascii_encoding.enc;
|
||||
if (streqci(buf, "UTF-16")) {
|
||||
static const unsigned short n = 1;
|
||||
if (enc->minBytesPerChar == 2)
|
||||
return enc;
|
||||
return &big2_encoding;
|
||||
return &big2_encoding.enc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -757,3 +907,229 @@ int checkCharRefNumber(int result)
|
|||
return result;
|
||||
}
|
||||
|
||||
int XmlUtf8Encode(int c, char *buf)
|
||||
{
|
||||
enum {
|
||||
/* minN is minimum legal resulting value for N byte sequence */
|
||||
min2 = 0x80,
|
||||
min3 = 0x800,
|
||||
min4 = 0x10000
|
||||
};
|
||||
|
||||
if (c < 0)
|
||||
return 0;
|
||||
if (c < min2) {
|
||||
buf[0] = (c | UTF8_cval1);
|
||||
return 1;
|
||||
}
|
||||
if (c < min3) {
|
||||
buf[0] = ((c >> 6) | UTF8_cval2);
|
||||
buf[1] = ((c & 0x3f) | 0x80);
|
||||
return 2;
|
||||
}
|
||||
if (c < min4) {
|
||||
buf[0] = ((c >> 12) | UTF8_cval3);
|
||||
buf[1] = (((c >> 6) & 0x3f) | 0x80);
|
||||
buf[2] = ((c & 0x3f) | 0x80);
|
||||
return 3;
|
||||
}
|
||||
if (c < 0x110000) {
|
||||
buf[0] = ((c >> 18) | UTF8_cval4);
|
||||
buf[1] = (((c >> 12) & 0x3f) | 0x80);
|
||||
buf[2] = (((c >> 6) & 0x3f) | 0x80);
|
||||
buf[3] = ((c & 0x3f) | 0x80);
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int XmlUtf16Encode(int charNum, unsigned short *buf)
|
||||
{
|
||||
if (charNum < 0)
|
||||
return 0;
|
||||
if (charNum < 0x10000) {
|
||||
buf[0] = charNum;
|
||||
return 1;
|
||||
}
|
||||
if (charNum < 0x110000) {
|
||||
charNum -= 0x10000;
|
||||
buf[0] = (charNum >> 10) + 0xD800;
|
||||
buf[1] = (charNum & 0x3FF) + 0xDC00;
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct unknown_encoding {
|
||||
struct normal_encoding normal;
|
||||
int (*convert)(void *userData, const char *p);
|
||||
void *userData;
|
||||
unsigned short utf16[256];
|
||||
char utf8[256][4];
|
||||
};
|
||||
|
||||
int XmlSizeOfUnknownEncoding()
|
||||
{
|
||||
return sizeof(struct unknown_encoding);
|
||||
}
|
||||
|
||||
static
|
||||
int unknown_isName(const ENCODING *enc, const char *p)
|
||||
{
|
||||
int c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p);
|
||||
if (c & ~0xFFFF)
|
||||
return 0;
|
||||
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
|
||||
}
|
||||
|
||||
static
|
||||
int unknown_isNmstrt(const ENCODING *enc, const char *p)
|
||||
{
|
||||
int c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p);
|
||||
if (c & ~0xFFFF)
|
||||
return 0;
|
||||
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
|
||||
}
|
||||
|
||||
static
|
||||
int unknown_isInvalid(const ENCODING *enc, const char *p)
|
||||
{
|
||||
int c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, p);
|
||||
return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
|
||||
}
|
||||
|
||||
static
|
||||
void unknown_toUtf8(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
char **toP, const char *toLim)
|
||||
{
|
||||
char buf[XML_UTF8_ENCODE_MAX];
|
||||
for (;;) {
|
||||
const char *utf8;
|
||||
int n;
|
||||
if (*fromP == fromLim)
|
||||
break;
|
||||
utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
|
||||
n = *utf8++;
|
||||
if (n == 0) {
|
||||
int c = ((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
||||
n = XmlUtf8Encode(c, buf);
|
||||
if (n > toLim - *toP)
|
||||
break;
|
||||
utf8 = buf;
|
||||
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
||||
- (BT_LEAD2 - 2);
|
||||
}
|
||||
else {
|
||||
if (n > toLim - *toP)
|
||||
break;
|
||||
(*fromP)++;
|
||||
}
|
||||
do {
|
||||
*(*toP)++ = *utf8++;
|
||||
} while (--n != 0);
|
||||
}
|
||||
}
|
||||
|
||||
static
|
||||
void unknown_toUtf16(const ENCODING *enc,
|
||||
const char **fromP, const char *fromLim,
|
||||
unsigned short **toP, const unsigned short *toLim)
|
||||
{
|
||||
while (*fromP != fromLim && *toP != toLim) {
|
||||
unsigned short c
|
||||
= ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
|
||||
if (c == 0) {
|
||||
c = (unsigned short)((const struct unknown_encoding *)enc)
|
||||
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
|
||||
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
|
||||
- (BT_LEAD2 - 2);
|
||||
}
|
||||
else
|
||||
(*fromP)++;
|
||||
*(*toP)++ = c;
|
||||
}
|
||||
}
|
||||
|
||||
ENCODING *
|
||||
XmlInitUnknownEncoding(void *mem,
|
||||
int *table,
|
||||
int (*convert)(void *userData, const char *p),
|
||||
void *userData)
|
||||
{
|
||||
int i;
|
||||
struct unknown_encoding *e = mem;
|
||||
for (i = 0; i < sizeof(struct normal_encoding); i++)
|
||||
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
|
||||
for (i = 0; i < 128; i++)
|
||||
if (latin1_encoding.type[i] != BT_OTHER
|
||||
&& latin1_encoding.type[i] != BT_NONXML
|
||||
&& table[i] != i)
|
||||
return 0;
|
||||
for (i = 0; i < 256; i++) {
|
||||
int c = table[i];
|
||||
if (c == -1) {
|
||||
e->normal.type[i] = BT_MALFORM;
|
||||
/* This shouldn't really get used. */
|
||||
e->utf16[i] = 0xFFFF;
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = 0;
|
||||
}
|
||||
else if (c < 0) {
|
||||
if (c < -4)
|
||||
return 0;
|
||||
e->normal.type[i] = BT_LEAD2 - (c + 2);
|
||||
e->utf8[i][0] = 0;
|
||||
e->utf16[i] = 0;
|
||||
}
|
||||
else if (c < 0x80) {
|
||||
if (latin1_encoding.type[c] != BT_OTHER
|
||||
&& latin1_encoding.type[c] != BT_NONXML
|
||||
&& c != i)
|
||||
return 0;
|
||||
e->normal.type[i] = latin1_encoding.type[c];
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = (char)c;
|
||||
e->utf16[i] = c == 0 ? 0xFFFF : c;
|
||||
}
|
||||
else if (checkCharRefNumber(c) < 0) {
|
||||
e->normal.type[i] = BT_NONXML;
|
||||
/* This shouldn't really get used. */
|
||||
e->utf16[i] = 0xFFFF;
|
||||
e->utf8[i][0] = 1;
|
||||
e->utf8[i][1] = 0;
|
||||
}
|
||||
else {
|
||||
if (c > 0xFFFF)
|
||||
return 0;
|
||||
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
|
||||
e->normal.type[i] = BT_NMSTRT;
|
||||
else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
|
||||
e->normal.type[i] = BT_NAME;
|
||||
else
|
||||
e->normal.type[i] = BT_OTHER;
|
||||
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
|
||||
e->utf16[i] = c;
|
||||
}
|
||||
}
|
||||
e->userData = userData;
|
||||
e->convert = convert;
|
||||
if (convert) {
|
||||
e->normal.isName2 = unknown_isName;
|
||||
e->normal.isName3 = unknown_isName;
|
||||
e->normal.isName4 = unknown_isName;
|
||||
e->normal.isNmstrt2 = unknown_isNmstrt;
|
||||
e->normal.isNmstrt3 = unknown_isNmstrt;
|
||||
e->normal.isNmstrt4 = unknown_isNmstrt;
|
||||
e->normal.isInvalid2 = unknown_isInvalid;
|
||||
e->normal.isInvalid3 = unknown_isInvalid;
|
||||
e->normal.isInvalid4 = unknown_isInvalid;
|
||||
}
|
||||
e->normal.enc.utf8Convert = unknown_toUtf8;
|
||||
e->normal.enc.utf16Convert = unknown_toUtf16;
|
||||
return &(e->normal.enc);
|
||||
}
|
||||
|
|
|
@ -29,6 +29,9 @@ extern "C" {
|
|||
#define XMLTOKAPI /* as nothing */
|
||||
#endif
|
||||
|
||||
/* The following token may be returned by XmlContentTok */
|
||||
#define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be start of
|
||||
illegal ]]> sequence */
|
||||
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
|
||||
#define XML_TOK_NONE -4 /* The string to be scanned is empty */
|
||||
#define XML_TOK_TRAILING_CR -3 /* A CR at the end of the scan;
|
||||
|
@ -38,7 +41,7 @@ extern "C" {
|
|||
#define XML_TOK_INVALID 0
|
||||
|
||||
/* The following tokens are returned by XmlContentTok; some are also
|
||||
returned by XmlAttributeValueTok and XmlEntityTok */
|
||||
returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
|
||||
|
||||
#define XML_TOK_START_TAG_WITH_ATTS 1
|
||||
#define XML_TOK_START_TAG_NO_ATTS 2
|
||||
|
@ -47,7 +50,7 @@ extern "C" {
|
|||
#define XML_TOK_END_TAG 5
|
||||
#define XML_TOK_DATA_CHARS 6
|
||||
#define XML_TOK_DATA_NEWLINE 7
|
||||
#define XML_TOK_CDATA_SECTION 8
|
||||
#define XML_TOK_CDATA_SECT_OPEN 8
|
||||
#define XML_TOK_ENTITY_REF 9
|
||||
#define XML_TOK_CHAR_REF 10 /* numeric character reference */
|
||||
|
||||
|
@ -85,25 +88,25 @@ extern "C" {
|
|||
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
|
||||
#define XML_TOK_COMMA 38
|
||||
|
||||
/* The following tokens is returned only by XmlAttributeValueTok */
|
||||
/* The following token is returned only by XmlAttributeValueTok */
|
||||
#define XML_TOK_ATTRIBUTE_VALUE_S 39
|
||||
|
||||
#define XML_N_STATES 2
|
||||
/* The following token is returned only by XmlCdataSectionTok */
|
||||
#define XML_TOK_CDATA_SECT_CLOSE 40
|
||||
|
||||
#define XML_N_STATES 3
|
||||
#define XML_PROLOG_STATE 0
|
||||
#define XML_CONTENT_STATE 1
|
||||
#define XML_CDATA_SECTION_STATE 2
|
||||
|
||||
#define XML_N_LITERAL_TYPES 2
|
||||
#define XML_ATTRIBUTE_VALUE_LITERAL 0
|
||||
#define XML_ENTITY_VALUE_LITERAL 1
|
||||
|
||||
#define XML_N_INTERNAL_ENCODINGS 1
|
||||
#define XML_UTF8_ENCODING 0
|
||||
#if 0
|
||||
#define XML_UTF16_ENCODING 1
|
||||
#define XML_UCS4_ENCODING 2
|
||||
#endif
|
||||
|
||||
#define XML_MAX_BYTES_PER_CHAR 4
|
||||
/* The size of the buffer passed to XmlUtf8Encode must be at least this. */
|
||||
#define XML_UTF8_ENCODE_MAX 4
|
||||
/* The size of the buffer passed to XmlUtf16Encode must be at least this. */
|
||||
#define XML_UTF16_ENCODE_MAX 2
|
||||
|
||||
typedef struct position {
|
||||
/* first line and first column are 0 not 1 */
|
||||
|
@ -139,21 +142,26 @@ struct encoding {
|
|||
int (*getAtts)(const ENCODING *enc, const char *ptr,
|
||||
int attsMax, ATTRIBUTE *atts);
|
||||
int (*charRefNumber)(const ENCODING *enc, const char *ptr);
|
||||
int (*predefinedEntityName)(const ENCODING *, const char *, const char *);
|
||||
void (*updatePosition)(const ENCODING *,
|
||||
const char *ptr,
|
||||
const char *end,
|
||||
POSITION *);
|
||||
int (*isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **badPtr);
|
||||
int (*encode)(const ENCODING *enc,
|
||||
int charNum,
|
||||
char *buf);
|
||||
void (*convert[XML_N_INTERNAL_ENCODINGS])(const ENCODING *enc,
|
||||
const char **fromP,
|
||||
const char *fromLim,
|
||||
char **toP,
|
||||
const char *toLim);
|
||||
void (*utf8Convert)(const ENCODING *enc,
|
||||
const char **fromP,
|
||||
const char *fromLim,
|
||||
char **toP,
|
||||
const char *toLim);
|
||||
void (*utf16Convert)(const ENCODING *enc,
|
||||
const char **fromP,
|
||||
const char *fromLim,
|
||||
unsigned short **toP,
|
||||
const unsigned short *toLim);
|
||||
int minBytesPerChar;
|
||||
char isUtf8;
|
||||
char isUtf16;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -186,6 +194,9 @@ literals, comments and processing instructions.
|
|||
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
|
||||
XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
|
||||
|
||||
/* This is used for performing a 2nd-level tokenization on
|
||||
the content of a literal that has already been returned by XmlTok. */
|
||||
|
||||
|
@ -215,17 +226,20 @@ the content of a literal that has already been returned by XmlTok. */
|
|||
#define XmlCharRefNumber(enc, ptr) \
|
||||
(((enc)->charRefNumber)(enc, ptr))
|
||||
|
||||
#define XmlPredefinedEntityName(enc, ptr, end) \
|
||||
(((enc)->predefinedEntityName)(enc, ptr, end))
|
||||
|
||||
#define XmlUpdatePosition(enc, ptr, end, pos) \
|
||||
(((enc)->updatePosition)(enc, ptr, end, pos))
|
||||
|
||||
#define XmlIsPublicId(enc, ptr, end, badPtr) \
|
||||
(((enc)->isPublicId)(enc, ptr, end, badPtr))
|
||||
|
||||
#define XmlEncode(enc, ch, buf) \
|
||||
(((enc)->encode)(enc, ch, buf))
|
||||
#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \
|
||||
(((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim))
|
||||
|
||||
#define XmlConvert(enc, targetEnc, fromP, fromLim, toP, toLim) \
|
||||
(((enc)->convert[targetEnc])(enc, fromP, fromLim, toP, toLim))
|
||||
#define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \
|
||||
(((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim))
|
||||
|
||||
typedef struct {
|
||||
ENCODING initEnc;
|
||||
|
@ -243,7 +257,17 @@ int XMLTOKAPI XmlParseXmlDecl(int isGeneralTextEntity,
|
|||
int *standalonePtr);
|
||||
|
||||
int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name);
|
||||
const ENCODING XMLTOKAPI *XmlGetInternalEncoding(int);
|
||||
const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding();
|
||||
const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
|
||||
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
|
||||
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
|
||||
|
||||
int XMLTOKAPI XmlSizeOfUnknownEncoding();
|
||||
ENCODING XMLTOKAPI *
|
||||
XmlInitUnknownEncoding(void *mem,
|
||||
int *table,
|
||||
int (*convert)(void *userData, const char *p),
|
||||
void *userData);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -56,7 +56,7 @@ Contributor(s):
|
|||
|
||||
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
|
||||
case BT_NONASCII: \
|
||||
if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \
|
||||
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_INVALID; \
|
||||
} \
|
||||
|
@ -84,7 +84,7 @@ Contributor(s):
|
|||
|
||||
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
|
||||
case BT_NONASCII: \
|
||||
if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \
|
||||
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_INVALID; \
|
||||
} \
|
||||
|
@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
return XML_TOK_PARTIAL;
|
||||
}
|
||||
|
||||
/* ptr points to character following "<![" */
|
||||
|
||||
static
|
||||
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
int i;
|
||||
/* CDATA[]]> */
|
||||
if (end - ptr < 9 * MINBPC)
|
||||
/* CDATA[ */
|
||||
if (end - ptr < 6 * MINBPC)
|
||||
return XML_TOK_PARTIAL;
|
||||
for (i = 0; i < 6; i++, ptr += MINBPC) {
|
||||
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
|
||||
|
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
|
|||
return XML_TOK_INVALID;
|
||||
}
|
||||
}
|
||||
end -= 2 * MINBPC;
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
case BT_RSQB:
|
||||
if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
|
||||
&& CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
|
||||
*nextTokPtr = ptr + 3 * MINBPC;
|
||||
return XML_TOK_CDATA_SECTION;
|
||||
}
|
||||
/* fall through */
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_CDATA_SECT_OPEN;
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
|
||||
const char **nextTokPtr)
|
||||
{
|
||||
if (ptr == end)
|
||||
return XML_TOK_NONE;
|
||||
#if MINBPC > 1
|
||||
{
|
||||
size_t n = end - ptr;
|
||||
if (n & (MINBPC - 1)) {
|
||||
n &= ~(MINBPC - 1);
|
||||
if (n == 0)
|
||||
return XML_TOK_PARTIAL;
|
||||
end = ptr + n;
|
||||
}
|
||||
}
|
||||
return XML_TOK_PARTIAL;
|
||||
#endif
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_RSQB:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (!CHAR_MATCHES(enc, ptr, ']'))
|
||||
break;
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (!CHAR_MATCHES(enc, ptr, '>')) {
|
||||
ptr -= MINBPC;
|
||||
break;
|
||||
}
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_CDATA_SECT_CLOSE;
|
||||
case BT_CR:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
if (BYTE_TYPE(enc, ptr) == BT_LF)
|
||||
ptr += MINBPC;
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_NEWLINE;
|
||||
case BT_LF:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_DATA_NEWLINE;
|
||||
INVALID_CASES(ptr, nextTokPtr)
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
while (ptr != end) {
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
#define LEAD_CASE(n) \
|
||||
case BT_LEAD ## n: \
|
||||
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
|
||||
*nextTokPtr = ptr; \
|
||||
return XML_TOK_DATA_CHARS; \
|
||||
} \
|
||||
ptr += n; \
|
||||
break;
|
||||
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
|
||||
#undef LEAD_CASE
|
||||
case BT_NONXML:
|
||||
case BT_MALFORM:
|
||||
case BT_TRAIL:
|
||||
case BT_CR:
|
||||
case BT_LF:
|
||||
case BT_RSQB:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
default:
|
||||
ptr += MINBPC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_DATA_CHARS;
|
||||
}
|
||||
|
||||
/* ptr points to character following "</" */
|
||||
|
@ -442,7 +505,7 @@ int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
|
||||
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_NUM:
|
||||
return PREFIX(scanCharRef)(enc, ptr + MINBPC, end, nextTokPtr);
|
||||
default:
|
||||
|
@ -543,6 +606,22 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
break;
|
||||
}
|
||||
}
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_S:
|
||||
case BT_CR:
|
||||
case BT_LF:
|
||||
break;
|
||||
case BT_SOL:
|
||||
goto sol;
|
||||
case BT_GT:
|
||||
goto gt;
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
}
|
||||
/* ptr points to closing quote */
|
||||
for (;;) {
|
||||
ptr += MINBPC;
|
||||
|
@ -553,9 +632,11 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
case BT_S: case BT_CR: case BT_LF:
|
||||
continue;
|
||||
case BT_GT:
|
||||
gt:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
return XML_TOK_START_TAG_WITH_ATTS;
|
||||
case BT_SOL:
|
||||
sol:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
|
@ -694,12 +775,12 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
case BT_RSQB:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
return XML_TOK_TRAILING_RSQB;
|
||||
if (!CHAR_MATCHES(enc, ptr, ']'))
|
||||
break;
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
return XML_TOK_TRAILING_RSQB;
|
||||
if (!CHAR_MATCHES(enc, ptr, '>')) {
|
||||
ptr -= MINBPC;
|
||||
break;
|
||||
|
@ -766,7 +847,7 @@ int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
|
||||
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
|
||||
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_PERCENT;
|
||||
|
@ -795,7 +876,7 @@ int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
if (ptr == end)
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
|
||||
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
|
||||
default:
|
||||
*nextTokPtr = ptr;
|
||||
return XML_TOK_INVALID;
|
||||
|
@ -944,7 +1025,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
case BT_RPAR:
|
||||
ptr += MINBPC;
|
||||
if (ptr == end)
|
||||
return XML_TOK_INVALID;
|
||||
return XML_TOK_PARTIAL;
|
||||
switch (BYTE_TYPE(enc, ptr)) {
|
||||
case BT_AST:
|
||||
*nextTokPtr = ptr + MINBPC;
|
||||
|
@ -1001,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
|
|||
ptr += MINBPC;
|
||||
break;
|
||||
case BT_NONASCII:
|
||||
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
|
||||
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NAME;
|
||||
break;
|
||||
}
|
||||
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
|
||||
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
|
||||
ptr += MINBPC;
|
||||
tok = XML_TOK_NMTOKEN;
|
||||
break;
|
||||
|
@ -1343,6 +1424,59 @@ int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
|
|||
return checkCharRefNumber(result);
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
|
||||
{
|
||||
switch (end - ptr) {
|
||||
case 2 * MINBPC:
|
||||
if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) {
|
||||
switch (BYTE_TO_ASCII(enc, ptr)) {
|
||||
case 'l':
|
||||
return '<';
|
||||
case 'g':
|
||||
return '>';
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 3 * MINBPC:
|
||||
if (CHAR_MATCHES(enc, ptr, 'a')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'm')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'p'))
|
||||
return '&';
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 4 * MINBPC:
|
||||
switch (BYTE_TO_ASCII(enc, ptr)) {
|
||||
case 'q':
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'u')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'o')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 't'))
|
||||
return '"';
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'a':
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'p')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 'o')) {
|
||||
ptr += MINBPC;
|
||||
if (CHAR_MATCHES(enc, ptr, 's'))
|
||||
return '\'';
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
|
||||
{
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
XML_ErrorString
|
||||
XML_GetErrorByteIndex
|
||||
XML_GetErrorColumnNumber
|
||||
XML_GetErrorLineNumber
|
||||
XML_GetCurrentLineNumber
|
||||
XML_GetCurrentColumnNumber
|
||||
XML_GetCurrentByteIndex
|
||||
XML_GetErrorCode
|
||||
XML_GetBuffer
|
||||
XML_ParseBuffer
|
||||
|
@ -19,7 +19,6 @@ hashTableDestroy
|
|||
lookup
|
||||
XmlParseXmlDecl
|
||||
XmlInitEncoding
|
||||
XmlGetInternalEncoding
|
||||
XmlPrologStateInit
|
||||
tokenizeXMLElement
|
||||
XMLDOM_PIHandler
|
||||
|
|
Загрузка…
Ссылка в новой задаче