2nd try; new parser from james clark

This commit is contained in:
jgellman%netscape.com 1998-08-20 21:20:50 +00:00
Родитель 67de8bdacd
Коммит 0fd8190c02
10 изменённых файлов: 2269 добавлений и 511 удалений

Просмотреть файл

@ -1,7 +1,7 @@
/*
The contents of this file are subject to the Mozilla Public License
Version 1.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
csompliance with the License. You may obtain a copy of the License at
http://www.mozilla.org/MPL/
Software distributed under the License is distributed on an "AS IS"
@ -18,15 +18,22 @@ James Clark. All Rights Reserved.
Contributor(s):
*/
#include "xmldef.h"
#include "hashtable.h"
#include <stdlib.h>
#include <string.h>
#include "xmldef.h"
#include "hashtable.h"
#ifdef XML_UNICODE
#define keycmp wcscmp
#else
#define keycmp strcmp
#endif
#define INIT_SIZE 64
static
unsigned long hash(const char *s)
unsigned long hash(KEY s)
{
unsigned long h = 0;
while (*s)
@ -34,7 +41,7 @@ unsigned long hash(const char *s)
return h;
}
NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize)
{
size_t i;
if (table->size == 0) {
@ -52,7 +59,7 @@ NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
for (i = h & (table->size - 1);
table->v[i];
i == 0 ? i = table->size - 1 : --i) {
if (strcmp(name, table->v[i]->name) == 0)
if (keycmp(name, table->v[i]->name) == 0)
return table->v[i];
}
if (!createSize)

Просмотреть файл

@ -21,8 +21,14 @@ Contributor(s):
#include <stddef.h>
#ifdef XML_UNICODE
typedef const wchar_t *KEY;
#else
typedef const char *KEY;
#endif
typedef struct {
const char *name;
KEY name;
} NAMED;
typedef struct {
@ -32,7 +38,7 @@ typedef struct {
size_t usedLim;
} HASH_TABLE;
NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize);
NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize);
void hashTableInit(HASH_TABLE *);
void hashTableDestroy(HASH_TABLE *);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -31,32 +31,199 @@ extern "C" {
typedef void *XML_Parser;
/* Constructs a new parser; encoding should be the name of the charset from
the Content-Type header if the Content-Type is text/xml, or null otherwise. */
#ifdef XML_UNICODE_WCHAR_T
XML_Parser XMLPARSEAPI
XML_ParserCreate(const char *encoding);
/* XML_UNICODE_WCHAR_T will work only if sizeof(wchar_t) == 2 and wchar_t
uses Unicode. */
/* Information is UTF-16 encoded as wchar_ts */
#ifndef XML_UNICODE
#define XML_UNICODE
#endif
#include <stddef.h>
typedef wchar_t XML_Char;
typedef wchar_t XML_LChar;
#else /* not XML_UNICODE_WCHAR_T */
#ifdef XML_UNICODE
/* Information is UTF-16 encoded as unsigned shorts */
typedef unsigned short XML_Char;
typedef char XML_LChar;
#else /* not XML_UNICODE */
/* Information is UTF-8 encoded. */
typedef char XML_Char;
typedef char XML_LChar;
/* atts is array of name/value pairs, terminated by NULL;
names and values are '\0' terminated. */
#endif /* not XML_UNICODE */
#endif /* not XML_UNICODE_WCHAR_T */
/* Constructs a new parser; encoding is the encoding specified by the external
protocol or null if there is none specified. */
XML_Parser XMLPARSEAPI
XML_ParserCreate(const XML_Char *encoding);
/* atts is array of name/value pairs, terminated by 0;
names and values are 0 terminated. */
typedef void (*XML_StartElementHandler)(void *userData,
const char *name,
const char **atts);
const XML_Char *name,
const XML_Char **atts);
typedef void (*XML_EndElementHandler)(void *userData,
const char *name);
const XML_Char *name);
/* s is not 0 terminated. */
typedef void (*XML_CharacterDataHandler)(void *userData,
const char *s,
const XML_Char *s,
int len);
/* target and data are '\0' terminated */
/* target and data are 0 terminated */
typedef void (*XML_ProcessingInstructionHandler)(void *userData,
const char *target,
const char *data);
const XML_Char *target,
const XML_Char *data);
/* This is called for any characters in the XML document for
which there is no applicable handler. This includes both
characters that are part of markup which is of a kind that is
not reported (comments, markup declarations), or characters
that are part of a construct which could be reported but
for which no handler has been supplied. The characters are passed
exactly as they were in the XML document except that
they will be encoded in UTF-8. Line boundaries are not normalized.
Note that a byte order mark character is not passed to the default handler.
If a default handler is set, internal entity references
are not expanded. There are no guarantees about
how characters are divided between calls to the default handler:
for example, a comment might be split between multiple calls. */
typedef void (*XML_DefaultHandler)(void *userData,
const XML_Char *s,
int len);
/* This is called for a declaration of an unparsed (NDATA)
entity. The base argument is whatever was set by XML_SetBase.
The entityName, systemId and notationName arguments will never be null.
The other arguments may be. */
typedef void (*XML_UnparsedEntityDeclHandler)(void *userData,
const XML_Char *entityName,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId,
const XML_Char *notationName);
/* This is called for a declaration of notation.
The base argument is whatever was set by XML_SetBase.
The notationName will never be null. The other arguments can be. */
typedef void (*XML_NotationDeclHandler)(void *userData,
const XML_Char *notationName,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId);
/* This is called for a reference to an external parsed general entity.
The referenced entity is not automatically parsed.
The application can parse it immediately or later using
XML_ExternalEntityParserCreate.
The parser argument is the parser parsing the entity containing the reference;
it can be passed as the parser argument to XML_ExternalEntityParserCreate.
The systemId argument is the system identifier as specified in the entity declaration;
it will not be null.
The base argument is the system identifier that should be used as the base for
resolving systemId if systemId was relative; this is set by XML_SetBase;
it may be null.
The publicId argument is the public identifier as specified in the entity declaration,
or null if none was specified; the whitespace in the public identifier
will have been normalized as required by the XML spec.
The openEntityNames argument is a space-separated list of the names of the entities
that are open for the parse of this entity (including the name of the referenced
entity); this can be passed as the openEntityNames argument to
XML_ExternalEntityParserCreate; openEntityNames is valid only until the handler
returns, so if the referenced entity is to be parsed later, it must be copied.
The handler should return 0 if processing should not continue because of
a fatal error in the handling of the external entity.
In this case the calling parser will return an XML_ERROR_EXTERNAL_ENTITY_HANDLING
error.
Note that unlike other handlers the first argument is the parser, not userData. */
typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser,
const XML_Char *openEntityNames,
const XML_Char *base,
const XML_Char *systemId,
const XML_Char *publicId);
/* This structure is filled in by the XML_UnknownEncodingHandler
to provide information to the parser about encodings that are unknown
to the parser.
The map[b] member gives information about byte sequences
whose first byte is b.
If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
If map[b] is -1, then the byte sequence is malformed.
If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
sequence that encodes a single Unicode scalar value.
The data member will be passed as the first argument to the convert function.
The convert function is used to convert multibyte sequences;
s will point to a n-byte sequence where map[(unsigned char)*s] == -n.
The convert function must return the Unicode scalar value
represented by this byte sequence or -1 if the byte sequence is malformed.
The convert function may be null if the encoding is a single-byte encoding,
that is if map[b] >= -1 for all bytes b.
When the parser is finished with the encoding, then if release is not null,
it will call release passing it the data member;
once release has been called, the convert function will not be called again.
Expat places certain restrictions on the encodings that are supported
using this mechanism.
1. Every ASCII character that can appear in a well-formed XML document,
other than the characters
$@\^`{}~
must be represented by a single byte, and that byte must be the
same byte that represents that character in ASCII.
2. No character may require more than 4 bytes to encode.
3. All characters encoded must have Unicode scalar values <= 0xFFFF,
(ie characters that would be encoded by surrogates in UTF-16
are not allowed). Note that this restriction doesn't apply to
the built-in support for UTF-8 and UTF-16.
4. No Unicode character may be encoded by more than one distinct sequence
of bytes. */
typedef struct {
int map[256];
void *data;
int (*convert)(void *data, const char *s);
void (*release)(void *data);
} XML_Encoding;
/* This is called for an encoding that is unknown to the parser.
The encodingHandlerData argument is that which was passed as the
second argument to XML_SetUnknownEncodingHandler.
The name argument gives the name of the encoding as specified in
the encoding declaration.
If the callback can provide information about the encoding,
it must fill in the XML_Encoding structure, and return 1.
Otherwise it must return 0.
If info does not describe a suitable encoding,
then the parser will return an XML_UNKNOWN_ENCODING error. */
typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info);
void XMLPARSEAPI
XML_SetElementHandler(XML_Parser parser,
@ -71,10 +238,62 @@ void XMLPARSEAPI
XML_SetProcessingInstructionHandler(XML_Parser parser,
XML_ProcessingInstructionHandler handler);
void XMLPARSEAPI
XML_SetDefaultHandler(XML_Parser parser,
XML_DefaultHandler handler);
void XMLPARSEAPI
XML_SetUnparsedEntityDeclHandler(XML_Parser parser,
XML_UnparsedEntityDeclHandler handler);
void XMLPARSEAPI
XML_SetNotationDeclHandler(XML_Parser parser,
XML_NotationDeclHandler handler);
void XMLPARSEAPI
XML_SetExternalEntityRefHandler(XML_Parser parser,
XML_ExternalEntityRefHandler handler);
void XMLPARSEAPI
XML_SetUnknownEncodingHandler(XML_Parser parser,
XML_UnknownEncodingHandler handler,
void *encodingHandlerData);
/* This can be called within a handler for a start element, end element,
processing instruction or character data. It causes the corresponding
markup to be passed to the default handler.
Within the expansion of an internal entity, nothing will be passed
to the default handler, although this usually will not happen since
setting a default handler inhibits expansion of internal entities. */
void XMLPARSEAPI XML_DefaultCurrent(XML_Parser parser);
/* This value is passed as the userData argument to callbacks. */
void XMLPARSEAPI
XML_SetUserData(XML_Parser parser, void *userData);
/* Returns the last value set by XML_SetUserData or null. */
#define XML_GetUserData(parser) (*(void **)(parser))
/* If this function is called, then the parser will be passed
as the first argument to callbacks instead of userData.
The userData will still be accessible using XML_GetUserData. */
void XMLPARSEAPI
XML_UseParserAsHandlerArg(XML_Parser parser);
/* Sets the base to be used for resolving relative URIs in system identifiers in
declarations. Resolving relative identifiers is left to the application:
this value will be passed through as the base argument to the
XML_ExternalEntityRefHandler, XML_NotationDeclHandler
and XML_UnparsedEntityDeclHandler. The base argument will be copied.
Returns zero if out of memory, non-zero otherwise. */
int XMLPARSEAPI
XML_SetBase(XML_Parser parser, const XML_Char *base);
const XML_Char XMLPARSEAPI *
XML_GetBase(XML_Parser parser);
/* Parses some input. Returns 0 if a fatal error is detected.
The last call to XML_Parse must have isFinal true;
len may be zero for this call (or any other). */
@ -87,8 +306,20 @@ XML_GetBuffer(XML_Parser parser, int len);
int XMLPARSEAPI
XML_ParseBuffer(XML_Parser parser, int len, int isFinal);
/* If XML_Parser or XML_ParseEnd have returned 0, then XML_GetError*
returns information about the error. */
/* Creates an XML_Parser object that can parse an external general entity;
openEntityNames is a space-separated list of the names of the entities that are open
for the parse of this entity (including the name of this one);
encoding is the externally specified encoding,
or null if there is no externally specified encoding.
This can be called at any point after the first call to an ExternalEntityRefHandler
so longer as the parser has not yet been freed.
The new parser is completely independent and may safely be used in a separate thread.
The handlers and userData are initialized from the parser argument.
Returns 0 if out of memory. Otherwise returns a new XML_Parser object. */
XML_Parser XMLPARSEAPI
XML_ExternalEntityParserCreate(XML_Parser parser,
const XML_Char *openEntityNames,
const XML_Char *encoding);
enum XML_Error {
XML_ERROR_NONE,
@ -110,19 +341,39 @@ enum XML_Error {
XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
XML_ERROR_MISPLACED_XML_PI,
XML_ERROR_UNKNOWN_ENCODING,
XML_ERROR_INCORRECT_ENCODING
XML_ERROR_INCORRECT_ENCODING,
XML_ERROR_UNCLOSED_CDATA_SECTION,
XML_ERROR_EXTERNAL_ENTITY_HANDLING
};
int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
int XMLPARSEAPI XML_GetErrorLineNumber(XML_Parser parser);
int XMLPARSEAPI XML_GetErrorColumnNumber(XML_Parser parser);
long XMLPARSEAPI XML_GetErrorByteIndex(XML_Parser parser);
/* If XML_Parse or XML_ParseBuffer have returned 0, then XML_GetErrorCode
returns information about the error. */
enum XML_Error XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
/* These functions return information about the current parse location.
They may be called when XML_Parse or XML_ParseBuffer return 0;
in this case the location is the location of the character at which
the error was detected.
They may also be called from any other callback called to report
some parse event; in this the location is the location of the first
of the sequence of characters that generated the event. */
int XMLPARSEAPI XML_GetCurrentLineNumber(XML_Parser parser);
int XMLPARSEAPI XML_GetCurrentColumnNumber(XML_Parser parser);
long XMLPARSEAPI XML_GetCurrentByteIndex(XML_Parser parser);
/* For backwards compatibility with previous versions. */
#define XML_GetErrorLineNumber XML_GetCurrentLineNumber
#define XML_GetErrorColumnNumber XML_GetCurrentColumnNumber
#define XML_GetErrorByteIndex XML_GetCurrentByteIndex
/* Frees memory used by the parser. */
void XMLPARSEAPI
XML_ParserFree(XML_Parser parser);
const char XMLPARSEAPI *
XML_ErrorString(int code);
/* Returns a string describing the error. */
const XML_LChar XMLPARSEAPI *XML_ErrorString(int code);
#ifdef __cplusplus
}

Просмотреть файл

@ -594,7 +594,7 @@ int notation4(PROLOG_STATE *state,
return XML_ROLE_NOTATION_SYSTEM_ID;
case XML_TOK_DECL_CLOSE:
state->handler = internalSubset;
return XML_ROLE_NONE;
return XML_ROLE_NOTATION_NO_SYSTEM_ID;
}
return syntaxError(state);
}

Просмотреть файл

@ -44,6 +44,7 @@ enum {
XML_ROLE_ENTITY_NOTATION_NAME,
XML_ROLE_NOTATION_NAME,
XML_ROLE_NOTATION_SYSTEM_ID,
XML_ROLE_NOTATION_NO_SYSTEM_ID,
XML_ROLE_NOTATION_PUBLIC_ID,
XML_ROLE_ATTRIBUTE_NAME,
XML_ROLE_ATTRIBUTE_TYPE_CDATA,

Просмотреть файл

@ -23,7 +23,7 @@ Contributor(s):
#include "nametab.h"
#define VTABLE1 \
{ PREFIX(prologTok), PREFIX(contentTok) }, \
{ PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
{ PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
PREFIX(sameName), \
PREFIX(nameMatchesAscii), \
@ -31,14 +31,11 @@ Contributor(s):
PREFIX(skipS), \
PREFIX(getAtts), \
PREFIX(charRefNumber), \
PREFIX(predefinedEntityName), \
PREFIX(updatePosition), \
PREFIX(isPublicId)
#define VTABLE2 \
PREFIX(encode), \
{ PREFIX(toUtf8) }
#define VTABLE VTABLE1, VTABLE2
#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
#define UCS2_GET_NAMING(pages, hi, lo) \
(namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
@ -81,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and
#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
static
int isNever(const ENCODING *enc, const char *p)
{
return 0;
}
static
int utf8_isName2(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
}
static
int utf8_isName3(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
}
#define utf8_isName4 isNever
static
int utf8_isNmstrt2(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
}
static
int utf8_isNmstrt3(const ENCODING *enc, const char *p)
{
return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
}
#define utf8_isNmstrt4 isNever
#define utf8_isInvalid2 isNever
static
int utf8_isInvalid3(const ENCODING *enc, const char *p)
{
return UTF8_INVALID3((const unsigned char *)p);
}
static
int utf8_isInvalid4(const ENCODING *enc, const char *p)
{
return UTF8_INVALID4((const unsigned char *)p);
}
struct normal_encoding {
ENCODING enc;
unsigned char type[256];
int (*isName2)(const ENCODING *, const char *);
int (*isName3)(const ENCODING *, const char *);
int (*isName4)(const ENCODING *, const char *);
int (*isNmstrt2)(const ENCODING *, const char *);
int (*isNmstrt3)(const ENCODING *, const char *);
int (*isNmstrt4)(const ENCODING *, const char *);
int (*isInvalid2)(const ENCODING *, const char *);
int (*isInvalid3)(const ENCODING *, const char *);
int (*isInvalid4)(const ENCODING *, const char *);
};
#define NORMAL_VTABLE(E) \
E ## isName2, \
E ## isName3, \
E ## isName4, \
E ## isNmstrt2, \
E ## isNmstrt3, \
E ## isNmstrt4, \
E ## isInvalid2, \
E ## isInvalid3, \
E ## isInvalid4
static int checkCharRefNumber(int);
#include "xmltok_impl.h"
@ -95,12 +160,16 @@ static int checkCharRefNumber(int);
#define BYTE_TYPE(enc, p) \
(((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
#define BYTE_TO_ASCII(enc, p) (*p)
#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
#define IS_NAME_CHAR(enc, p, n) \
(((const struct normal_encoding *)(enc))->isName ## n(enc, p))
#define IS_NMSTRT_CHAR(enc, p, n) \
(((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
#define IS_INVALID_CHAR(enc, p, n) \
((n) == 3 \
? UTF8_INVALID3((const unsigned char *)(p)) \
: ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0))
(((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
#define IS_NAME_CHAR_MINBPC(enc, p) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
/* c is an ASCII character */
#define CHAR_MATCHES(enc, p, c) (*(p) == c)
@ -113,51 +182,18 @@ static int checkCharRefNumber(int);
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
enum {
/* cvalN is value of masked first byte of N byte sequence */
cval1 = 0x00,
cval2 = 0xc0,
cval3 = 0xe0,
cval4 = 0xf0,
/* minN is minimum legal resulting value for N byte sequence */
min2 = 0x80,
min3 = 0x800,
min4 = 0x10000
enum { /* UTF8_cvalN is value of masked first byte of N byte sequence */
UTF8_cval1 = 0x00,
UTF8_cval2 = 0xc0,
UTF8_cval3 = 0xe0,
UTF8_cval4 = 0xf0
};
static
int utf8_encode(const ENCODING *enc, int c, char *buf)
{
if (c < 0)
return 0;
if (c < min2) {
buf[0] = (c | cval1);
return 1;
}
if (c < min3) {
buf[0] = ((c >> 6) | cval2);
buf[1] = ((c & 0x3f) | 0x80);
return 2;
}
if (c < min4) {
buf[0] = ((c >> 12) | cval3);
buf[1] = (((c >> 6) & 0x3f) | 0x80);
buf[2] = ((c & 0x3f) | 0x80);
return 3;
}
if (c < 0x110000) {
buf[0] = ((c >> 18) | cval4);
buf[1] = (((c >> 12) & 0x3f) | 0x80);
buf[2] = (((c >> 6) & 0x3f) | 0x80);
buf[3] = ((c & 0x3f) | 0x80);
return 4;
}
return 0;
}
static
void utf8_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
@ -177,34 +213,63 @@ void utf8_toUtf8(const ENCODING *enc,
*toP = to;
}
static
void utf8_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
unsigned short **toP, const unsigned short *toLim)
{
unsigned short *to = *toP;
const char *from = *fromP;
while (from != fromLim && to != toLim) {
switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
case BT_LEAD2:
*to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
from += 2;
break;
case BT_LEAD3:
*to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
from += 3;
break;
case BT_LEAD4:
{
unsigned long n;
if (to + 1 == toLim)
break;
n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
n -= 0x10000;
to[0] = (unsigned short)((n >> 10) | 0xD800);
to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
to += 2;
from += 4;
}
break;
default:
*to++ = *from++;
break;
}
}
*fromP = from;
*toP = to;
}
static const struct normal_encoding utf8_encoding = {
{ VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
{
#include "asciitab.h"
#include "utf8tab.h"
}
},
NORMAL_VTABLE(utf8_)
};
static const struct normal_encoding internal_utf8_encoding = {
{ VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
{ VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
{
#include "iasciitab.h"
#include "utf8tab.h"
}
},
NORMAL_VTABLE(utf8_)
};
static
int latin1_encode(const ENCODING *enc, int c, char *buf)
{
if (c < 0)
return 0;
if (c <= 0xFF) {
buf[0] = (char)c;
return 1;
}
return 0;
}
static
void latin1_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
@ -218,7 +283,7 @@ void latin1_toUtf8(const ENCODING *enc,
if (c & 0x80) {
if (toLim - *toP < 2)
break;
*(*toP)++ = ((c >> 6) | cval2);
*(*toP)++ = ((c >> 6) | UTF8_cval2);
*(*toP)++ = ((c & 0x3f) | 0x80);
(*fromP)++;
}
@ -230,15 +295,39 @@ void latin1_toUtf8(const ENCODING *enc,
}
}
static
void latin1_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
unsigned short **toP, const unsigned short *toLim)
{
while (*fromP != fromLim && *toP != toLim)
*(*toP)++ = (unsigned char)*(*fromP)++;
}
static const struct normal_encoding latin1_encoding = {
{ VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 },
{ VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
{
#include "asciitab.h"
#include "latin1tab.h"
}
};
#define latin1tab (latin1_encoding.type)
static
void ascii_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
{
while (*fromP != fromLim && *toP != toLim)
*(*toP)++ = *(*fromP)++;
}
static const struct normal_encoding ascii_encoding = {
{ VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
{
#include "asciitab.h"
/* BT_NONXML == 0 */
}
};
#undef PREFIX
@ -260,25 +349,6 @@ static int unicode_byte_type(char hi, char lo)
return BT_NONASCII;
}
#define DEFINE_UTF16_ENCODE \
static \
int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \
{ \
if (charNum < 0) \
return 0; \
if (charNum < 0x10000) { \
SET2(buf, charNum); \
return 2; \
} \
if (charNum < 0x110000) { \
charNum -= 0x10000; \
SET2(buf, (charNum >> 10) + 0xD800); \
SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \
return 4; \
} \
return 0; \
}
#define DEFINE_UTF16_TO_UTF8 \
static \
void PREFIX(toUtf8)(const ENCODING *enc, \
@ -308,7 +378,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
*fromP = from; \
return; \
} \
*(*toP)++ = ((lo >> 6) | (hi << 2) | cval2); \
*(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
*(*toP)++ = ((lo & 0x3f) | 0x80); \
break; \
default: \
@ -317,7 +387,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
return; \
} \
/* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
*(*toP)++ = ((hi >> 4) | cval3); \
*(*toP)++ = ((hi >> 4) | UTF8_cval3); \
*(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
*(*toP)++ = ((lo & 0x3f) | 0x80); \
break; \
@ -327,7 +397,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
return; \
} \
plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
*(*toP)++ = ((plane >> 2) | cval4); \
*(*toP)++ = ((plane >> 2) | UTF8_cval4); \
*(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
from += 2; \
lo2 = GET_LO(from); \
@ -342,15 +412,33 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
*fromP = from; \
}
#define DEFINE_UTF16_TO_UTF16 \
static \
void PREFIX(toUtf16)(const ENCODING *enc, \
const char **fromP, const char *fromLim, \
unsigned short **toP, const unsigned short *toLim) \
{ \
/* Avoid copying first half only of surrogate */ \
if (fromLim - *fromP > ((toLim - *toP) << 1) \
&& (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
fromLim -= 2; \
for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
*(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
}
#define PREFIX(ident) little2_ ## ident
#define MINBPC 2
#define BYTE_TYPE(enc, p) \
((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
((p)[1] == 0 \
? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
: unicode_byte_type((p)[1], (p)[0]))
#define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
#define IS_NAME_CHAR(enc, p, n) \
#define IS_NAME_CHAR(enc, p, n) (0)
#define IS_NAME_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
#define IS_NMSTRT_CHAR(enc, p, n) \
#define IS_NMSTRT_CHAR(enc, p, n) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
#include "xmltok_impl.c"
@ -360,8 +448,8 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
#define GET_LO(ptr) ((unsigned char)(ptr)[0])
#define GET_HI(ptr) ((unsigned char)(ptr)[1])
DEFINE_UTF16_ENCODE
DEFINE_UTF16_TO_UTF8
DEFINE_UTF16_TO_UTF16
#undef SET2
#undef GET_LO
@ -371,10 +459,32 @@ DEFINE_UTF16_TO_UTF8
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
static const struct encoding little2_encoding = { VTABLE, 2 };
static const struct normal_encoding little2_encoding = {
{ VTABLE, 2, 0,
#if BYTE_ORDER == 12
1
#else
0
#endif
},
#include "asciitab.h"
#include "latin1tab.h"
};
#if BYTE_ORDER != 21
static const struct normal_encoding internal_little2_encoding = {
{ VTABLE, 2, 0, 1 },
#include "iasciitab.h"
#include "latin1tab.h"
};
#endif
#undef PREFIX
@ -382,12 +492,16 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
#define MINBPC 2
/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
#define BYTE_TYPE(enc, p) \
((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
((p)[0] == 0 \
? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
: unicode_byte_type((p)[0], (p)[1]))
#define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
#define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
#define IS_NAME_CHAR(enc, p, n) \
#define IS_NAME_CHAR(enc, p, n) 0
#define IS_NAME_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
#define IS_NMSTRT_CHAR(enc, p, n) \
#define IS_NMSTRT_CHAR(enc, p, n) (0)
#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
#include "xmltok_impl.c"
@ -397,8 +511,8 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
#define GET_LO(ptr) ((unsigned char)(ptr)[1])
#define GET_HI(ptr) ((unsigned char)(ptr)[0])
DEFINE_UTF16_ENCODE
DEFINE_UTF16_TO_UTF8
DEFINE_UTF16_TO_UTF16
#undef SET2
#undef GET_LO
@ -408,10 +522,32 @@ DEFINE_UTF16_TO_UTF8
#undef BYTE_TO_ASCII
#undef CHAR_MATCHES
#undef IS_NAME_CHAR
#undef IS_NAME_CHAR_MINBPC
#undef IS_NMSTRT_CHAR
#undef IS_NMSTRT_CHAR_MINBPC
#undef IS_INVALID_CHAR
static const struct encoding big2_encoding = { VTABLE, 2 };
static const struct normal_encoding big2_encoding = {
{ VTABLE, 2, 0,
#if BYTE_ORDER == 21
1
#else
0
#endif
},
#include "asciitab.h"
#include "latin1tab.h"
};
#if BYTE_ORDER != 12
static const struct normal_encoding internal_big2_encoding = {
{ VTABLE, 2, 0, 1 },
#include "iasciitab.h"
#include "latin1tab.h"
};
#endif
#undef PREFIX
@ -454,18 +590,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
else {
switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
case 0x003C:
*encPtr = &big2_encoding;
*encPtr = &big2_encoding.enc;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFEFF:
*nextTokPtr = ptr + 2;
*encPtr = &big2_encoding;
*encPtr = &big2_encoding.enc;
return XML_TOK_BOM;
case 0x3C00:
*encPtr = &little2_encoding;
*encPtr = &little2_encoding.enc;
return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
case 0xFFFE:
*nextTokPtr = ptr + 2;
*encPtr = &little2_encoding;
*encPtr = &little2_encoding.enc;
return XML_TOK_BOM;
}
}
@ -494,13 +630,21 @@ void initUpdatePosition(const ENCODING *enc, const char *ptr,
normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
}
const ENCODING *XmlGetInternalEncoding(int e)
const ENCODING *XmlGetUtf8InternalEncoding()
{
switch (e) {
case XML_UTF8_ENCODING:
return &internal_utf8_encoding.enc;
}
return 0;
return &internal_utf8_encoding.enc;
}
const ENCODING *XmlGetUtf16InternalEncoding()
{
#if BYTE_ORDER == 12
return &internal_little2_encoding.enc;
#elif BYTE_ORDER == 21
return &internal_big2_encoding.enc;
#else
const short n = 1;
return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
#endif
}
int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
@ -514,6 +658,10 @@ int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
*encPtr = &utf8_encoding.enc;
return 1;
}
if (streqci(name, "US-ASCII")) {
*encPtr = &ascii_encoding.enc;
return 1;
}
if (!streqci(name, "UTF-16"))
return 0;
}
@ -531,7 +679,7 @@ int toAscii(const ENCODING *enc, const char *ptr, const char *end)
{
char buf[1];
char *p = buf;
XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1);
XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
if (p == buf)
return -1;
else
@ -641,7 +789,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
char buf[ENCODING_MAX];
char *p = buf;
int i;
XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1);
XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1);
if (ptr != end)
return 0;
*p = 0;
@ -653,11 +801,13 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
return &utf8_encoding.enc;
if (streqci(buf, "ISO-8859-1"))
return &latin1_encoding.enc;
if (streqci(buf, "US-ASCII"))
return &ascii_encoding.enc;
if (streqci(buf, "UTF-16")) {
static const unsigned short n = 1;
if (enc->minBytesPerChar == 2)
return enc;
return &big2_encoding;
return &big2_encoding.enc;
}
return 0;
}
@ -757,3 +907,229 @@ int checkCharRefNumber(int result)
return result;
}
int XmlUtf8Encode(int c, char *buf)
{
enum {
/* minN is minimum legal resulting value for N byte sequence */
min2 = 0x80,
min3 = 0x800,
min4 = 0x10000
};
if (c < 0)
return 0;
if (c < min2) {
buf[0] = (c | UTF8_cval1);
return 1;
}
if (c < min3) {
buf[0] = ((c >> 6) | UTF8_cval2);
buf[1] = ((c & 0x3f) | 0x80);
return 2;
}
if (c < min4) {
buf[0] = ((c >> 12) | UTF8_cval3);
buf[1] = (((c >> 6) & 0x3f) | 0x80);
buf[2] = ((c & 0x3f) | 0x80);
return 3;
}
if (c < 0x110000) {
buf[0] = ((c >> 18) | UTF8_cval4);
buf[1] = (((c >> 12) & 0x3f) | 0x80);
buf[2] = (((c >> 6) & 0x3f) | 0x80);
buf[3] = ((c & 0x3f) | 0x80);
return 4;
}
return 0;
}
int XmlUtf16Encode(int charNum, unsigned short *buf)
{
if (charNum < 0)
return 0;
if (charNum < 0x10000) {
buf[0] = charNum;
return 1;
}
if (charNum < 0x110000) {
charNum -= 0x10000;
buf[0] = (charNum >> 10) + 0xD800;
buf[1] = (charNum & 0x3FF) + 0xDC00;
return 2;
}
return 0;
}
struct unknown_encoding {
struct normal_encoding normal;
int (*convert)(void *userData, const char *p);
void *userData;
unsigned short utf16[256];
char utf8[256][4];
};
int XmlSizeOfUnknownEncoding()
{
return sizeof(struct unknown_encoding);
}
static
int unknown_isName(const ENCODING *enc, const char *p)
{
int c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p);
if (c & ~0xFFFF)
return 0;
return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
}
static
int unknown_isNmstrt(const ENCODING *enc, const char *p)
{
int c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p);
if (c & ~0xFFFF)
return 0;
return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
}
static
int unknown_isInvalid(const ENCODING *enc, const char *p)
{
int c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, p);
return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
}
static
void unknown_toUtf8(const ENCODING *enc,
const char **fromP, const char *fromLim,
char **toP, const char *toLim)
{
char buf[XML_UTF8_ENCODE_MAX];
for (;;) {
const char *utf8;
int n;
if (*fromP == fromLim)
break;
utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
n = *utf8++;
if (n == 0) {
int c = ((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
n = XmlUtf8Encode(c, buf);
if (n > toLim - *toP)
break;
utf8 = buf;
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
- (BT_LEAD2 - 2);
}
else {
if (n > toLim - *toP)
break;
(*fromP)++;
}
do {
*(*toP)++ = *utf8++;
} while (--n != 0);
}
}
static
void unknown_toUtf16(const ENCODING *enc,
const char **fromP, const char *fromLim,
unsigned short **toP, const unsigned short *toLim)
{
while (*fromP != fromLim && *toP != toLim) {
unsigned short c
= ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
if (c == 0) {
c = (unsigned short)((const struct unknown_encoding *)enc)
->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
*fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
- (BT_LEAD2 - 2);
}
else
(*fromP)++;
*(*toP)++ = c;
}
}
ENCODING *
XmlInitUnknownEncoding(void *mem,
int *table,
int (*convert)(void *userData, const char *p),
void *userData)
{
int i;
struct unknown_encoding *e = mem;
for (i = 0; i < sizeof(struct normal_encoding); i++)
((char *)mem)[i] = ((char *)&latin1_encoding)[i];
for (i = 0; i < 128; i++)
if (latin1_encoding.type[i] != BT_OTHER
&& latin1_encoding.type[i] != BT_NONXML
&& table[i] != i)
return 0;
for (i = 0; i < 256; i++) {
int c = table[i];
if (c == -1) {
e->normal.type[i] = BT_MALFORM;
/* This shouldn't really get used. */
e->utf16[i] = 0xFFFF;
e->utf8[i][0] = 1;
e->utf8[i][1] = 0;
}
else if (c < 0) {
if (c < -4)
return 0;
e->normal.type[i] = BT_LEAD2 - (c + 2);
e->utf8[i][0] = 0;
e->utf16[i] = 0;
}
else if (c < 0x80) {
if (latin1_encoding.type[c] != BT_OTHER
&& latin1_encoding.type[c] != BT_NONXML
&& c != i)
return 0;
e->normal.type[i] = latin1_encoding.type[c];
e->utf8[i][0] = 1;
e->utf8[i][1] = (char)c;
e->utf16[i] = c == 0 ? 0xFFFF : c;
}
else if (checkCharRefNumber(c) < 0) {
e->normal.type[i] = BT_NONXML;
/* This shouldn't really get used. */
e->utf16[i] = 0xFFFF;
e->utf8[i][0] = 1;
e->utf8[i][1] = 0;
}
else {
if (c > 0xFFFF)
return 0;
if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
e->normal.type[i] = BT_NMSTRT;
else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
e->normal.type[i] = BT_NAME;
else
e->normal.type[i] = BT_OTHER;
e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
e->utf16[i] = c;
}
}
e->userData = userData;
e->convert = convert;
if (convert) {
e->normal.isName2 = unknown_isName;
e->normal.isName3 = unknown_isName;
e->normal.isName4 = unknown_isName;
e->normal.isNmstrt2 = unknown_isNmstrt;
e->normal.isNmstrt3 = unknown_isNmstrt;
e->normal.isNmstrt4 = unknown_isNmstrt;
e->normal.isInvalid2 = unknown_isInvalid;
e->normal.isInvalid3 = unknown_isInvalid;
e->normal.isInvalid4 = unknown_isInvalid;
}
e->normal.enc.utf8Convert = unknown_toUtf8;
e->normal.enc.utf16Convert = unknown_toUtf16;
return &(e->normal.enc);
}

Просмотреть файл

@ -29,6 +29,9 @@ extern "C" {
#define XMLTOKAPI /* as nothing */
#endif
/* The following token may be returned by XmlContentTok */
#define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be start of
illegal ]]> sequence */
/* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
#define XML_TOK_NONE -4 /* The string to be scanned is empty */
#define XML_TOK_TRAILING_CR -3 /* A CR at the end of the scan;
@ -38,7 +41,7 @@ extern "C" {
#define XML_TOK_INVALID 0
/* The following tokens are returned by XmlContentTok; some are also
returned by XmlAttributeValueTok and XmlEntityTok */
returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */
#define XML_TOK_START_TAG_WITH_ATTS 1
#define XML_TOK_START_TAG_NO_ATTS 2
@ -47,7 +50,7 @@ extern "C" {
#define XML_TOK_END_TAG 5
#define XML_TOK_DATA_CHARS 6
#define XML_TOK_DATA_NEWLINE 7
#define XML_TOK_CDATA_SECTION 8
#define XML_TOK_CDATA_SECT_OPEN 8
#define XML_TOK_ENTITY_REF 9
#define XML_TOK_CHAR_REF 10 /* numeric character reference */
@ -85,25 +88,25 @@ extern "C" {
#define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
#define XML_TOK_COMMA 38
/* The following tokens is returned only by XmlAttributeValueTok */
/* The following token is returned only by XmlAttributeValueTok */
#define XML_TOK_ATTRIBUTE_VALUE_S 39
#define XML_N_STATES 2
/* The following token is returned only by XmlCdataSectionTok */
#define XML_TOK_CDATA_SECT_CLOSE 40
#define XML_N_STATES 3
#define XML_PROLOG_STATE 0
#define XML_CONTENT_STATE 1
#define XML_CDATA_SECTION_STATE 2
#define XML_N_LITERAL_TYPES 2
#define XML_ATTRIBUTE_VALUE_LITERAL 0
#define XML_ENTITY_VALUE_LITERAL 1
#define XML_N_INTERNAL_ENCODINGS 1
#define XML_UTF8_ENCODING 0
#if 0
#define XML_UTF16_ENCODING 1
#define XML_UCS4_ENCODING 2
#endif
#define XML_MAX_BYTES_PER_CHAR 4
/* The size of the buffer passed to XmlUtf8Encode must be at least this. */
#define XML_UTF8_ENCODE_MAX 4
/* The size of the buffer passed to XmlUtf16Encode must be at least this. */
#define XML_UTF16_ENCODE_MAX 2
typedef struct position {
/* first line and first column are 0 not 1 */
@ -139,21 +142,26 @@ struct encoding {
int (*getAtts)(const ENCODING *enc, const char *ptr,
int attsMax, ATTRIBUTE *atts);
int (*charRefNumber)(const ENCODING *enc, const char *ptr);
int (*predefinedEntityName)(const ENCODING *, const char *, const char *);
void (*updatePosition)(const ENCODING *,
const char *ptr,
const char *end,
POSITION *);
int (*isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
const char **badPtr);
int (*encode)(const ENCODING *enc,
int charNum,
char *buf);
void (*convert[XML_N_INTERNAL_ENCODINGS])(const ENCODING *enc,
const char **fromP,
const char *fromLim,
char **toP,
const char *toLim);
void (*utf8Convert)(const ENCODING *enc,
const char **fromP,
const char *fromLim,
char **toP,
const char *toLim);
void (*utf16Convert)(const ENCODING *enc,
const char **fromP,
const char *fromLim,
unsigned short **toP,
const unsigned short *toLim);
int minBytesPerChar;
char isUtf8;
char isUtf16;
};
/*
@ -186,6 +194,9 @@ literals, comments and processing instructions.
#define XmlContentTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)
#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
/* This is used for performing a 2nd-level tokenization on
the content of a literal that has already been returned by XmlTok. */
@ -215,17 +226,20 @@ the content of a literal that has already been returned by XmlTok. */
#define XmlCharRefNumber(enc, ptr) \
(((enc)->charRefNumber)(enc, ptr))
#define XmlPredefinedEntityName(enc, ptr, end) \
(((enc)->predefinedEntityName)(enc, ptr, end))
#define XmlUpdatePosition(enc, ptr, end, pos) \
(((enc)->updatePosition)(enc, ptr, end, pos))
#define XmlIsPublicId(enc, ptr, end, badPtr) \
(((enc)->isPublicId)(enc, ptr, end, badPtr))
#define XmlEncode(enc, ch, buf) \
(((enc)->encode)(enc, ch, buf))
#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \
(((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim))
#define XmlConvert(enc, targetEnc, fromP, fromLim, toP, toLim) \
(((enc)->convert[targetEnc])(enc, fromP, fromLim, toP, toLim))
#define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \
(((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim))
typedef struct {
ENCODING initEnc;
@ -243,7 +257,17 @@ int XMLTOKAPI XmlParseXmlDecl(int isGeneralTextEntity,
int *standalonePtr);
int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name);
const ENCODING XMLTOKAPI *XmlGetInternalEncoding(int);
const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding();
const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
int XMLTOKAPI XmlSizeOfUnknownEncoding();
ENCODING XMLTOKAPI *
XmlInitUnknownEncoding(void *mem,
int *table,
int (*convert)(void *userData, const char *p),
void *userData);
#ifdef __cplusplus
}

Просмотреть файл

@ -56,7 +56,7 @@ Contributor(s):
#define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
case BT_NONASCII: \
if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \
if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@ -84,7 +84,7 @@ Contributor(s):
#define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
case BT_NONASCII: \
if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \
if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
*nextTokPtr = ptr; \
return XML_TOK_INVALID; \
} \
@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
return XML_TOK_PARTIAL;
}
/* ptr points to character following "<![" */
static
int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
int i;
/* CDATA[]]> */
if (end - ptr < 9 * MINBPC)
/* CDATA[ */
if (end - ptr < 6 * MINBPC)
return XML_TOK_PARTIAL;
for (i = 0; i < 6; i++, ptr += MINBPC) {
if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
return XML_TOK_INVALID;
}
}
end -= 2 * MINBPC;
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
INVALID_CASES(ptr, nextTokPtr)
case BT_RSQB:
if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
&& CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
*nextTokPtr = ptr + 3 * MINBPC;
return XML_TOK_CDATA_SECTION;
}
/* fall through */
default:
ptr += MINBPC;
*nextTokPtr = ptr;
return XML_TOK_CDATA_SECT_OPEN;
}
static
int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
const char **nextTokPtr)
{
if (ptr == end)
return XML_TOK_NONE;
#if MINBPC > 1
{
size_t n = end - ptr;
if (n & (MINBPC - 1)) {
n &= ~(MINBPC - 1);
if (n == 0)
return XML_TOK_PARTIAL;
end = ptr + n;
}
}
return XML_TOK_PARTIAL;
#endif
switch (BYTE_TYPE(enc, ptr)) {
case BT_RSQB:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, ']'))
break;
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (!CHAR_MATCHES(enc, ptr, '>')) {
ptr -= MINBPC;
break;
}
*nextTokPtr = ptr + MINBPC;
return XML_TOK_CDATA_SECT_CLOSE;
case BT_CR:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
if (BYTE_TYPE(enc, ptr) == BT_LF)
ptr += MINBPC;
*nextTokPtr = ptr;
return XML_TOK_DATA_NEWLINE;
case BT_LF:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_DATA_NEWLINE;
INVALID_CASES(ptr, nextTokPtr)
default:
ptr += MINBPC;
break;
}
while (ptr != end) {
switch (BYTE_TYPE(enc, ptr)) {
#define LEAD_CASE(n) \
case BT_LEAD ## n: \
if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
*nextTokPtr = ptr; \
return XML_TOK_DATA_CHARS; \
} \
ptr += n; \
break;
LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
#undef LEAD_CASE
case BT_NONXML:
case BT_MALFORM:
case BT_TRAIL:
case BT_CR:
case BT_LF:
case BT_RSQB:
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
default:
ptr += MINBPC;
break;
}
}
*nextTokPtr = ptr;
return XML_TOK_DATA_CHARS;
}
/* ptr points to character following "</" */
@ -442,7 +505,7 @@ int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
case BT_NUM:
return PREFIX(scanCharRef)(enc, ptr + MINBPC, end, nextTokPtr);
default:
@ -543,6 +606,22 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
break;
}
}
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
case BT_S:
case BT_CR:
case BT_LF:
break;
case BT_SOL:
goto sol;
case BT_GT:
goto gt;
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
}
/* ptr points to closing quote */
for (;;) {
ptr += MINBPC;
@ -553,9 +632,11 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
case BT_S: case BT_CR: case BT_LF:
continue;
case BT_GT:
gt:
*nextTokPtr = ptr + MINBPC;
return XML_TOK_START_TAG_WITH_ATTS;
case BT_SOL:
sol:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
@ -694,12 +775,12 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
case BT_RSQB:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
return XML_TOK_TRAILING_RSQB;
if (!CHAR_MATCHES(enc, ptr, ']'))
break;
ptr += MINBPC;
if (ptr == end)
return XML_TOK_PARTIAL;
return XML_TOK_TRAILING_RSQB;
if (!CHAR_MATCHES(enc, ptr, '>')) {
ptr -= MINBPC;
break;
@ -766,7 +847,7 @@ int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
*nextTokPtr = ptr;
return XML_TOK_PERCENT;
@ -795,7 +876,7 @@ int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
if (ptr == end)
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
default:
*nextTokPtr = ptr;
return XML_TOK_INVALID;
@ -944,7 +1025,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
case BT_RPAR:
ptr += MINBPC;
if (ptr == end)
return XML_TOK_INVALID;
return XML_TOK_PARTIAL;
switch (BYTE_TYPE(enc, ptr)) {
case BT_AST:
*nextTokPtr = ptr + MINBPC;
@ -1001,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
ptr += MINBPC;
break;
case BT_NONASCII:
if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
ptr += MINBPC;
tok = XML_TOK_NAME;
break;
}
if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
ptr += MINBPC;
tok = XML_TOK_NMTOKEN;
break;
@ -1343,6 +1424,59 @@ int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
return checkCharRefNumber(result);
}
static
int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
{
switch (end - ptr) {
case 2 * MINBPC:
if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) {
switch (BYTE_TO_ASCII(enc, ptr)) {
case 'l':
return '<';
case 'g':
return '>';
}
}
break;
case 3 * MINBPC:
if (CHAR_MATCHES(enc, ptr, 'a')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'm')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'p'))
return '&';
}
}
break;
case 4 * MINBPC:
switch (BYTE_TO_ASCII(enc, ptr)) {
case 'q':
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'u')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'o')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 't'))
return '"';
}
}
break;
case 'a':
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'p')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 'o')) {
ptr += MINBPC;
if (CHAR_MATCHES(enc, ptr, 's'))
return '\'';
}
}
break;
}
}
return 0;
}
static
int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
{

Просмотреть файл

@ -1,7 +1,7 @@
XML_ErrorString
XML_GetErrorByteIndex
XML_GetErrorColumnNumber
XML_GetErrorLineNumber
XML_GetCurrentLineNumber
XML_GetCurrentColumnNumber
XML_GetCurrentByteIndex
XML_GetErrorCode
XML_GetBuffer
XML_ParseBuffer
@ -19,7 +19,6 @@ hashTableDestroy
lookup
XmlParseXmlDecl
XmlInitEncoding
XmlGetInternalEncoding
XmlPrologStateInit
tokenizeXMLElement
XMLDOM_PIHandler