2nd try; new parser from james clark

1998-08-20 21:20:50 +00:00 · 1998-08-20 21:20:50 +00:00 · c1ce4a8d42
--- a/modules/xml/expat/xmlparse/hashtable.c
+++ b/modules/xml/expat/xmlparse/hashtable.c
@ -1,7 +1,7 @@
 /*
 The contents of this file are subject to the Mozilla Public License
 Version 1.0 (the "License"); you may not use this file except in
-compliance with the License. You may obtain a copy of the License at
+csompliance with the License. You may obtain a copy of the License at
 http://www.mozilla.org/MPL/

 Software distributed under the License is distributed on an "AS IS"
@ -18,15 +18,22 @@ James Clark. All Rights Reserved.
 Contributor(s):
 */

-#include "xmldef.h"
-#include "hashtable.h"
 #include <stdlib.h>
 #include <string.h>

+#include "xmldef.h"
+#include "hashtable.h"
+
+#ifdef XML_UNICODE
+#define keycmp wcscmp
+#else
+#define keycmp strcmp
+#endif
+
 #define INIT_SIZE 64

 static
-unsigned long hash(const char *s)
+unsigned long hash(KEY s)
 {
  unsigned long h = 0;
  while (*s)
@ -34,7 +41,7 @@ unsigned long hash(const char *s)
  return h;
 }

-NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
+NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize)
 {
  size_t i;
  if (table->size == 0) {
@ -52,7 +59,7 @@ NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize)
    for (i = h & (table->size - 1);
         table->v[i];
         i == 0 ? i = table->size - 1 : --i) {
-      if (strcmp(name, table->v[i]->name) == 0)
+      if (keycmp(name, table->v[i]->name) == 0)
 	return table->v[i];
    }
    if (!createSize)
--- a/modules/xml/expat/xmlparse/hashtable.h
+++ b/modules/xml/expat/xmlparse/hashtable.h
@ -21,8 +21,14 @@ Contributor(s):

 #include <stddef.h>

+#ifdef XML_UNICODE
+typedef const wchar_t *KEY;
+#else
+typedef const char *KEY;
+#endif
+
 typedef struct {
-  const char *name;
+  KEY name;
 } NAMED;

 typedef struct {
@ -32,7 +38,7 @@ typedef struct {
  size_t usedLim;
 } HASH_TABLE;

-NAMED *lookup(HASH_TABLE *table, const char *name, size_t createSize);
+NAMED *lookup(HASH_TABLE *table, KEY name, size_t createSize);
 void hashTableInit(HASH_TABLE *);
 void hashTableDestroy(HASH_TABLE *);

--- a/modules/xml/expat/xmlparse/xmlparse.c
+++ b/modules/xml/expat/xmlparse/xmlparse.c
--- a/modules/xml/expat/xmlparse/xmlparse.h
+++ b/modules/xml/expat/xmlparse/xmlparse.h
@ -31,32 +31,199 @@ extern "C" {

 typedef void *XML_Parser;

-/* Constructs a new parser; encoding should be the name of the charset from
-the Content-Type header if the Content-Type is text/xml, or null otherwise. */
+#ifdef XML_UNICODE_WCHAR_T

-XML_Parser XMLPARSEAPI
-XML_ParserCreate(const char *encoding);
+/* XML_UNICODE_WCHAR_T will work only if sizeof(wchar_t) == 2 and wchar_t
+uses Unicode. */
+/* Information is UTF-16 encoded as wchar_ts */
+
+#ifndef XML_UNICODE
+#define XML_UNICODE
+#endif
+
+#include <stddef.h>
+typedef wchar_t XML_Char;
+typedef wchar_t XML_LChar;
+
+#else /* not XML_UNICODE_WCHAR_T */
+
+#ifdef XML_UNICODE
+
+/* Information is UTF-16 encoded as unsigned shorts */
+typedef unsigned short XML_Char;
+typedef char XML_LChar;
+
+#else /* not XML_UNICODE */

 /* Information is UTF-8 encoded. */
+typedef char XML_Char;
+typedef char XML_LChar;

-/* atts is array of name/value pairs, terminated by NULL;
-   names and values are '\0' terminated. */
+#endif /* not XML_UNICODE */
+
+#endif /* not XML_UNICODE_WCHAR_T */
+
+
+/* Constructs a new parser; encoding is the encoding specified by the external
+protocol or null if there is none specified. */
+
+XML_Parser XMLPARSEAPI
+XML_ParserCreate(const XML_Char *encoding);
+
+
+/* atts is array of name/value pairs, terminated by 0;
+   names and values are 0 terminated. */

 typedef void (*XML_StartElementHandler)(void *userData,
-					const char *name,
-					const char **atts);
+					const XML_Char *name,
+					const XML_Char **atts);

 typedef void (*XML_EndElementHandler)(void *userData,
-				      const char *name);
+				      const XML_Char *name);

+/* s is not 0 terminated. */
 typedef void (*XML_CharacterDataHandler)(void *userData,
-					 const char *s,
+					 const XML_Char *s,
 					 int len);

-/* target and data are '\0' terminated */
+/* target and data are 0 terminated */
 typedef void (*XML_ProcessingInstructionHandler)(void *userData,
-						 const char *target,
-						 const char *data);
+						 const XML_Char *target,
+						 const XML_Char *data);
+
+/* This is called for any characters in the XML document for
+which there is no applicable handler.  This includes both
+characters that are part of markup which is of a kind that is
+not reported (comments, markup declarations), or characters
+that are part of a construct which could be reported but
+for which no handler has been supplied. The characters are passed
+exactly as they were in the XML document except that
+they will be encoded in UTF-8.  Line boundaries are not normalized.
+Note that a byte order mark character is not passed to the default handler.
+If a default handler is set, internal entity references
+are not expanded. There are no guarantees about
+how characters are divided between calls to the default handler:
+for example, a comment might be split between multiple calls. */
+
+typedef void (*XML_DefaultHandler)(void *userData,
+				   const XML_Char *s,
+				   int len);
+
+/* This is called for a declaration of an unparsed (NDATA)
+entity.  The base argument is whatever was set by XML_SetBase.
+The entityName, systemId and notationName arguments will never be null.
+The other arguments may be. */
+
+typedef void (*XML_UnparsedEntityDeclHandler)(void *userData,
+					      const XML_Char *entityName,
+					      const XML_Char *base,
+					      const XML_Char *systemId,
+					      const XML_Char *publicId,
+					      const XML_Char *notationName);
+
+/* This is called for a declaration of notation.
+The base argument is whatever was set by XML_SetBase.
+The notationName will never be null.  The other arguments can be. */
+
+typedef void (*XML_NotationDeclHandler)(void *userData,
+					const XML_Char *notationName,
+					const XML_Char *base,
+					const XML_Char *systemId,
+					const XML_Char *publicId);
+
+/* This is called for a reference to an external parsed general entity.
+The referenced entity is not automatically parsed.
+The application can parse it immediately or later using
+XML_ExternalEntityParserCreate.
+The parser argument is the parser parsing the entity containing the reference;
+it can be passed as the parser argument to XML_ExternalEntityParserCreate.
+The systemId argument is the system identifier as specified in the entity declaration;
+it will not be null.
+The base argument is the system identifier that should be used as the base for
+resolving systemId if systemId was relative; this is set by XML_SetBase;
+it may be null.
+The publicId argument is the public identifier as specified in the entity declaration,
+or null if none was specified; the whitespace in the public identifier
+will have been normalized as required by the XML spec.
+The openEntityNames argument is a space-separated list of the names of the entities
+that are open for the parse of this entity (including the name of the referenced
+entity); this can be passed as the openEntityNames argument to
+XML_ExternalEntityParserCreate; openEntityNames is valid only until the handler
+returns, so if the referenced entity is to be parsed later, it must be copied.
+The handler should return 0 if processing should not continue because of
+a fatal error in the handling of the external entity.
+In this case the calling parser will return an XML_ERROR_EXTERNAL_ENTITY_HANDLING
+error.
+Note that unlike other handlers the first argument is the parser, not userData. */
+
+typedef int (*XML_ExternalEntityRefHandler)(XML_Parser parser,
+					    const XML_Char *openEntityNames,
+					    const XML_Char *base,
+					    const XML_Char *systemId,
+					    const XML_Char *publicId);
+
+/* This structure is filled in by the XML_UnknownEncodingHandler
+to provide information to the parser about encodings that are unknown
+to the parser.
+The map[b] member gives information about byte sequences
+whose first byte is b.
+If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
+If map[b] is -1, then the byte sequence is malformed.
+If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
+sequence that encodes a single Unicode scalar value.
+The data member will be passed as the first argument to the convert function.
+The convert function is used to convert multibyte sequences;
+s will point to a n-byte sequence where map[(unsigned char)*s] == -n.
+The convert function must return the Unicode scalar value
+represented by this byte sequence or -1 if the byte sequence is malformed.
+The convert function may be null if the encoding is a single-byte encoding,
+that is if map[b] >= -1 for all bytes b.
+When the parser is finished with the encoding, then if release is not null,
+it will call release passing it the data member;
+once release has been called, the convert function will not be called again.
+
+Expat places certain restrictions on the encodings that are supported
+using this mechanism.
+
+1. Every ASCII character that can appear in a well-formed XML document,
+other than the characters
+
+  $@\^`{}~
+
+must be represented by a single byte, and that byte must be the
+same byte that represents that character in ASCII.
+
+2. No character may require more than 4 bytes to encode.
+
+3. All characters encoded must have Unicode scalar values <= 0xFFFF,
+(ie characters that would be encoded by surrogates in UTF-16
+are  not allowed).  Note that this restriction doesn't apply to
+the built-in support for UTF-8 and UTF-16.
+
+4. No Unicode character may be encoded by more than one distinct sequence
+of bytes. */
+
+typedef struct {
+  int map[256];
+  void *data;
+  int (*convert)(void *data, const char *s);
+  void (*release)(void *data);
+} XML_Encoding;
+
+/* This is called for an encoding that is unknown to the parser.
+The encodingHandlerData argument is that which was passed as the
+second argument to XML_SetUnknownEncodingHandler.
+The name argument gives the name of the encoding as specified in
+the encoding declaration.
+If the callback can provide information about the encoding,
+it must fill in the XML_Encoding structure, and return 1.
+Otherwise it must return 0.
+If info does not describe a suitable encoding,
+then the parser will return an XML_UNKNOWN_ENCODING error. */
+
+typedef int (*XML_UnknownEncodingHandler)(void *encodingHandlerData,
+					  const XML_Char *name,
+					  XML_Encoding *info);

 void XMLPARSEAPI
 XML_SetElementHandler(XML_Parser parser,
@ -71,10 +238,62 @@ void XMLPARSEAPI
 XML_SetProcessingInstructionHandler(XML_Parser parser,
 				    XML_ProcessingInstructionHandler handler);

+void XMLPARSEAPI
+XML_SetDefaultHandler(XML_Parser parser,
+		      XML_DefaultHandler handler);
+
+void XMLPARSEAPI
+XML_SetUnparsedEntityDeclHandler(XML_Parser parser,
+				 XML_UnparsedEntityDeclHandler handler);
+
+void XMLPARSEAPI
+XML_SetNotationDeclHandler(XML_Parser parser,
+			   XML_NotationDeclHandler handler);
+
+void XMLPARSEAPI
+XML_SetExternalEntityRefHandler(XML_Parser parser,
+				XML_ExternalEntityRefHandler handler);
+
+void XMLPARSEAPI
+XML_SetUnknownEncodingHandler(XML_Parser parser,
+			      XML_UnknownEncodingHandler handler,
+			      void *encodingHandlerData);
+
+/* This can be called within a handler for a start element, end element,
+processing instruction or character data.  It causes the corresponding
+markup to be passed to the default handler.
+Within the expansion of an internal entity, nothing will be passed
+to the default handler, although this usually will not happen since
+setting a default handler inhibits expansion of internal entities. */
+void XMLPARSEAPI XML_DefaultCurrent(XML_Parser parser);
+
 /* This value is passed as the userData argument to callbacks. */
 void XMLPARSEAPI
 XML_SetUserData(XML_Parser parser, void *userData);

+/* Returns the last value set by XML_SetUserData or null. */
+#define XML_GetUserData(parser) (*(void **)(parser))
+
+/* If this function is called, then the parser will be passed
+as the first argument to callbacks instead of userData.
+The userData will still be accessible using XML_GetUserData. */
+
+void XMLPARSEAPI
+XML_UseParserAsHandlerArg(XML_Parser parser);
+
+/* Sets the base to be used for resolving relative URIs in system identifiers in
+declarations.  Resolving relative identifiers is left to the application:
+this value will be passed through as the base argument to the
+XML_ExternalEntityRefHandler, XML_NotationDeclHandler
+and XML_UnparsedEntityDeclHandler. The base argument will be copied.
+Returns zero if out of memory, non-zero otherwise. */
+
+int XMLPARSEAPI
+XML_SetBase(XML_Parser parser, const XML_Char *base);
+
+const XML_Char XMLPARSEAPI *
+XML_GetBase(XML_Parser parser);
+
 /* Parses some input. Returns 0 if a fatal error is detected.
 The last call to XML_Parse must have isFinal true;
 len may be zero for this call (or any other). */
@ -87,8 +306,20 @@ XML_GetBuffer(XML_Parser parser, int len);
 int XMLPARSEAPI
 XML_ParseBuffer(XML_Parser parser, int len, int isFinal);

-/* If XML_Parser or XML_ParseEnd have returned 0, then XML_GetError*
-returns information about the error. */
+/* Creates an XML_Parser object that can parse an external general entity;
+openEntityNames is a space-separated list of the names of the entities that are open
+for the parse of this entity (including the name of this one);
+encoding is the externally specified encoding,
+or null if there is no externally specified encoding.
+This can be called at any point after the first call to an ExternalEntityRefHandler
+so longer as the parser has not yet been freed.
+The new parser is completely independent and may safely be used in a separate thread.
+The handlers and userData are initialized from the parser argument.
+Returns 0 if out of memory.  Otherwise returns a new XML_Parser object. */
+XML_Parser XMLPARSEAPI
+XML_ExternalEntityParserCreate(XML_Parser parser,
+			       const XML_Char *openEntityNames,
+			       const XML_Char *encoding);

 enum XML_Error {
  XML_ERROR_NONE,
@ -110,19 +341,39 @@ enum XML_Error {
  XML_ERROR_ATTRIBUTE_EXTERNAL_ENTITY_REF,
  XML_ERROR_MISPLACED_XML_PI,
  XML_ERROR_UNKNOWN_ENCODING,
-  XML_ERROR_INCORRECT_ENCODING
+  XML_ERROR_INCORRECT_ENCODING,
+  XML_ERROR_UNCLOSED_CDATA_SECTION,
+  XML_ERROR_EXTERNAL_ENTITY_HANDLING
 };

-int XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
-int XMLPARSEAPI XML_GetErrorLineNumber(XML_Parser parser);
-int XMLPARSEAPI XML_GetErrorColumnNumber(XML_Parser parser);
-long XMLPARSEAPI XML_GetErrorByteIndex(XML_Parser parser);
+/* If XML_Parse or XML_ParseBuffer have returned 0, then XML_GetErrorCode
+returns information about the error. */

+enum XML_Error XMLPARSEAPI XML_GetErrorCode(XML_Parser parser);
+
+/* These functions return information about the current parse location.
+They may be called when XML_Parse or XML_ParseBuffer return 0;
+in this case the location is the location of the character at which
+the error was detected.
+They may also be called from any other callback called to report
+some parse event; in this the location is the location of the first
+of the sequence of characters that generated the event. */
+
+int XMLPARSEAPI XML_GetCurrentLineNumber(XML_Parser parser);
+int XMLPARSEAPI XML_GetCurrentColumnNumber(XML_Parser parser);
+long XMLPARSEAPI XML_GetCurrentByteIndex(XML_Parser parser);
+
+/* For backwards compatibility with previous versions. */
+#define XML_GetErrorLineNumber XML_GetCurrentLineNumber
+#define XML_GetErrorColumnNumber XML_GetCurrentColumnNumber
+#define XML_GetErrorByteIndex XML_GetCurrentByteIndex
+
+/* Frees memory used by the parser. */
 void XMLPARSEAPI
 XML_ParserFree(XML_Parser parser);

-const char XMLPARSEAPI *
-XML_ErrorString(int code);
+/* Returns a string describing the error. */
+const XML_LChar XMLPARSEAPI *XML_ErrorString(int code);

 #ifdef __cplusplus
 }
--- a/modules/xml/expat/xmltok/xmlrole.c
+++ b/modules/xml/expat/xmltok/xmlrole.c
@ -594,7 +594,7 @@ int notation4(PROLOG_STATE *state,
    return XML_ROLE_NOTATION_SYSTEM_ID;
  case XML_TOK_DECL_CLOSE:
    state->handler = internalSubset;
-    return XML_ROLE_NONE;
+    return XML_ROLE_NOTATION_NO_SYSTEM_ID;
  }
  return syntaxError(state);
 }
--- a/modules/xml/expat/xmltok/xmlrole.h
+++ b/modules/xml/expat/xmltok/xmlrole.h
@ -44,6 +44,7 @@ enum {
  XML_ROLE_ENTITY_NOTATION_NAME,
  XML_ROLE_NOTATION_NAME,
  XML_ROLE_NOTATION_SYSTEM_ID,
+  XML_ROLE_NOTATION_NO_SYSTEM_ID,
  XML_ROLE_NOTATION_PUBLIC_ID,
  XML_ROLE_ATTRIBUTE_NAME,
  XML_ROLE_ATTRIBUTE_TYPE_CDATA,
--- a/modules/xml/expat/xmltok/xmltok.c
+++ b/modules/xml/expat/xmltok/xmltok.c
@ -23,7 +23,7 @@ Contributor(s):
 #include "nametab.h"

 #define VTABLE1 \
-  { PREFIX(prologTok), PREFIX(contentTok) }, \
+  { PREFIX(prologTok), PREFIX(contentTok), PREFIX(cdataSectionTok) }, \
  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
  PREFIX(sameName), \
  PREFIX(nameMatchesAscii), \
@ -31,14 +31,11 @@ Contributor(s):
  PREFIX(skipS), \
  PREFIX(getAtts), \
  PREFIX(charRefNumber), \
+  PREFIX(predefinedEntityName), \
  PREFIX(updatePosition), \
  PREFIX(isPublicId)

-#define VTABLE2 \
-  PREFIX(encode), \
-  { PREFIX(toUtf8) }
-
-#define VTABLE VTABLE1, VTABLE2
+#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)

 #define UCS2_GET_NAMING(pages, hi, lo) \
   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
@ -81,11 +78,79 @@ We need 8 bits to index into pages, 3 bits to add to that index and

 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)

+static
+int isNever(const ENCODING *enc, const char *p)
+{
+  return 0;
+}
+
+static
+int utf8_isName2(const ENCODING *enc, const char *p)
+{
+  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
+}
+
+static
+int utf8_isName3(const ENCODING *enc, const char *p)
+{
+  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
+}
+
+#define utf8_isName4 isNever
+
+static
+int utf8_isNmstrt2(const ENCODING *enc, const char *p)
+{
+  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
+}
+
+static
+int utf8_isNmstrt3(const ENCODING *enc, const char *p)
+{
+  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
+}
+
+#define utf8_isNmstrt4 isNever
+
+#define utf8_isInvalid2 isNever
+
+static
+int utf8_isInvalid3(const ENCODING *enc, const char *p)
+{
+  return UTF8_INVALID3((const unsigned char *)p);
+}
+
+static
+int utf8_isInvalid4(const ENCODING *enc, const char *p)
+{
+  return UTF8_INVALID4((const unsigned char *)p);
+}
+
 struct normal_encoding {
  ENCODING enc;
  unsigned char type[256];
+  int (*isName2)(const ENCODING *, const char *);
+  int (*isName3)(const ENCODING *, const char *);
+  int (*isName4)(const ENCODING *, const char *);
+  int (*isNmstrt2)(const ENCODING *, const char *);
+  int (*isNmstrt3)(const ENCODING *, const char *);
+  int (*isNmstrt4)(const ENCODING *, const char *);
+  int (*isInvalid2)(const ENCODING *, const char *);
+  int (*isInvalid3)(const ENCODING *, const char *);
+  int (*isInvalid4)(const ENCODING *, const char *);
 };

+#define NORMAL_VTABLE(E) \
+ E ## isName2, \
+ E ## isName3, \
+ E ## isName4, \
+ E ## isNmstrt2, \
+ E ## isNmstrt3, \
+ E ## isNmstrt4, \
+ E ## isInvalid2, \
+ E ## isInvalid3, \
+ E ## isInvalid4
+ 
 static int checkCharRefNumber(int);

 #include "xmltok_impl.h"
@ -95,12 +160,16 @@ static int checkCharRefNumber(int);
 #define BYTE_TYPE(enc, p) \
  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
 #define BYTE_TO_ASCII(enc, p) (*p)
-#define IS_NAME_CHAR(enc, p, n) UTF8_GET_NAMING(namePages, p, n)
-#define IS_NMSTRT_CHAR(enc, p, n) UTF8_GET_NAMING(nmstrtPages, p, n)
+
+#define IS_NAME_CHAR(enc, p, n) \
+ (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
+#define IS_NMSTRT_CHAR(enc, p, n) \
+ (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
 #define IS_INVALID_CHAR(enc, p, n) \
-((n) == 3 \
-  ? UTF8_INVALID3((const unsigned char *)(p)) \
-  : ((n) == 4 ? UTF8_INVALID4((const unsigned char *)(p)) : 0))
+ (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
+
+#define IS_NAME_CHAR_MINBPC(enc, p) (0)
+#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)

 /* c is an ASCII character */
 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
@ -113,51 +182,18 @@ static int checkCharRefNumber(int);
 #undef BYTE_TO_ASCII
 #undef CHAR_MATCHES
 #undef IS_NAME_CHAR
+#undef IS_NAME_CHAR_MINBPC
 #undef IS_NMSTRT_CHAR
+#undef IS_NMSTRT_CHAR_MINBPC
 #undef IS_INVALID_CHAR

-enum {
-  /* cvalN is value of masked first byte of N byte sequence */
-  cval1 = 0x00,
-  cval2 = 0xc0,
-  cval3 = 0xe0,
-  cval4 = 0xf0,
-  /* minN is minimum legal resulting value for N byte sequence */
-  min2 = 0x80,
-  min3 = 0x800,
-  min4 = 0x10000
+enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
+  UTF8_cval1 = 0x00,
+  UTF8_cval2 = 0xc0,
+  UTF8_cval3 = 0xe0,
+  UTF8_cval4 = 0xf0
 };

-static
-int utf8_encode(const ENCODING *enc, int c, char *buf)
-{
-  if (c < 0)
-    return 0;
-  if (c < min2) {
-    buf[0] = (c | cval1);
-    return 1;
-  }
-  if (c < min3) {
-    buf[0] = ((c >> 6) | cval2);
-    buf[1] = ((c & 0x3f) | 0x80);
-    return 2;
-  }
-  if (c < min4) {
-    buf[0] = ((c >> 12) | cval3);
-    buf[1] = (((c >> 6) & 0x3f) | 0x80);
-    buf[2] = ((c & 0x3f) | 0x80);
-    return 3;
-  }
-  if (c < 0x110000) {
-    buf[0] = ((c >> 18) | cval4);
-    buf[1] = (((c >> 12) & 0x3f) | 0x80);
-    buf[2] = (((c >> 6) & 0x3f) | 0x80);
-    buf[3] = ((c & 0x3f) | 0x80);
-    return 4;
-  }
-  return 0;
-}
-
 static
 void utf8_toUtf8(const ENCODING *enc,
 		 const char **fromP, const char *fromLim,
@ -177,34 +213,63 @@ void utf8_toUtf8(const ENCODING *enc,
  *toP = to;
 }

+static
+void utf8_toUtf16(const ENCODING *enc,
+		  const char **fromP, const char *fromLim,
+		  unsigned short **toP, const unsigned short *toLim)
+{
+  unsigned short *to = *toP;
+  const char *from = *fromP;
+  while (from != fromLim && to != toLim) {
+    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
+    case BT_LEAD2:
+      *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
+      from += 2;
+      break;
+    case BT_LEAD3:
+      *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
+      from += 3;
+      break;
+    case BT_LEAD4:
+      {
+	unsigned long n;
+	if (to + 1 == toLim)
+	  break;
+	n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
+	n -= 0x10000;
+	to[0] = (unsigned short)((n >> 10) | 0xD800);
+	to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
+	to += 2;
+	from += 4;
+      }
+      break;
+    default:
+      *to++ = *from++;
+      break;
+    }
+  }
+  *fromP = from;
+  *toP = to;
+}
+
 static const struct normal_encoding utf8_encoding = {
-  { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
+  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  {
 #include "asciitab.h"
 #include "utf8tab.h"
-  }
+  },
+  NORMAL_VTABLE(utf8_)
 };

 static const struct normal_encoding internal_utf8_encoding = {
-  { VTABLE1, utf8_encode, { utf8_toUtf8 }, 1 },
+  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
  {
 #include "iasciitab.h"
 #include "utf8tab.h"
-  }
+  },
+  NORMAL_VTABLE(utf8_)
 };

-static
-int latin1_encode(const ENCODING *enc, int c, char *buf)
-{
-  if (c < 0)
-    return 0;
-  if (c <= 0xFF) {
-    buf[0] = (char)c;
-    return 1;
-  }
-  return 0;
-}
-
 static
 void latin1_toUtf8(const ENCODING *enc,
 		   const char **fromP, const char *fromLim,
@ -218,7 +283,7 @@ void latin1_toUtf8(const ENCODING *enc,
    if (c & 0x80) {
      if (toLim - *toP < 2)
 	break;
-      *(*toP)++ = ((c >> 6) | cval2);
+      *(*toP)++ = ((c >> 6) | UTF8_cval2);
      *(*toP)++ = ((c & 0x3f) | 0x80);
      (*fromP)++;
    }
@ -230,15 +295,39 @@ void latin1_toUtf8(const ENCODING *enc,
  }
 }

+static
+void latin1_toUtf16(const ENCODING *enc,
+		    const char **fromP, const char *fromLim,
+		    unsigned short **toP, const unsigned short *toLim)
+{
+  while (*fromP != fromLim && *toP != toLim)
+    *(*toP)++ = (unsigned char)*(*fromP)++;
+}
+
 static const struct normal_encoding latin1_encoding = {
-  { VTABLE1, latin1_encode, { latin1_toUtf8 }, 1 },
+  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
  {
 #include "asciitab.h"
 #include "latin1tab.h"
  }
 };

-#define latin1tab (latin1_encoding.type)
+static
+void ascii_toUtf8(const ENCODING *enc,
+		  const char **fromP, const char *fromLim,
+		  char **toP, const char *toLim)
+{
+  while (*fromP != fromLim && *toP != toLim)
+    *(*toP)++ = *(*fromP)++;
+}
+
+static const struct normal_encoding ascii_encoding = {
+  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
+  {
+#include "asciitab.h"
+/* BT_NONXML == 0 */
+  }
+};

 #undef PREFIX

@ -260,25 +349,6 @@ static int unicode_byte_type(char hi, char lo)
  return BT_NONASCII;
 }

-#define DEFINE_UTF16_ENCODE \
-static \
-int PREFIX(encode)(const ENCODING *enc, int charNum, char *buf) \
-{ \
-  if (charNum < 0) \
-    return 0; \
-  if (charNum < 0x10000) { \
-    SET2(buf, charNum); \
-    return 2; \
-  } \
-  if (charNum < 0x110000) { \
-    charNum -= 0x10000; \
-    SET2(buf, (charNum >> 10) + 0xD800); \
-    SET2(buf + 2, (charNum & 0x3FF) + 0xDC00); \
-    return 4; \
-  } \
-  return 0; \
-}
-
 #define DEFINE_UTF16_TO_UTF8 \
 static \
 void PREFIX(toUtf8)(const ENCODING *enc, \
@ -308,7 +378,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
        *fromP = from; \
 	return; \
      } \
-      *(*toP)++ = ((lo >> 6) | (hi << 2) |  cval2); \
+      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
      *(*toP)++ = ((lo & 0x3f) | 0x80); \
      break; \
    default: \
@ -317,7 +387,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
 	return; \
      } \
      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
-      *(*toP)++ = ((hi >> 4) | cval3); \
+      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
      *(*toP)++ = ((lo & 0x3f) | 0x80); \
      break; \
@ -327,7 +397,7 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
 	return; \
      } \
      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
-      *(*toP)++ = ((plane >> 2) | cval4); \
+      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
      from += 2; \
      lo2 = GET_LO(from); \
@ -342,15 +412,33 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
  *fromP = from; \
 }

+#define DEFINE_UTF16_TO_UTF16 \
+static \
+void PREFIX(toUtf16)(const ENCODING *enc, \
+		     const char **fromP, const char *fromLim, \
+		     unsigned short **toP, const unsigned short *toLim) \
+{ \
+  /* Avoid copying first half only of surrogate */ \
+  if (fromLim - *fromP > ((toLim - *toP) << 1) \
+      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
+    fromLim -= 2; \
+  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
+    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
+}
+
 #define PREFIX(ident) little2_ ## ident
 #define MINBPC 2
 #define BYTE_TYPE(enc, p) \
- ((p)[1] == 0 ? latin1tab[(unsigned char)*(p)] : unicode_byte_type((p)[1], (p)[0]))
+ ((p)[1] == 0 \
+  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
+  : unicode_byte_type((p)[1], (p)[0]))
 #define BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
 #define CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
-#define IS_NAME_CHAR(enc, p, n) \
+#define IS_NAME_CHAR(enc, p, n) (0)
+#define IS_NAME_CHAR_MINBPC(enc, p) \
  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
-#define IS_NMSTRT_CHAR(enc, p, n) \
+#define IS_NMSTRT_CHAR(enc, p, n) (0)
+#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])

 #include "xmltok_impl.c"
@ -360,8 +448,8 @@ void PREFIX(toUtf8)(const ENCODING *enc, \
 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
 #define GET_HI(ptr) ((unsigned char)(ptr)[1])

-DEFINE_UTF16_ENCODE
 DEFINE_UTF16_TO_UTF8
+DEFINE_UTF16_TO_UTF16

 #undef SET2
 #undef GET_LO
@ -371,10 +459,32 @@ DEFINE_UTF16_TO_UTF8
 #undef BYTE_TO_ASCII
 #undef CHAR_MATCHES
 #undef IS_NAME_CHAR
+#undef IS_NAME_CHAR_MINBPC
 #undef IS_NMSTRT_CHAR
+#undef IS_NMSTRT_CHAR_MINBPC
 #undef IS_INVALID_CHAR

-static const struct encoding little2_encoding = { VTABLE, 2 };
+static const struct normal_encoding little2_encoding = { 
+  { VTABLE, 2, 0,
+#if BYTE_ORDER == 12
+    1
+#else
+    0
+#endif
+  },
+#include "asciitab.h"
+#include "latin1tab.h"
+};
+
+#if BYTE_ORDER != 21
+
+static const struct normal_encoding internal_little2_encoding = { 
+  { VTABLE, 2, 0, 1 },
+#include "iasciitab.h"
+#include "latin1tab.h"
+};
+
+#endif

 #undef PREFIX

@ -382,12 +492,16 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
 #define MINBPC 2
 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
 #define BYTE_TYPE(enc, p) \
- ((p)[0] == 0 ? latin1tab[(unsigned char)(p)[1]] : unicode_byte_type((p)[0], (p)[1]))
+ ((p)[0] == 0 \
+  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
+  : unicode_byte_type((p)[0], (p)[1]))
 #define BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
 #define CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
-#define IS_NAME_CHAR(enc, p, n) \
+#define IS_NAME_CHAR(enc, p, n) 0
+#define IS_NAME_CHAR_MINBPC(enc, p) \
  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
-#define IS_NMSTRT_CHAR(enc, p, n) \
+#define IS_NMSTRT_CHAR(enc, p, n) (0)
+#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])

 #include "xmltok_impl.c"
@ -397,8 +511,8 @@ static const struct encoding little2_encoding = { VTABLE, 2 };
 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
 #define GET_HI(ptr) ((unsigned char)(ptr)[0])

-DEFINE_UTF16_ENCODE
 DEFINE_UTF16_TO_UTF8
+DEFINE_UTF16_TO_UTF16

 #undef SET2
 #undef GET_LO
@ -408,10 +522,32 @@ DEFINE_UTF16_TO_UTF8
 #undef BYTE_TO_ASCII
 #undef CHAR_MATCHES
 #undef IS_NAME_CHAR
+#undef IS_NAME_CHAR_MINBPC
 #undef IS_NMSTRT_CHAR
+#undef IS_NMSTRT_CHAR_MINBPC
 #undef IS_INVALID_CHAR

-static const struct encoding big2_encoding = { VTABLE, 2 };
+static const struct normal_encoding big2_encoding = {
+  { VTABLE, 2, 0,
+#if BYTE_ORDER == 21
+  1
+#else
+  0
+#endif
+  },
+#include "asciitab.h"
+#include "latin1tab.h"
+};
+
+#if BYTE_ORDER != 12
+
+static const struct normal_encoding internal_big2_encoding = {
+  { VTABLE, 2, 0, 1 },
+#include "iasciitab.h"
+#include "latin1tab.h"
+};
+
+#endif

 #undef PREFIX

@ -454,18 +590,18 @@ int initScan(const ENCODING *enc, int state, const char *ptr, const char *end,
  else {
    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
    case 0x003C:
-      *encPtr = &big2_encoding;
+      *encPtr = &big2_encoding.enc;
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    case 0xFEFF:
      *nextTokPtr = ptr + 2;
-      *encPtr = &big2_encoding;
+      *encPtr = &big2_encoding.enc;
      return XML_TOK_BOM;
    case 0x3C00:
-      *encPtr = &little2_encoding;
+      *encPtr = &little2_encoding.enc;
      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
    case 0xFFFE:
      *nextTokPtr = ptr + 2;
-      *encPtr = &little2_encoding;
+      *encPtr = &little2_encoding.enc;
      return XML_TOK_BOM;
    }
  }
@ -494,13 +630,21 @@ void initUpdatePosition(const ENCODING *enc, const char *ptr,
  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
 }

-const ENCODING *XmlGetInternalEncoding(int e)
+const ENCODING *XmlGetUtf8InternalEncoding()
 {
-  switch (e) {
-  case XML_UTF8_ENCODING:
-    return &internal_utf8_encoding.enc;
-  }
-  return 0;
+  return &internal_utf8_encoding.enc;
+}
+
+const ENCODING *XmlGetUtf16InternalEncoding()
+{
+#if BYTE_ORDER == 12
+  return &internal_little2_encoding.enc;
+#elif BYTE_ORDER == 21
+  return &internal_big2_encoding.enc;
+#else
+  const short n = 1;
+  return *(const char *)&n ? &internal_little2_encoding.enc : &internal_big2_encoding.enc;
+#endif
 }

 int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
@ -514,6 +658,10 @@ int XmlInitEncoding(INIT_ENCODING *p, const ENCODING **encPtr, const char *name)
      *encPtr = &utf8_encoding.enc;
      return 1;
    }
+    if (streqci(name, "US-ASCII")) {
+      *encPtr = &ascii_encoding.enc;
+      return 1;
+    }
    if (!streqci(name, "UTF-16"))
      return 0;
  }
@ -531,7 +679,7 @@ int toAscii(const ENCODING *enc, const char *ptr, const char *end)
 {
  char buf[1];
  char *p = buf;
-  XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + 1);
+  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
  if (p == buf)
    return -1;
  else
@ -641,7 +789,7 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
  char buf[ENCODING_MAX];
  char *p = buf;
  int i;
-  XmlConvert(enc, XML_UTF8_ENCODING, &ptr, end, &p, p + ENCODING_MAX - 1);
+  XmlUtf8Convert(enc, &ptr, end, &p, p + ENCODING_MAX - 1);
  if (ptr != end)
    return 0;
  *p = 0;
@ -653,11 +801,13 @@ const ENCODING *findEncoding(const ENCODING *enc, const char *ptr, const char *e
    return &utf8_encoding.enc;
  if (streqci(buf, "ISO-8859-1"))
    return &latin1_encoding.enc;
+  if (streqci(buf, "US-ASCII"))
+    return &ascii_encoding.enc;
  if (streqci(buf, "UTF-16")) {
    static const unsigned short n = 1;
    if (enc->minBytesPerChar == 2)
      return enc;
-    return &big2_encoding;
+    return &big2_encoding.enc;
  }
  return 0;  
 }
@ -757,3 +907,229 @@ int checkCharRefNumber(int result)
  return result;
 }

+int XmlUtf8Encode(int c, char *buf)
+{
+  enum {
+    /* minN is minimum legal resulting value for N byte sequence */
+    min2 = 0x80,
+    min3 = 0x800,
+    min4 = 0x10000
+  };
+
+  if (c < 0)
+    return 0;
+  if (c < min2) {
+    buf[0] = (c | UTF8_cval1);
+    return 1;
+  }
+  if (c < min3) {
+    buf[0] = ((c >> 6) | UTF8_cval2);
+    buf[1] = ((c & 0x3f) | 0x80);
+    return 2;
+  }
+  if (c < min4) {
+    buf[0] = ((c >> 12) | UTF8_cval3);
+    buf[1] = (((c >> 6) & 0x3f) | 0x80);
+    buf[2] = ((c & 0x3f) | 0x80);
+    return 3;
+  }
+  if (c < 0x110000) {
+    buf[0] = ((c >> 18) | UTF8_cval4);
+    buf[1] = (((c >> 12) & 0x3f) | 0x80);
+    buf[2] = (((c >> 6) & 0x3f) | 0x80);
+    buf[3] = ((c & 0x3f) | 0x80);
+    return 4;
+  }
+  return 0;
+}
+
+int XmlUtf16Encode(int charNum, unsigned short *buf)
+{
+  if (charNum < 0)
+    return 0;
+  if (charNum < 0x10000) {
+    buf[0] = charNum;
+    return 1;
+  }
+  if (charNum < 0x110000) {
+    charNum -= 0x10000;
+    buf[0] = (charNum >> 10) + 0xD800;
+    buf[1] = (charNum & 0x3FF) + 0xDC00;
+    return 2;
+  }
+  return 0;
+}
+
+struct unknown_encoding {
+  struct normal_encoding normal;
+  int (*convert)(void *userData, const char *p);
+  void *userData;
+  unsigned short utf16[256];
+  char utf8[256][4];
+};
+
+int XmlSizeOfUnknownEncoding()
+{
+  return sizeof(struct unknown_encoding);
+}
+
+static
+int unknown_isName(const ENCODING *enc, const char *p)
+{
+  int c = ((const struct unknown_encoding *)enc)
+	  ->convert(((const struct unknown_encoding *)enc)->userData, p);
+  if (c & ~0xFFFF)
+    return 0;
+  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
+}
+
+static
+int unknown_isNmstrt(const ENCODING *enc, const char *p)
+{
+  int c = ((const struct unknown_encoding *)enc)
+	  ->convert(((const struct unknown_encoding *)enc)->userData, p);
+  if (c & ~0xFFFF)
+    return 0;
+  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
+}
+
+static
+int unknown_isInvalid(const ENCODING *enc, const char *p)
+{
+  int c = ((const struct unknown_encoding *)enc)
+	   ->convert(((const struct unknown_encoding *)enc)->userData, p);
+  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
+}
+
+static
+void unknown_toUtf8(const ENCODING *enc,
+		    const char **fromP, const char *fromLim,
+		    char **toP, const char *toLim)
+{
+  char buf[XML_UTF8_ENCODE_MAX];
+  for (;;) {
+    const char *utf8;
+    int n;
+    if (*fromP == fromLim)
+      break;
+    utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
+    n = *utf8++;
+    if (n == 0) {
+      int c = ((const struct unknown_encoding *)enc)
+	      ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
+      n = XmlUtf8Encode(c, buf);
+      if (n > toLim - *toP)
+	break;
+      utf8 = buf;
+      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
+	         - (BT_LEAD2 - 2);
+    }
+    else {
+      if (n > toLim - *toP)
+	break;
+      (*fromP)++;
+    }
+    do {
+      *(*toP)++ = *utf8++;
+    } while (--n != 0);
+  }
+}
+
+static
+void unknown_toUtf16(const ENCODING *enc,
+		     const char **fromP, const char *fromLim,
+		     unsigned short **toP, const unsigned short *toLim)
+{
+  while (*fromP != fromLim && *toP != toLim) {
+    unsigned short c
+      = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
+    if (c == 0) {
+      c = (unsigned short)((const struct unknown_encoding *)enc)
+	   ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
+      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
+	         - (BT_LEAD2 - 2);
+    }
+    else
+      (*fromP)++;
+    *(*toP)++ = c;
+  }
+}
+
+ENCODING *
+XmlInitUnknownEncoding(void *mem,
+		       int *table,
+		       int (*convert)(void *userData, const char *p),
+		       void *userData)
+{
+  int i;
+  struct unknown_encoding *e = mem;
+  for (i = 0; i < sizeof(struct normal_encoding); i++)
+    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
+  for (i = 0; i < 128; i++)
+    if (latin1_encoding.type[i] != BT_OTHER
+        && latin1_encoding.type[i] != BT_NONXML
+	&& table[i] != i)
+      return 0;
+  for (i = 0; i < 256; i++) {
+    int c = table[i];
+    if (c == -1) {
+      e->normal.type[i] = BT_MALFORM;
+      /* This shouldn't really get used. */
+      e->utf16[i] = 0xFFFF;
+      e->utf8[i][0] = 1;
+      e->utf8[i][1] = 0;
+    }
+    else if (c < 0) {
+      if (c < -4)
+	return 0;
+      e->normal.type[i] = BT_LEAD2 - (c + 2);
+      e->utf8[i][0] = 0;
+      e->utf16[i] = 0;
+    }
+    else if (c < 0x80) {
+      if (latin1_encoding.type[c] != BT_OTHER
+	  && latin1_encoding.type[c] != BT_NONXML
+	  && c != i)
+	return 0;
+      e->normal.type[i] = latin1_encoding.type[c];
+      e->utf8[i][0] = 1;
+      e->utf8[i][1] = (char)c;
+      e->utf16[i] = c == 0 ? 0xFFFF : c;
+    }
+    else if (checkCharRefNumber(c) < 0) {
+      e->normal.type[i] = BT_NONXML;
+      /* This shouldn't really get used. */
+      e->utf16[i] = 0xFFFF;
+      e->utf8[i][0] = 1;
+      e->utf8[i][1] = 0;
+    }
+    else {
+      if (c > 0xFFFF)
+	return 0;
+      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
+	e->normal.type[i] = BT_NMSTRT;
+      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
+	e->normal.type[i] = BT_NAME;
+      else
+	e->normal.type[i] = BT_OTHER;
+      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
+      e->utf16[i] = c;
+    }
+  }
+  e->userData = userData;
+  e->convert = convert;
+  if (convert) {
+    e->normal.isName2 = unknown_isName;
+    e->normal.isName3 = unknown_isName;
+    e->normal.isName4 = unknown_isName;
+    e->normal.isNmstrt2 = unknown_isNmstrt;
+    e->normal.isNmstrt3 = unknown_isNmstrt;
+    e->normal.isNmstrt4 = unknown_isNmstrt;
+    e->normal.isInvalid2 = unknown_isInvalid;
+    e->normal.isInvalid3 = unknown_isInvalid;
+    e->normal.isInvalid4 = unknown_isInvalid;
+  }
+  e->normal.enc.utf8Convert = unknown_toUtf8;
+  e->normal.enc.utf16Convert = unknown_toUtf16;
+  return &(e->normal.enc);
+}
--- a/modules/xml/expat/xmltok/xmltok.h
+++ b/modules/xml/expat/xmltok/xmltok.h
@ -29,6 +29,9 @@ extern "C" {
 #define XMLTOKAPI /* as nothing */
 #endif

+/* The following token may be returned by XmlContentTok */
+#define XML_TOK_TRAILING_RSQB -5 /* ] or ]] at the end of the scan; might be start of
+                                    illegal ]]> sequence */
 /* The following tokens may be returned by both XmlPrologTok and XmlContentTok */
 #define XML_TOK_NONE -4    /* The string to be scanned is empty */
 #define XML_TOK_TRAILING_CR -3 /* A CR at the end of the scan;
@ -38,7 +41,7 @@ extern "C" {
 #define XML_TOK_INVALID 0

 /* The following tokens are returned by XmlContentTok; some are also
-  returned by XmlAttributeValueTok and XmlEntityTok */
+  returned by XmlAttributeValueTok, XmlEntityTok, XmlCdataSectionTok */

 #define XML_TOK_START_TAG_WITH_ATTS 1
 #define XML_TOK_START_TAG_NO_ATTS 2
@ -47,7 +50,7 @@ extern "C" {
 #define XML_TOK_END_TAG 5
 #define XML_TOK_DATA_CHARS 6
 #define XML_TOK_DATA_NEWLINE 7
-#define XML_TOK_CDATA_SECTION 8
+#define XML_TOK_CDATA_SECT_OPEN 8
 #define XML_TOK_ENTITY_REF 9
 #define XML_TOK_CHAR_REF 10     /* numeric character reference */

@ -85,25 +88,25 @@ extern "C" {
 #define XML_TOK_CLOSE_PAREN_PLUS 37 /* )+ */
 #define XML_TOK_COMMA 38

-  /* The following tokens is returned only by XmlAttributeValueTok */
+/* The following token is returned only by XmlAttributeValueTok */
 #define XML_TOK_ATTRIBUTE_VALUE_S 39

-#define XML_N_STATES 2
+/* The following token is returned only by XmlCdataSectionTok */
+#define XML_TOK_CDATA_SECT_CLOSE 40
+
+#define XML_N_STATES 3
 #define XML_PROLOG_STATE 0
 #define XML_CONTENT_STATE 1
+#define XML_CDATA_SECTION_STATE 2

 #define XML_N_LITERAL_TYPES 2
 #define XML_ATTRIBUTE_VALUE_LITERAL 0
 #define XML_ENTITY_VALUE_LITERAL 1

-#define XML_N_INTERNAL_ENCODINGS 1
-#define XML_UTF8_ENCODING 0
-#if 0
-#define XML_UTF16_ENCODING 1
-#define XML_UCS4_ENCODING 2
-#endif
-
-#define XML_MAX_BYTES_PER_CHAR 4
+/* The size of the buffer passed to XmlUtf8Encode must be at least this. */
+#define XML_UTF8_ENCODE_MAX 4
+/* The size of the buffer passed to XmlUtf16Encode must be at least this. */
+#define XML_UTF16_ENCODE_MAX 2

 typedef struct position {
  /* first line and first column are 0 not 1 */
@ -139,21 +142,26 @@ struct encoding {
  int (*getAtts)(const ENCODING *enc, const char *ptr,
 	         int attsMax, ATTRIBUTE *atts);
  int (*charRefNumber)(const ENCODING *enc, const char *ptr);
+  int (*predefinedEntityName)(const ENCODING *, const char *, const char *);
  void (*updatePosition)(const ENCODING *,
 			 const char *ptr,
 			 const char *end,
 			 POSITION *);
  int (*isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
 		    const char **badPtr);
-  int (*encode)(const ENCODING *enc,
-		int charNum,
-		char *buf);
-  void (*convert[XML_N_INTERNAL_ENCODINGS])(const ENCODING *enc,
-					    const char **fromP,
-					    const char *fromLim,
-					    char **toP,
-					    const char *toLim);
+  void (*utf8Convert)(const ENCODING *enc,
+		      const char **fromP,
+		      const char *fromLim,
+		      char **toP,
+		      const char *toLim);
+  void (*utf16Convert)(const ENCODING *enc,
+		       const char **fromP,
+		       const char *fromLim,
+		       unsigned short **toP,
+		       const unsigned short *toLim);
  int minBytesPerChar;
+  char isUtf8;
+  char isUtf16;
 };

 /*
@ -186,6 +194,9 @@ literals, comments and processing instructions.
 #define XmlContentTok(enc, ptr, end, nextTokPtr) \
   XmlTok(enc, XML_CONTENT_STATE, ptr, end, nextTokPtr)

+#define XmlCdataSectionTok(enc, ptr, end, nextTokPtr) \
+   XmlTok(enc, XML_CDATA_SECTION_STATE, ptr, end, nextTokPtr)
+
 /* This is used for performing a 2nd-level tokenization on
 the content of a literal that has already been returned by XmlTok. */ 

@ -215,17 +226,20 @@ the content of a literal that has already been returned by XmlTok. */
 #define XmlCharRefNumber(enc, ptr) \
  (((enc)->charRefNumber)(enc, ptr))

+#define XmlPredefinedEntityName(enc, ptr, end) \
+  (((enc)->predefinedEntityName)(enc, ptr, end))
+
 #define XmlUpdatePosition(enc, ptr, end, pos) \
  (((enc)->updatePosition)(enc, ptr, end, pos))

 #define XmlIsPublicId(enc, ptr, end, badPtr) \
  (((enc)->isPublicId)(enc, ptr, end, badPtr))

-#define XmlEncode(enc, ch, buf) \
-  (((enc)->encode)(enc, ch, buf))
+#define XmlUtf8Convert(enc, fromP, fromLim, toP, toLim) \
+  (((enc)->utf8Convert)(enc, fromP, fromLim, toP, toLim))

-#define XmlConvert(enc, targetEnc, fromP, fromLim, toP, toLim) \
-  (((enc)->convert[targetEnc])(enc, fromP, fromLim, toP, toLim))
+#define XmlUtf16Convert(enc, fromP, fromLim, toP, toLim) \
+  (((enc)->utf16Convert)(enc, fromP, fromLim, toP, toLim))

 typedef struct {
  ENCODING initEnc;
@ -243,7 +257,17 @@ int XMLTOKAPI XmlParseXmlDecl(int isGeneralTextEntity,
 			      int *standalonePtr);

 int XMLTOKAPI XmlInitEncoding(INIT_ENCODING *, const ENCODING **, const char *name);
-const ENCODING XMLTOKAPI *XmlGetInternalEncoding(int);
+const ENCODING XMLTOKAPI *XmlGetUtf8InternalEncoding();
+const ENCODING XMLTOKAPI *XmlGetUtf16InternalEncoding();
+int XMLTOKAPI XmlUtf8Encode(int charNumber, char *buf);
+int XMLTOKAPI XmlUtf16Encode(int charNumber, unsigned short *buf);
+
+int XMLTOKAPI XmlSizeOfUnknownEncoding();
+ENCODING XMLTOKAPI *
+XmlInitUnknownEncoding(void *mem,
+		       int *table,
+		       int (*convert)(void *userData, const char *p),
+		       void *userData);

 #ifdef __cplusplus
 }
--- a/modules/xml/expat/xmltok/xmltok_impl.c
+++ b/modules/xml/expat/xmltok/xmltok_impl.c
@ -56,7 +56,7 @@ Contributor(s):

 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
  case BT_NONASCII: \
-    if (!IS_NAME_CHAR(enc, ptr, MINBPC)) { \
+    if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
      *nextTokPtr = ptr; \
      return XML_TOK_INVALID; \
    } \
@ -84,7 +84,7 @@ Contributor(s):

 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
  case BT_NONASCII: \
-    if (!IS_NMSTRT_CHAR(enc, ptr, MINBPC)) { \
+    if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
      *nextTokPtr = ptr; \
      return XML_TOK_INVALID; \
    } \
@ -293,15 +293,14 @@ int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
  return XML_TOK_PARTIAL;
 }

-/* ptr points to character following "<![" */

 static
 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
 			     const char **nextTokPtr)
 {
  int i;
-  /* CDATA[]]> */
-  if (end - ptr < 9 * MINBPC)
+  /* CDATA[ */
+  if (end - ptr < 6 * MINBPC)
    return XML_TOK_PARTIAL;
  for (i = 0; i < 6; i++, ptr += MINBPC) {
    if (!CHAR_MATCHES(enc, ptr, "CDATA["[i])) {
@ -309,22 +308,86 @@ int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *e
      return XML_TOK_INVALID;
    }
  }
-  end -= 2 * MINBPC;
-  while (ptr != end) {
-    switch (BYTE_TYPE(enc, ptr)) {
-    INVALID_CASES(ptr, nextTokPtr)
-    case BT_RSQB:
-      if (CHAR_MATCHES(enc, ptr + MINBPC, ']')
-	  && CHAR_MATCHES(enc, ptr + 2 * MINBPC, '>')) {
-	*nextTokPtr = ptr + 3 * MINBPC;
-	return XML_TOK_CDATA_SECTION;
-      }
-    /* fall through */
-    default:
-      ptr += MINBPC;
+  *nextTokPtr = ptr;
+  return XML_TOK_CDATA_SECT_OPEN;
+}
+
+static
+int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
+			    const char **nextTokPtr)
+{
+  if (ptr == end)
+    return XML_TOK_NONE;
+#if MINBPC > 1
+  {
+    size_t n = end - ptr;
+    if (n & (MINBPC - 1)) {
+      n &= ~(MINBPC - 1);
+      if (n == 0)
+	return XML_TOK_PARTIAL;
+      end = ptr + n;
    }
  }
-  return XML_TOK_PARTIAL;
+#endif
+  switch (BYTE_TYPE(enc, ptr)) {
+  case BT_RSQB:
+    ptr += MINBPC;
+    if (ptr == end)
+      return XML_TOK_PARTIAL;
+    if (!CHAR_MATCHES(enc, ptr, ']'))
+      break;
+    ptr += MINBPC;
+    if (ptr == end)
+      return XML_TOK_PARTIAL;
+    if (!CHAR_MATCHES(enc, ptr, '>')) {
+      ptr -= MINBPC;
+      break;
+    }
+    *nextTokPtr = ptr + MINBPC;
+    return XML_TOK_CDATA_SECT_CLOSE;
+  case BT_CR:
+    ptr += MINBPC;
+    if (ptr == end)
+      return XML_TOK_PARTIAL;
+    if (BYTE_TYPE(enc, ptr) == BT_LF)
+      ptr += MINBPC;
+    *nextTokPtr = ptr;
+    return XML_TOK_DATA_NEWLINE;
+  case BT_LF:
+    *nextTokPtr = ptr + MINBPC;
+    return XML_TOK_DATA_NEWLINE;
+  INVALID_CASES(ptr, nextTokPtr)
+  default:
+    ptr += MINBPC;
+    break;
+  }
+  while (ptr != end) {
+    switch (BYTE_TYPE(enc, ptr)) {
+#define LEAD_CASE(n) \
+    case BT_LEAD ## n: \
+      if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
+	*nextTokPtr = ptr; \
+	return XML_TOK_DATA_CHARS; \
+      } \
+      ptr += n; \
+      break;
+    LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
+#undef LEAD_CASE
+    case BT_NONXML:
+    case BT_MALFORM:
+    case BT_TRAIL:
+    case BT_CR:
+    case BT_LF:
+    case BT_RSQB:
+      *nextTokPtr = ptr;
+      return XML_TOK_DATA_CHARS;
+    default:
+      ptr += MINBPC;
+      break;
+    }
+  }
+  *nextTokPtr = ptr;
+  return XML_TOK_DATA_CHARS;
 }

 /* ptr points to character following "</" */
@ -442,7 +505,7 @@ int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
  if (ptr == end)
    return XML_TOK_PARTIAL;
  switch (BYTE_TYPE(enc, ptr)) {
-  CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
+  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  case BT_NUM:
    return PREFIX(scanCharRef)(enc, ptr + MINBPC, end, nextTokPtr);
  default:
@ -543,6 +606,22 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
 	    break;
 	  }
 	}
+	ptr += MINBPC;
+	if (ptr == end)
+	  return XML_TOK_PARTIAL;
+	switch (BYTE_TYPE(enc, ptr)) {
+	case BT_S:
+	case BT_CR:
+	case BT_LF:
+	  break;
+	case BT_SOL:
+	  goto sol;
+	case BT_GT:
+	  goto gt;
+	default:
+	  *nextTokPtr = ptr;
+	  return XML_TOK_INVALID;
+	}
 	/* ptr points to closing quote */
 	for (;;) {
 	  ptr += MINBPC;
@ -553,9 +632,11 @@ int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
 	  case BT_S: case BT_CR: case BT_LF:
 	    continue;
 	  case BT_GT:
+          gt:
 	    *nextTokPtr = ptr + MINBPC;
 	    return XML_TOK_START_TAG_WITH_ATTS;
 	  case BT_SOL:
+          sol:
 	    ptr += MINBPC;
 	    if (ptr == end)
 	      return XML_TOK_PARTIAL;
@ -694,12 +775,12 @@ int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
  case BT_RSQB:
    ptr += MINBPC;
    if (ptr == end)
-      return XML_TOK_PARTIAL;
+      return XML_TOK_TRAILING_RSQB;
    if (!CHAR_MATCHES(enc, ptr, ']'))
      break;
    ptr += MINBPC;
    if (ptr == end)
-      return XML_TOK_PARTIAL;
+      return XML_TOK_TRAILING_RSQB;
    if (!CHAR_MATCHES(enc, ptr, '>')) {
      ptr -= MINBPC;
      break;
@ -766,7 +847,7 @@ int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
  if (ptr == end)
    return XML_TOK_PARTIAL;
  switch (BYTE_TYPE(enc, ptr)) {
-  CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
+  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
    *nextTokPtr = ptr;
    return XML_TOK_PERCENT;
@ -795,7 +876,7 @@ int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
  if (ptr == end)
    return XML_TOK_PARTIAL;
  switch (BYTE_TYPE(enc, ptr)) {
-  CHECK_NMSTRT_CASES(end, ptr, end, nextTokPtr)
+  CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
  default:
    *nextTokPtr = ptr;
    return XML_TOK_INVALID;
@ -944,7 +1025,7 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
  case BT_RPAR:
    ptr += MINBPC;
    if (ptr == end)
-      return XML_TOK_INVALID;
+      return XML_TOK_PARTIAL;
    switch (BYTE_TYPE(enc, ptr)) {
    case BT_AST:
      *nextTokPtr = ptr + MINBPC;
@ -1001,12 +1082,12 @@ int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
    ptr += MINBPC;
    break;
  case BT_NONASCII:
-    if (IS_NMSTRT_CHAR(enc, ptr, MINBPC)) {
+    if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
      ptr += MINBPC;
      tok = XML_TOK_NAME;
      break;
    }
-    if (IS_NAME_CHAR(enc, ptr, MINBPC)) {
+    if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
      ptr += MINBPC;
      tok = XML_TOK_NMTOKEN;
      break;
@ -1343,6 +1424,59 @@ int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
  return checkCharRefNumber(result);
 }

+static
+int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
+{
+  switch (end - ptr) {
+  case 2 * MINBPC:
+    if (CHAR_MATCHES(enc, ptr + MINBPC, 't')) {
+      switch (BYTE_TO_ASCII(enc, ptr)) {
+      case 'l':
+	return '<';
+      case 'g':
+	return '>';
+      }
+    }
+    break;
+  case 3 * MINBPC:
+    if (CHAR_MATCHES(enc, ptr, 'a')) {
+      ptr += MINBPC;
+      if (CHAR_MATCHES(enc, ptr, 'm')) {
+	ptr += MINBPC;
+	if (CHAR_MATCHES(enc, ptr, 'p'))
+	  return '&';
+      }
+    }
+    break;
+  case 4 * MINBPC:
+    switch (BYTE_TO_ASCII(enc, ptr)) {
+    case 'q':
+      ptr += MINBPC;
+      if (CHAR_MATCHES(enc, ptr, 'u')) {
+	ptr += MINBPC;
+	if (CHAR_MATCHES(enc, ptr, 'o')) {
+	  ptr += MINBPC;
+  	  if (CHAR_MATCHES(enc, ptr, 't'))
+	    return '"';
+	}
+      }
+      break;
+    case 'a':
+      ptr += MINBPC;
+      if (CHAR_MATCHES(enc, ptr, 'p')) {
+	ptr += MINBPC;
+	if (CHAR_MATCHES(enc, ptr, 'o')) {
+	  ptr += MINBPC;
+  	  if (CHAR_MATCHES(enc, ptr, 's'))
+	    return '\'';
+	}
+      }
+      break;
+    }
+  }
+  return 0;
+}
+
 static
 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
 {
--- a/modules/xml/macbuild/XML.mcp.exp
+++ b/modules/xml/macbuild/XML.mcp.exp
@ -1,7 +1,7 @@
 XML_ErrorString
-XML_GetErrorByteIndex
-XML_GetErrorColumnNumber
-XML_GetErrorLineNumber
+XML_GetCurrentLineNumber
+XML_GetCurrentColumnNumber
+XML_GetCurrentByteIndex
 XML_GetErrorCode
 XML_GetBuffer
 XML_ParseBuffer
@ -19,7 +19,6 @@ hashTableDestroy
 lookup
 XmlParseXmlDecl
 XmlInitEncoding
-XmlGetInternalEncoding
 XmlPrologStateInit
 tokenizeXMLElement
 XMLDOM_PIHandler