bug 206811 : xp_iconv should use UTF-16, if available, instead of UCS-2 (r=drepper, sr=darin)

2003-07-30 05:58:28 +00:00 · 2003-07-30 05:58:28 +00:00 · d6b83a6362
--- a/xpcom/io/nsNativeCharsetUtils.cpp
+++ b/xpcom/io/nsNativeCharsetUtils.cpp
@ -106,8 +106,11 @@ utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output,
 #endif

 // solaris definitely needs this, but we'll enable it by default
-// just in case...
+// just in case... but we know for sure that iconv(3) in glibc
+// doesn't need this.
+#if !defined(__GLIBC__)
 #define ENABLE_UTF8_FALLBACK_SUPPORT
+#endif

 #define INVALID_ICONV_T ((iconv_t) -1)

@ -177,12 +180,34 @@ xp_iconv_open(const char **to_list, const char **from_list)
    return INVALID_ICONV_T;
 }

-// PRUnichar[] is NOT a UCS-2 array BUT for UTF-16 string. Therefore, we
-// have to use UTF-16 with iconv(3) on platforms where it's supported.
-// We could list 'UTF-16' name variants, but all platforms known (to me) to 
-// support UTF-16 in iconv(3) uses 'UTF-16'. Let me know (jshin) if there's an
-// exception. (bug 206811)
+/* 
+ * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
+ * have to use UTF-16 with iconv(3) on platforms where it's supported.
+ * However, the way UTF-16 and UCS-2 are interpreted varies across platforms 
+ * and implementations of iconv(3). On Tru64, it also depends on the environment
+ * variable. To avoid the trouble arising from byte-swapping 
+ * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling 
+ * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 
+ * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
+ * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
+ * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
+ * variable ICONV_BYTEORDER is set to 'big-endian', about which not much 
+ * can be done other than adding a note in the release notes. (bug 206811)
+ */
 static const char *UTF_16_NAMES[] = {
+#if defined(IS_LITTLE_ENDIAN)
+    "UTF-16LE",
+#if defined(__GLIBC__)
+    "UNICODELITTLE",
+#endif
+    "UCS-2LE",
+#else
+    "UTF-16BE",
+#if defined(__GLIBC__)
+    "UNICODEBIG",
+#endif
+    "UCS-2BE",
+#endif
    "UTF-16",
    "UCS-2",
    "UCS2",
@ -193,6 +218,7 @@ static const char *UTF_16_NAMES[] = {
    NULL
 };

+#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
 static const char *UTF_8_NAMES[] = {
    "UTF-8",
    "UTF8",
@ -202,9 +228,11 @@ static const char *UTF_8_NAMES[] = {
    "utf_8",
    NULL
 };
+#endif

 static const char *ISO_8859_1_NAMES[] = {
    "ISO-8859-1",
+#if !defined(__GLIBC__)
    "ISO8859-1",
    "ISO88591",
    "ISO_8859_1",
@ -214,6 +242,7 @@ static const char *ISO_8859_1_NAMES[] = {
    "iso88591",
    "iso_8859_1",
    "iso8859_1",
+#endif
    NULL
 };

@ -282,25 +311,28 @@ nsNativeCharsetConverter::LazyInit()
        gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
        gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
        NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
-        NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to ucs-2 converter");
+        NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
    }
    if (gUnicodeToNative == INVALID_ICONV_T) {
        gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
        gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
-        NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no unicode to utf-8 converter");
+        NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
        NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
    }
 #else
-    NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to ucs-2 converter");
-    NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no ucs-2 to native converter");
+    NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
+    NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
 #endif

    /*
     * On Solaris 8 (and newer?), the iconv modules converting to UCS-2
     * prepend a byte order mark unicode character (BOM, u+FEFF) during
-     * the first use of the iconv converter.
+     * the first use of the iconv converter. The same is the case of 
+     * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. 
+     * However, we use 'UTF-16LE/BE' in both cases, instead so that we 
+     * should be safe. But just in case...
     *
-     * This dummy conversion gets rid of the BOMs and fixes bugid 153562.
+     * This dummy conversion gets rid of the BOMs and fixes bug 153562.
     */
    char dummy_input[1] = { ' ' };
    char dummy_output[4];
@ -421,7 +453,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input,
            return NS_OK;
        }

-        NS_WARNING("conversion from native to ucs-2 failed");
+        NS_WARNING("conversion from native to utf-16 failed");

        // reset converter
        xp_iconv_reset(gNativeToUnicode);
@ -449,7 +481,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input,
            n = sizeof(ubuf) - n;
            res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
            if (res == (size_t) -1) {
-                NS_ERROR("conversion from utf-8 to ucs-2 failed");
+                NS_ERROR("conversion from utf-8 to utf-16 failed");
                break;
            }
        }
@ -510,7 +542,7 @@ nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
            size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
            res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
            if (res == (size_t) -1) {
-                NS_ERROR("conversion from ucs-2 to utf-8 failed");
+                NS_ERROR("conversion from utf-16 to utf-8 failed");
                break;
            }
            p = ubuf;