diff --git a/xpcom/io/nsNativeCharsetUtils.cpp b/xpcom/io/nsNativeCharsetUtils.cpp index e70deb91782..2586988611b 100644 --- a/xpcom/io/nsNativeCharsetUtils.cpp +++ b/xpcom/io/nsNativeCharsetUtils.cpp @@ -106,8 +106,11 @@ utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output, #endif // solaris definitely needs this, but we'll enable it by default -// just in case... +// just in case... but we know for sure that iconv(3) in glibc +// doesn't need this. +#if !defined(__GLIBC__) #define ENABLE_UTF8_FALLBACK_SUPPORT +#endif #define INVALID_ICONV_T ((iconv_t) -1) @@ -177,12 +180,34 @@ xp_iconv_open(const char **to_list, const char **from_list) return INVALID_ICONV_T; } -// PRUnichar[] is NOT a UCS-2 array BUT for UTF-16 string. Therefore, we -// have to use UTF-16 with iconv(3) on platforms where it's supported. -// We could list 'UTF-16' name variants, but all platforms known (to me) to -// support UTF-16 in iconv(3) uses 'UTF-16'. Let me know (jshin) if there's an -// exception. (bug 206811) +/* + * PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we + * have to use UTF-16 with iconv(3) on platforms where it's supported. + * However, the way UTF-16 and UCS-2 are interpreted varies across platforms + * and implementations of iconv(3). On Tru64, it also depends on the environment + * variable. To avoid the trouble arising from byte-swapping + * (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling + * back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2 + * on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness, + * which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE' + * and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment + * variable ICONV_BYTEORDER is set to 'big-endian', about which not much + * can be done other than adding a note in the release notes. (bug 206811) + */ static const char *UTF_16_NAMES[] = { +#if defined(IS_LITTLE_ENDIAN) + "UTF-16LE", +#if defined(__GLIBC__) + "UNICODELITTLE", +#endif + "UCS-2LE", +#else + "UTF-16BE", +#if defined(__GLIBC__) + "UNICODEBIG", +#endif + "UCS-2BE", +#endif "UTF-16", "UCS-2", "UCS2", @@ -193,6 +218,7 @@ static const char *UTF_16_NAMES[] = { NULL }; +#if defined(ENABLE_UTF8_FALLBACK_SUPPORT) static const char *UTF_8_NAMES[] = { "UTF-8", "UTF8", @@ -202,9 +228,11 @@ static const char *UTF_8_NAMES[] = { "utf_8", NULL }; +#endif static const char *ISO_8859_1_NAMES[] = { "ISO-8859-1", +#if !defined(__GLIBC__) "ISO8859-1", "ISO88591", "ISO_8859_1", @@ -214,6 +242,7 @@ static const char *ISO_8859_1_NAMES[] = { "iso88591", "iso_8859_1", "iso8859_1", +#endif NULL }; @@ -282,25 +311,28 @@ nsNativeCharsetConverter::LazyInit() gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list); gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES); NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter"); - NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to ucs-2 converter"); + NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter"); } if (gUnicodeToNative == INVALID_ICONV_T) { gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES); gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES); - NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no unicode to utf-8 converter"); + NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter"); NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter"); } #else - NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to ucs-2 converter"); - NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no ucs-2 to native converter"); + NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter"); + NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter"); #endif /* * On Solaris 8 (and newer?), the iconv modules converting to UCS-2 * prepend a byte order mark unicode character (BOM, u+FEFF) during - * the first use of the iconv converter. + * the first use of the iconv converter. The same is the case of + * glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used. + * However, we use 'UTF-16LE/BE' in both cases, instead so that we + * should be safe. But just in case... * - * This dummy conversion gets rid of the BOMs and fixes bugid 153562. + * This dummy conversion gets rid of the BOMs and fixes bug 153562. */ char dummy_input[1] = { ' ' }; char dummy_output[4]; @@ -421,7 +453,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input, return NS_OK; } - NS_WARNING("conversion from native to ucs-2 failed"); + NS_WARNING("conversion from native to utf-16 failed"); // reset converter xp_iconv_reset(gNativeToUnicode); @@ -449,7 +481,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input, n = sizeof(ubuf) - n; res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft); if (res == (size_t) -1) { - NS_ERROR("conversion from utf-8 to ucs-2 failed"); + NS_ERROR("conversion from utf-8 to utf-16 failed"); break; } } @@ -510,7 +542,7 @@ nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input, size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar); res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n); if (res == (size_t) -1) { - NS_ERROR("conversion from ucs-2 to utf-8 failed"); + NS_ERROR("conversion from utf-16 to utf-8 failed"); break; } p = ubuf;