bug 206811 : xp_iconv should use UTF-16, if available, instead of UCS-2 (r=drepper, sr=darin)

This commit is contained in:
jshin%mailaps.org 2003-07-30 05:58:28 +00:00
Родитель b21443a8b2
Коммит d6b83a6362
1 изменённых файлов: 47 добавлений и 15 удалений

Просмотреть файл

@ -106,8 +106,11 @@ utf16_to_isolatin1(const PRUnichar **input, PRUint32 *inputLeft, char **output,
#endif
// solaris definitely needs this, but we'll enable it by default
// just in case...
// just in case... but we know for sure that iconv(3) in glibc
// doesn't need this.
#if !defined(__GLIBC__)
#define ENABLE_UTF8_FALLBACK_SUPPORT
#endif
#define INVALID_ICONV_T ((iconv_t) -1)
@ -177,12 +180,34 @@ xp_iconv_open(const char **to_list, const char **from_list)
return INVALID_ICONV_T;
}
// PRUnichar[] is NOT a UCS-2 array BUT for UTF-16 string. Therefore, we
// have to use UTF-16 with iconv(3) on platforms where it's supported.
// We could list 'UTF-16' name variants, but all platforms known (to me) to
// support UTF-16 in iconv(3) uses 'UTF-16'. Let me know (jshin) if there's an
// exception. (bug 206811)
/*
* PRUnichar[] is NOT a UCS-2 array BUT a UTF-16 string. Therefore, we
* have to use UTF-16 with iconv(3) on platforms where it's supported.
* However, the way UTF-16 and UCS-2 are interpreted varies across platforms
* and implementations of iconv(3). On Tru64, it also depends on the environment
* variable. To avoid the trouble arising from byte-swapping
* (bug 208809), we have to try UTF-16LE/BE and UCS-2LE/BE before falling
* back to UTF-16 and UCS-2 and variants. We assume that UTF-16 and UCS-2
* on systems without UTF-16LE/BE and UCS-2LE/BE have the native endianness,
* which isn't the case of glibc 2.1.x, for which we use 'UNICODELITTLE'
* and 'UNICODEBIG'. It's also not true of Tru64 V4 when the environment
* variable ICONV_BYTEORDER is set to 'big-endian', about which not much
* can be done other than adding a note in the release notes. (bug 206811)
*/
static const char *UTF_16_NAMES[] = {
#if defined(IS_LITTLE_ENDIAN)
"UTF-16LE",
#if defined(__GLIBC__)
"UNICODELITTLE",
#endif
"UCS-2LE",
#else
"UTF-16BE",
#if defined(__GLIBC__)
"UNICODEBIG",
#endif
"UCS-2BE",
#endif
"UTF-16",
"UCS-2",
"UCS2",
@ -193,6 +218,7 @@ static const char *UTF_16_NAMES[] = {
NULL
};
#if defined(ENABLE_UTF8_FALLBACK_SUPPORT)
static const char *UTF_8_NAMES[] = {
"UTF-8",
"UTF8",
@ -202,9 +228,11 @@ static const char *UTF_8_NAMES[] = {
"utf_8",
NULL
};
#endif
static const char *ISO_8859_1_NAMES[] = {
"ISO-8859-1",
#if !defined(__GLIBC__)
"ISO8859-1",
"ISO88591",
"ISO_8859_1",
@ -214,6 +242,7 @@ static const char *ISO_8859_1_NAMES[] = {
"iso88591",
"iso_8859_1",
"iso8859_1",
#endif
NULL
};
@ -282,25 +311,28 @@ nsNativeCharsetConverter::LazyInit()
gNativeToUTF8 = xp_iconv_open(UTF_8_NAMES, native_charset_list);
gUTF8ToUnicode = xp_iconv_open(UTF_16_NAMES, UTF_8_NAMES);
NS_ASSERTION(gNativeToUTF8 != INVALID_ICONV_T, "no native to utf-8 converter");
NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to ucs-2 converter");
NS_ASSERTION(gUTF8ToUnicode != INVALID_ICONV_T, "no utf-8 to utf-16 converter");
}
if (gUnicodeToNative == INVALID_ICONV_T) {
gUnicodeToUTF8 = xp_iconv_open(UTF_8_NAMES, UTF_16_NAMES);
gUTF8ToNative = xp_iconv_open(native_charset_list, UTF_8_NAMES);
NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no unicode to utf-8 converter");
NS_ASSERTION(gUnicodeToUTF8 != INVALID_ICONV_T, "no utf-16 to utf-8 converter");
NS_ASSERTION(gUTF8ToNative != INVALID_ICONV_T, "no utf-8 to native converter");
}
#else
NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to ucs-2 converter");
NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no ucs-2 to native converter");
NS_ASSERTION(gNativeToUnicode != INVALID_ICONV_T, "no native to utf-16 converter");
NS_ASSERTION(gUnicodeToNative != INVALID_ICONV_T, "no utf-16 to native converter");
#endif
/*
* On Solaris 8 (and newer?), the iconv modules converting to UCS-2
* prepend a byte order mark unicode character (BOM, u+FEFF) during
* the first use of the iconv converter.
* the first use of the iconv converter. The same is the case of
* glibc 2.2.9x and Tru64 V5 (see bug 208809) when 'UTF-16' is used.
* However, we use 'UTF-16LE/BE' in both cases, instead so that we
* should be safe. But just in case...
*
* This dummy conversion gets rid of the BOMs and fixes bugid 153562.
* This dummy conversion gets rid of the BOMs and fixes bug 153562.
*/
char dummy_input[1] = { ' ' };
char dummy_output[4];
@ -421,7 +453,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input,
return NS_OK;
}
NS_WARNING("conversion from native to ucs-2 failed");
NS_WARNING("conversion from native to utf-16 failed");
// reset converter
xp_iconv_reset(gNativeToUnicode);
@ -449,7 +481,7 @@ nsNativeCharsetConverter::NativeToUnicode(const char **input,
n = sizeof(ubuf) - n;
res = xp_iconv(gUTF8ToUnicode, (const char **) &p, &n, (char **) output, &outLeft);
if (res == (size_t) -1) {
NS_ERROR("conversion from utf-8 to ucs-2 failed");
NS_ERROR("conversion from utf-8 to utf-16 failed");
break;
}
}
@ -510,7 +542,7 @@ nsNativeCharsetConverter::UnicodeToNative(const PRUnichar **input,
size_t n = sizeof(ubuf), one_uchar = sizeof(PRUnichar);
res = xp_iconv(gUnicodeToUTF8, &in, &one_uchar, &p, &n);
if (res == (size_t) -1) {
NS_ERROR("conversion from ucs-2 to utf-8 failed");
NS_ERROR("conversion from utf-16 to utf-8 failed");
break;
}
p = ubuf;