зеркало из https://github.com/mozilla/gecko-dev.git
#102595 nsUnicodeToUTF8 does not handle surrogate pair correctly
r=bstell, sr=brendan
This commit is contained in:
Родитель
3fde7db591
Коммит
1517ddd0b2
|
@ -66,6 +66,9 @@ static NS_DEFINE_IID(kIUnicharEncoderIID, NS_IUNICHARENCODER_IID);
|
|||
#define NS_ERROR_UENC_NOMAPPING \
|
||||
NS_ERROR_GENERATE_SUCCESS(NS_ERROR_MODULE_UCONV, 0x23)
|
||||
|
||||
#define NS_OK_UENC_MOREINPUT \
|
||||
NS_ERROR_GENERATE_SUCCESS(NS_ERROR_MODULE_UCONV, 0x24)
|
||||
|
||||
|
||||
#define NS_UNICODEENCODER_CONTRACTID_BASE "@mozilla.org/intl/unicode/encoder;1?charset="
|
||||
|
||||
|
|
|
@ -91,14 +91,14 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsCharsetAlias2)
|
|||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsTextToSubURI)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsScriptableUnicodeConverter)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsConverterInputStream)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsISO88591ToUnicode);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsCP1252ToUnicode);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsMacRomanToUnicode);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF8ToUnicode);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO88591);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToCP1252);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToMacRoman);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF8);
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsISO88591ToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsCP1252ToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsMacRomanToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF8ToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO88591)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToCP1252)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToMacRoman)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF8)
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsConverterManagerDataRegister(nsIComponentManager* aCompMgr,
|
||||
|
|
|
@ -42,41 +42,138 @@
|
|||
#include "nsUnicodeToUTF8.h"
|
||||
#include <string.h>
|
||||
|
||||
static const PRUint16 g_UTF8MappingTable[] = {
|
||||
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0xFFFF, 0x0000
|
||||
};
|
||||
|
||||
static const PRInt16 g_UTF8ShiftTable[] = {
|
||||
3, uMultibytesCharset,
|
||||
ShiftCell(u1ByteChar, 1, 0x00, 0x7F, 0x00, 0x00, 0x00, 0x7F),
|
||||
ShiftCell(u2BytesUTF8, 2, 0xC0, 0xDF, 0x00, 0x00, 0x07, 0xFF),
|
||||
ShiftCell(u3BytesUTF8, 3, 0xE0, 0xEF, 0x08, 0x00, 0xFF, 0xFF)
|
||||
};
|
||||
NS_IMPL_ISUPPORTS1(nsUnicodeToUTF8, nsIUnicodeEncoder)
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Class nsUnicodeToUTF8 [implementation]
|
||||
|
||||
nsUnicodeToUTF8::nsUnicodeToUTF8()
|
||||
: nsTableEncoderSupport((uShiftTable*) &g_UTF8ShiftTable,
|
||||
(uMappingTable*) &g_UTF8MappingTable)
|
||||
{
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Subclassing of nsTableEncoderSupport class [implementation]
|
||||
// nsUnicodeToUTF8 class [implementation]
|
||||
|
||||
NS_IMETHODIMP nsUnicodeToUTF8::GetMaxLength(const PRUnichar * aSrc,
|
||||
PRInt32 aSrcLength,
|
||||
PRInt32 * aDestLength)
|
||||
{
|
||||
// in theory it should be 6, but since we do not handle
|
||||
// UCS4 and UTF-16 here. It is 3. We should change it to 6 when we
|
||||
// support UCS4 or UTF-16
|
||||
*aDestLength = 3*aSrcLength;
|
||||
// aSrc is interpreted as UTF16, 3 is normally enough.
|
||||
// But when previous buffer only contains part of the surrogate pair, we
|
||||
// need to complete it here. If the first word in following buffer is not
|
||||
// in valid surrogate rang, we need to convert the remaining of last buffer
|
||||
// to 3 bytes.
|
||||
*aDestLength = 3*aSrcLength + 3;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP nsUnicodeToUTF8::FillInfo(PRUint32 *aInfo)
|
||||
{
|
||||
memset(aInfo, 0xFF, (0x10000L >> 3));
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP nsUnicodeToUTF8::Convert(const PRUnichar * aSrc,
|
||||
PRInt32 * aSrcLength,
|
||||
char * aDest,
|
||||
PRInt32 * aDestLength)
|
||||
{
|
||||
const PRUnichar * src = aSrc;
|
||||
const PRUnichar * srcEnd = aSrc + *aSrcLength;
|
||||
char * dest = aDest;
|
||||
PRInt32 destLen = *aDestLength;
|
||||
PRUint32 n;
|
||||
|
||||
//complete remaining of last conversion
|
||||
if (mHighSurrogate) {
|
||||
if (src < srcEnd) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREINPUT;
|
||||
}
|
||||
if (*aDestLength < 4) {
|
||||
*aSrcLength = 0;
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
if (*src < (PRUnichar)0xdc00 || *src > (PRUnichar)0xdfff) { //not a pair
|
||||
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
|
||||
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
|
||||
} else {
|
||||
n = ((mHighSurrogate - (PRUnichar)0xd800) << 10) +
|
||||
(*src - (PRUnichar)0xdc00) + 0x10000;
|
||||
*dest++ = (char)0xf0 | (n >> 18);
|
||||
*dest++ = (char)0x80 | ((n >> 12) & 0x3f);
|
||||
*dest++ = (char)0x80 | ((n >> 6) & 0x3f);
|
||||
*dest++ = (char)0x80 | (n & 0x3f);
|
||||
++src;
|
||||
}
|
||||
mHighSurrogate = 0;
|
||||
}
|
||||
|
||||
while (src < srcEnd) {
|
||||
if ( *src < 0x007f) {
|
||||
if (destLen < 1)
|
||||
goto error_more_output;
|
||||
*dest++ = (char)*src;
|
||||
--destLen;
|
||||
} else if (*src < 0x07ff) {
|
||||
if (destLen < 2)
|
||||
goto error_more_output;
|
||||
*dest++ = (char)0xc0 | (*src >> 6);
|
||||
*dest++ = (char)0x80 | (*src & 0x003f);
|
||||
} else if (*src >= (PRUnichar)0xD800 && *src < (PRUnichar)0xDA00) {
|
||||
if ((src+1) >= srcEnd) {
|
||||
//we need another surrogate to complete this unicode char
|
||||
mHighSurrogate = *src;
|
||||
*aDestLength = dest - aDest;
|
||||
return NS_OK_UENC_MOREINPUT;
|
||||
}
|
||||
//handle surrogate
|
||||
if (destLen < 4)
|
||||
goto error_more_output;
|
||||
if (*(src+1) < (PRUnichar)0xdc00 || *(src+1) > 0xdfff) { //not a pair
|
||||
*dest++ = (char)0xe0 | (*src >> 12);
|
||||
*dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (*src & 0x003f);
|
||||
} else {
|
||||
n = ((*src - (PRUnichar)0xd800) << 10) + (*(src+1) - (PRUnichar)0xdc00) + (PRUnichar)0x10000;
|
||||
*dest++ = (char)0xf0 | (n >> 18);
|
||||
*dest++ = (char)0x80 | ((n >> 12) & 0x3f);
|
||||
*dest++ = (char)0x80 | ((n >> 6) & 0x3f);
|
||||
*dest++ = (char)0x80 | (n & 0x3f);
|
||||
++src;
|
||||
}
|
||||
} else {
|
||||
if (destLen < 3)
|
||||
goto error_more_output;
|
||||
//treat rest of the character as BMP
|
||||
*dest++ = (char)0xe0 | (*src >> 12);
|
||||
*dest++ = (char)0x80 | ((*src >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (*src & 0x003f);
|
||||
}
|
||||
++src;
|
||||
}
|
||||
|
||||
*aDestLength = dest - aDest;
|
||||
return NS_OK;
|
||||
|
||||
error_more_output:
|
||||
*aSrcLength = src - aSrc;
|
||||
*aDestLength = dest - aDest;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP nsUnicodeToUTF8::Finish(char * aDest, PRInt32 * aDestLength)
|
||||
{
|
||||
char * dest = aDest;
|
||||
|
||||
if (mHighSurrogate) {
|
||||
if (*aDestLength < 3) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*dest++ = (char)0xe0 | (mHighSurrogate >> 12);
|
||||
*dest++ = (char)0x80 | ((mHighSurrogate >> 6) & 0x003f);
|
||||
*dest++ = (char)0x80 | (mHighSurrogate & 0x003f);
|
||||
mHighSurrogate = 0;
|
||||
*aDestLength = 3;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
*aDestLength = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
|
|
@ -58,24 +58,37 @@
|
|||
* @created 05/Apr/1999
|
||||
* @author Catalin Rotaru [CATA]
|
||||
*/
|
||||
class nsUnicodeToUTF8 : public nsTableEncoderSupport
|
||||
class nsUnicodeToUTF8 : public nsIUnicodeEncoder
|
||||
{
|
||||
NS_DECL_ISUPPORTS
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
* Class constructor.
|
||||
*/
|
||||
nsUnicodeToUTF8();
|
||||
nsUnicodeToUTF8() {NS_INIT_REFCNT(); mHighSurrogate = 0;};
|
||||
|
||||
NS_IMETHOD FillInfo(PRUint32* aInfo);
|
||||
|
||||
protected:
|
||||
NS_IMETHOD Convert(const PRUnichar * aSrc,
|
||||
PRInt32 * aSrcLength,
|
||||
char * aDest,
|
||||
PRInt32 * aDestLength);
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
// Subclassing of nsEncoderSupport class [declaration]
|
||||
NS_IMETHOD Finish(char * aDest, PRInt32 * aDestLength);
|
||||
|
||||
NS_IMETHOD GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength,
|
||||
PRInt32 * aDestLength);
|
||||
|
||||
NS_IMETHOD Reset() {mHighSurrogate = 0; return NS_OK;}
|
||||
|
||||
NS_IMETHOD SetOutputErrorBehavior(PRInt32 aBehavior,
|
||||
nsIUnicharEncoder * aEncoder, PRUnichar aChar) {return NS_OK;};
|
||||
|
||||
protected:
|
||||
PRUnichar mHighSurrogate;
|
||||
|
||||
};
|
||||
|
||||
#endif /* nsUnicodeToUTF8_h___ */
|
||||
|
|
Загрузка…
Ссылка в новой задаче