Bug 335531 - Correct misuse of UTF-16BE, UTF-16LE, UTF-32BE, and UTF-32LE charset labels; r=smontagu sr=dbaron

This commit is contained in:
Masatoshi Kimura 2008-12-06 11:08:26 -08:00
Родитель f77a3e21e2
Коммит 7d5d10f0d6
19 изменённых файлов: 491 добавлений и 87 удалений

Просмотреть файл

@ -523,7 +523,7 @@ public:
* @return boolean indicating whether a BOM was detected.
*/
static PRBool CheckForBOM(const unsigned char* aBuffer, PRUint32 aLength,
nsACString& aCharset);
nsACString& aCharset, PRBool *bigEndian = nsnull);
/**

Просмотреть файл

@ -3171,7 +3171,7 @@ nsContentUtils::ConvertStringFromCharset(const nsACString& aCharset,
/* static */
PRBool
nsContentUtils::CheckForBOM(const unsigned char* aBuffer, PRUint32 aLength,
nsACString& aCharset)
nsACString& aCharset, PRBool *bigEndian)
{
PRBool found = PR_TRUE;
aCharset.Truncate();
@ -3186,22 +3186,30 @@ nsContentUtils::CheckForBOM(const unsigned char* aBuffer, PRUint32 aLength,
aBuffer[1] == 0x00 &&
aBuffer[2] == 0xFE &&
aBuffer[3] == 0xFF) {
aCharset = "UTF-32BE";
aCharset = "UTF-32";
if (bigEndian)
*bigEndian = PR_TRUE;
}
else if (aLength >= 4 &&
aBuffer[0] == 0xFF &&
aBuffer[1] == 0xFE &&
aBuffer[2] == 0x00 &&
aBuffer[3] == 0x00) {
aCharset = "UTF-32LE";
aCharset = "UTF-32";
if (bigEndian)
*bigEndian = PR_FALSE;
}
else if (aLength >= 2 &&
aBuffer[0] == 0xFE && aBuffer[1] == 0xFF) {
aCharset = "UTF-16BE";
aCharset = "UTF-16";
if (bigEndian)
*bigEndian = PR_TRUE;
}
else if (aLength >= 2 &&
aBuffer[0] == 0xFF && aBuffer[1] == 0xFE) {
aCharset = "UTF-16LE";
aCharset = "UTF-16";
if (bigEndian)
*bigEndian = PR_FALSE;
} else {
found = PR_FALSE;
}

Просмотреть файл

@ -766,14 +766,14 @@ DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen, nsCString& oChars
if (0xFF == aBytes[1]) {
// FE FF
// UTF-16, big-endian
oCharset.Assign("UTF-16BE");
oCharset.Assign("UTF-16");
}
break;
case 0xFF:
if (0xFE == aBytes[1]) {
// FF FE
// UTF-16, little-endian
oCharset.Assign("UTF-16LE");
oCharset.Assign("UTF-16");
}
break;
}

Просмотреть файл

@ -80,10 +80,11 @@ iso-2022-kr.title = Korean (ISO-2022-KR)
utf-7.title = Unicode (UTF-7)
utf-8.title = Unicode (UTF-8)
utf-16.title = Unicode (UTF-16)
utf-16le.title = Unicode (UTF-16 Little Endian)
utf-16be.title = Unicode (UTF-16 Big Endian)
utf-32le.title = Unicode (UTF-32 Little Endian)
utf-32be.title = Unicode (UTF-32 Big Endian)
utf-16le.title = Unicode (UTF-16LE)
utf-16be.title = Unicode (UTF-16BE)
utf-32.title = Unicode (UTF-32)
utf-32le.title = Unicode (UTF-32LE)
utf-32be.title = Unicode (UTF-32BE)
iso-8859-5.title = Cyrillic (ISO-8859-5)
iso-ir-111.title = Cyrillic (ISO-IR-111)
windows-1251.title = Cyrillic (Windows-1251)

Просмотреть файл

@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
mDetectedCharset = "UTF-16BE";
mDetectedCharset = "UTF-16";
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = "UTF-32BE";
mDetectedCharset = "UTF-32";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = "UTF-32LE";
mDetectedCharset = "UTF-32";
else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
mDetectedCharset = "UTF-16LE";
mDetectedCharset = "UTF-16";
break;
} // switch

Просмотреть файл

@ -296,6 +296,7 @@ NS_IMETHODIMP nsMetaCharsetObserver::Notify(
if (!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
!preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE")) {
// Propagate the error message so that the parser can
@ -375,6 +376,7 @@ NS_IMETHODIMP nsMetaCharsetObserver::GetCharsetFromCompatibilityTag(
!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
!preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE"))
AppendASCIItoUTF16(preferred, aCharset);

Просмотреть файл

@ -85,6 +85,7 @@ iso-2022-kr=ISO-2022-KR
iso-2022-jp=ISO-2022-JP
utf-32be=UTF-32BE
utf-32le=UTF-32LE
utf-32=UTF-32
utf-16be=UTF-16BE
utf-16le=UTF-16LE
utf-16=UTF-16

Просмотреть файл

@ -344,6 +344,7 @@ NS_UCONV_REG_UNREG("x-imap4-modified-utf7", NS_MUTF7TOUNICODE_CID, NS_UNICODETOM
NS_UCONV_REG_UNREG("UTF-16", NS_UTF16TOUNICODE_CID, NS_UNICODETOUTF16_CID)
NS_UCONV_REG_UNREG("UTF-16BE", NS_UTF16BETOUNICODE_CID, NS_UNICODETOUTF16BE_CID)
NS_UCONV_REG_UNREG("UTF-16LE", NS_UTF16LETOUNICODE_CID, NS_UNICODETOUTF16LE_CID)
NS_UCONV_REG_UNREG("UTF-32", NS_UTF32TOUNICODE_CID, NS_UNICODETOUTF32_CID)
NS_UCONV_REG_UNREG("UTF-32BE", NS_UTF32BETOUNICODE_CID, NS_UNICODETOUTF32BE_CID)
NS_UCONV_REG_UNREG("UTF-32LE", NS_UTF32LETOUNICODE_CID, NS_UNICODETOUTF32LE_CID)
NS_UCONV_REG_UNREG("T.61-8bit", NS_T61TOUNICODE_CID, NS_UNICODETOT61_CID)
@ -418,6 +419,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsMUTF7ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16BEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF16LEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32BEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF32LEToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF7)
@ -427,6 +429,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF16LE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF16)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32BE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32LE)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF32)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToTSCII)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToTamilTTF)
@ -967,6 +970,11 @@ static const nsModuleComponentInfo components[] =
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-16LE",
nsUTF16LEToUnicodeConstructor ,
},
{
DECODER_NAME_BASE "UTF-32" , NS_UTF32TOUNICODE_CID,
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-32",
nsUTF32ToUnicodeConstructor ,
},
{
DECODER_NAME_BASE "UTF-32BE" , NS_UTF32BETOUNICODE_CID,
NS_UNICODEDECODER_CONTRACTID_BASE "UTF-32BE",
@ -1282,7 +1290,12 @@ static const nsModuleComponentInfo components[] =
NS_UNICODEENCODER_CONTRACTID_BASE "UTF-32LE",
nsUnicodeToUTF32LEConstructor,
},
{
{
ENCODER_NAME_BASE "UTF-32" , NS_UNICODETOUTF32_CID,
NS_UNICODEENCODER_CONTRACTID_BASE "UTF-32",
nsUnicodeToUTF32Constructor,
},
{
ENCODER_NAME_BASE "T.61-8bit" , NS_UNICODETOT61_CID,
NS_UNICODEENCODER_CONTRACTID_BASE "T.61-8bit",
nsUnicodeToT61Constructor,

Просмотреть файл

@ -0,0 +1,228 @@
/* Test case for bug 335531
*
* Uses nsIConverterInputStream to decode UTF-16 text with all combinations
* of UTF-16BE and UTF-16LE with and without BOM.
*
* Sample text is: "Все счастливые семьи похожи друг на друга, каждая несчастливая семья несчастлива по-своему."
*
* The enclosing quotation marks are included in the sample text to test that
* UTF-16LE is recognized even when there is no BOM and the UTF-16LE decoder is
* not explicitly called. This only works when the first character of the text
* is an eight-bit character.
*/
const beBOM="%00%00%FE%FF";
const leBOM="%FF%FE%00%00";
const outBOM="\uFEFF";
const sampleUTF32BE="%00%00%00%22%00%00%04%12%00%00%04%41%00%00%04%35%00%00%00%20%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%4B%00%00%04%35%00%00%00%20%00%00%04%41%00%00%04%35%00%00%04%3C%00%00%04%4C%00%00%04%38%00%00%00%20%00%00%04%3F%00%00%04%3E%00%00%04%45%00%00%04%3E%00%00%04%36%00%00%04%38%00%00%00%20%00%00%04%34%00%00%04%40%00%00%04%43%00%00%04%33%00%00%00%20%00%00%04%3D%00%00%04%30%00%00%00%20%00%00%04%34%00%00%04%40%00%00%04%43%00%00%04%33%00%00%04%30%00%00%00%2C%00%00%00%20%00%00%04%3A%00%00%04%30%00%00%04%36%00%00%04%34%00%00%04%30%00%00%04%4F%00%00%00%20%00%00%04%3D%00%00%04%35%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%30%00%00%04%4F%00%00%00%20%00%00%04%41%00%00%04%35%00%00%04%3C%00%00%04%4C%00%00%04%4F%00%00%00%20%00%00%04%3D%00%00%04%35%00%00%04%41%00%00%04%47%00%00%04%30%00%00%04%41%00%00%04%42%00%00%04%3B%00%00%04%38%00%00%04%32%00%00%04%30%00%00%00%20%00%00%04%3F%00%00%04%3E%00%00%00%2D%00%00%04%41%00%00%04%32%00%00%04%3E%00%00%04%35%00%00%04%3C%00%00%04%43%00%00%00%2E%00%00%00%22";
const sampleUTF32LE="%22%00%00%00%12%04%00%00%41%04%00%00%35%04%00%00%20%00%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%4B%04%00%00%35%04%00%00%20%00%00%00%41%04%00%00%35%04%00%00%3C%04%00%00%4C%04%00%00%38%04%00%00%20%00%00%00%3F%04%00%00%3E%04%00%00%45%04%00%00%3E%04%00%00%36%04%00%00%38%04%00%00%20%00%00%00%34%04%00%00%40%04%00%00%43%04%00%00%33%04%00%00%20%00%00%00%3D%04%00%00%30%04%00%00%20%00%00%00%34%04%00%00%40%04%00%00%43%04%00%00%33%04%00%00%30%04%00%00%2C%00%00%00%20%00%00%00%3A%04%00%00%30%04%00%00%36%04%00%00%34%04%00%00%30%04%00%00%4F%04%00%00%20%00%00%00%3D%04%00%00%35%04%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%30%04%00%00%4F%04%00%00%20%00%00%00%41%04%00%00%35%04%00%00%3C%04%00%00%4C%04%00%00%4F%04%00%00%20%00%00%00%3D%04%00%00%35%04%00%00%41%04%00%00%47%04%00%00%30%04%00%00%41%04%00%00%42%04%00%00%3B%04%00%00%38%04%00%00%32%04%00%00%30%04%00%00%20%00%00%00%3F%04%00%00%3E%04%00%00%2D%00%00%00%41%04%00%00%32%04%00%00%3E%04%00%00%35%04%00%00%3C%04%00%00%43%04%00%00%2E%00%00%00%22%00%00%00";
const expectedNoBOM = "\"\u0412\u0441\u0435 \u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u044B\u0435 \u0441\u0435\u043C\u044C\u0438 \u043F\u043E\u0445\u043E\u0436\u0438 \u0434\u0440\u0443\u0433 \u043D\u0430 \u0434\u0440\u0443\u0433\u0430, \u043A\u0430\u0436\u0434\u0430\u044F \u043D\u0435\u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u0430\u044F \u0441\u0435\u043C\u044C\u044F \u043D\u0435\u0441\u0447\u0430\u0441\u0442\u043B\u0438\u0432\u0430 \u043F\u043E-\u0441\u0432\u043E\u0435\u043C\u0443.\"";
function makeText(withBOM, charset)
{
var theText = eval("sample" + charset);
if (withBOM) {
if (charset == "UTF32BE") {
theText = beBOM + theText;
} else {
theText = leBOM + theText;
}
}
return theText;
}
function testCase(withBOM, charset, charsetDec, decoder, bufferLength)
{
var dataURI = "data:text/plain;charset=" + charsetDec + "," +
makeText(withBOM, charset);
var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
"nsIIOService");
var ConverterInputStream =
Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
"nsIConverterInputStream",
"init");
var ios = new IOService();
var channel = ios.newChannel(dataURI, "", null);
var testInputStream = channel.open();
var testConverter = new ConverterInputStream(testInputStream,
decoder,
bufferLength,
0xFFFD);
if (!(testConverter instanceof
Components.interfaces.nsIUnicharLineInputStream))
throw "not line input stream";
var outStr = "";
var more;
do {
// read the line and check for eof
var line = {};
more = testConverter.readLine(line);
outStr += line.value;
} while (more);
var expected = expectedNoBOM;
if (withBOM) {
// BE / LE decoder wouldn't strip the BOM
if (decoder == "UTF-32BE" || decoder == "UTF-32LE") {
expected = outBOM + expectedNoBOM;
}
}
do_check_eq(outStr, expected);
}
// Tests conversion of one to three byte(s) from UTF-32 to Unicode
const expectedString = "\ufffd";
const charset = "UTF-32";
function testCase2(inString) {
var ScriptableUnicodeConverter =
Components.Constructor("@mozilla.org/intl/scriptableunicodeconverter",
"nsIScriptableUnicodeConverter");
var converter = new ScriptableUnicodeConverter();
converter.charset = charset;
var outString;
try {
outString = converter.ConvertToUnicode(inString) + converter.Finish();
} catch(e) {
outString = "\ufffd";
}
do_check_eq(escape(outString), escape(expectedString));
}
/*
* Uses nsIConverterInputStream to decode UTF-32 text with surrogate characters
*
* Sample text is: "g" in Mathematical Bold Symbolls (U+1D420)
*
* The test uses buffers of 4 different lengths to test end of buffer in mid-
* UTF32 character
*/
// Single supplementaly character
// expected: surrogate pair
const test0="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%00%2D%00%00%00%2D";
const expected0 = "--\uD835\uDC20--";
// High surrogate followed by low surrogate (invalid in UTF-32)
// expected: two replacement chars
const test1="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%DC%20%00%00%00%2D%00%00%00%2D";
const expected1 = "--\uFFFD\uFFFD--";
// Lone high surrogate
// expected: one replacement char
const test2="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%00%2D%00%00%00%2D";
const expected2 = "--\uFFFD--";
// Lone low surrogate
// expected: one replacement char
const test3="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%00%2D%00%00%00%2D";
const expected3 = "--\uFFFD--";
// Two high surrogates
// expected: two replacement chars
const test4="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%00%D8%35%00%00%00%2D%00%00%00%2D";
const expected4 = "--\uFFFD\uFFFD--";
// Two low surrogates
// expected: two replacement chars
const test5="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%DC%20%00%00%00%2D%00%00%00%2D";
const expected5 = "--\uFFFD\uFFFD--";
// Low surrogate followed by high surrogate
// expected: two replacement chars
const test6="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%00%D8%35%00%00%00%2D%00%00%00%2D";
const expected6 = "--\uFFFD\uFFFD--";
// Lone high surrogate followed by supplementaly character
// expected: replacement char followed by surrogate pair
const test7="%00%00%00%2D%00%00%00%2D%00%00%D8%35%00%01%D4%20%00%00%00%2D%00%00%00%2D";
const expected7 = "--\uFFFD\uD835\uDC20--";
// Lone low surrogate followed by supplementaly character
// expected: replacement char followed by surrogate pair
const test8="%00%00%00%2D%00%00%00%2D%00%00%DC%20%00%01%D4%20%00%00%00%2D%00%00%00%2D";
const expected8 = "--\uFFFD\uD835\uDC20--";
// Supplementaly character followed by lone high surrogate
// expected: surrogate pair followed by replacement char
const test9="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%D8%35%00%00%00%2D%00%00%00%2D";
const expected9 = "--\uD835\uDC20\uFFFD--";
// Supplementaly character followed by lone low surrogate
// expected: surrogate pair followed by replacement char
const test10="%00%00%00%2D%00%00%00%2D%00%01%D4%20%00%00%DC%20%00%00%00%2D%00%00%00%2D";
const expected10 = "--\uD835\uDC20\uFFFD--";
// Lone high surrogate at the end of the input
// expected: one replacement char (invalid in UTF-32)
const test11="%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%D8%35";
const expected11 = "----\uFFFD";
// Half code unit at the end of the input
// expected: nothing
const test12="%00%00%00%2D%00%00%00%2D%00%00%00%2D%00%00%00%2D%D8";
const expected12 = "----";
function testCase3(testNumber, bufferLength)
{
var dataURI = "data:text/plain;charset=UTF32BE," + eval("test" + testNumber);
var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
"nsIIOService");
var ConverterInputStream =
Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
"nsIConverterInputStream",
"init");
var ios = new IOService();
var channel = ios.newChannel(dataURI, "", null);
var testInputStream = channel.open();
var testConverter = new ConverterInputStream(testInputStream,
"UTF-32BE",
bufferLength,
0xFFFD);
if (!(testConverter instanceof
Components.interfaces.nsIUnicharLineInputStream))
throw "not line input stream";
var outStr = "";
var more;
do {
// read the line and check for eof
var line = {};
more = testConverter.readLine(line);
outStr += line.value;
} while (more);
// escape the strings before comparing for better readability
do_check_eq(escape(outStr), escape(eval("expected" + testNumber)));
}
function run_test()
{
/* BOM charset charset decoder buffer
declaration length */
testCase(true, "UTF32LE", "UTF-32", "UTF-32", 64);
testCase(true, "UTF32BE", "UTF-32", "UTF-32", 64);
testCase(true, "UTF32LE", "UTF-32", "UTF-32LE", 64);
testCase(true, "UTF32BE", "UTF-32", "UTF-32BE", 64);
testCase(false, "UTF32LE", "UTF-32", "UTF-32", 64);
testCase(false, "UTF32BE", "UTF-32", "UTF-32", 64);
testCase(false, "UTF32LE", "UTF-32", "UTF-32LE", 64);
testCase(false, "UTF32BE", "UTF-32", "UTF-32BE", 64);
testCase(true, "UTF32LE", "UTF-32", "UTF-32", 65);
testCase(true, "UTF32BE", "UTF-32", "UTF-32", 65);
testCase(true, "UTF32LE", "UTF-32", "UTF-32LE", 65);
testCase(true, "UTF32BE", "UTF-32", "UTF-32BE", 65);
testCase(false, "UTF32LE", "UTF-32", "UTF-32", 65);
testCase(false, "UTF32BE", "UTF-32", "UTF-32", 65);
testCase(false, "UTF32LE", "UTF-32", "UTF-32LE", 65);
testCase(false, "UTF32BE", "UTF-32", "UTF-32BE", 65);
testCase2("A");
testCase2("AB");
testCase2("ABC");
for (var test = 0; test <= 12; ++ test) {
for (var bufferLength = 4; bufferLength < 8; ++ bufferLength) {
testCase3(test, bufferLength);
}
}
}

Просмотреть файл

@ -567,6 +567,11 @@
#define NS_UTF16TOUNICODE_CID \
{ 0xd673255d, 0x1184, 0x400a, {0xb0, 0xb5, 0xee,0x9d, 0x12, 0x95, 0xbd, 0x85}}
// Class ID for our UTF32ToUnicode charset converter
// {30DCD313-73E1-447d-8339-37744952154E}
#define NS_UTF32TOUNICODE_CID \
{ 0x30dcd313, 0x73e1, 0x447d, {0x83, 0x39, 0x37, 0x74, 0x49, 0x52, 0x15, 0x4e}}
// Class ID for our UTF16LEToUnicode charset converter
// {BA6151B7-1DFA-11d3-B3BF-00805F8A6670}
#define NS_UTF16LETOUNICODE_CID \

Просмотреть файл

@ -167,7 +167,7 @@ static nsresult ConvertCommon(const char * aSrc,
//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [implementation]
nsUTF32ToUnicode::nsUTF32ToUnicode() : nsBasicDecoderSupport()
nsUTF32ToUnicodeBase::nsUTF32ToUnicodeBase() : nsBasicDecoderSupport()
{
Reset();
}
@ -175,9 +175,9 @@ nsUTF32ToUnicode::nsUTF32ToUnicode() : nsBasicDecoderSupport()
//----------------------------------------------------------------------
// Subclassing of nsDecoderSupport class [implementation]
NS_IMETHODIMP nsUTF32ToUnicode::GetMaxLength(const char * aSrc,
PRInt32 aSrcLength,
PRInt32 * aDestLength)
NS_IMETHODIMP nsUTF32ToUnicodeBase::GetMaxLength(const char * aSrc,
PRInt32 aSrcLength,
PRInt32 * aDestLength)
{
// Non-BMP characters take two PRUnichars(a pair of surrogate codepoints)
// so that we have to divide by 2 instead of 4 for the worst case.
@ -189,7 +189,7 @@ NS_IMETHODIMP nsUTF32ToUnicode::GetMaxLength(const char * aSrc,
//----------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [implementation]
NS_IMETHODIMP nsUTF32ToUnicode::Reset()
NS_IMETHODIMP nsUTF32ToUnicodeBase::Reset()
{
// the number of additional bytes to read to complete UTF-32 4byte seq.
mState = 0;
@ -203,7 +203,7 @@ NS_IMETHODIMP nsUTF32ToUnicode::Reset()
// Class nsUTF32BEToUnicode [implementation]
//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicode class [implementation]
// Subclassing of nsUTF32ToUnicodeBase class [implementation]
NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc,
PRInt32 * aSrcLength,
@ -218,7 +218,7 @@ NS_IMETHODIMP nsUTF32BEToUnicode::Convert(const char * aSrc,
// Class nsUTF32LEToUnicode [implementation]
//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicode class [implementation]
// Subclassing of nsUTF32ToUnicodeBase class [implementation]
NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc,
PRInt32 * aSrcLength,
@ -229,5 +229,81 @@ NS_IMETHODIMP nsUTF32LEToUnicode::Convert(const char * aSrc,
mBufferInc, PR_TRUE);
}
//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [implementation]
//----------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicodeBase class [implementation]
NS_IMETHODIMP nsUTF32ToUnicode::Reset()
{
nsresult rv = nsUTF32ToUnicodeBase::Reset();
mState = 4;
mEndian = kUnknown;
mFoundBOM = PR_FALSE;
return rv;
}
NS_IMETHODIMP nsUTF32ToUnicode::Convert(const char * aSrc,
PRInt32 * aSrcLength,
PRUnichar * aDest,
PRInt32 * aDestLength)
{
PRBool foundBOM = PR_FALSE;
if (4 == mState) // Called for the first time.
{
if (*aSrcLength < 4)
return NS_ERROR_ILLEGAL_INPUT;
// check if BOM (0xFEFF) is at the beginning, remove it if found, and
// set mEndian accordingly.
if (0xFF == PRUint8(aSrc[0]) && 0xFE == PRUint8(aSrc[1]) &&
0 == PRUint8(aSrc[2]) && 0 == PRUint8(aSrc[3])) {
aSrc += 4;
*aSrcLength -= 4;
mState = 0;
mEndian = kLittleEndian;
mFoundBOM = foundBOM = PR_TRUE;
}
else if (0 == PRUint8(aSrc[0]) && 0 == PRUint8(aSrc[1]) &&
0xFE == PRUint8(aSrc[2]) && 0xFF == PRUint8(aSrc[3])) {
aSrc += 4;
*aSrcLength -= 4;
mState = 0;
mEndian = kBigEndian;
mFoundBOM = foundBOM = PR_TRUE;
}
// BOM is not found, but we can use a simple heuristic to determine
// the endianness. Assume the first character is [U+0001, U+FFFF].
// Not always valid, but it's very likely to hold for html/xml/css.
#if 0 // BE case will be handled below
else if (!aSrc[0] && !aSrc[1] && (aSrc[2] || aSrc[3])) { // 0x00 0x00 0xhh 0xhh (hh != 00)
mState = 0;
mEndian = kBigEndian;
}
#endif
else if ((aSrc[0] || aSrc[1]) && !aSrc[2] && !aSrc[3]) { // 0xhh 0xhh 0x00 0x00 (hh != 00)
mState = 0;
mEndian = kLittleEndian;
}
else { // Neither BOM nor 'plausible' byte patterns at the beginning.
// Just assume it's BE (following Unicode standard)
// and let the garbage show up in the browser. (security concern?)
// (bug 246194)
mState = 0;
mEndian = kBigEndian;
}
}
nsresult rv = ConvertCommon(aSrc, aSrcLength, aDest, aDestLength, &mState,
mBufferInc, mEndian == kLittleEndian);
if (foundBOM)
*aSrcLength += 4; // need to consume BOM
// If BOM is not found and we're to return NS_OK, signal that BOM
// is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
}
// XXX : What to do with 'unflushed' mBufferInc?? : Finish()

Просмотреть файл

@ -42,29 +42,27 @@
#define nsUTF32ToUnicode_h___
//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [declaration]
// Class nsUTF32ToUnicodeBase [declaration]
/**
* A character set converter from UTF32 to Unicode.
* The base class for UTF32BE/UTF32LE to Unicode converters.
* A character set converter from UTF-32 family to Unicode.
* The base class for UTF-32BE/UTF-32LE/UTF-32 to Unicode converters.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUTF32ToUnicode : public nsBasicDecoderSupport
class nsUTF32ToUnicodeBase : public nsBasicDecoderSupport
{
public:
/**
* Class constructor.
*/
nsUTF32ToUnicode();
protected:
/**
* Class constructor. accessible only by child classes
*/
nsUTF32ToUnicodeBase();
// the number of additional bytes to read to complete an incomplete UTF-32 4byte seq.
PRUint16 mState;
PRUint16 mState;
// buffer for an incomplete UTF-32 sequence.
PRUint8 mBufferInc[4];
@ -82,13 +80,13 @@ protected:
// Class nsUTF32BEToUnicode [declaration]
/**
* A character set converter from UTF32BE to Unicode.
* A subclass of UTF32ToUnicode.
* A character set converter from UTF-32BE to Unicode.
* A subclass of UTF32ToUnicodeBase.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUTF32BEToUnicode : public nsUTF32ToUnicode
class nsUTF32BEToUnicode : public nsUTF32ToUnicodeBase
{
public:
@ -106,13 +104,13 @@ public:
// Class nsUTF32LEToUnicode [declaration]
/**
* A character set converter from UTF32LE to Unicode.
* A subclass of UTF32ToUnicode.
* A character set converter from UTF-32LE to Unicode.
* A subclass of UTF32ToUnicodeBase.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUTF32LEToUnicode : public nsUTF32ToUnicode
class nsUTF32LEToUnicode : public nsUTF32ToUnicodeBase
{
public:
@ -125,5 +123,42 @@ public:
};
//----------------------------------------------------------------------
// Class nsUTF32ToUnicode [declaration]
/**
* A character set converter from UTF-32 to Unicode.
* A subclass of UTF32ToUnicodeBase.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUTF32ToUnicode : public nsUTF32ToUnicodeBase
{
public:
/**
* Class constructor.
*/
nsUTF32ToUnicode() { Reset(); }
//--------------------------------------------------------------------
// Subclassing of nsBasicDecoderSupport class [declaration]
NS_IMETHOD Convert(const char * aSrc, PRInt32 * aSrcLength,
PRUnichar * aDest, PRInt32 * aDestLength);
//--------------------------------------------------------------------
// Subclassing of nsUTF32ToUnicodeBase class [declaration]
NS_IMETHOD Reset();
private:
enum Endian {kUnknown, kBigEndian, kLittleEndian};
Endian mEndian;
PRBool mFoundBOM;
};
#endif /* nsUTF32ToUnicode_h___ */

Просмотреть файл

@ -80,6 +80,7 @@ static nsresult ConvertCommon(const PRUnichar * aSrc,
char * aDest,
PRInt32 * aDestLength,
PRUnichar * aHighSurrogate,
PRUnichar * aBOM,
PRBool aIsLE)
{
const PRUnichar * src = aSrc;
@ -88,6 +89,18 @@ static nsresult ConvertCommon(const PRUnichar * aSrc,
const char * destEnd = aDest + *aDestLength;
PRUint32 ucs4;
// Handle BOM if necessary
if (0 != *aBOM)
{
if (*aDestLength < 4) {
*aSrcLength = *aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*(PRUint32*)dest = *aBOM;
*aBOM = 0;
dest += 4;
}
// left-over high surroage code point from the prev. run.
if (*aHighSurrogate)
@ -194,21 +207,21 @@ static nsresult FinishCommon(char * aDest,
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32 [implementation]
NS_IMPL_ISUPPORTS1(nsUnicodeToUTF32, nsIUnicodeEncoder)
NS_IMPL_ISUPPORTS1(nsUnicodeToUTF32Base, nsIUnicodeEncoder)
//----------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [implementation]
NS_IMETHODIMP nsUnicodeToUTF32::GetMaxLength(const PRUnichar * aSrc,
PRInt32 aSrcLength,
PRInt32 * aDestLength)
NS_IMETHODIMP nsUnicodeToUTF32Base::GetMaxLength(const PRUnichar * aSrc,
PRInt32 aSrcLength,
PRInt32 * aDestLength)
{
*aDestLength = aSrcLength * 4;
return NS_OK;
}
NS_IMETHODIMP nsUnicodeToUTF32::FillInfo(PRUint32 *aInfo)
NS_IMETHODIMP nsUnicodeToUTF32Base::FillInfo(PRUint32 *aInfo)
{
memset(aInfo, 0xFF, (0x10000L >> 3));
return NS_OK;
@ -228,7 +241,7 @@ NS_IMETHODIMP nsUnicodeToUTF32BE::Convert(const PRUnichar * aSrc,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength,
&mHighSurrogate, PR_FALSE);
&mHighSurrogate, &mBOM, PR_FALSE);
}
NS_IMETHODIMP nsUnicodeToUTF32BE::Finish(char * aDest,
@ -251,7 +264,7 @@ NS_IMETHODIMP nsUnicodeToUTF32LE::Convert(const PRUnichar * aSrc,
PRInt32 * aDestLength)
{
return ConvertCommon(aSrc, aSrcLength, aDest, aDestLength,
&mHighSurrogate, PR_TRUE);
&mHighSurrogate, &mBOM, PR_TRUE);
}
NS_IMETHODIMP nsUnicodeToUTF32LE::Finish(char * aDest,

Просмотреть файл

@ -45,25 +45,24 @@
// Class nsUnicodeToUTF32 [declaration]
/**
* A character set converter from UTF32 to Unicode.
* The base class for UTF32BE/UTF32LE to Unicode converters.
* A character set converter from UTF-32 family to Unicode.
* The base class for UTF-32/UTF-32BE/UTF-32LE to Unicode converters.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUnicodeToUTF32 : public nsIUnicodeEncoder
class nsUnicodeToUTF32Base : public nsIUnicodeEncoder
{
NS_DECL_ISUPPORTS
public:
protected:
/**
* Class constructor.
* Class constructor. accessible only by child classes
*/
nsUnicodeToUTF32() {mHighSurrogate = 0;}
virtual ~nsUnicodeToUTF32() {}
nsUnicodeToUTF32Base() {mBOM = 0; mHighSurrogate = 0;}
virtual ~nsUnicodeToUTF32Base() {}
protected:
PRUnichar mHighSurrogate;
NS_IMETHOD GetMaxLength(const PRUnichar * aSrc, PRInt32 aSrcLength,
@ -72,26 +71,28 @@ protected:
//--------------------------------------------------------------------
// Subclassing of nsIUnicodeEncoder class [declaration]
NS_IMETHOD Reset() {mHighSurrogate = 0; return NS_OK;}
NS_IMETHOD Reset() {mBOM = 0; mHighSurrogate = 0; return NS_OK;}
NS_IMETHOD FillInfo(PRUint32* aInfo);
NS_IMETHOD SetOutputErrorBehavior(PRInt32 aBehavior,
nsIUnicharEncoder * aEncoder,
PRUnichar aChar)
{return NS_OK;}
protected:
PRUnichar mBOM;
};
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32BE [declaration]
/**
* A character set converter from Unicode to UTF32BE.
* A subclass of UnicodeToUTF32.
* A character set converter from Unicode to UTF-32BE.
* A subclass of UnicodeToUTF32Base.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUnicodeToUTF32BE : public nsUnicodeToUTF32
class nsUnicodeToUTF32BE : public nsUnicodeToUTF32Base
{
public:
@ -109,13 +110,13 @@ public:
// Class nsUnicodeToUTF32LE [declaration]
/**
* A character set converter from Unicode to UTF32LE.
* A subclass of UnicodeToUTF32.
* A character set converter from Unicode to UTF-32LE.
* A subclass of UnicodeToUTF32Base.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
class nsUnicodeToUTF32LE : public nsUnicodeToUTF32
class nsUnicodeToUTF32LE : public nsUnicodeToUTF32Base
{
public:
@ -127,5 +128,31 @@ public:
};
//----------------------------------------------------------------------
// Class nsUnicodeToUTF32 [declaration]
/**
* A character set converter from Unicode to UTF-32.
* A subclass of UnicodeToUTF32Base.
* @created 08/Dec/2002
* @author Jungshik Shin
*/
#ifdef IS_LITTLE_ENDIAN
class nsUnicodeToUTF32 : public nsUnicodeToUTF32LE
#elif defined(IS_BIG_ENDIAN)
class nsUnicodeToUTF32 : public nsUnicodeToUTF32BE
#else
#error "Unknown endianness"
#endif
{
public:
nsUnicodeToUTF32() {mBOM = 0xFEFF; mHighSurrogate = 0;};
//--------------------------------------------------------------------
// Subclassing of nsUnicodeToUTF32Base class [declaration]
NS_IMETHOD Reset() {mBOM = 0xFEFF; mHighSurrogate = 0; return NS_OK;};
};
#endif /* nsUnicodeToUTF32_h___ */

Просмотреть файл

@ -34,7 +34,7 @@ while (decoderList.hasMore()) {
data = encodeUTF16BE(testContent);
else if (decoder == "UTF-16LE")
data = encodeUTF16LE(testContent);
else if (decoder == "UTF-32BE")
else if (decoder == "UTF-32" || decoder == "UTF-32BE")
data = encodeUTF32BE(testContent);
else if (decoder == "UTF-32LE")
data = encodeUTF32LE(testContent);

Просмотреть файл

@ -456,6 +456,7 @@ static nsresult GetCharsetFromData(const unsigned char* aStyleSheetData,
return NS_ERROR_NOT_AVAILABLE;
PRUint32 step = 1;
PRUint32 pos = 0;
PRBool bigEndian = PR_FALSE;
// Determine the encoding type. If we have a BOM, set aCharset to the
// charset listed for that BOM in http://www.w3.org/TR/REC-xml#sec-guessing;
// that way even if we don't have a valid @charset rule we can use the BOM to
@ -489,26 +490,18 @@ static nsresult GetCharsetFromData(const unsigned char* aStyleSheetData,
aCharset = "UTF-32";
}
else if (nsContentUtils::CheckForBOM(aStyleSheetData,
aDataLength, aCharset)) {
aDataLength, aCharset, &bigEndian)) {
if (aCharset.Equals("UTF-8")) {
step = 1;
pos = 3;
}
else if (aCharset.Equals("UTF-32BE")) {
else if (aCharset.Equals("UTF-32")) {
step = 4;
pos = 7;
pos = bigEndian ? 7 : 4;
}
else if (aCharset.Equals("UTF-32LE")) {
step = 4;
pos = 4;
}
else if (aCharset.Equals("UTF-16BE")) {
else if (aCharset.Equals("UTF-16")) {
step = 2;
pos = 3;
}
else if (aCharset.Equals("UTF-16LE")) {
step = 2;
pos = 2;
pos = bigEndian ? 3 : 2;
}
}
else if (aStyleSheetData[0] == 0x00 &&

Просмотреть файл

@ -568,11 +568,10 @@ PRBool nsUnknownDecoder::LastDitchSniff(nsIRequest* aRequest)
// are for 2-byte encodings and the UTF-8 BOM is 3 bytes).
if (mBufferLen >= 4) {
const unsigned char* buf = (const unsigned char*)mBuffer;
if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16BE
(buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16LE
if ((buf[0] == 0xFE && buf[1] == 0xFF) || // UTF-16, Big Endian
(buf[0] == 0xFF && buf[1] == 0xFE) || // UTF-16 or UCS-4, Little Endian
(buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) || // UTF-8
(buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF) || // UCS-4BE
(buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFF && buf[3] == 0xFE)) { // UCS-4
(buf[0] == 0 && buf[1] == 0 && buf[2] == 0xFE && buf[3] == 0xFF)) { // UCS-4, Big Endian
mContentType = TEXT_PLAIN;
return PR_TRUE;

Просмотреть файл

@ -2402,8 +2402,10 @@ nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext)
}
#define UTF16_BOM "UTF-16"
#define UTF16_BE "UTF-16BE"
#define UTF16_LE "UTF-16LE"
#define UCS4_BOM "UTF-32"
#define UCS4_BE "UTF-32BE"
#define UCS4_LE "UTF-32LE"
#define UCS4_2143 "X-ISO-10646-UCS-4-2143"
@ -2441,7 +2443,7 @@ DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
// 00 00
if((0xFE==aBytes[2]) && (0xFF==aBytes[3])) {
// 00 00 FE FF UCS-4, big-endian machine (1234 order)
oCharset.Assign(UCS4_BE);
oCharset.Assign(UCS4_BOM);
} else if((0x00==aBytes[2]) && (0x3C==aBytes[3])) {
// 00 00 00 3C UCS-4, big-endian machine (1234 order)
oCharset.Assign(UCS4_BE);
@ -2572,7 +2574,7 @@ DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
oCharset.Assign(UCS4_3412);
} else {
// FE FF UTF-16, big-endian
oCharset.Assign(UTF16_BE);
oCharset.Assign(UTF16_BOM);
}
oCharsetSource= kCharsetFromByteOrderMark;
}
@ -2581,11 +2583,11 @@ DetectByteOrderMark(const unsigned char* aBytes, PRInt32 aLen,
if(0xFE==aBytes[1]) {
if(0x00==aBytes[2] && 0x00==aBytes[3])
// FF FE 00 00 UTF-32, little-endian
oCharset.Assign(UCS4_LE);
oCharset.Assign(UCS4_BOM);
else
// FF FE
// UTF-16, little-endian
oCharset.Assign(UTF16_LE);
oCharset.Assign(UTF16_BOM);
oCharsetSource= kCharsetFromByteOrderMark;
}
break;
@ -2780,6 +2782,7 @@ ParserWriteFunc(nsIInputStream* in,
(!preferred.EqualsLiteral("UTF-16") &&
!preferred.EqualsLiteral("UTF-16BE") &&
!preferred.EqualsLiteral("UTF-16LE") &&
!preferred.EqualsLiteral("UTF-32") &&
!preferred.EqualsLiteral("UTF-32BE") &&
!preferred.EqualsLiteral("UTF-32LE")))) {
guess = preferred;

Просмотреть файл

@ -29,7 +29,7 @@ intl.charsetmenu.browser.more3=GB2312, x-gbk, gb18030, HZ-GB-2312, ISO-2022-CN,
intl.charsetmenu.browser.more4=armscii-8, GEOSTD8, TIS-620, ISO-8859-11, windows-874, IBM857, ISO-8859-9, x-mac-turkish, windows-1254, x-viet-tcvn5712, VISCII, x-viet-vps, windows-1258, x-mac-devanagari, x-mac-gujarati, x-mac-gurmukhi
intl.charsetmenu.browser.more5=ISO-8859-6, windows-1256, IBM864, x-mac-arabic, x-mac-farsi, ISO-8859-8-I, windows-1255, ISO-8859-8, IBM862, x-mac-hebrew
# Localization Note: Never change the following entry.
intl.charsetmenu.browser.unicode=UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, UTF-32BE
intl.charsetmenu.browser.unicode=UTF-8, UTF-16LE, UTF-16BE, UTF-32, UTF-32LE, UTF-32BE
intl.charset.default=ISO-8859-1
intl.charset.detector=
intl.charsetmenu.mailedit=ISO-8859-1, ISO-8859-15, ISO-8859-6, armscii-8, geostd8, ISO-8859-13, ISO-8859-14, ISO-8859-2, GB2312, GB18030, Big5, KOI8-R, windows-1251, KOI8-U, ISO-8859-7, ISO-8859-8-I, windows-1255, ISO-2022-JP, EUC-KR, ISO-8859-10, ISO-8859-3, TIS-620, ISO-8859-9, UTF-8, VISCII