зеркало из https://github.com/mozilla/gecko-dev.git
Bug 494099, HZ-GB-2312 converter reads beyond input buffer and omits characters at block boundaries. r=VYV03354@nifty.ne.jp
This commit is contained in:
Родитель
451e5accfb
Коммит
306b8cb3e6
|
@ -0,0 +1,67 @@
|
|||
/* Test case for bug 90411
|
||||
*
|
||||
* Uses nsIConverterInputStream to decode GB_HK test.
|
||||
*
|
||||
* Sample text is:
|
||||
* 问他谁是傻瓜了5分钟。但是,他谁不要求仍然是一个傻瓜永远
|
||||
* 我听见 我忘记; 我看见 我记住; 我做 我了解。
|
||||
*/
|
||||
|
||||
const sample = "~{NJK{K-JGI59OAK~}5~{7VVS!#5+JG#,K{K-2;R*GsHTH;JGR;8vI59OS@T6!#~} ~{NRL}<{~} ~{NRM|<G~}; ~{NR?4<{~} ~{NR<GW!~}; ~{NRWv~} ~{NRAK=b!#~}";
|
||||
|
||||
const expected = "\u95EE\u4ED6\u8C01\u662F\u50BB\u74DC\u4E865\u5206\u949F\u3002\u4F46\u662F\uFF0C\u4ED6\u8C01\u4E0D\u8981\u6C42\u4ECD\u7136\u662F\u4E00\u4E2A\u50BB\u74DC\u6C38\u8FDC\u3002 \u6211\u542C\u89C1 \u6211\u5FD8\u8BB0; \u6211\u770B\u89C1 \u6211\u8BB0\u4F4F; \u6211\u505A \u6211\u4E86\u89E3\u3002";
|
||||
|
||||
const charset="HZ-GB-2312";
|
||||
|
||||
function testCase(bufferLength)
|
||||
{
|
||||
var dataURI = "data:text/plain;charset=" + charset + "," + sample;
|
||||
|
||||
var IOService = Components.Constructor("@mozilla.org/network/io-service;1",
|
||||
"nsIIOService");
|
||||
var ConverterInputStream =
|
||||
Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
|
||||
"nsIConverterInputStream",
|
||||
"init");
|
||||
|
||||
var ios = new IOService();
|
||||
var channel = ios.newChannel(dataURI, "", null);
|
||||
var testInputStream = channel.open();
|
||||
var testConverter = new ConverterInputStream(testInputStream,
|
||||
charset,
|
||||
bufferLength,
|
||||
0xFFFD);
|
||||
|
||||
if (!(testConverter instanceof
|
||||
Components.interfaces.nsIUnicharLineInputStream))
|
||||
throw "not line input stream";
|
||||
|
||||
var outStr = "";
|
||||
var more;
|
||||
do {
|
||||
// read the line and check for eof
|
||||
var line = {};
|
||||
more = testConverter.readLine(line);
|
||||
outStr += line.value;
|
||||
} while (more);
|
||||
|
||||
if (outStr != expected) {
|
||||
dump("Failed with bufferLength = " + bufferLength + "\n");
|
||||
if (outStr.length == expected.length) {
|
||||
for (i = 0; i < outStr.length; ++i) {
|
||||
if (outStr.charCodeAt(i) != expected.charCodeAt(i)) {
|
||||
dump(i + ": " + outStr.charCodeAt(i).toString(16) + " != " + expected.charCodeAt(i).toString(16) + "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// escape the strings before comparing for better readability
|
||||
do_check_eq(escape(outStr), escape(expected));
|
||||
}
|
||||
|
||||
function run_test()
|
||||
{
|
||||
testCase(32);
|
||||
testCase(33);
|
||||
}
|
|
@ -68,19 +68,23 @@
|
|||
//----------------------------------------------------------------------
|
||||
// Subclassing of nsTablesDecoderSupport class [implementation]
|
||||
|
||||
#define HZ_STATE_GB 1
|
||||
#define HZ_STATE_ASCII 2
|
||||
#define HZ_STATE_TILD 3
|
||||
#define HZ_STATE_GB 1
|
||||
#define HZ_STATE_ASCII 2
|
||||
#define HZ_STATE_ODD_BYTE_FLAG 0x80
|
||||
#define HZLEAD1 '~'
|
||||
#define HZLEAD2 '{'
|
||||
#define HZLEAD3 '}'
|
||||
#define HZLEAD4 '\n'
|
||||
#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
|
||||
#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
|
||||
|
||||
nsHZToUnicode::nsHZToUnicode() : nsBufferDecoderSupport(1)
|
||||
{
|
||||
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
|
||||
mHZState = HZ_STATE_ASCII; // per HZ spec, default to ASCII state
|
||||
mRunLength = 0;
|
||||
mOddByte = 0;
|
||||
}
|
||||
|
||||
//Overwriting the ConvertNoBuff() in nsUCvCnSupport.cpp.
|
||||
NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
|
||||
const char* aSrc,
|
||||
|
@ -91,114 +95,98 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
|
|||
PRInt32 i=0;
|
||||
PRInt32 iSrcLength = *aSrcLength;
|
||||
PRInt32 iDestlen = 0;
|
||||
PRUint8 ch1, ch2;
|
||||
nsresult res = NS_OK;
|
||||
*aSrcLength=0;
|
||||
for (i=0;i<iSrcLength;i++)
|
||||
{
|
||||
if ( iDestlen >= (*aDestLength) )
|
||||
{
|
||||
nsresult res = NS_OK;
|
||||
char oddByte = mOddByte;
|
||||
|
||||
for (i=0; i<iSrcLength; i++) {
|
||||
if (iDestlen >= (*aDestLength)) {
|
||||
res = NS_OK_UDEC_MOREOUTPUT;
|
||||
break;
|
||||
}
|
||||
if ( *aSrc & 0x80 ) // if it is a 8-bit byte
|
||||
{
|
||||
if (UINT8_IN_RANGE(0x81, aSrc[0], 0xFE) &&
|
||||
UINT8_IN_RANGE(0x40, aSrc[1], 0xFE)) {
|
||||
// The source is a 8-bit GBCode
|
||||
*aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
|
||||
|
||||
char srcByte = *aSrc++;
|
||||
(*aSrcLength)++;
|
||||
if (!HZ_ODD_BYTE_STATE) {
|
||||
if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) {
|
||||
oddByte = srcByte;
|
||||
mHZState |= HZ_STATE_ODD_BYTE_FLAG;
|
||||
} else {
|
||||
*aDest = UCS2_NO_MAPPING;
|
||||
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
|
||||
iDestlen++;
|
||||
}
|
||||
aSrc += 2;
|
||||
i++;
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
*aSrcLength = i+1;
|
||||
continue;
|
||||
}
|
||||
// otherwise, it is a 7-bit byte
|
||||
// The source will be an ASCII or a 7-bit HZ code depending on ch1
|
||||
ch1 = *aSrc;
|
||||
ch2 = *(aSrc+1);
|
||||
if (ch1 == HZLEAD1 ) // if it is lead by '~'
|
||||
{
|
||||
switch (ch2)
|
||||
{
|
||||
case HZLEAD2:
|
||||
// we got a '~{'
|
||||
// we are switching to HZ state
|
||||
mHZState = HZ_STATE_GB;
|
||||
mRunLength = 0;
|
||||
aSrc += 2;
|
||||
i++;
|
||||
break;
|
||||
case HZLEAD3:
|
||||
// we got a '~}'
|
||||
// we are switching to ASCII state
|
||||
mHZState = HZ_STATE_ASCII;
|
||||
aSrc += 2;
|
||||
i++;
|
||||
if (mRunLength == 0) {
|
||||
*aDest = UCS2_NO_MAPPING;
|
||||
} else {
|
||||
if (oddByte & 0x80) { // if it is a 8-bit byte
|
||||
if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
|
||||
UINT8_IN_RANGE(0x40, srcByte, 0xFE)) {
|
||||
// The source is a 8-bit GBCode
|
||||
*aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte);
|
||||
} else {
|
||||
*aDest++ = UCS2_NO_MAPPING;
|
||||
}
|
||||
iDestlen++;
|
||||
// otherwise, it is a 7-bit byte
|
||||
// The source will be an ASCII or a 7-bit HZ code depending on oddByte
|
||||
} else if (oddByte == HZLEAD1) { // if it is lead by '~'
|
||||
switch (srcByte) {
|
||||
case HZLEAD2:
|
||||
// we got a '~{'
|
||||
// we are switching to HZ state
|
||||
mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE;
|
||||
mRunLength = 0;
|
||||
break;
|
||||
|
||||
case HZLEAD3:
|
||||
// we got a '~}'
|
||||
// we are switching to ASCII state
|
||||
mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE;
|
||||
if (mRunLength == 0) {
|
||||
*aDest++ = UCS2_NO_MAPPING;
|
||||
iDestlen++;
|
||||
}
|
||||
mRunLength = 0;
|
||||
break;
|
||||
|
||||
case HZLEAD1:
|
||||
// we got a '~~', process like an ASCII, but no state change
|
||||
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
}
|
||||
mRunLength = 0;
|
||||
break;
|
||||
case HZLEAD1:
|
||||
// we got a '~~', process like an ASCII, but no state change
|
||||
aSrc++;
|
||||
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
|
||||
aSrc++;
|
||||
i++;
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
mRunLength++;
|
||||
break;
|
||||
case HZLEAD4:
|
||||
// we got a "~\n", it means maintain double byte mode cross lines, ignore the '~' itself
|
||||
// mHZState = HZ_STATE_GB;
|
||||
// I find that "~\n" should interpreted as line continuation without mode change
|
||||
// It should not be interpreted as line continuation with double byte mode on
|
||||
aSrc++;
|
||||
break;
|
||||
default:
|
||||
// undefined ESC sequence '~X' are ignored since this is a illegal combination
|
||||
aSrc += 2;
|
||||
*aDest = UCS2_NO_MAPPING;
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
break;
|
||||
};
|
||||
continue;// go for next loop
|
||||
}
|
||||
// ch1 != '~'
|
||||
switch (mHZState)
|
||||
{
|
||||
case HZ_STATE_GB:
|
||||
// the following chars are HZ
|
||||
*aDest = mUtil.GBKCharToUnicode(aSrc[0]|0x80, aSrc[1]|0x80);
|
||||
aSrc += 2;
|
||||
i++;
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
mRunLength++;
|
||||
break;
|
||||
|
||||
case HZLEAD4:
|
||||
// we got a "~\n", it means maintain double byte mode cross lines,
|
||||
// ignore the '~' itself
|
||||
// mHZState = HZ_STATE_GB;
|
||||
// I find that "~\n" should interpreted as line continuation
|
||||
// without mode change
|
||||
// It should not be interpreted as line continuation with double
|
||||
// byte mode on
|
||||
break;
|
||||
|
||||
default:
|
||||
// undefined ESC sequence '~X' are ignored since this is an
|
||||
// illegal combination
|
||||
*aDest++ = UCS2_NO_MAPPING;
|
||||
iDestlen++;
|
||||
break;
|
||||
}
|
||||
} else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
|
||||
*aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80);
|
||||
mRunLength++;
|
||||
break;
|
||||
case HZ_STATE_ASCII:
|
||||
default:
|
||||
// default behavior also like an ASCII
|
||||
// when the source is an ASCII
|
||||
*aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
|
||||
aSrc++;
|
||||
iDestlen++;
|
||||
aDest++;
|
||||
break;
|
||||
} else {
|
||||
NS_NOTREACHED("2-byte sequence that we don't know how to handle");
|
||||
*aDest++ = UCS2_NO_MAPPING;
|
||||
iDestlen++;
|
||||
}
|
||||
oddByte = 0;
|
||||
mHZState &= ~HZ_STATE_ODD_BYTE_FLAG;
|
||||
}
|
||||
*aSrcLength = i+1;
|
||||
}// for loop
|
||||
} // for loop
|
||||
mOddByte = HZ_ODD_BYTE_STATE ? oddByte : 0;
|
||||
*aDestLength = iDestlen;
|
||||
return NS_OK;
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -71,6 +71,7 @@ protected:
|
|||
private:
|
||||
PRInt16 mHZState;
|
||||
PRUint32 mRunLength; // length of a run of 8-bit GB-encoded characters
|
||||
char mOddByte; // first byte of a multi-byte sequence from a previous buffer
|
||||
|
||||
};
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче