mishandling of 'stand-alone' octet with MSB set in CJK(multibyte) encodings

r = ftang sr=erik,
p =  Jungshik Shin
This commit is contained in:
shanjian%netscape.com 2001-03-21 22:50:05 +00:00
Родитель bffde03348
Коммит 1c3ce9176c
3 изменённых файлов: 75 добавлений и 20 удалений

Просмотреть файл

@ -83,7 +83,8 @@ NS_IMETHODIMP nsGB2312ToUnicodeV2::ConvertNoBuff(const char* aSrc,
break;
}
if ( *aSrc & 0x80 )
// we need to handle 0xa0 specially even though it is not a legal GB2312 code point
if ( (PRUint8)0xA0 < (PRUint8)*aSrc && (PRUint8)*aSrc < (PRUint8)0xFF)
{
if(i+1 >= iSrcLength)
{
@ -91,22 +92,39 @@ NS_IMETHODIMP nsGB2312ToUnicodeV2::ConvertNoBuff(const char* aSrc,
break;
}
// The source is a GBCode
left = pSrcDBCode->leftbyte;
right = pSrcDBCode->rightbyte;
iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);
*pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
// To make sure, the second byte has to be checked as well
// The valid 2nd byte range: [0xA1,0xFE]
if ( (PRUint8)0xA0 < right && right < (PRUint8)0xFF )
{
// Valid GB 2312 code point
iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);
*pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
aSrc += 2;
i++;
}
else
{
// Invalid GB 2312 code point
aSrc += 2;
i++;
*pDestDBCode = (PRUnichar)0xfffd;
aSrc++;
}
}
else
{
// The source is an ASCII
*pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
aSrc++;
if ((PRUint8)*aSrc <= (PRUint8) 0x9f && (PRUint8)*aSrc >= (PRUint8) 0x80)
*pDestDBCode = (PRUnichar)0xfffd;
else
// The source is an ASCII
*pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
aSrc++;
}
iDestlen++;

Просмотреть файл

@ -70,23 +70,43 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
break;
}
if ( *aSrc & 0x80 )
// The valid range for the 1st byte is [0x81,0xFE]
if ( (PRUint8) 0x80 < (PRUint8)*aSrc && (PRUint8)*aSrc < (PRUint8)0xff )
{
if(i+1 >= iSrcLength)
{
rv = NS_OK_UDEC_MOREINPUT;
break;
}
// The source is a GBCode
left = pSrcDBCode->leftbyte;
right = pSrcDBCode->rightbyte;
// To make sure, the second byte has to be checked as well.
// In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
if ( right >= (PRUint8)0x40 && (right & 0x7f) != (PRUint8)0x7F)
{
// Valid GBK code
iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);
*pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
aSrc += 2;
i++;
}
else if ( left == (PRUint8)0xA0 )
{
// stand-alone (not followed by a valid second byte) 0xA0 !
// treat it as valid a la Netscape 4.x
*pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
aSrc++;
}
iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);
*pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
else
{
// Invalid GBK code point (second byte should be 0x40 or higher)
*pDestDBCode = (PRUnichar)0xfffd;
aSrc++;
}
aSrc += 2;
i++;
}
else
{

Просмотреть файл

@ -100,7 +100,7 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
switch(mState)
{
case 0:
if(*src & 0x80)
if(*src & 0x80 && *src != (unsigned char)0xa0)
{
mData = fbIdx[*src & 0x7F];
if(mData < 0xE000 )
@ -135,6 +135,11 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
PRUint8 off = sbIdx[*src];
if(0xFF == off) {
*dest++ = 0xFFFD;
// if the first byte is valid for SJIS but the second
// is not while being a valid US-ASCII(i.e. < 0x40), save it
// instead of eating it up !
if ( ! (*src & 0xc0) )
*dest++ = (PRUnichar) *src;
} else {
*dest++ = gJis0208map[mData+off];
}
@ -149,6 +154,9 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
PRUint8 off = sbIdx[*src];
if(0xFF == off) {
*dest++ = 0xFFFD;
// see the comment above for mstate=1
if ( ! (*src & 0xc0) )
*dest++ = (PRUnichar) *src;
} else {
*dest++ = mData + off;
}
@ -272,7 +280,7 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
switch(mState)
{
case 0:
if(*src & 0x80)
if(*src & 0x80 && *src != (unsigned char)0xa0)
{
mData = fbIdx[*src & 0x7F];
if(mData != 0xFFFD )
@ -304,7 +312,12 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
{
PRUint8 off = sbIdx[*src];
if(0xFF == off) {
*dest++ = 0xFFFD;
*dest++ = 0xFFFD;
// if the first byte is valid for EUC-JP but the second
// is not while being a valid US-ASCII(i.e. < 0xc0), save it
// instead of eating it up !
if ( ! (*src & 0xc0) )
*dest++ = (PRUnichar) *src;;
} else {
*dest++ = gJis0208map[mData+off];
}
@ -319,7 +332,11 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
if((0xA1 <= *src) && (*src <= 0xDF)) {
*dest++ = (0xFF61-0x00A1) + *src;
} else {
*dest++ = 0xFFFD;
*dest++ = 0xFFFD;
// if 0x8e is not followed by a valid JIS X 0201 byte
// but by a valid US-ASCII, save it instead of eating it up.
if ( (PRUint8)*src < (PRUint8)0x7f )
*dest++ = (PRUnichar) *src;;
}
if(dest >= destEnd)
goto error1;