#64235, 25037

mishandling of 'stand-alone' octet with MSB set in CJK(multibyte) encodings r = ftang sr=erik, p = Jungshik Shin
2001-03-21 22:50:05 +00:00 · 2001-03-21 22:50:05 +00:00 · 1c3ce9176c
--- a/intl/uconv/ucvcn/nsGB2312ToUnicodeV2.cpp
+++ b/intl/uconv/ucvcn/nsGB2312ToUnicodeV2.cpp
@ -83,7 +83,8 @@ NS_IMETHODIMP nsGB2312ToUnicodeV2::ConvertNoBuff(const char* aSrc,
         break;
      }
 		
-      if ( *aSrc & 0x80 )
+      // we need to handle 0xa0 specially even though it is not a legal GB2312 code point
+      if ( (PRUint8)0xA0 < (PRUint8)*aSrc && (PRUint8)*aSrc < (PRUint8)0xFF)
        {
          if(i+1 >= iSrcLength)
          {
@ -91,22 +92,39 @@ NS_IMETHODIMP nsGB2312ToUnicodeV2::ConvertNoBuff(const char* aSrc,
            break;
          }
          
-		  // The source is a GBCode
+
     
          left = pSrcDBCode->leftbyte; 
          right = pSrcDBCode->rightbyte;
-
-          iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);  
-          *pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
+          // To make sure, the second byte has to be checked as well
+          // The valid 2nd byte range: [0xA1,0xFE]
+          if ( (PRUint8)0xA0 < right && right < (PRUint8)0xFF ) 
+            {
+              // Valid GB 2312 code point
+              iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);  
+              *pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
+              aSrc += 2;
+              i++;
+            }
+          else 
+            {
+              // Invalid GB 2312 code point 
          
-          aSrc += 2;
-          i++;
+              *pDestDBCode = (PRUnichar)0xfffd;
+              aSrc++;
+              
+            }
 		}
      else
 		{
-          // The source is an ASCII
-          *pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
-          aSrc++;
+           if ((PRUint8)*aSrc <= (PRUint8) 0x9f && (PRUint8)*aSrc >= (PRUint8) 0x80)
+             *pDestDBCode = (PRUnichar)0xfffd;
+           else
+             // The source is an ASCII
+             *pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
+
+           aSrc++;
+          
 		}

      iDestlen++;
--- a/intl/uconv/ucvcn/nsGBKToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp
@ -70,23 +70,43 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
         break;
 		}
      
-      if ( *aSrc & 0x80 )
+      // The valid range for the 1st byte is [0x81,0xFE] 
+      if ( (PRUint8) 0x80 < (PRUint8)*aSrc && (PRUint8)*aSrc < (PRUint8)0xff )
 		{
          if(i+1 >= iSrcLength) 
          {
            rv = NS_OK_UDEC_MOREINPUT;
            break;
          }
-		  // The source is a GBCode
+

          left = pSrcDBCode->leftbyte;  
          right = pSrcDBCode->rightbyte;
+          // To make sure, the second byte has to be checked as well.
+          // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
+          if ( right >= (PRUint8)0x40 && (right & 0x7f) != (PRUint8)0x7F) 
+            {
+              // Valid GBK code
+              iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);  
+              *pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
+              aSrc += 2;
+              i++;
+            }
+          else if ( left == (PRUint8)0xA0 )
+            {
+              // stand-alone (not followed by a valid second byte) 0xA0 !
+              // treat it as valid a la Netscape 4.x
+              *pDestDBCode = (PRUnichar) ( ((char )(*aSrc)) & 0x00ff);
+              aSrc++;
+            }
          
-          iGBKToUnicodeIndex = (left - 0x0081)*0x00BF + (right - 0x0040);  
-          *pDestDBCode = GBKToUnicodeTable[iGBKToUnicodeIndex];
+          else 
+            {
+              // Invalid GBK code point (second byte should be 0x40 or higher)
+              *pDestDBCode = (PRUnichar)0xfffd;
+              aSrc++;
+            }
          
-          aSrc += 2;
-          i++;
 		}
      else
 		{
--- a/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
+++ b/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
@ -100,7 +100,7 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
       switch(mState)
       {
          case 0:
-          if(*src & 0x80)
+          if(*src & 0x80 && *src != (unsigned char)0xa0)
          {
            mData = fbIdx[*src & 0x7F];
            if(mData < 0xE000 )
@ -135,6 +135,11 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
            PRUint8 off = sbIdx[*src];
            if(0xFF == off) {
               *dest++ = 0xFFFD;
+               // if the first byte is valid for SJIS but the second 
+               // is not while being a valid US-ASCII(i.e. < 0x40), save it
+               // instead of eating it up !
+               if ( ! (*src & 0xc0)  )
+                 *dest++ = (PRUnichar) *src;
            } else {
               *dest++ = gJis0208map[mData+off];
            }
@ -149,6 +154,9 @@ NS_IMETHODIMP nsShiftJISToUnicode::Convert(
            PRUint8 off = sbIdx[*src];
            if(0xFF == off) {
               *dest++ = 0xFFFD;
+               // see the comment above for mstate=1
+               if ( ! (*src & 0xc0)  )
+                 *dest++ = (PRUnichar) *src;
            } else {
               *dest++ = mData + off;
            }
@ -272,7 +280,7 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
       switch(mState)
       {
          case 0:
-          if(*src & 0x80)
+          if(*src & 0x80  && *src != (unsigned char)0xa0)
          {
            mData = fbIdx[*src & 0x7F];
            if(mData != 0xFFFD )
@ -304,7 +312,12 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
          {
            PRUint8 off = sbIdx[*src];
            if(0xFF == off) {
-               *dest++ = 0xFFFD;
+              *dest++ = 0xFFFD;
+               // if the first byte is valid for EUC-JP but the second 
+               // is not while being a valid US-ASCII(i.e. < 0xc0), save it
+               // instead of eating it up !
+               if ( ! (*src & 0xc0)  )
+                 *dest++ = (PRUnichar) *src;;
            } else {
               *dest++ = gJis0208map[mData+off];
            }
@ -319,7 +332,11 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
            if((0xA1 <= *src) && (*src <= 0xDF)) {
              *dest++ = (0xFF61-0x00A1) + *src;
            } else {
-              *dest++ = 0xFFFD;
+              *dest++ = 0xFFFD;             
+              // if 0x8e is not followed by a valid JIS X 0201 byte
+              // but by a valid US-ASCII, save it instead of eating it up.
+              if ( (PRUint8)*src < (PRUint8)0x7f )
+                 *dest++ = (PRUnichar) *src;;
            }
            if(dest >= destEnd)
              goto error1;