Bug 97224

Adding a Unicode byte check in the UTF8 converter. Checking in for ftang. /r=yokoyama, /sr=waterson
2001-09-22 00:18:39 +00:00 · 2001-09-22 00:18:39 +00:00 · 1c8b45f751
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@ -77,6 +77,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,

 	mState = 0;			// cached expected number of bytes per UTF8 character sequence
 	mUcs4  = 0;			// cached Unicode character
+  mBytes = 1;
 	return NS_OK;

 }
@ -108,52 +109,51 @@ NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
         if( 0 == (0x80 & (*in))) {
             // ASCII
             *out++ = (PRUnichar)*in;
+        mBytes =1;
         } else if( 0xC0 == (0xE0 & (*in))) {
             // 2 bytes UTF8
             mUcs4 = (PRUint32)(*in);
             mUcs4 = (mUcs4 << 6) & 0x000007C0L;
             mState=1;
+        mBytes =2;
         } else if( 0xE0 == (0xF0 & (*in))) {
 			 // 3 bytes UTF8
             mUcs4 = (PRUint32)(*in);
             mUcs4 = (mUcs4 << 12) & 0x0000F000L;
             mState=2;
+        mBytes =3;
         } else if( 0xF0 == (0xF8 & (*in))) {
 			 // 4 bytes UTF8
             mUcs4 = (PRUint32)(*in);
             mUcs4 = (mUcs4 << 18) & 0x001F0000L;
             mState=3;
+        mBytes =4;
         } else if( 0xF8 == (0xFC & (*in))) {
 			 // 5 bytes UTF8
             mUcs4 = (PRUint32)(*in);
             mUcs4 = (mUcs4 << 24) & 0x03000000L;
             mState=4;
+        mBytes =5;
         } else if( 0xFC == (0xFE & (*in))) {
 			 // 6 bytes UTF8
             mUcs4 = (PRUint32)(*in);
             mUcs4 = (mUcs4 << 30) & 0x40000000L;
             mState=5;
+        mBytes =6;
         } else {
-			 
 			 //NS_ASSERTION(0, "The input string is not in utf8");
-
 	  		 //unexpected octet, put in a replacement char, 
 			 //flush and refill the buffer, reset state
 			 res = NS_ERROR_UNEXPECTED;
 			 break;
-
         }
-
 	 } else {
-
-		 if(0x80 == (0xC0 & (*in)))
-         {
+      if(0x80 == (0xC0 & (*in))) {
             PRUint32 tmp = (*in);
-             int shift = (mState-1) * 6;
+        PRUint32 shift = (mState-1) * 6;
             tmp = (tmp << shift ) & ( 0x0000003FL << shift);
             mUcs4 |= tmp;
-			 if(0 == --mState)
-             {
+        if(0 == --mState) {
                 if(mUcs4 >= 0x00010000) {
                    if(mUcs4 >= 0x00110000) {
                      *out++ = 0xFFFD;
@ -163,24 +163,32 @@ NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc,
                      *out++ = 0xDC00 | (0x000003FF & mUcs4);
                    }
                 } else {
+            // from Unicode 3.1, non-shortest form is illegal 
+            if(((2==mBytes) && (mUcs4 < 0x0080)) ||
+               ((3==mBytes) && (mUcs4 < 0x0800)) ||
+               ((4==mBytes) && (mUcs4 < 0x1000)) ||
+                (5==mBytes) ||
+                (6==mBytes)) 
+            {
+              res = NS_ERROR_UNEXPECTED;
+              break;
+            } 
+
                    if( 0xfeff != mUcs4 ) // ignore BOM
+            {  
                      *out++ = mUcs4;
                 }
-                 
+          }
 				 //initialize UTF8 cache
 				 Reset();
             }
-
         } else {
-
 			 //NS_ASSERTION(0, "The input string is not in utf8");
-	
 	  		 //unexpected octet, put in a replacement char, 
 			 //flush and refill the buffer, reset state
                         in--;
 			 res = NS_ERROR_UNEXPECTED;
 			 break;
-
         }
     }
   }
--- a/intl/uconv/src/nsUTF8ToUnicode.h
+++ b/intl/uconv/src/nsUTF8ToUnicode.h
@ -58,6 +58,7 @@ protected:

   PRUint32 mState;	// cached expected number of bytes per UTF8 character sequence
   PRUint32 mUcs4;	// cached Unicode character
+   PRUint32 mBytes;

  //--------------------------------------------------------------------
  // Subclassing of nsDecoderSupport class [declaration]