From 5db97f9537cc90e17a13ce54dd46fea5017595a3 Mon Sep 17 00:00:00 2001
From: Jonas Sicking <jonas@sicking.cc>
Date: Tue, 23 Feb 2010 09:38:10 -0800
Subject: [PATCH] Bug 422868 part 1: Fix UTF8 <-> UTF16 conversion code to deal
 with all encoding errors consistently. r=smontagu

---
 xpcom/string/public/nsUTF8Utils.h    | 43 +++++++++++++--
 xpcom/string/src/nsReadableUtils.cpp | 82 +++++++---------------------
 2 files changed, 57 insertions(+), 68 deletions(-)

diff --git a/xpcom/string/public/nsUTF8Utils.h b/xpcom/string/public/nsUTF8Utils.h
index be73a9ed0678..1b3d9a9cbcde 100644
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@@ -387,6 +387,8 @@ class ConvertUTF8toUTF16
 
     size_t Length() const { return mBuffer - mStart; }
 
+    PRBool ErrorEncountered() const { return mErrorEncountered; }
+
     void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
       {
         if ( mErrorEncountered )
@@ -489,18 +491,47 @@ class CalculateUTF8Length
             else if ( UTF8traits::is3byte(*p) )
                 p += 3;
             else if ( UTF8traits::is4byte(*p) ) {
-                p += 4;
                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
                 // greater than 0xFFFF, it will become a surrogate pair in the
                 // UTF-16 string, so add 1 more to mLength.
                 // This doesn't happen with is5byte and is6byte because they
                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
                 // converted to a single replacement character.
-                //
-                // XXX: if the 4-byte sequence is an illegal non-shortest form,
-                //      it also gets converted to a replacement character, so
-                //      mLength will be off by one in this case.
-                ++mLength;
+
+                // However, there is one case when a 4 byte UTF-8 sequence will
+                // only generate 2 UTF-16 bytes. If we have a properly encoded
+                // sequence, but with an invalid value (too small or too big),
+                // that will result in a replacement character being written
+                // This replacement character is encoded as just 1 single
+                // UTF-16 character, which is 2 bytes.
+
+                // The below code therefore only adds 1 to mLength if the UTF8
+                // data will produce a decoded character which is greater than
+                // or equal to 0x010000 and less than 0x0110000.
+
+                // A 4byte UTF8 character is encoded as
+                // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
+                // map to bit 17-21 in the final result. If these bits are
+                // between 0x01 and 0x11, that means that the final result is
+                // between 0x010000 and 0x110000. The below code reads these
+                // bits out and assigns them to c, but shifted up 4 bits to
+                // avoid having to shift twice.
+
+                // It doesn't matter what to do in the case where p + 4 > end
+                // since no UTF16 characters will be written in that case by
+                // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
+                // any of the surrogate bits are wrong since no UTF16
+                // characters will be written in that case either.
+
+                if (p + 4 <= end) {
+                  PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
+                               ((PRUint32)(p[1] & 0x30));
+                  if (c >= 0x010 && c < 0x110)
+                    ++mLength;
+                }
+
+                p += 4;
             }
             else if ( UTF8traits::is5byte(*p) )
                 p += 5;
diff --git a/xpcom/string/src/nsReadableUtils.cpp b/xpcom/string/src/nsReadableUtils.cpp
index cb0c89fa3fd1..02f20bfd92c2 100644
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@@ -209,40 +209,15 @@ AppendUTF16toUTF8( const nsAString& aSource, nsACString& aDest )
         if(!SetLengthForWritingC(aDest, old_dest_length + count))
             return;
 
-        nsACString::iterator dest;
-        aDest.BeginWriting(dest);
+        // All ready? Time to convert
 
-        dest.advance(old_dest_length);
+        ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length);
+        copy_string(aSource.BeginReading(source_start),
+                    aSource.EndReading(source_end), converter);
 
-        if (count <= (PRUint32)dest.size_forward())
-          {
-            // aDest has enough room in the fragment just past the end
-            // of its old data that it can hold what we're about to
-            // append. Append using copy_string().
-
-            // All ready? Time to convert
-
-            ConvertUTF16toUTF8 converter(dest.get());
-            copy_string(aSource.BeginReading(source_start),
-                        aSource.EndReading(source_end), converter);
-
-            if (converter.Size() != count)
-              {
-                NS_ERROR("Input invalid or incorrect length was calculated");
-
-                aDest.SetLength(old_dest_length);
-              }
-          }
-        else
-          {
-            // This isn't the fastest way to do this, but it gets
-            // complicated to convert UTF16 into a fragmented UTF8
-            // string, so we'll take the easy way out here in this
-            // rare situation.
-
-            aDest.Replace(old_dest_length, count,
-                          NS_ConvertUTF16toUTF8(aSource));
-          }
+        NS_ASSERTION(converter.Size() == count,
+                     "Unexpected disparity between CalculateUTF8Size and "
+                     "ConvertUTF16toUTF8");
       }
   }
 
@@ -257,46 +232,29 @@ AppendUTF8toUTF16( const nsACString& aSource, nsAString& aDest )
 
     PRUint32 count = calculator.Length();
 
+    // Avoid making the string mutable if we're appending an empty string
     if (count)
       {
         PRUint32 old_dest_length = aDest.Length();
 
         // Grow the buffer if we need to.
         if(!SetLengthForWriting(aDest, old_dest_length + count))
-            return;
+          return;
 
-        nsAString::iterator dest;
-        aDest.BeginWriting(dest);
+        // All ready? Time to convert
 
-        dest.advance(old_dest_length);
+        ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length);
+        copy_string(aSource.BeginReading(source_start),
+                    aSource.EndReading(source_end), converter);
 
-        if (count <= (PRUint32)dest.size_forward())
+        NS_ASSERTION(converter.ErrorEncountered() ||
+                     converter.Length() == count,
+                     "CalculateUTF8Length produced the wrong length");
+
+        if (converter.ErrorEncountered())
           {
-            // aDest has enough room in the fragment just past the end
-            // of its old data that it can hold what we're about to
-            // append. Append using copy_string().
-
-            // All ready? Time to convert
-
-            ConvertUTF8toUTF16 converter(dest.get());
-            copy_string(aSource.BeginReading(source_start),
-                        aSource.EndReading(source_end), converter);
-
-            if (converter.Length() != count)
-              {
-                NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
-                aDest.SetLength(old_dest_length);
-              }
-          }
-        else
-          {
-            // This isn't the fastest way to do this, but it gets
-            // complicated to convert parts of a UTF8 string into a
-            // UTF16 string, so we'll take the easy way out here in
-            // this rare situation.
-
-            aDest.Replace(old_dest_length, count,
-                          NS_ConvertUTF8toUTF16(aSource));
+            NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
+            aDest.SetLength(old_dest_length);
           }
       }
   }