Bug 422868 part 1: Fix UTF8 <-> UTF16 conversion code to deal with all encoding errors consistently. r=smontagu

2010-02-23 09:38:10 -08:00 · 2010-02-23 09:38:10 -08:00 · 5db97f9537
--- a/xpcom/string/public/nsUTF8Utils.h
+++ b/xpcom/string/public/nsUTF8Utils.h
@ -387,6 +387,8 @@ class ConvertUTF8toUTF16

    size_t Length() const { return mBuffer - mStart; }

+    PRBool ErrorEncountered() const { return mErrorEncountered; }
+
    void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
      {
        if ( mErrorEncountered )
@ -489,18 +491,47 @@ class CalculateUTF8Length
            else if ( UTF8traits::is3byte(*p) )
                p += 3;
            else if ( UTF8traits::is4byte(*p) ) {
-                p += 4;
                // Because a UTF-8 sequence of 4 bytes represents a codepoint
                // greater than 0xFFFF, it will become a surrogate pair in the
                // UTF-16 string, so add 1 more to mLength.
                // This doesn't happen with is5byte and is6byte because they
                // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
                // converted to a single replacement character.
-                //
-                // XXX: if the 4-byte sequence is an illegal non-shortest form,
-                //      it also gets converted to a replacement character, so
-                //      mLength will be off by one in this case.
-                ++mLength;
+
+                // However, there is one case when a 4 byte UTF-8 sequence will
+                // only generate 2 UTF-16 bytes. If we have a properly encoded
+                // sequence, but with an invalid value (too small or too big),
+                // that will result in a replacement character being written
+                // This replacement character is encoded as just 1 single
+                // UTF-16 character, which is 2 bytes.
+
+                // The below code therefore only adds 1 to mLength if the UTF8
+                // data will produce a decoded character which is greater than
+                // or equal to 0x010000 and less than 0x0110000.
+
+                // A 4byte UTF8 character is encoded as
+                // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+                // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
+                // map to bit 17-21 in the final result. If these bits are
+                // between 0x01 and 0x11, that means that the final result is
+                // between 0x010000 and 0x110000. The below code reads these
+                // bits out and assigns them to c, but shifted up 4 bits to
+                // avoid having to shift twice.
+
+                // It doesn't matter what to do in the case where p + 4 > end
+                // since no UTF16 characters will be written in that case by
+                // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
+                // any of the surrogate bits are wrong since no UTF16
+                // characters will be written in that case either.
+
+                if (p + 4 <= end) {
+                  PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
+                               ((PRUint32)(p[1] & 0x30));
+                  if (c >= 0x010 && c < 0x110)
+                    ++mLength;
+                }
+
+                p += 4;
            }
            else if ( UTF8traits::is5byte(*p) )
                p += 5;
--- a/xpcom/string/src/nsReadableUtils.cpp
+++ b/xpcom/string/src/nsReadableUtils.cpp
@ -209,40 +209,15 @@ AppendUTF16toUTF8( const nsAString& aSource, nsACString& aDest )
        if(!SetLengthForWritingC(aDest, old_dest_length + count))
            return;

-        nsACString::iterator dest;
-        aDest.BeginWriting(dest);
+        // All ready? Time to convert

-        dest.advance(old_dest_length);
+        ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length);
+        copy_string(aSource.BeginReading(source_start),
+                    aSource.EndReading(source_end), converter);

-        if (count <= (PRUint32)dest.size_forward())
-          {
-            // aDest has enough room in the fragment just past the end
-            // of its old data that it can hold what we're about to
-            // append. Append using copy_string().
-
-            // All ready? Time to convert
-
-            ConvertUTF16toUTF8 converter(dest.get());
-            copy_string(aSource.BeginReading(source_start),
-                        aSource.EndReading(source_end), converter);
-
-            if (converter.Size() != count)
-              {
-                NS_ERROR("Input invalid or incorrect length was calculated");
-
-                aDest.SetLength(old_dest_length);
-              }
-          }
-        else
-          {
-            // This isn't the fastest way to do this, but it gets
-            // complicated to convert UTF16 into a fragmented UTF8
-            // string, so we'll take the easy way out here in this
-            // rare situation.
-
-            aDest.Replace(old_dest_length, count,
-                          NS_ConvertUTF16toUTF8(aSource));
-          }
+        NS_ASSERTION(converter.Size() == count,
+                     "Unexpected disparity between CalculateUTF8Size and "
+                     "ConvertUTF16toUTF8");
      }
  }

@ -257,46 +232,29 @@ AppendUTF8toUTF16( const nsACString& aSource, nsAString& aDest )

    PRUint32 count = calculator.Length();

+    // Avoid making the string mutable if we're appending an empty string
    if (count)
      {
        PRUint32 old_dest_length = aDest.Length();

        // Grow the buffer if we need to.
        if(!SetLengthForWriting(aDest, old_dest_length + count))
-            return;
+          return;

-        nsAString::iterator dest;
-        aDest.BeginWriting(dest);
+        // All ready? Time to convert

-        dest.advance(old_dest_length);
+        ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length);
+        copy_string(aSource.BeginReading(source_start),
+                    aSource.EndReading(source_end), converter);

-        if (count <= (PRUint32)dest.size_forward())
+        NS_ASSERTION(converter.ErrorEncountered() ||
+                     converter.Length() == count,
+                     "CalculateUTF8Length produced the wrong length");
+
+        if (converter.ErrorEncountered())
          {
-            // aDest has enough room in the fragment just past the end
-            // of its old data that it can hold what we're about to
-            // append. Append using copy_string().
-
-            // All ready? Time to convert
-
-            ConvertUTF8toUTF16 converter(dest.get());
-            copy_string(aSource.BeginReading(source_start),
-                        aSource.EndReading(source_end), converter);
-
-            if (converter.Length() != count)
-              {
-                NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
-                aDest.SetLength(old_dest_length);
-              }
-          }
-        else
-          {
-            // This isn't the fastest way to do this, but it gets
-            // complicated to convert parts of a UTF8 string into a
-            // UTF16 string, so we'll take the easy way out here in
-            // this rare situation.
-
-            aDest.Replace(old_dest_length, count,
-                          NS_ConvertUTF8toUTF16(aSource));
+            NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
+            aDest.SetLength(old_dest_length);
          }
      }
  }