From 5db97f9537cc90e17a13ce54dd46fea5017595a3 Mon Sep 17 00:00:00 2001 From: Jonas Sicking Date: Tue, 23 Feb 2010 09:38:10 -0800 Subject: [PATCH] Bug 422868 part 1: Fix UTF8 <-> UTF16 conversion code to deal with all encoding errors consistently. r=smontagu --- xpcom/string/public/nsUTF8Utils.h | 43 +++++++++++++-- xpcom/string/src/nsReadableUtils.cpp | 82 +++++++--------------------- 2 files changed, 57 insertions(+), 68 deletions(-) diff --git a/xpcom/string/public/nsUTF8Utils.h b/xpcom/string/public/nsUTF8Utils.h index be73a9ed0678..1b3d9a9cbcde 100644 --- a/xpcom/string/public/nsUTF8Utils.h +++ b/xpcom/string/public/nsUTF8Utils.h @@ -387,6 +387,8 @@ class ConvertUTF8toUTF16 size_t Length() const { return mBuffer - mStart; } + PRBool ErrorEncountered() const { return mErrorEncountered; } + void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N ) { if ( mErrorEncountered ) @@ -489,18 +491,47 @@ class CalculateUTF8Length else if ( UTF8traits::is3byte(*p) ) p += 3; else if ( UTF8traits::is4byte(*p) ) { - p += 4; // Because a UTF-8 sequence of 4 bytes represents a codepoint // greater than 0xFFFF, it will become a surrogate pair in the // UTF-16 string, so add 1 more to mLength. // This doesn't happen with is5byte and is6byte because they // are illegal UTF-8 sequences (greater than 0x10FFFF) so get // converted to a single replacement character. - // - // XXX: if the 4-byte sequence is an illegal non-shortest form, - // it also gets converted to a replacement character, so - // mLength will be off by one in this case. - ++mLength; + + // However, there is one case when a 4 byte UTF-8 sequence will + // only generate 2 UTF-16 bytes. If we have a properly encoded + // sequence, but with an invalid value (too small or too big), + // that will result in a replacement character being written + // This replacement character is encoded as just 1 single + // UTF-16 character, which is 2 bytes. + + // The below code therefore only adds 1 to mLength if the UTF8 + // data will produce a decoded character which is greater than + // or equal to 0x010000 and less than 0x0110000. + + // A 4byte UTF8 character is encoded as + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // Bit 1-3 on the first byte, and bit 5-6 on the second byte, + // map to bit 17-21 in the final result. If these bits are + // between 0x01 and 0x11, that means that the final result is + // between 0x010000 and 0x110000. The below code reads these + // bits out and assigns them to c, but shifted up 4 bits to + // avoid having to shift twice. + + // It doesn't matter what to do in the case where p + 4 > end + // since no UTF16 characters will be written in that case by + // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if + // any of the surrogate bits are wrong since no UTF16 + // characters will be written in that case either. + + if (p + 4 <= end) { + PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 | + ((PRUint32)(p[1] & 0x30)); + if (c >= 0x010 && c < 0x110) + ++mLength; + } + + p += 4; } else if ( UTF8traits::is5byte(*p) ) p += 5; diff --git a/xpcom/string/src/nsReadableUtils.cpp b/xpcom/string/src/nsReadableUtils.cpp index cb0c89fa3fd1..02f20bfd92c2 100644 --- a/xpcom/string/src/nsReadableUtils.cpp +++ b/xpcom/string/src/nsReadableUtils.cpp @@ -209,40 +209,15 @@ AppendUTF16toUTF8( const nsAString& aSource, nsACString& aDest ) if(!SetLengthForWritingC(aDest, old_dest_length + count)) return; - nsACString::iterator dest; - aDest.BeginWriting(dest); + // All ready? Time to convert - dest.advance(old_dest_length); + ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length); + copy_string(aSource.BeginReading(source_start), + aSource.EndReading(source_end), converter); - if (count <= (PRUint32)dest.size_forward()) - { - // aDest has enough room in the fragment just past the end - // of its old data that it can hold what we're about to - // append. Append using copy_string(). - - // All ready? Time to convert - - ConvertUTF16toUTF8 converter(dest.get()); - copy_string(aSource.BeginReading(source_start), - aSource.EndReading(source_end), converter); - - if (converter.Size() != count) - { - NS_ERROR("Input invalid or incorrect length was calculated"); - - aDest.SetLength(old_dest_length); - } - } - else - { - // This isn't the fastest way to do this, but it gets - // complicated to convert UTF16 into a fragmented UTF8 - // string, so we'll take the easy way out here in this - // rare situation. - - aDest.Replace(old_dest_length, count, - NS_ConvertUTF16toUTF8(aSource)); - } + NS_ASSERTION(converter.Size() == count, + "Unexpected disparity between CalculateUTF8Size and " + "ConvertUTF16toUTF8"); } } @@ -257,46 +232,29 @@ AppendUTF8toUTF16( const nsACString& aSource, nsAString& aDest ) PRUint32 count = calculator.Length(); + // Avoid making the string mutable if we're appending an empty string if (count) { PRUint32 old_dest_length = aDest.Length(); // Grow the buffer if we need to. if(!SetLengthForWriting(aDest, old_dest_length + count)) - return; + return; - nsAString::iterator dest; - aDest.BeginWriting(dest); + // All ready? Time to convert - dest.advance(old_dest_length); + ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length); + copy_string(aSource.BeginReading(source_start), + aSource.EndReading(source_end), converter); - if (count <= (PRUint32)dest.size_forward()) + NS_ASSERTION(converter.ErrorEncountered() || + converter.Length() == count, + "CalculateUTF8Length produced the wrong length"); + + if (converter.ErrorEncountered()) { - // aDest has enough room in the fragment just past the end - // of its old data that it can hold what we're about to - // append. Append using copy_string(). - - // All ready? Time to convert - - ConvertUTF8toUTF16 converter(dest.get()); - copy_string(aSource.BeginReading(source_start), - aSource.EndReading(source_end), converter); - - if (converter.Length() != count) - { - NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); - aDest.SetLength(old_dest_length); - } - } - else - { - // This isn't the fastest way to do this, but it gets - // complicated to convert parts of a UTF8 string into a - // UTF16 string, so we'll take the easy way out here in - // this rare situation. - - aDest.Replace(old_dest_length, count, - NS_ConvertUTF8toUTF16(aSource)); + NS_ERROR("Input wasn't UTF8 or incorrect length was calculated"); + aDest.SetLength(old_dest_length); } } }