Bug 422868 part 1: Fix UTF8 <-> UTF16 conversion code to deal with all encoding errors consistently. r=smontagu

This commit is contained in:
Jonas Sicking 2010-02-23 09:38:10 -08:00
Родитель 2e11cfa6dc
Коммит 5db97f9537
2 изменённых файлов: 57 добавлений и 68 удалений

Просмотреть файл

@ -387,6 +387,8 @@ class ConvertUTF8toUTF16
size_t Length() const { return mBuffer - mStart; }
PRBool ErrorEncountered() const { return mErrorEncountered; }
void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
@ -489,18 +491,47 @@ class CalculateUTF8Length
else if ( UTF8traits::is3byte(*p) )
p += 3;
else if ( UTF8traits::is4byte(*p) ) {
p += 4;
// Because a UTF-8 sequence of 4 bytes represents a codepoint
// greater than 0xFFFF, it will become a surrogate pair in the
// UTF-16 string, so add 1 more to mLength.
// This doesn't happen with is5byte and is6byte because they
// are illegal UTF-8 sequences (greater than 0x10FFFF) so get
// converted to a single replacement character.
//
// XXX: if the 4-byte sequence is an illegal non-shortest form,
// it also gets converted to a replacement character, so
// mLength will be off by one in this case.
++mLength;
// However, there is one case when a 4 byte UTF-8 sequence will
// only generate 2 UTF-16 bytes. If we have a properly encoded
// sequence, but with an invalid value (too small or too big),
// that will result in a replacement character being written
// This replacement character is encoded as just 1 single
// UTF-16 character, which is 2 bytes.
// The below code therefore only adds 1 to mLength if the UTF8
// data will produce a decoded character which is greater than
// or equal to 0x010000 and less than 0x0110000.
// A 4byte UTF8 character is encoded as
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
// Bit 1-3 on the first byte, and bit 5-6 on the second byte,
// map to bit 17-21 in the final result. If these bits are
// between 0x01 and 0x11, that means that the final result is
// between 0x010000 and 0x110000. The below code reads these
// bits out and assigns them to c, but shifted up 4 bits to
// avoid having to shift twice.
// It doesn't matter what to do in the case where p + 4 > end
// since no UTF16 characters will be written in that case by
// ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
// any of the surrogate bits are wrong since no UTF16
// characters will be written in that case either.
if (p + 4 <= end) {
PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
((PRUint32)(p[1] & 0x30));
if (c >= 0x010 && c < 0x110)
++mLength;
}
p += 4;
}
else if ( UTF8traits::is5byte(*p) )
p += 5;

Просмотреть файл

@ -209,40 +209,15 @@ AppendUTF16toUTF8( const nsAString& aSource, nsACString& aDest )
if(!SetLengthForWritingC(aDest, old_dest_length + count))
return;
nsACString::iterator dest;
aDest.BeginWriting(dest);
// All ready? Time to convert
dest.advance(old_dest_length);
ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length);
copy_string(aSource.BeginReading(source_start),
aSource.EndReading(source_end), converter);
if (count <= (PRUint32)dest.size_forward())
{
// aDest has enough room in the fragment just past the end
// of its old data that it can hold what we're about to
// append. Append using copy_string().
// All ready? Time to convert
ConvertUTF16toUTF8 converter(dest.get());
copy_string(aSource.BeginReading(source_start),
aSource.EndReading(source_end), converter);
if (converter.Size() != count)
{
NS_ERROR("Input invalid or incorrect length was calculated");
aDest.SetLength(old_dest_length);
}
}
else
{
// This isn't the fastest way to do this, but it gets
// complicated to convert UTF16 into a fragmented UTF8
// string, so we'll take the easy way out here in this
// rare situation.
aDest.Replace(old_dest_length, count,
NS_ConvertUTF16toUTF8(aSource));
}
NS_ASSERTION(converter.Size() == count,
"Unexpected disparity between CalculateUTF8Size and "
"ConvertUTF16toUTF8");
}
}
@ -257,46 +232,29 @@ AppendUTF8toUTF16( const nsACString& aSource, nsAString& aDest )
PRUint32 count = calculator.Length();
// Avoid making the string mutable if we're appending an empty string
if (count)
{
PRUint32 old_dest_length = aDest.Length();
// Grow the buffer if we need to.
if(!SetLengthForWriting(aDest, old_dest_length + count))
return;
return;
nsAString::iterator dest;
aDest.BeginWriting(dest);
// All ready? Time to convert
dest.advance(old_dest_length);
ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length);
copy_string(aSource.BeginReading(source_start),
aSource.EndReading(source_end), converter);
if (count <= (PRUint32)dest.size_forward())
NS_ASSERTION(converter.ErrorEncountered() ||
converter.Length() == count,
"CalculateUTF8Length produced the wrong length");
if (converter.ErrorEncountered())
{
// aDest has enough room in the fragment just past the end
// of its old data that it can hold what we're about to
// append. Append using copy_string().
// All ready? Time to convert
ConvertUTF8toUTF16 converter(dest.get());
copy_string(aSource.BeginReading(source_start),
aSource.EndReading(source_end), converter);
if (converter.Length() != count)
{
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
aDest.SetLength(old_dest_length);
}
}
else
{
// This isn't the fastest way to do this, but it gets
// complicated to convert parts of a UTF8 string into a
// UTF16 string, so we'll take the easy way out here in
// this rare situation.
aDest.Replace(old_dest_length, count,
NS_ConvertUTF8toUTF16(aSource));
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
aDest.SetLength(old_dest_length);
}
}
}