зеркало из https://github.com/mozilla/gecko-dev.git
Bug 422868 part 1: Fix UTF8 <-> UTF16 conversion code to deal with all encoding errors consistently. r=smontagu
This commit is contained in:
Родитель
2e11cfa6dc
Коммит
5db97f9537
|
@ -387,6 +387,8 @@ class ConvertUTF8toUTF16
|
|||
|
||||
size_t Length() const { return mBuffer - mStart; }
|
||||
|
||||
PRBool ErrorEncountered() const { return mErrorEncountered; }
|
||||
|
||||
void NS_ALWAYS_INLINE write( const value_type* start, PRUint32 N )
|
||||
{
|
||||
if ( mErrorEncountered )
|
||||
|
@ -489,18 +491,47 @@ class CalculateUTF8Length
|
|||
else if ( UTF8traits::is3byte(*p) )
|
||||
p += 3;
|
||||
else if ( UTF8traits::is4byte(*p) ) {
|
||||
p += 4;
|
||||
// Because a UTF-8 sequence of 4 bytes represents a codepoint
|
||||
// greater than 0xFFFF, it will become a surrogate pair in the
|
||||
// UTF-16 string, so add 1 more to mLength.
|
||||
// This doesn't happen with is5byte and is6byte because they
|
||||
// are illegal UTF-8 sequences (greater than 0x10FFFF) so get
|
||||
// converted to a single replacement character.
|
||||
//
|
||||
// XXX: if the 4-byte sequence is an illegal non-shortest form,
|
||||
// it also gets converted to a replacement character, so
|
||||
// mLength will be off by one in this case.
|
||||
++mLength;
|
||||
|
||||
// However, there is one case when a 4 byte UTF-8 sequence will
|
||||
// only generate 2 UTF-16 bytes. If we have a properly encoded
|
||||
// sequence, but with an invalid value (too small or too big),
|
||||
// that will result in a replacement character being written
|
||||
// This replacement character is encoded as just 1 single
|
||||
// UTF-16 character, which is 2 bytes.
|
||||
|
||||
// The below code therefore only adds 1 to mLength if the UTF8
|
||||
// data will produce a decoded character which is greater than
|
||||
// or equal to 0x010000 and less than 0x0110000.
|
||||
|
||||
// A 4byte UTF8 character is encoded as
|
||||
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
// Bit 1-3 on the first byte, and bit 5-6 on the second byte,
|
||||
// map to bit 17-21 in the final result. If these bits are
|
||||
// between 0x01 and 0x11, that means that the final result is
|
||||
// between 0x010000 and 0x110000. The below code reads these
|
||||
// bits out and assigns them to c, but shifted up 4 bits to
|
||||
// avoid having to shift twice.
|
||||
|
||||
// It doesn't matter what to do in the case where p + 4 > end
|
||||
// since no UTF16 characters will be written in that case by
|
||||
// ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
|
||||
// any of the surrogate bits are wrong since no UTF16
|
||||
// characters will be written in that case either.
|
||||
|
||||
if (p + 4 <= end) {
|
||||
PRUint32 c = ((PRUint32)(p[0] & 0x07)) << 6 |
|
||||
((PRUint32)(p[1] & 0x30));
|
||||
if (c >= 0x010 && c < 0x110)
|
||||
++mLength;
|
||||
}
|
||||
|
||||
p += 4;
|
||||
}
|
||||
else if ( UTF8traits::is5byte(*p) )
|
||||
p += 5;
|
||||
|
|
|
@ -209,40 +209,15 @@ AppendUTF16toUTF8( const nsAString& aSource, nsACString& aDest )
|
|||
if(!SetLengthForWritingC(aDest, old_dest_length + count))
|
||||
return;
|
||||
|
||||
nsACString::iterator dest;
|
||||
aDest.BeginWriting(dest);
|
||||
// All ready? Time to convert
|
||||
|
||||
dest.advance(old_dest_length);
|
||||
ConvertUTF16toUTF8 converter(aDest.BeginWriting() + old_dest_length);
|
||||
copy_string(aSource.BeginReading(source_start),
|
||||
aSource.EndReading(source_end), converter);
|
||||
|
||||
if (count <= (PRUint32)dest.size_forward())
|
||||
{
|
||||
// aDest has enough room in the fragment just past the end
|
||||
// of its old data that it can hold what we're about to
|
||||
// append. Append using copy_string().
|
||||
|
||||
// All ready? Time to convert
|
||||
|
||||
ConvertUTF16toUTF8 converter(dest.get());
|
||||
copy_string(aSource.BeginReading(source_start),
|
||||
aSource.EndReading(source_end), converter);
|
||||
|
||||
if (converter.Size() != count)
|
||||
{
|
||||
NS_ERROR("Input invalid or incorrect length was calculated");
|
||||
|
||||
aDest.SetLength(old_dest_length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This isn't the fastest way to do this, but it gets
|
||||
// complicated to convert UTF16 into a fragmented UTF8
|
||||
// string, so we'll take the easy way out here in this
|
||||
// rare situation.
|
||||
|
||||
aDest.Replace(old_dest_length, count,
|
||||
NS_ConvertUTF16toUTF8(aSource));
|
||||
}
|
||||
NS_ASSERTION(converter.Size() == count,
|
||||
"Unexpected disparity between CalculateUTF8Size and "
|
||||
"ConvertUTF16toUTF8");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -257,46 +232,29 @@ AppendUTF8toUTF16( const nsACString& aSource, nsAString& aDest )
|
|||
|
||||
PRUint32 count = calculator.Length();
|
||||
|
||||
// Avoid making the string mutable if we're appending an empty string
|
||||
if (count)
|
||||
{
|
||||
PRUint32 old_dest_length = aDest.Length();
|
||||
|
||||
// Grow the buffer if we need to.
|
||||
if(!SetLengthForWriting(aDest, old_dest_length + count))
|
||||
return;
|
||||
return;
|
||||
|
||||
nsAString::iterator dest;
|
||||
aDest.BeginWriting(dest);
|
||||
// All ready? Time to convert
|
||||
|
||||
dest.advance(old_dest_length);
|
||||
ConvertUTF8toUTF16 converter(aDest.BeginWriting() + old_dest_length);
|
||||
copy_string(aSource.BeginReading(source_start),
|
||||
aSource.EndReading(source_end), converter);
|
||||
|
||||
if (count <= (PRUint32)dest.size_forward())
|
||||
NS_ASSERTION(converter.ErrorEncountered() ||
|
||||
converter.Length() == count,
|
||||
"CalculateUTF8Length produced the wrong length");
|
||||
|
||||
if (converter.ErrorEncountered())
|
||||
{
|
||||
// aDest has enough room in the fragment just past the end
|
||||
// of its old data that it can hold what we're about to
|
||||
// append. Append using copy_string().
|
||||
|
||||
// All ready? Time to convert
|
||||
|
||||
ConvertUTF8toUTF16 converter(dest.get());
|
||||
copy_string(aSource.BeginReading(source_start),
|
||||
aSource.EndReading(source_end), converter);
|
||||
|
||||
if (converter.Length() != count)
|
||||
{
|
||||
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
|
||||
aDest.SetLength(old_dest_length);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// This isn't the fastest way to do this, but it gets
|
||||
// complicated to convert parts of a UTF8 string into a
|
||||
// UTF16 string, so we'll take the easy way out here in
|
||||
// this rare situation.
|
||||
|
||||
aDest.Replace(old_dest_length, count,
|
||||
NS_ConvertUTF8toUTF16(aSource));
|
||||
NS_ERROR("Input wasn't UTF8 or incorrect length was calculated");
|
||||
aDest.SetLength(old_dest_length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче