зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1395527 part 2 - Delegate IsASCII and IsUTF8 to encoding_rs. r=froydnj
MozReview-Commit-ID: 49AGBPjW4Ca --HG-- extra : rebase_source : fb20025e98f97210c58c3cf9767eae06ad92349b
This commit is contained in:
Родитель
521edd3c6a
Коммит
3c7fa0536e
|
@ -1745,7 +1745,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
|
|||
}
|
||||
|
||||
// Section 8.1 says to fail connection if invalid utf-8 in text message
|
||||
if (!IsUTF8(utf8Data, false)) {
|
||||
if (!IsUTF8(utf8Data)) {
|
||||
LOG(("WebSocketChannel:: text frame invalid utf-8\n"));
|
||||
return NS_ERROR_CANNOT_CONVERT_DATA;
|
||||
}
|
||||
|
@ -1796,7 +1796,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
|
|||
// (which are non-conformant to send) with u+fffd,
|
||||
// but secteam feels that silently rewriting messages is
|
||||
// inappropriate - so we will fail the connection instead.
|
||||
if (!IsUTF8(mServerCloseReason, false)) {
|
||||
if (!IsUTF8(mServerCloseReason)) {
|
||||
LOG(("WebSocketChannel:: close frame invalid utf-8\n"));
|
||||
return NS_ERROR_CANNOT_CONVERT_DATA;
|
||||
}
|
||||
|
|
|
@ -617,116 +617,6 @@ IsASCII(const nsAString& aString)
|
|||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
IsASCII(const nsACString& aString)
|
||||
{
|
||||
static const char NOT_ASCII = char(~0x7F);
|
||||
|
||||
|
||||
// Don't want to use |copy_string| for this task, since we can stop at the first non-ASCII character
|
||||
|
||||
nsACString::const_iterator iter, done_reading;
|
||||
aString.BeginReading(iter);
|
||||
aString.EndReading(done_reading);
|
||||
|
||||
const char* c = iter.get();
|
||||
const char* end = done_reading.get();
|
||||
|
||||
while (c < end) {
|
||||
if (*c++ & NOT_ASCII) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
IsUTF8(const nsACString& aString, bool aRejectNonChar)
|
||||
{
|
||||
nsReadingIterator<char> done_reading;
|
||||
aString.EndReading(done_reading);
|
||||
|
||||
int32_t state = 0;
|
||||
bool overlong = false;
|
||||
bool surrogate = false;
|
||||
bool nonchar = false;
|
||||
uint16_t olupper = 0; // overlong byte upper bound.
|
||||
uint16_t slower = 0; // surrogate byte lower bound.
|
||||
|
||||
nsReadingIterator<char> iter;
|
||||
aString.BeginReading(iter);
|
||||
|
||||
const char* ptr = iter.get();
|
||||
const char* end = done_reading.get();
|
||||
while (ptr < end) {
|
||||
uint8_t c;
|
||||
|
||||
if (0 == state) {
|
||||
c = *ptr++;
|
||||
|
||||
if (UTF8traits::isASCII(c)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.
|
||||
return false;
|
||||
} else if (UTF8traits::is2byte(c)) {
|
||||
state = 1;
|
||||
} else if (UTF8traits::is3byte(c)) {
|
||||
state = 2;
|
||||
if (c == 0xE0) { // to exclude E0[80-9F][80-BF]
|
||||
overlong = true;
|
||||
olupper = 0x9F;
|
||||
} else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepoint
|
||||
surrogate = true;
|
||||
slower = 0xA0;
|
||||
} else if (c == 0xEF) { // EF BF [BE-BF] : non-character
|
||||
nonchar = true;
|
||||
}
|
||||
} else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
|
||||
state = 3;
|
||||
nonchar = true;
|
||||
if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}
|
||||
overlong = true;
|
||||
olupper = 0x8F;
|
||||
} else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]
|
||||
// actually not surrogates but codepoints beyond 0x10FFFF
|
||||
surrogate = true;
|
||||
slower = 0x90;
|
||||
}
|
||||
} else {
|
||||
return false; // Not UTF-8 string
|
||||
}
|
||||
}
|
||||
|
||||
if (nonchar && !aRejectNonChar) {
|
||||
nonchar = false;
|
||||
}
|
||||
|
||||
while (ptr < end && state) {
|
||||
c = *ptr++;
|
||||
--state;
|
||||
|
||||
// non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
|
||||
if (nonchar &&
|
||||
((!state && c < 0xBE) ||
|
||||
(state == 1 && c != 0xBF) ||
|
||||
(state == 2 && 0x0F != (0x0F & c)))) {
|
||||
nonchar = false;
|
||||
}
|
||||
|
||||
if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||
|
||||
(surrogate && slower <= c) || (nonchar && !state)) {
|
||||
return false; // Not UTF-8 string
|
||||
}
|
||||
|
||||
overlong = surrogate = false;
|
||||
}
|
||||
}
|
||||
return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
|
||||
}
|
||||
|
||||
/**
|
||||
* A character sink for in-place case conversion.
|
||||
*/
|
||||
|
|
|
@ -18,6 +18,12 @@
|
|||
|
||||
#include "nsTArrayForwardDeclare.h"
|
||||
|
||||
// Can't include mozilla/Encoding.h here
|
||||
extern "C" {
|
||||
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
||||
size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
|
||||
}
|
||||
|
||||
inline size_t
|
||||
Distance(const nsReadingIterator<char16_t>& aStart,
|
||||
const nsReadingIterator<char16_t>& aEnd)
|
||||
|
@ -253,40 +259,57 @@ bool IsASCII(const nsAString& aString);
|
|||
*
|
||||
* @param aString a 8-bit wide string to scan
|
||||
*/
|
||||
bool IsASCII(const nsACString& aString);
|
||||
inline bool IsASCII(const nsACString& aString)
|
||||
{
|
||||
size_t length = aString.Length();
|
||||
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
|
||||
// For short strings, calling into Rust is a pessimization, and the SIMD
|
||||
// code won't have a chance to kick in anyway. Additionally, handling the
|
||||
// case of the empty string here makes null-checking ptr unnecessary.
|
||||
// (Passing nullptr to Rust would technically be UB.)
|
||||
if (length < 16) {
|
||||
size_t accu = 0;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
accu |= ptr[i];
|
||||
}
|
||||
return accu < 0x80;
|
||||
}
|
||||
// This is not quite optimal, because it's not fail-fast when the by-register
|
||||
// check already finds non-ASCII. Also, input to this function is almost
|
||||
// always ASCII, so even the by-register check wouldn't need to be fail-fast
|
||||
// and could be more like the loop above.
|
||||
return length == encoding_ascii_valid_up_to(ptr, length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns |true| if |aString| is a valid UTF-8 string.
|
||||
* XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
|
||||
* It is mainly written to replace and roughly equivalent to
|
||||
*
|
||||
* str.Equals(NS_ConvertUTF16toUTF8(NS_ConvertUTF8toUTF16(str)))
|
||||
*
|
||||
* (see bug 191541)
|
||||
* As such, it does not check for non-UTF-8 7bit encodings such as
|
||||
* ISO-2022-JP and HZ.
|
||||
*
|
||||
* It rejects sequences with the following errors:
|
||||
*
|
||||
* byte sequences that cannot be decoded into characters according to
|
||||
* UTF-8's rules (including cases where the input is part of a valid
|
||||
* UTF-8 sequence but starts or ends mid-character)
|
||||
* overlong sequences (i.e., cases where a character was encoded
|
||||
* non-canonically by using more bytes than necessary)
|
||||
* surrogate codepoints (i.e., the codepoints reserved for
|
||||
representing astral characters in UTF-16)
|
||||
* codepoints above the unicode range (i.e., outside the first 17
|
||||
* planes; higher than U+10FFFF), in accordance with
|
||||
* http://tools.ietf.org/html/rfc3629
|
||||
* when aRejectNonChar is true (the default), any codepoint whose low
|
||||
* 16 bits are 0xFFFE or 0xFFFF
|
||||
|
||||
* Note that this doesn't check whether the string might look like a valid
|
||||
* string in another encoding, too, e.g. ISO-2022-JP.
|
||||
*
|
||||
* @param aString an 8-bit wide string to scan
|
||||
* @param aRejectNonChar a boolean to control the rejection of utf-8
|
||||
* non characters
|
||||
*/
|
||||
bool IsUTF8(const nsACString& aString, bool aRejectNonChar = true);
|
||||
inline bool IsUTF8(const nsACString& aString)
|
||||
{
|
||||
size_t length = aString.Length();
|
||||
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
|
||||
// For short strings, calling into Rust is a pessimization, and the SIMD
|
||||
// code won't have a chance to kick in anyway. Additionally, handling the
|
||||
// case of the empty string here makes null-checking ptr unnecessary.
|
||||
// (Passing nullptr to Rust would technically be UB.)
|
||||
if (length < 16) {
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
if (ptr[i] >= 0x80) {
|
||||
ptr += i;
|
||||
length -= i;
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
end:
|
||||
return length == encoding_utf8_valid_up_to(ptr, length);
|
||||
}
|
||||
|
||||
bool ParseString(const nsACString& aAstring, char aDelimiter,
|
||||
nsTArray<nsCString>& aArray);
|
||||
|
|
Загрузка…
Ссылка в новой задаче