Bug 1395527 part 2 - Delegate IsASCII and IsUTF8 to encoding_rs. r=froydnj

MozReview-Commit-ID: 49AGBPjW4Ca

--HG--
extra : rebase_source : fb20025e98f97210c58c3cf9767eae06ad92349b
This commit is contained in:
Henri Sivonen 2017-09-01 11:53:31 +03:00
Родитель 521edd3c6a
Коммит 3c7fa0536e
3 изменённых файлов: 52 добавлений и 139 удалений

Просмотреть файл

@ -1745,7 +1745,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
}
// Section 8.1 says to fail connection if invalid utf-8 in text message
if (!IsUTF8(utf8Data, false)) {
if (!IsUTF8(utf8Data)) {
LOG(("WebSocketChannel:: text frame invalid utf-8\n"));
return NS_ERROR_CANNOT_CONVERT_DATA;
}
@ -1796,7 +1796,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
// (which are non-conformant to send) with u+fffd,
// but secteam feels that silently rewriting messages is
// inappropriate - so we will fail the connection instead.
if (!IsUTF8(mServerCloseReason, false)) {
if (!IsUTF8(mServerCloseReason)) {
LOG(("WebSocketChannel:: close frame invalid utf-8\n"));
return NS_ERROR_CANNOT_CONVERT_DATA;
}

Просмотреть файл

@ -617,116 +617,6 @@ IsASCII(const nsAString& aString)
return true;
}
bool
IsASCII(const nsACString& aString)
{
static const char NOT_ASCII = char(~0x7F);
// Don't want to use |copy_string| for this task, since we can stop at the first non-ASCII character
nsACString::const_iterator iter, done_reading;
aString.BeginReading(iter);
aString.EndReading(done_reading);
const char* c = iter.get();
const char* end = done_reading.get();
while (c < end) {
if (*c++ & NOT_ASCII) {
return false;
}
}
return true;
}
bool
IsUTF8(const nsACString& aString, bool aRejectNonChar)
{
nsReadingIterator<char> done_reading;
aString.EndReading(done_reading);
int32_t state = 0;
bool overlong = false;
bool surrogate = false;
bool nonchar = false;
uint16_t olupper = 0; // overlong byte upper bound.
uint16_t slower = 0; // surrogate byte lower bound.
nsReadingIterator<char> iter;
aString.BeginReading(iter);
const char* ptr = iter.get();
const char* end = done_reading.get();
while (ptr < end) {
uint8_t c;
if (0 == state) {
c = *ptr++;
if (UTF8traits::isASCII(c)) {
continue;
}
if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.
return false;
} else if (UTF8traits::is2byte(c)) {
state = 1;
} else if (UTF8traits::is3byte(c)) {
state = 2;
if (c == 0xE0) { // to exclude E0[80-9F][80-BF]
overlong = true;
olupper = 0x9F;
} else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepoint
surrogate = true;
slower = 0xA0;
} else if (c == 0xEF) { // EF BF [BE-BF] : non-character
nonchar = true;
}
} else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
state = 3;
nonchar = true;
if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}
overlong = true;
olupper = 0x8F;
} else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]
// actually not surrogates but codepoints beyond 0x10FFFF
surrogate = true;
slower = 0x90;
}
} else {
return false; // Not UTF-8 string
}
}
if (nonchar && !aRejectNonChar) {
nonchar = false;
}
while (ptr < end && state) {
c = *ptr++;
--state;
// non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
if (nonchar &&
((!state && c < 0xBE) ||
(state == 1 && c != 0xBF) ||
(state == 2 && 0x0F != (0x0F & c)))) {
nonchar = false;
}
if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||
(surrogate && slower <= c) || (nonchar && !state)) {
return false; // Not UTF-8 string
}
overlong = surrogate = false;
}
}
return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
}
/**
* A character sink for in-place case conversion.
*/

Просмотреть файл

@ -18,6 +18,12 @@
#include "nsTArrayForwardDeclare.h"
// Can't include mozilla/Encoding.h here
extern "C" {
size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
}
inline size_t
Distance(const nsReadingIterator<char16_t>& aStart,
const nsReadingIterator<char16_t>& aEnd)
@ -253,40 +259,57 @@ bool IsASCII(const nsAString& aString);
*
* @param aString a 8-bit wide string to scan
*/
bool IsASCII(const nsACString& aString);
inline bool IsASCII(const nsACString& aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway. Additionally, handling the
// case of the empty string here makes null-checking ptr unnecessary.
// (Passing nullptr to Rust would technically be UB.)
if (length < 16) {
size_t accu = 0;
for (size_t i = 0; i < length; i++) {
accu |= ptr[i];
}
return accu < 0x80;
}
// This is not quite optimal, because it's not fail-fast when the by-register
// check already finds non-ASCII. Also, input to this function is almost
// always ASCII, so even the by-register check wouldn't need to be fail-fast
// and could be more like the loop above.
return length == encoding_ascii_valid_up_to(ptr, length);
}
/**
* Returns |true| if |aString| is a valid UTF-8 string.
* XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
* It is mainly written to replace and roughly equivalent to
*
* str.Equals(NS_ConvertUTF16toUTF8(NS_ConvertUTF8toUTF16(str)))
*
* (see bug 191541)
* As such, it does not check for non-UTF-8 7bit encodings such as
* ISO-2022-JP and HZ.
*
* It rejects sequences with the following errors:
*
* byte sequences that cannot be decoded into characters according to
* UTF-8's rules (including cases where the input is part of a valid
* UTF-8 sequence but starts or ends mid-character)
* overlong sequences (i.e., cases where a character was encoded
* non-canonically by using more bytes than necessary)
* surrogate codepoints (i.e., the codepoints reserved for
representing astral characters in UTF-16)
* codepoints above the unicode range (i.e., outside the first 17
* planes; higher than U+10FFFF), in accordance with
* http://tools.ietf.org/html/rfc3629
* when aRejectNonChar is true (the default), any codepoint whose low
* 16 bits are 0xFFFE or 0xFFFF
* Note that this doesn't check whether the string might look like a valid
* string in another encoding, too, e.g. ISO-2022-JP.
*
* @param aString an 8-bit wide string to scan
* @param aRejectNonChar a boolean to control the rejection of utf-8
* non characters
*/
bool IsUTF8(const nsACString& aString, bool aRejectNonChar = true);
inline bool IsUTF8(const nsACString& aString)
{
size_t length = aString.Length();
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
// For short strings, calling into Rust is a pessimization, and the SIMD
// code won't have a chance to kick in anyway. Additionally, handling the
// case of the empty string here makes null-checking ptr unnecessary.
// (Passing nullptr to Rust would technically be UB.)
if (length < 16) {
for (size_t i = 0; i < length; i++) {
if (ptr[i] >= 0x80) {
ptr += i;
length -= i;
goto end;
}
}
return true;
}
end:
return length == encoding_utf8_valid_up_to(ptr, length);
}
bool ParseString(const nsACString& aAstring, char aDelimiter,
nsTArray<nsCString>& aArray);