Bug 1395527 part 2 - Delegate IsASCII and IsUTF8 to encoding_rs. r=froydnj

MozReview-Commit-ID: 49AGBPjW4Ca --HG-- extra : rebase_source : fb20025e98f97210c58c3cf9767eae06ad92349b
2017-09-01 11:53:31 +03:00 · 2017-09-01 11:53:31 +03:00 · 3c7fa0536e
--- a/netwerk/protocol/websocket/WebSocketChannel.cpp
+++ b/netwerk/protocol/websocket/WebSocketChannel.cpp
@ -1745,7 +1745,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
        }

        // Section 8.1 says to fail connection if invalid utf-8 in text message
-        if (!IsUTF8(utf8Data, false)) {
+        if (!IsUTF8(utf8Data)) {
          LOG(("WebSocketChannel:: text frame invalid utf-8\n"));
          return NS_ERROR_CANNOT_CONVERT_DATA;
        }
@ -1796,7 +1796,7 @@ WebSocketChannel::ProcessInput(uint8_t *buffer, uint32_t count)
            // (which are non-conformant to send) with u+fffd,
            // but secteam feels that silently rewriting messages is
            // inappropriate - so we will fail the connection instead.
-            if (!IsUTF8(mServerCloseReason, false)) {
+            if (!IsUTF8(mServerCloseReason)) {
              LOG(("WebSocketChannel:: close frame invalid utf-8\n"));
              return NS_ERROR_CANNOT_CONVERT_DATA;
            }
--- a/xpcom/string/nsReadableUtils.cpp
+++ b/xpcom/string/nsReadableUtils.cpp
@ -617,116 +617,6 @@ IsASCII(const nsAString& aString)
  return true;
 }

-bool
-IsASCII(const nsACString& aString)
-{
-  static const char NOT_ASCII = char(~0x7F);
-
-
-  // Don't want to use |copy_string| for this task, since we can stop at the first non-ASCII character
-
-  nsACString::const_iterator iter, done_reading;
-  aString.BeginReading(iter);
-  aString.EndReading(done_reading);
-
-  const char* c = iter.get();
-  const char* end = done_reading.get();
-
-  while (c < end) {
-    if (*c++ & NOT_ASCII) {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-bool
-IsUTF8(const nsACString& aString, bool aRejectNonChar)
-{
-  nsReadingIterator<char> done_reading;
-  aString.EndReading(done_reading);
-
-  int32_t state = 0;
-  bool overlong = false;
-  bool surrogate = false;
-  bool nonchar = false;
-  uint16_t olupper = 0; // overlong byte upper bound.
-  uint16_t slower = 0;  // surrogate byte lower bound.
-
-  nsReadingIterator<char> iter;
-  aString.BeginReading(iter);
-
-  const char* ptr = iter.get();
-  const char* end = done_reading.get();
-  while (ptr < end) {
-    uint8_t c;
-
-    if (0 == state) {
-      c = *ptr++;
-
-      if (UTF8traits::isASCII(c)) {
-        continue;
-      }
-
-      if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.
-        return false;
-      } else if (UTF8traits::is2byte(c)) {
-        state = 1;
-      } else if (UTF8traits::is3byte(c)) {
-        state = 2;
-        if (c == 0xE0) { // to exclude E0[80-9F][80-BF]
-          overlong = true;
-          olupper = 0x9F;
-        } else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepoint
-          surrogate = true;
-          slower = 0xA0;
-        } else if (c == 0xEF) { // EF BF [BE-BF] : non-character
-          nonchar = true;
-        }
-      } else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)
-        state = 3;
-        nonchar = true;
-        if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}
-          overlong = true;
-          olupper = 0x8F;
-        } else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]
-          // actually not surrogates but codepoints beyond 0x10FFFF
-          surrogate = true;
-          slower = 0x90;
-        }
-      } else {
-        return false;  // Not UTF-8 string
-      }
-    }
-
-    if (nonchar && !aRejectNonChar) {
-      nonchar = false;
-    }
-
-    while (ptr < end && state) {
-      c = *ptr++;
-      --state;
-
-      // non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]
-      if (nonchar &&
-          ((!state && c < 0xBE) ||
-           (state == 1 && c != 0xBF)  ||
-           (state == 2 && 0x0F != (0x0F & c)))) {
-        nonchar = false;
-      }
-
-      if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||
-          (surrogate && slower <= c) || (nonchar && !state)) {
-        return false;  // Not UTF-8 string
-      }
-
-      overlong = surrogate = false;
-    }
-  }
-  return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
-}
-
 /**
 * A character sink for in-place case conversion.
 */
--- a/xpcom/string/nsReadableUtils.h
+++ b/xpcom/string/nsReadableUtils.h
@ -18,6 +18,12 @@

 #include "nsTArrayForwardDeclare.h"

+// Can't include mozilla/Encoding.h here
+extern "C" {
+  size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+  size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len);
+}
+
 inline size_t
 Distance(const nsReadingIterator<char16_t>& aStart,
         const nsReadingIterator<char16_t>& aEnd)
@ -253,40 +259,57 @@ bool IsASCII(const nsAString& aString);
 *
 * @param aString a 8-bit wide string to scan
 */
-bool IsASCII(const nsACString& aString);
+inline bool IsASCII(const nsACString& aString)
+{
+  size_t length = aString.Length();
+  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
+  // For short strings, calling into Rust is a pessimization, and the SIMD
+  // code won't have a chance to kick in anyway. Additionally, handling the
+  // case of the empty string here makes null-checking ptr unnecessary.
+  // (Passing nullptr to Rust would technically be UB.)
+  if (length < 16) {
+    size_t accu = 0;
+    for (size_t i = 0; i < length; i++) {
+      accu |= ptr[i];
+    }
+    return accu < 0x80;
+  }
+  // This is not quite optimal, because it's not fail-fast when the by-register
+  // check already finds non-ASCII. Also, input to this function is almost
+  // always ASCII, so even the by-register check wouldn't need to be fail-fast
+  // and could be more like the loop above.
+  return length == encoding_ascii_valid_up_to(ptr, length);
+}

 /**
 * Returns |true| if |aString| is a valid UTF-8 string.
- * XXX This is not bullet-proof and nor an all-purpose UTF-8 validator.
- * It is mainly written to replace and roughly equivalent to
 *
- *    str.Equals(NS_ConvertUTF16toUTF8(NS_ConvertUTF8toUTF16(str)))
- *
- * (see bug 191541)
- * As such,  it does not check for non-UTF-8 7bit encodings such as
- * ISO-2022-JP and HZ.
- *
- * It rejects sequences with the following errors:
- *
- * byte sequences that cannot be decoded into characters according to
- *   UTF-8's rules (including cases where the input is part of a valid
- *   UTF-8 sequence but starts or ends mid-character)
- * overlong sequences (i.e., cases where a character was encoded
- *   non-canonically by using more bytes than necessary)
- * surrogate codepoints (i.e., the codepoints reserved for
-     representing astral characters in UTF-16)
- * codepoints above the unicode range (i.e., outside the first 17
- *   planes; higher than U+10FFFF), in accordance with
- *   http://tools.ietf.org/html/rfc3629
- * when aRejectNonChar is true (the default), any codepoint whose low
- *   16 bits are 0xFFFE or 0xFFFF
-
+ * Note that this doesn't check whether the string might look like a valid
+ * string in another encoding, too, e.g. ISO-2022-JP.
 *
 * @param aString an 8-bit wide string to scan
- * @param aRejectNonChar a boolean to control the rejection of utf-8
- *        non characters
 */
-bool IsUTF8(const nsACString& aString, bool aRejectNonChar = true);
+inline bool IsUTF8(const nsACString& aString)
+{
+  size_t length = aString.Length();
+  const uint8_t* ptr = reinterpret_cast<const uint8_t*>(aString.BeginReading());
+  // For short strings, calling into Rust is a pessimization, and the SIMD
+  // code won't have a chance to kick in anyway. Additionally, handling the
+  // case of the empty string here makes null-checking ptr unnecessary.
+  // (Passing nullptr to Rust would technically be UB.)
+  if (length < 16) {
+    for (size_t i = 0; i < length; i++) {
+      if (ptr[i] >= 0x80) {
+        ptr += i;
+        length -= i;
+        goto end;
+      }
+    }
+    return true;
+  }
+  end:
+  return length == encoding_utf8_valid_up_to(ptr, length);
+}

 bool ParseString(const nsACString& aAstring, char aDelimiter,
                 nsTArray<nsCString>& aArray);