Bug 1495571 - Part 7: Make the output a lambda. r=efaust

Depends on D7375 Differential Revision: https://phabricator.services.mozilla.com/D7376 --HG-- extra : moz-landing-system : lando
2018-10-02 14:26:01 +00:00 · 2018-10-02 14:26:01 +00:00 · 4d45a905c2
--- a/js/src/vm/CharacterEncoding.cpp
+++ b/js/src/vm/CharacterEncoding.cpp
@ -280,20 +280,15 @@ static const char16_t REPLACEMENT_CHARACTER = 0xFFFD;
 // If making changes to this algorithm, make sure to also update
 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
 //
-// Scan UTF8 input and (internally, at least) convert it to a series of
+// Scan UTF8 input and (internally, at least) convert it to a series of UTF-16
-// UTF-16 code units. But you can also do odd things like pass
+// code units. But you can also do odd things like pass an empty lambda for
-// CharT=Latin1Char, in which case each output code unit is silently truncated
+// `dst`, in which case the output is discarded entirely--the only effect of
-// to 8 bits; or Action=Count, in which case the output is discarded entirely
+// calling the template that way is error-checking.
-// because we're just counting how many UTF-16 code units of output there are.
+template <InflateUTF8Action Action, OnUTF8Error ErrorAction, typename OutputFn>
 template <InflateUTF8Action Action, OnUTF8Error ErrorAction, typename CharT>
 static bool
-InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstlenp,
+InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, OutputFn dst,
                   JS::SmallestEncoding *smallestEncoding)
 {
    static_assert(std::is_same<CharT, char16_t>::value ||
                  std::is_same<CharT, Latin1Char>::value,
                  "bad CharT");
    if (Action != Nop) {
        *smallestEncoding = JS::SmallestEncoding::ASCII;
    }
@ -304,17 +299,12 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstle
        *smallestEncoding = JS::SmallestEncoding::UTF16;
    };
    // Count how many code units need to be in the inflated string.
    // |i| is the index into |src|, and |j| is the the index into |dst|.
    size_t srclen = src.length();
-    uint32_t j = 0;
+    for (uint32_t i = 0; i < srclen; i++) {
    for (uint32_t i = 0; i < srclen; i++, j++) {
        uint32_t v = uint32_t(src[i]);
        if (!(v & 0x80)) {
            // ASCII code unit.  Simple copy.
-            if (Action == Copy) {
+            dst(uint16_t(v));
                dst[j] = CharT(v);
            }
        } else {
            // Non-ASCII code unit.  Determine its length in bytes (n).
@ -338,9 +328,7 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstle
                        MOZ_ASSERT(ErrorAction == OnUTF8Error::InsertQuestionMark); \
                        replacement = '?';                              \
                    }                                                   \
-                    if (Action == Copy) {                               \
+                    dst(replacement);                                   \
                        dst[j] = CharT(replacement);                    \
                    }                                                   \
                    n = n2;                                             \
                    goto invalidMultiByteCodeUnit;                      \
                }                                                       \
@ -379,7 +367,6 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstle
                if (v > 0xff) {
                    RequireUTF16();
                    if (Action == FindEncoding) {
                        MOZ_ASSERT(dst == nullptr);
                        return true;
                    }
                } else {
@ -388,21 +375,13 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstle
            }
            if (v < 0x10000) {
                // The n-byte UTF8 code unit will fit in a single CharT.
-                if (Action == Copy) {
+                dst(char16_t(v));
                    dst[j] = CharT(v);
                }
            } else {
                v -= 0x10000;
                if (v <= 0xFFFFF) {
                    // The n-byte UTF8 code unit will fit in two CharT units.
-                    if (Action == Copy) {
+                    dst(char16_t((v >> 10) + 0xD800));
-                        dst[j] = CharT((v >> 10) + 0xD800);
+                    dst(char16_t((v & 0x3FF) + 0xDC00));
                    }
                    j++;
                    if (Action == Copy) {
                        dst[j] = CharT((v & 0x3FF) + 0xDC00);
                    }
                } else {
                    // The n-byte UTF8 code unit won't fit in two CharT units.
                    INVALID(ReportTooBigCharacter, v, 1);
@ -420,10 +399,6 @@ InflateUTF8ToUTF16(JSContext* cx, const UTF8Chars src, CharT* dst, size_t* dstle
        }
    }
    if (Action != Nop && Action != FindEncoding) {
        *dstlenp = j;
    }
    return true;
 }
@ -432,12 +407,19 @@ static CharsT
 InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
 {
    using CharT = typename CharsT::CharT;
    static_assert(std::is_same<CharT, char16_t>::value ||
                  std::is_same<CharT, Latin1Char>::value,
                  "bad CharT");
    *outlen = 0;
    JS::SmallestEncoding encoding;
-    if (!InflateUTF8ToUTF16<Count, ErrorAction, CharT>(cx, src, /* dst = */ nullptr, outlen, &encoding)) {
+    size_t len = 0;
    auto count = [&](char16_t) { len++; };
    if (!InflateUTF8ToUTF16<Count, ErrorAction>(cx, src, count, &encoding)) {
        return CharsT();
    }
    *outlen = len;
    CharT* dst = cx->template pod_malloc<CharT>(*outlen + 1);  // +1 for NUL
    if (!dst) {
@ -451,12 +433,15 @@ InflateUTF8StringHelper(JSContext* cx, const UTF8Chars src, size_t* outlen)
        for (uint32_t i = 0; i < srclen; i++) {
            dst[i] = CharT(src[i]);
        }
    } else if (std::is_same<decltype(dst[0]), Latin1Char>::value) {
        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<Copy, OnUTF8Error::InsertQuestionMark, CharT>(cx, src, dst, outlen, &encoding)));
    } else {
-        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<Copy, OnUTF8Error::InsertReplacementCharacter, CharT>(cx, src, dst, outlen, &encoding)));
+        constexpr OnUTF8Error errorMode = std::is_same<CharT, Latin1Char>::value
            ? OnUTF8Error::InsertQuestionMark
            : OnUTF8Error::InsertReplacementCharacter;
        size_t j = 0;
        auto push = [&](char16_t c) { dst[j++] = CharT(c); };
        MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<Copy, errorMode>(cx, src, push, &encoding)));
        MOZ_ASSERT(j == len);
    }
    dst[*outlen] = 0;    // NUL char
    return CharsT(dst, *outlen);
@ -492,11 +477,10 @@ JS::SmallestEncoding
 JS::FindSmallestEncoding(UTF8Chars utf8)
 {
    JS::SmallestEncoding encoding;
-    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<FindEncoding, OnUTF8Error::InsertReplacementCharacter, char16_t>(
+    MOZ_ALWAYS_TRUE((InflateUTF8ToUTF16<FindEncoding, OnUTF8Error::InsertReplacementCharacter>(
                         /* cx = */ nullptr,
                         utf8,
-                         /* dst = */ nullptr,
+                         [](char16_t) {},
                         /* dstlen = */ nullptr,
                         &encoding)));
    return encoding;
 }
@ -519,11 +503,10 @@ JS::ConstUTF8CharsZ::validate(size_t aLength)
 {
    MOZ_ASSERT(data_);
    UTF8Chars chars(data_, aLength);
-    InflateUTF8ToUTF16<Nop, OnUTF8Error::Crash, char16_t>(
+    InflateUTF8ToUTF16<Nop, OnUTF8Error::Crash>(
        /* cx = */ nullptr,
        chars,
-        /* dst = */ nullptr,
+        [](char16_t) {},
        /* dstlen = */ nullptr,
        /* smallestEncoding = */ nullptr);
 }
 #endif