Improve Unicode perf. (#1318)

2024-01-23 16:42:13 -08:00 · 2024-01-23 16:42:13 -08:00 · 8d673fc81a
--- a/include/vcpkg/base/unicode.h
+++ b/include/vcpkg/base/unicode.h
@ -6,16 +6,6 @@

 namespace vcpkg::Unicode
 {
-    enum class Utf8CodeUnitKind
-    {
-        Invalid = -1,
-        Continue = 0,
-        StartOne = 1,
-        StartTwo = 2,
-        StartThree = 3,
-        StartFour = 4,
-    };
-
    constexpr static char32_t end_of_file = 0xFFFF'FFFF;

    enum class utf8_errc
@ -29,18 +19,12 @@ namespace vcpkg::Unicode
        UnexpectedEof = 6,
    };

-    Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
-
-    int utf8_code_unit_count(char code_unit) noexcept;
-
    int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;

-    // returns {after-current-code-point, error},
-    // and if error = NoError, then out = parsed code point.
-    // else, out = end_of_file.
-    std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
-                                                             const char* last,
-                                                             char32_t& out) noexcept;
+    // If possible, decodes one codepoint from the beginning of [first, last). If successful advances first after the
+    // last decoded encoding unit, stores the codepoint in out, and returns utf8_errc::NoError.
+    // Otherwise, advances first to last, stores end_of_file in out, and returns one of the utf8_errc values.
+    utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept;

    // uses the C++20 definition
    /*
@ -117,9 +101,13 @@ namespace vcpkg::Unicode
    */
    struct Utf8Decoder
    {
-        constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
+        constexpr Utf8Decoder() noexcept
+            : current_(end_of_file), pointer_to_current_(nullptr), next_(nullptr), last_(nullptr)
+        {
+        }
        explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
-        constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
+        constexpr Utf8Decoder(const char* first, const char* last) noexcept
+            : current_(0), pointer_to_current_(first), next_(first), last_(last)
        {
            if (next_ != last_)
            {
@ -135,7 +123,7 @@ namespace vcpkg::Unicode
        {
        }
        constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
-            : current_(0), next_(first), last_(last)
+            : current_(0), pointer_to_current_(first), next_(first), last_(last)
        {
            if (next_ != last_)
            {
@ -158,7 +146,7 @@ namespace vcpkg::Unicode

        Utf8Decoder& operator=(sentinel) noexcept;

-        char const* pointer_to_current() const noexcept;
+        char const* pointer_to_current() const noexcept { return pointer_to_current_; }

        char32_t operator*() const noexcept
        {
@ -186,7 +174,7 @@ namespace vcpkg::Unicode
                Checks::unreachable(VCPKG_LINE_INFO);
            }

-            return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
+            return lhs.pointer_to_current_ == rhs.pointer_to_current_;
        }
        friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
        {
@ -205,6 +193,7 @@ namespace vcpkg::Unicode

    private:
        char32_t current_;
+        const char* pointer_to_current_;
        const char* next_;
        const char* last_;
    };
--- a/src/vcpkg-test/unicode.cpp
+++ b/src/vcpkg-test/unicode.cpp
@ -56,11 +56,12 @@ TEST_CASE ("Utf8Decoder valid", "[unicode]")
        REQUIRE(!decode.is_eof());
        char32_t decoded;
        auto pointer_to_current = decode.pointer_to_current();
-        REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
+        const auto original_pointer_to_current = pointer_to_current;
+        REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded) == utf8_errc::NoError);
        REQUIRE(decoded == expected[idx]);
        char encoded[4];
        auto encoded_size = utf8_encode_code_point(encoded, decoded);
-        REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
+        REQUIRE(std::equal(encoded, encoded + encoded_size, original_pointer_to_current));
        ++decode;
    }

@ -104,3 +105,43 @@ TEST_CASE ("Utf8Decoder invalid", "[unicode]")

    REQUIRE(uut.is_eof());
 }
+
+TEST_CASE ("Utf8Decoder empty current", "[unicode]")
+{
+    char storage[] = "";
+    Utf8Decoder uut(storage);
+    REQUIRE(uut.pointer_to_current() == storage);
+    REQUIRE(uut.is_eof());
+}
+
+TEST_CASE ("utf8_is_valid_string fails", "[unicode]")
+{
+    const char* test = GENERATE("hello \xFF too big",
+                                "hello \xC3\xBF\xBF\xBF also too big",
+                                "hello \x9C continuation",
+                                "hello \xE0\x28 overlong",
+                                "hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
+                                "missing two: \xC3",
+                                "missing three one: \xE6\x9C",
+                                "missing three two: \xE6",
+                                "missing four one: \xF0\x9F\x8F",
+                                "missing four two: \xF0\x9F",
+                                "missing four three: \xF0");
+    REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
+}
+
+TEST_CASE ("utf8_is_valid_string fails at end", "[unicode]")
+{
+    const char* test = GENERATE("\xFF",
+                                "\xC3\xBF\xBF\xBF",
+                                "\x9C",
+                                "\xE0\x28",
+                                "\xED\xA0\xBC\xED\xBF\x88",
+                                "\xC3",
+                                "\xE6\x9C",
+                                "\xE6",
+                                "\xF0\x9F\x8F",
+                                "\xF0\x9F",
+                                "\xF0");
+    REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
+}
--- a/src/vcpkg/base/unicode.cpp
+++ b/src/vcpkg/base/unicode.cpp
@ -3,176 +3,185 @@

 namespace vcpkg::Unicode
 {
-    Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept
-    {
-        if (code_unit < 0b1000'0000)
-        {
-            return Utf8CodeUnitKind::StartOne;
-        }
-        else if (code_unit < 0b1100'0000)
-        {
-            return Utf8CodeUnitKind::Continue;
-        }
-        else if (code_unit < 0b1110'0000)
-        {
-            return Utf8CodeUnitKind::StartTwo;
-        }
-        else if (code_unit < 0b1111'0000)
-        {
-            return Utf8CodeUnitKind::StartThree;
-        }
-        else if (code_unit < 0b1111'1000)
-        {
-            return Utf8CodeUnitKind::StartFour;
-        }
-        else
-        {
-            return Utf8CodeUnitKind::Invalid;
-        }
-    }
-
-    static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
-
-    int utf8_code_unit_count(char code_unit) noexcept
-    {
-        return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
-    }
-
-    static constexpr int utf8_encode_code_unit_count(char32_t code_point) noexcept
-    {
-        if (code_point < 0x80)
-        {
-            return 1;
-        }
-        else if (code_point < 0x800)
-        {
-            return 2;
-        }
-        else if (code_point < 0x10000)
-        {
-            return 3;
-        }
-        else if (code_point < 0x110000)
-        {
-            return 4;
-        }
-        else
-        {
-            vcpkg::Checks::msg_exit_with_message(
-                VCPKG_LINE_INFO,
-                msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
-        }
-    }
-
    int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept
    {
-        // count \in {2, 3, 4}
-        const auto start_code_point = [](char32_t code_point, int count) {
-            const unsigned char and_mask = 0xFF >> (count + 1);
-            const unsigned char or_mask = (0xFF << (8 - count)) & 0xFF;
-            const int shift = 6 * (count - 1);
-            return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
-        };
-        // count \in {2, 3, 4}, byte \in {1, 2, 3}
-        const auto continue_code_point = [](char32_t code_point, int count, int byte) {
-            constexpr unsigned char and_mask = 0xFF >> 2;
-            constexpr unsigned char or_mask = (0xFF << 7) & 0xFF;
-            const int shift = 6 * (count - byte - 1);
-            return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
-        };
-
-        int count = utf8_encode_code_unit_count(code_point);
-        if (count == 1)
+        if (code_point < 0x80)
        {
            array[0] = static_cast<char>(code_point);
            return 1;
        }

-        array[0] = start_code_point(code_point, count);
-        for (int i = 1; i < count; ++i)
+        if (code_point < 0x800)
        {
-            array[i] = continue_code_point(code_point, count, i);
+            array[0] = static_cast<unsigned char>(0b1100'0000u | (code_point >> 6));
+            array[1] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
+            return 2;
        }

-        return count;
+        if (code_point < 0x10000)
+        {
+            // clang-format off
+            array[0] = static_cast<unsigned char>(0b1110'0000u | (code_point  >> 12));
+            array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6) & 0b0011'1111u));
+            array[2] = static_cast<unsigned char>(0b1000'0000u | (code_point        & 0b0011'1111u));
+            // clang-format on
+            return 3;
+        }
+
+        if (code_point < 0x110000)
+        {
+            // clang-format off
+            array[0] = static_cast<unsigned char>(0b1111'0000u |  (code_point >> 18));
+            array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 12) & 0b0011'1111u));
+            array[2] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6)  & 0b0011'1111u));
+            array[3] = static_cast<unsigned char>(0b1000'0000u |  (code_point        & 0b0011'1111u));
+            // clang-format on
+            return 4;
+        }
+
+        vcpkg::Checks::msg_exit_with_message(
+            VCPKG_LINE_INFO,
+            msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
    }

-    std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
-                                                             const char* last,
-                                                             char32_t& out) noexcept
+    static utf8_errc check_trailing(unsigned char code_unit) noexcept
+    {
+        if ((code_unit & 0b1100'0000u) != 0b1000'0000u)
+        {
+            if (code_unit >= 0b1111'1000u)
+            {
+                return utf8_errc::InvalidCodeUnit;
+            }
+
+            return utf8_errc::UnexpectedStart;
+        }
+
+        return utf8_errc::NoError;
+    }
+
+    utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept
    {
-        out = end_of_file;
        if (first == last)
        {
-            return {last, utf8_errc::NoError};
+            out = end_of_file;
+            return utf8_errc::NoError;
        }

-        auto code_unit = *first;
-        auto kind = utf8_code_unit_kind(static_cast<unsigned char>(code_unit));
-        const int count = utf8_code_unit_count(kind);
-
-        const char* it = first + 1;
-
-        if (kind == Utf8CodeUnitKind::Invalid)
+        auto code_unit = static_cast<unsigned char>(*first);
+        if (code_unit < 0b1000'0000u)
        {
-            return {it, utf8_errc::InvalidCodeUnit};
-        }
-        else if (kind == Utf8CodeUnitKind::Continue)
-        {
-            return {it, utf8_errc::UnexpectedContinue};
-        }
-        else if (count > last - first)
-        {
-            return {last, utf8_errc::UnexpectedEof};
+            out = code_unit;
+            ++first;
+            return utf8_errc::NoError;
        }

-        if (count == 1)
+        if (code_unit < 0b1100'0000u)
        {
-            out = static_cast<char32_t>(code_unit);
-            return {it, utf8_errc::NoError};
+            out = end_of_file;
+            first = last;
+            return utf8_errc::UnexpectedContinue;
        }

-        // 2 -> 0b0001'1111, 6
-        // 3 -> 0b0000'1111, 12
-        // 4 -> 0b0000'0111, 18
-        const auto start_mask = static_cast<unsigned char>(0xFF >> (count + 1));
-        const int start_shift = 6 * (count - 1);
-        char32_t code_point = static_cast<char32_t>(code_unit & start_mask) << start_shift;
-
-        constexpr unsigned char continue_mask = 0b0011'1111;
-        for (int byte = 1; byte < count; ++byte)
+        if (code_unit < 0b1110'0000u)
        {
-            code_unit = *it++;
-
-            kind = utf8_code_unit_kind(code_unit);
-            if (kind == Utf8CodeUnitKind::Invalid)
+            if (2 > last - first)
            {
-                return {it, utf8_errc::InvalidCodeUnit};
-            }
-            else if (kind != Utf8CodeUnitKind::Continue)
-            {
-                return {it, utf8_errc::UnexpectedStart};
+                out = end_of_file;
+                first = last;
+                return utf8_errc::UnexpectedEof;
            }

-            const int shift = 6 * (count - byte - 1);
-            code_point |= (code_unit & continue_mask) << shift;
+            utf8_errc out_error;
+            if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError)
+            {
+                out = end_of_file;
+                first = last;
+                return out_error;
+            }
+
+            out = ((code_unit & 0b0001'1111) << 6) | (static_cast<unsigned char>(first[1]) & 0b0011'1111u);
+            first += 2;
+            return utf8_errc::NoError;
        }

-        if (code_point > 0x10'FFFF)
+        if (code_unit < 0b1111'0000u)
        {
-            return {it, utf8_errc::InvalidCodePoint};
+            if (3 > last - first)
+            {
+                out = end_of_file;
+                first = last;
+                return utf8_errc::UnexpectedEof;
+            }
+
+            utf8_errc out_error;
+            if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
+                (out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError)
+            {
+                out = end_of_file;
+                first = last;
+                return out_error;
+            }
+
+            // clang-format off
+            out = ((code_unit & 0b0000'1111) << 12)
+                | ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 6)
+                |  (static_cast<unsigned char>(first[2]) & 0b0011'1111u);
+            // clang-format on
+            first += 3;
+            return utf8_errc::NoError;
        }

-        out = code_point;
-        return {it, utf8_errc::NoError};
+        if (code_unit < 0b1111'1000u)
+        {
+            if (4 > last - first)
+            {
+                out = end_of_file;
+                first = last;
+                return utf8_errc::UnexpectedEof;
+            }
+
+            utf8_errc out_error;
+            if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
+                (out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError ||
+                (out_error = check_trailing(static_cast<unsigned char>(first[3]))) != utf8_errc::NoError)
+            {
+                out = end_of_file;
+                first = last;
+                return out_error;
+            }
+
+            // clang-format off
+            out = ((code_unit & 0b0000'0111) << 18)
+                | ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 12)
+                | ((static_cast<unsigned char>(first[2]) & 0b0011'1111u) << 6)
+                |  (static_cast<unsigned char>(first[3]) & 0b0011'1111u);
+            // clang-format on
+
+            if (out > 0x10'FFFF)
+            {
+                out = end_of_file;
+                first = last;
+                return utf8_errc::InvalidCodePoint;
+            }
+
+            first += 4;
+            return utf8_errc::NoError;
+        }
+
+        out = end_of_file;
+        first = last;
+        return utf8_errc::InvalidCodeUnit;
    }

    bool utf8_is_valid_string(const char* first, const char* last) noexcept
    {
-        utf8_errc err = utf8_errc::NoError;
-        for (auto dec = Utf8Decoder(first, last); dec != dec.end(); err = dec.next())
+        utf8_errc err;
+        Utf8Decoder dec(first, last, err);
+        while (!dec.is_eof())
        {
+            err = dec.next();
        }
+
        return err == utf8_errc::NoError;
    }

@ -203,17 +212,6 @@ namespace vcpkg::Unicode
        }
    }

-    char const* Utf8Decoder::pointer_to_current() const noexcept
-    {
-        if (is_eof())
-        {
-            return last_;
-        }
-
-        auto count = utf8_encode_code_unit_count(current_);
-        return next_ - count;
-    }
-
    utf8_errc Utf8Decoder::next() noexcept
    {
        if (is_eof())
@ -222,28 +220,34 @@ namespace vcpkg::Unicode
            Checks::unreachable(VCPKG_LINE_INFO);
        }

-        if (next_ == last_)
+        const auto old_next = next_;
+        const auto last = last_;
+        if (old_next == last)
        {
            current_ = end_of_file;
+            pointer_to_current_ = last;
            return utf8_errc::NoError;
        }

        char32_t code_point;
-        auto new_next = utf8_decode_code_point(next_, last_, code_point);
-        if (new_next.second != utf8_errc::NoError)
+        auto err = utf8_decode_code_point(next_, last, code_point);
+        if (err != utf8_errc::NoError)
        {
-            *this = sentinel();
-            return new_next.second;
+            current_ = end_of_file;
+            pointer_to_current_ = last;
+            return err;
        }

        if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_))
        {
-            *this = sentinel();
+            current_ = end_of_file;
+            pointer_to_current_ = last;
+            next_ = last;
            return utf8_errc::PairedSurrogates;
        }

-        next_ = new_next.first;
        current_ = code_point;
+        pointer_to_current_ = old_next;
        return utf8_errc::NoError;
    }

@ -261,8 +265,9 @@ namespace vcpkg::Unicode

    Utf8Decoder& Utf8Decoder::operator=(sentinel) noexcept
    {
-        next_ = last_;
        current_ = end_of_file;
+        pointer_to_current_ = last_;
+        next_ = last_;
        return *this;
    }
 }