Fix Utf8Decoder operator== handling of the last code point in the input (#1316)

* Fix Utf8Decoder operator== handling of the last code point in the input While working on diagnostics for https://github.com/microsoft/vcpkg-tool/pull/1210 I observed that we were printing the caret ^ in the wrong place when it goes after the input. The way this works is we form the line of text to print, then decode the unicode encoding units, and when we hit the target, we stop and print ^: 5b8f9c40dd/src/vcpkg/base/parse.cpp (L51-L68) however, if the intended location for the ^ is the "end" of the line, we hit this bug: 5b8f9c40dd/src/vcpkg/base/unicode.cpp (L273) The iterator only compares the last_ pointers, but both the "points at the last code point in the input" and "points to the end of the input" state set `next_ == last_`. See: 5b8f9c40dd/src/vcpkg/base/unicode.cpp (L222-L226) This means that the points to the end and points one past the end iterator compare equal, so the loop in parse.cpp stops one position too early. Also adds a bunch of testing for this specific case, for other parts of Utf8Decoder, adds a way to parse the first code point without failing, makes all the operators 'hidden friends', and removes localized strings for bugs-in-vcpkg-itself. * Add noexcepts as requested by @Thomas1664
2024-01-04 17:36:54 -08:00 · 2024-01-04 17:36:54 -08:00 · 4e75e1c5a5
--- a/include/vcpkg/base/message-data.inc.h
+++ b/include/vcpkg/base/message-data.inc.h
@ -866,10 +866,6 @@ DECLARE_MESSAGE(CommandFailed,
                "{command_line}\n"
                "failed with the following results:")
 DECLARE_MESSAGE(CommunityTriplets, (), "", "Community Triplets:")
-DECLARE_MESSAGE(ComparingUtf8Decoders,
-                (),
-                "",
-                "Comparing Utf8Decoders with different provenance; this is always an error")
 DECLARE_MESSAGE(CompressFolderFailed, (msg::path), "", "Failed to compress folder \"{path}\":")
 DECLARE_MESSAGE(ComputingInstallPlan, (), "", "Computing installation plan...")
 DECLARE_MESSAGE(ConfigurationErrorRegistriesWithoutBaseline,
@ -1702,7 +1698,6 @@ DECLARE_MESSAGE(IllegalPlatformSpec, (), "", "Platform qualifier is not allowed
 DECLARE_MESSAGE(ImproperShaLength, (msg::value), "{value} is a sha.", "SHA512's must be 128 hex characters: {value}")
 DECLARE_MESSAGE(IncorrectArchiveFileSignature, (), "", "Incorrect archive file signature")
 DECLARE_MESSAGE(IncorrectPESignature, (), "", "Incorrect PE signature")
-DECLARE_MESSAGE(IncrementedUtf8Decoder, (), "", "Incremented Utf8Decoder at the end of the string")
 DECLARE_MESSAGE(InfoSetEnvVar,
                (msg::env_var),
                "In this context 'editor' means IDE",
--- a/include/vcpkg/base/unicode.h
+++ b/include/vcpkg/base/unicode.h
@ -31,7 +31,6 @@ namespace vcpkg::Unicode

    Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;

-    constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
    int utf8_code_unit_count(char code_unit) noexcept;

    int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
@ -87,20 +86,20 @@ namespace vcpkg::Unicode

    bool utf8_is_valid_string(const char* first, const char* last) noexcept;

-    constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_leading_surrogate_code_point(char32_t code_point) noexcept
    {
        return code_point >= 0xD800 && code_point < 0xDC00;
    }
-    constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_trailing_surrogate_code_point(char32_t code_point) noexcept
    {
        return code_point >= 0xDC00 && code_point < 0xE000;
    }
-    constexpr bool utf16_is_surrogate_code_point(char32_t code_point)
+    constexpr bool utf16_is_surrogate_code_point(char32_t code_point) noexcept
    {
        return code_point >= 0xD800 && code_point < 0xE000;
    }

-    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing);
+    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept;

    /*
        There are two ways to parse utf-8: we could allow unpaired surrogates (as in [wtf-8]) -- this is important
@ -119,7 +118,7 @@ namespace vcpkg::Unicode
    struct Utf8Decoder
    {
        constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
-        explicit constexpr Utf8Decoder(StringView sv) : Utf8Decoder(sv.begin(), sv.end()) { }
+        explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
        constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
        {
            if (next_ != last_)
@ -131,13 +130,31 @@ namespace vcpkg::Unicode
                current_ = end_of_file;
            }
        }
+        constexpr Utf8Decoder(StringView sv, utf8_errc& first_decode_error) noexcept
+            : Utf8Decoder(sv.begin(), sv.end(), first_decode_error)
+        {
+        }
+        constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
+            : current_(0), next_(first), last_(last)
+        {
+            if (next_ != last_)
+            {
+                first_decode_error = next();
+            }
+            else
+            {
+                current_ = end_of_file;
+                first_decode_error = utf8_errc::NoError;
+            }
+        }
+
        struct sentinel
        {
        };

        constexpr inline bool is_eof() const noexcept { return current_ == end_of_file; }

-        [[nodiscard]] utf8_errc next();
+        [[nodiscard]] utf8_errc next() noexcept;

        Utf8Decoder& operator=(sentinel) noexcept;

@ -161,7 +178,24 @@ namespace vcpkg::Unicode

        constexpr sentinel end() const { return sentinel(); }

-        friend bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept;
+        friend constexpr bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
+        {
+            if (lhs.last_ != rhs.last_)
+            {
+                // comparing decoders of different provenance is always an error
+                Checks::unreachable(VCPKG_LINE_INFO);
+            }
+
+            return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
+        }
+        friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
+        {
+            return !(lhs == rhs);
+        }
+        friend constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return d.is_eof(); }
+        friend constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d == s; }
+        friend constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) noexcept { return !d.is_eof(); }
+        friend constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) noexcept { return d != s; }

        using difference_type = std::ptrdiff_t;
        using value_type = char32_t;
@ -174,11 +208,4 @@ namespace vcpkg::Unicode
        const char* next_;
        const char* last_;
    };
-
-    inline bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept { return !(lhs == rhs); }
-
-    constexpr bool operator==(const Utf8Decoder& d, Utf8Decoder::sentinel) { return d.is_eof(); }
-    constexpr bool operator==(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d == s; }
-    constexpr bool operator!=(const Utf8Decoder& d, Utf8Decoder::sentinel) { return !d.is_eof(); }
-    constexpr bool operator!=(Utf8Decoder::sentinel s, const Utf8Decoder& d) { return d != s; }
 }
--- a/locales/messages.json
+++ b/locales/messages.json
@ -511,7 +511,6 @@
  "CommandFailed": "command:\n{command_line}\nfailed with the following results:",
  "_CommandFailed.comment": "An example of {command_line} is vcpkg install zlib.",
  "CommunityTriplets": "Community Triplets:",
-  "ComparingUtf8Decoders": "Comparing Utf8Decoders with different provenance; this is always an error",
  "CompressFolderFailed": "Failed to compress folder \"{path}\":",
  "_CompressFolderFailed.comment": "An example of {path} is /foo/bar.",
  "ComputingInstallPlan": "Computing installation plan...",
@ -972,7 +971,6 @@
  "_ImproperShaLength.comment": "{value} is a sha.",
  "IncorrectArchiveFileSignature": "Incorrect archive file signature",
  "IncorrectPESignature": "Incorrect PE signature",
-  "IncrementedUtf8Decoder": "Incremented Utf8Decoder at the end of the string",
  "InfoSetEnvVar": "You can also set {env_var} to your editor of choice.",
  "_InfoSetEnvVar.comment": "In this context 'editor' means IDE An example of {env_var} is VCPKG_DEFAULT_TRIPLET.",
  "InitRegistryFailedNoRepo": "Could not create a registry at {path} because this is not a git repository root.\nUse `git init {command_line}` to create a git repository in this folder.",
--- a/src/vcpkg-test/ci-baseline.cpp
+++ b/src/vcpkg-test/ci-baseline.cpp
@ -258,7 +258,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
 {
    check_error("hello", R"(test:1:6: error: expected ':' here
  on expression: hello
-                     ^)");
+                      ^)");

    check_error("hello\n:", R"(test:1:6: error: expected ':' here
  on expression: hello
@ -271,7 +271,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")

    check_error("x64-windows:", R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
  on expression: x64-windows:
-                            ^)");
+                             ^)");

    check_error("x64-windows:\nport:x64-windows=skip",
                R"(test:1:13: error: expected a triplet name here (must be lowercase, digits, '-')
@ -285,7 +285,7 @@ TEST_CASE ("Parse Errors", "[ci-baseline]")
    // clang-format off
    check_error("   \tx64-windows:", R"(test:1:21: error: expected a triplet name here (must be lowercase, digits, '-')
  on expression:    )" "\t" R"(x64-windows:
-                    )" "\t" R"(           ^)");
+                    )" "\t" R"(            ^)");
    // clang-format on

    check_error("port:x64-windows\n=fail", R"(test:1:17: error: expected '=' here
--- a/src/vcpkg-test/manifests.cpp
+++ b/src/vcpkg-test/manifests.cpp
@ -1267,7 +1267,7 @@ TEST_CASE ("license error messages", "[manifests][license]")
    CHECK(messages.error->to_string() ==
          R"(<license string>:1:8: error: Expected a license name, found the end of the string.
  on expression: MIT AND
-                       ^)");
+                        ^)");

    parse_spdx_license_expression("MIT AND unknownlicense", messages);
    CHECK(!messages.error);
--- a/src/vcpkg-test/unicode.cpp
+++ b/src/vcpkg-test/unicode.cpp
@ -0,0 +1,106 @@
+#include <vcpkg-test/util.h>
+
+#include <vcpkg/base/unicode.h>
+
+#include <iterator>
+
+using namespace vcpkg::Unicode;
+
+TEST_CASE ("Utf8Decoder valid", "[unicode]")
+{
+    const char32_t* expected = U"";
+    const char* input = "";
+    SECTION ("hello")
+    {
+        expected = U"hello";
+        input = "hello";
+    }
+
+    SECTION ("all types of code points")
+    {
+        expected = U"one: a two: \u00E9 three: \u672C four: \U0001F3C8";
+        input = "one: a two: \xC3\xA9 three: \xE6\x9C\xAC four: \xF0\x9F\x8F\x88";
+    }
+
+    SECTION ("wtf-8 leading")
+    {
+        // U+1F3C8 as WTF-8
+        static constexpr char32_t storage[] = {0xD83C, 0};
+        expected = storage;
+        input = "\xED\xA0\xBC";
+    }
+
+    SECTION ("wtf-8 trailing")
+    {
+        // U+1F3C8 as WTF-8
+        static constexpr char32_t storage[] = {0xDFC8, 0};
+        expected = storage;
+        input = "\xED\xBF\x88";
+    }
+
+    auto input_end = input + strlen(input);
+    Utf8Decoder decode(input);
+    // strlen for char32_t:
+    size_t expected_size = 0;
+    for (auto* e = expected; *e; ++e)
+    {
+        ++expected_size;
+    }
+
+    auto decode_at_end = std::next(decode, expected_size);
+    for (size_t idx = 0; idx < expected_size; ++idx)
+    {
+        REQUIRE(decode != decode.end());  // compare sentinel
+        REQUIRE(decode != decode_at_end); // compare iterator
+        REQUIRE(*decode == expected[idx]);
+        REQUIRE(!decode.is_eof());
+        char32_t decoded;
+        auto pointer_to_current = decode.pointer_to_current();
+        REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
+        REQUIRE(decoded == expected[idx]);
+        char encoded[4];
+        auto encoded_size = utf8_encode_code_point(encoded, decoded);
+        REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
+        ++decode;
+    }
+
+    REQUIRE(decode == decode.end());
+    REQUIRE(decode == decode_at_end);
+}
+
+TEST_CASE ("Utf8Decoder first decode empty", "[unicode]")
+{
+    utf8_errc err;
+    Utf8Decoder uut("", err);
+    REQUIRE(err == utf8_errc::NoError);
+    REQUIRE(uut.is_eof());
+    REQUIRE(uut == uut.end());
+    REQUIRE(uut == uut);
+}
+
+TEST_CASE ("Utf8Decoder invalid", "[unicode]")
+{
+    utf8_errc err;
+    // clang-format off
+    Utf8Decoder uut(GENERATE(
+        "hello \xFF too big",
+        "hello \xC3\xBF\xBF\xBF also too big",
+        "hello \x9C continuation",
+        "hello \xE0\x28 overlong",
+        "hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
+        "missing two: \xC3",
+        "missing three one: \xE6\x9C",
+        "missing three two: \xE6",
+        "missing four one: \xF0\x9F\x8F",
+        "missing four two: \xF0\x9F",
+        "missing four three: \xF0"
+    ), err);
+    // clang-format on
+    while (err == utf8_errc::NoError)
+    {
+        REQUIRE(!uut.is_eof());
+        err = uut.next();
+    }
+
+    REQUIRE(uut.is_eof());
+}
--- a/src/vcpkg/base/unicode.cpp
+++ b/src/vcpkg/base/unicode.cpp
@ -31,6 +31,8 @@ namespace vcpkg::Unicode
        }
    }

+    static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
+
    int utf8_code_unit_count(char code_unit) noexcept
    {
        return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
@ -174,7 +176,7 @@ namespace vcpkg::Unicode
        return err == utf8_errc::NoError;
    }

-    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing)
+    char32_t utf16_surrogates_to_code_point(char32_t leading, char32_t trailing) noexcept
    {
        vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_leading_surrogate_code_point(leading));
        vcpkg::Checks::check_exit(VCPKG_LINE_INFO, utf16_is_trailing_surrogate_code_point(trailing));
@ -212,11 +214,12 @@ namespace vcpkg::Unicode
        return next_ - count;
    }

-    utf8_errc Utf8Decoder::next()
+    utf8_errc Utf8Decoder::next() noexcept
    {
        if (is_eof())
        {
-            vcpkg::Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgIncrementedUtf8Decoder);
+            // incremented Utf8Decoder at the end of the string
+            Checks::unreachable(VCPKG_LINE_INFO);
        }

        if (next_ == last_)
@ -262,14 +265,4 @@ namespace vcpkg::Unicode
        current_ = end_of_file;
        return *this;
    }
-
-    bool operator==(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
-    {
-        if (lhs.last_ != rhs.last_)
-        {
-            Checks::msg_exit_with_message(VCPKG_LINE_INFO, msgComparingUtf8Decoders);
-        }
-
-        return lhs.next_ == rhs.next_;
-    }
 }