Improve Unicode perf. (#1318)
This commit is contained in:
Родитель
78681de322
Коммит
8d673fc81a
|
@ -6,16 +6,6 @@
|
|||
|
||||
namespace vcpkg::Unicode
|
||||
{
|
||||
enum class Utf8CodeUnitKind
|
||||
{
|
||||
Invalid = -1,
|
||||
Continue = 0,
|
||||
StartOne = 1,
|
||||
StartTwo = 2,
|
||||
StartThree = 3,
|
||||
StartFour = 4,
|
||||
};
|
||||
|
||||
constexpr static char32_t end_of_file = 0xFFFF'FFFF;
|
||||
|
||||
enum class utf8_errc
|
||||
|
@ -29,18 +19,12 @@ namespace vcpkg::Unicode
|
|||
UnexpectedEof = 6,
|
||||
};
|
||||
|
||||
Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
|
||||
|
||||
int utf8_code_unit_count(char code_unit) noexcept;
|
||||
|
||||
int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
|
||||
|
||||
// returns {after-current-code-point, error},
|
||||
// and if error = NoError, then out = parsed code point.
|
||||
// else, out = end_of_file.
|
||||
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
|
||||
const char* last,
|
||||
char32_t& out) noexcept;
|
||||
// If possible, decodes one codepoint from the beginning of [first, last). If successful advances first after the
|
||||
// last decoded encoding unit, stores the codepoint in out, and returns utf8_errc::NoError.
|
||||
// Otherwise, advances first to last, stores end_of_file in out, and returns one of the utf8_errc values.
|
||||
utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept;
|
||||
|
||||
// uses the C++20 definition
|
||||
/*
|
||||
|
@ -117,9 +101,13 @@ namespace vcpkg::Unicode
|
|||
*/
|
||||
struct Utf8Decoder
|
||||
{
|
||||
constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
|
||||
constexpr Utf8Decoder() noexcept
|
||||
: current_(end_of_file), pointer_to_current_(nullptr), next_(nullptr), last_(nullptr)
|
||||
{
|
||||
}
|
||||
explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
|
||||
constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
|
||||
constexpr Utf8Decoder(const char* first, const char* last) noexcept
|
||||
: current_(0), pointer_to_current_(first), next_(first), last_(last)
|
||||
{
|
||||
if (next_ != last_)
|
||||
{
|
||||
|
@ -135,7 +123,7 @@ namespace vcpkg::Unicode
|
|||
{
|
||||
}
|
||||
constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
|
||||
: current_(0), next_(first), last_(last)
|
||||
: current_(0), pointer_to_current_(first), next_(first), last_(last)
|
||||
{
|
||||
if (next_ != last_)
|
||||
{
|
||||
|
@ -158,7 +146,7 @@ namespace vcpkg::Unicode
|
|||
|
||||
Utf8Decoder& operator=(sentinel) noexcept;
|
||||
|
||||
char const* pointer_to_current() const noexcept;
|
||||
char const* pointer_to_current() const noexcept { return pointer_to_current_; }
|
||||
|
||||
char32_t operator*() const noexcept
|
||||
{
|
||||
|
@ -186,7 +174,7 @@ namespace vcpkg::Unicode
|
|||
Checks::unreachable(VCPKG_LINE_INFO);
|
||||
}
|
||||
|
||||
return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
|
||||
return lhs.pointer_to_current_ == rhs.pointer_to_current_;
|
||||
}
|
||||
friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
|
||||
{
|
||||
|
@ -205,6 +193,7 @@ namespace vcpkg::Unicode
|
|||
|
||||
private:
|
||||
char32_t current_;
|
||||
const char* pointer_to_current_;
|
||||
const char* next_;
|
||||
const char* last_;
|
||||
};
|
||||
|
|
|
@ -56,11 +56,12 @@ TEST_CASE ("Utf8Decoder valid", "[unicode]")
|
|||
REQUIRE(!decode.is_eof());
|
||||
char32_t decoded;
|
||||
auto pointer_to_current = decode.pointer_to_current();
|
||||
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
|
||||
const auto original_pointer_to_current = pointer_to_current;
|
||||
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded) == utf8_errc::NoError);
|
||||
REQUIRE(decoded == expected[idx]);
|
||||
char encoded[4];
|
||||
auto encoded_size = utf8_encode_code_point(encoded, decoded);
|
||||
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
|
||||
REQUIRE(std::equal(encoded, encoded + encoded_size, original_pointer_to_current));
|
||||
++decode;
|
||||
}
|
||||
|
||||
|
@ -104,3 +105,43 @@ TEST_CASE ("Utf8Decoder invalid", "[unicode]")
|
|||
|
||||
REQUIRE(uut.is_eof());
|
||||
}
|
||||
|
||||
TEST_CASE ("Utf8Decoder empty current", "[unicode]")
|
||||
{
|
||||
char storage[] = "";
|
||||
Utf8Decoder uut(storage);
|
||||
REQUIRE(uut.pointer_to_current() == storage);
|
||||
REQUIRE(uut.is_eof());
|
||||
}
|
||||
|
||||
TEST_CASE ("utf8_is_valid_string fails", "[unicode]")
|
||||
{
|
||||
const char* test = GENERATE("hello \xFF too big",
|
||||
"hello \xC3\xBF\xBF\xBF also too big",
|
||||
"hello \x9C continuation",
|
||||
"hello \xE0\x28 overlong",
|
||||
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
|
||||
"missing two: \xC3",
|
||||
"missing three one: \xE6\x9C",
|
||||
"missing three two: \xE6",
|
||||
"missing four one: \xF0\x9F\x8F",
|
||||
"missing four two: \xF0\x9F",
|
||||
"missing four three: \xF0");
|
||||
REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
|
||||
}
|
||||
|
||||
TEST_CASE ("utf8_is_valid_string fails at end", "[unicode]")
|
||||
{
|
||||
const char* test = GENERATE("\xFF",
|
||||
"\xC3\xBF\xBF\xBF",
|
||||
"\x9C",
|
||||
"\xE0\x28",
|
||||
"\xED\xA0\xBC\xED\xBF\x88",
|
||||
"\xC3",
|
||||
"\xE6\x9C",
|
||||
"\xE6",
|
||||
"\xF0\x9F\x8F",
|
||||
"\xF0\x9F",
|
||||
"\xF0");
|
||||
REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
|
||||
}
|
|
@ -3,176 +3,185 @@
|
|||
|
||||
namespace vcpkg::Unicode
|
||||
{
|
||||
Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept
|
||||
{
|
||||
if (code_unit < 0b1000'0000)
|
||||
{
|
||||
return Utf8CodeUnitKind::StartOne;
|
||||
}
|
||||
else if (code_unit < 0b1100'0000)
|
||||
{
|
||||
return Utf8CodeUnitKind::Continue;
|
||||
}
|
||||
else if (code_unit < 0b1110'0000)
|
||||
{
|
||||
return Utf8CodeUnitKind::StartTwo;
|
||||
}
|
||||
else if (code_unit < 0b1111'0000)
|
||||
{
|
||||
return Utf8CodeUnitKind::StartThree;
|
||||
}
|
||||
else if (code_unit < 0b1111'1000)
|
||||
{
|
||||
return Utf8CodeUnitKind::StartFour;
|
||||
}
|
||||
else
|
||||
{
|
||||
return Utf8CodeUnitKind::Invalid;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
|
||||
|
||||
int utf8_code_unit_count(char code_unit) noexcept
|
||||
{
|
||||
return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
|
||||
}
|
||||
|
||||
static constexpr int utf8_encode_code_unit_count(char32_t code_point) noexcept
|
||||
{
|
||||
if (code_point < 0x80)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
else if (code_point < 0x800)
|
||||
{
|
||||
return 2;
|
||||
}
|
||||
else if (code_point < 0x10000)
|
||||
{
|
||||
return 3;
|
||||
}
|
||||
else if (code_point < 0x110000)
|
||||
{
|
||||
return 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
vcpkg::Checks::msg_exit_with_message(
|
||||
VCPKG_LINE_INFO,
|
||||
msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
|
||||
}
|
||||
}
|
||||
|
||||
int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept
|
||||
{
|
||||
// count \in {2, 3, 4}
|
||||
const auto start_code_point = [](char32_t code_point, int count) {
|
||||
const unsigned char and_mask = 0xFF >> (count + 1);
|
||||
const unsigned char or_mask = (0xFF << (8 - count)) & 0xFF;
|
||||
const int shift = 6 * (count - 1);
|
||||
return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
|
||||
};
|
||||
// count \in {2, 3, 4}, byte \in {1, 2, 3}
|
||||
const auto continue_code_point = [](char32_t code_point, int count, int byte) {
|
||||
constexpr unsigned char and_mask = 0xFF >> 2;
|
||||
constexpr unsigned char or_mask = (0xFF << 7) & 0xFF;
|
||||
const int shift = 6 * (count - byte - 1);
|
||||
return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
|
||||
};
|
||||
|
||||
int count = utf8_encode_code_unit_count(code_point);
|
||||
if (count == 1)
|
||||
if (code_point < 0x80)
|
||||
{
|
||||
array[0] = static_cast<char>(code_point);
|
||||
return 1;
|
||||
}
|
||||
|
||||
array[0] = start_code_point(code_point, count);
|
||||
for (int i = 1; i < count; ++i)
|
||||
if (code_point < 0x800)
|
||||
{
|
||||
array[i] = continue_code_point(code_point, count, i);
|
||||
array[0] = static_cast<unsigned char>(0b1100'0000u | (code_point >> 6));
|
||||
array[1] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
|
||||
return 2;
|
||||
}
|
||||
|
||||
return count;
|
||||
if (code_point < 0x10000)
|
||||
{
|
||||
// clang-format off
|
||||
array[0] = static_cast<unsigned char>(0b1110'0000u | (code_point >> 12));
|
||||
array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6) & 0b0011'1111u));
|
||||
array[2] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
|
||||
// clang-format on
|
||||
return 3;
|
||||
}
|
||||
|
||||
if (code_point < 0x110000)
|
||||
{
|
||||
// clang-format off
|
||||
array[0] = static_cast<unsigned char>(0b1111'0000u | (code_point >> 18));
|
||||
array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 12) & 0b0011'1111u));
|
||||
array[2] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6) & 0b0011'1111u));
|
||||
array[3] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
|
||||
// clang-format on
|
||||
return 4;
|
||||
}
|
||||
|
||||
vcpkg::Checks::msg_exit_with_message(
|
||||
VCPKG_LINE_INFO,
|
||||
msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
|
||||
}
|
||||
|
||||
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
|
||||
const char* last,
|
||||
char32_t& out) noexcept
|
||||
static utf8_errc check_trailing(unsigned char code_unit) noexcept
|
||||
{
|
||||
if ((code_unit & 0b1100'0000u) != 0b1000'0000u)
|
||||
{
|
||||
if (code_unit >= 0b1111'1000u)
|
||||
{
|
||||
return utf8_errc::InvalidCodeUnit;
|
||||
}
|
||||
|
||||
return utf8_errc::UnexpectedStart;
|
||||
}
|
||||
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept
|
||||
{
|
||||
out = end_of_file;
|
||||
if (first == last)
|
||||
{
|
||||
return {last, utf8_errc::NoError};
|
||||
out = end_of_file;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
auto code_unit = *first;
|
||||
auto kind = utf8_code_unit_kind(static_cast<unsigned char>(code_unit));
|
||||
const int count = utf8_code_unit_count(kind);
|
||||
|
||||
const char* it = first + 1;
|
||||
|
||||
if (kind == Utf8CodeUnitKind::Invalid)
|
||||
auto code_unit = static_cast<unsigned char>(*first);
|
||||
if (code_unit < 0b1000'0000u)
|
||||
{
|
||||
return {it, utf8_errc::InvalidCodeUnit};
|
||||
}
|
||||
else if (kind == Utf8CodeUnitKind::Continue)
|
||||
{
|
||||
return {it, utf8_errc::UnexpectedContinue};
|
||||
}
|
||||
else if (count > last - first)
|
||||
{
|
||||
return {last, utf8_errc::UnexpectedEof};
|
||||
out = code_unit;
|
||||
++first;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
if (count == 1)
|
||||
if (code_unit < 0b1100'0000u)
|
||||
{
|
||||
out = static_cast<char32_t>(code_unit);
|
||||
return {it, utf8_errc::NoError};
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::UnexpectedContinue;
|
||||
}
|
||||
|
||||
// 2 -> 0b0001'1111, 6
|
||||
// 3 -> 0b0000'1111, 12
|
||||
// 4 -> 0b0000'0111, 18
|
||||
const auto start_mask = static_cast<unsigned char>(0xFF >> (count + 1));
|
||||
const int start_shift = 6 * (count - 1);
|
||||
char32_t code_point = static_cast<char32_t>(code_unit & start_mask) << start_shift;
|
||||
|
||||
constexpr unsigned char continue_mask = 0b0011'1111;
|
||||
for (int byte = 1; byte < count; ++byte)
|
||||
if (code_unit < 0b1110'0000u)
|
||||
{
|
||||
code_unit = *it++;
|
||||
|
||||
kind = utf8_code_unit_kind(code_unit);
|
||||
if (kind == Utf8CodeUnitKind::Invalid)
|
||||
if (2 > last - first)
|
||||
{
|
||||
return {it, utf8_errc::InvalidCodeUnit};
|
||||
}
|
||||
else if (kind != Utf8CodeUnitKind::Continue)
|
||||
{
|
||||
return {it, utf8_errc::UnexpectedStart};
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::UnexpectedEof;
|
||||
}
|
||||
|
||||
const int shift = 6 * (count - byte - 1);
|
||||
code_point |= (code_unit & continue_mask) << shift;
|
||||
utf8_errc out_error;
|
||||
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return out_error;
|
||||
}
|
||||
|
||||
out = ((code_unit & 0b0001'1111) << 6) | (static_cast<unsigned char>(first[1]) & 0b0011'1111u);
|
||||
first += 2;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
if (code_point > 0x10'FFFF)
|
||||
if (code_unit < 0b1111'0000u)
|
||||
{
|
||||
return {it, utf8_errc::InvalidCodePoint};
|
||||
if (3 > last - first)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::UnexpectedEof;
|
||||
}
|
||||
|
||||
utf8_errc out_error;
|
||||
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
|
||||
(out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return out_error;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
out = ((code_unit & 0b0000'1111) << 12)
|
||||
| ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 6)
|
||||
| (static_cast<unsigned char>(first[2]) & 0b0011'1111u);
|
||||
// clang-format on
|
||||
first += 3;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
out = code_point;
|
||||
return {it, utf8_errc::NoError};
|
||||
if (code_unit < 0b1111'1000u)
|
||||
{
|
||||
if (4 > last - first)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::UnexpectedEof;
|
||||
}
|
||||
|
||||
utf8_errc out_error;
|
||||
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
|
||||
(out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError ||
|
||||
(out_error = check_trailing(static_cast<unsigned char>(first[3]))) != utf8_errc::NoError)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return out_error;
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
out = ((code_unit & 0b0000'0111) << 18)
|
||||
| ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 12)
|
||||
| ((static_cast<unsigned char>(first[2]) & 0b0011'1111u) << 6)
|
||||
| (static_cast<unsigned char>(first[3]) & 0b0011'1111u);
|
||||
// clang-format on
|
||||
|
||||
if (out > 0x10'FFFF)
|
||||
{
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::InvalidCodePoint;
|
||||
}
|
||||
|
||||
first += 4;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
out = end_of_file;
|
||||
first = last;
|
||||
return utf8_errc::InvalidCodeUnit;
|
||||
}
|
||||
|
||||
bool utf8_is_valid_string(const char* first, const char* last) noexcept
|
||||
{
|
||||
utf8_errc err = utf8_errc::NoError;
|
||||
for (auto dec = Utf8Decoder(first, last); dec != dec.end(); err = dec.next())
|
||||
utf8_errc err;
|
||||
Utf8Decoder dec(first, last, err);
|
||||
while (!dec.is_eof())
|
||||
{
|
||||
err = dec.next();
|
||||
}
|
||||
|
||||
return err == utf8_errc::NoError;
|
||||
}
|
||||
|
||||
|
@ -203,17 +212,6 @@ namespace vcpkg::Unicode
|
|||
}
|
||||
}
|
||||
|
||||
char const* Utf8Decoder::pointer_to_current() const noexcept
|
||||
{
|
||||
if (is_eof())
|
||||
{
|
||||
return last_;
|
||||
}
|
||||
|
||||
auto count = utf8_encode_code_unit_count(current_);
|
||||
return next_ - count;
|
||||
}
|
||||
|
||||
utf8_errc Utf8Decoder::next() noexcept
|
||||
{
|
||||
if (is_eof())
|
||||
|
@ -222,28 +220,34 @@ namespace vcpkg::Unicode
|
|||
Checks::unreachable(VCPKG_LINE_INFO);
|
||||
}
|
||||
|
||||
if (next_ == last_)
|
||||
const auto old_next = next_;
|
||||
const auto last = last_;
|
||||
if (old_next == last)
|
||||
{
|
||||
current_ = end_of_file;
|
||||
pointer_to_current_ = last;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
char32_t code_point;
|
||||
auto new_next = utf8_decode_code_point(next_, last_, code_point);
|
||||
if (new_next.second != utf8_errc::NoError)
|
||||
auto err = utf8_decode_code_point(next_, last, code_point);
|
||||
if (err != utf8_errc::NoError)
|
||||
{
|
||||
*this = sentinel();
|
||||
return new_next.second;
|
||||
current_ = end_of_file;
|
||||
pointer_to_current_ = last;
|
||||
return err;
|
||||
}
|
||||
|
||||
if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_))
|
||||
{
|
||||
*this = sentinel();
|
||||
current_ = end_of_file;
|
||||
pointer_to_current_ = last;
|
||||
next_ = last;
|
||||
return utf8_errc::PairedSurrogates;
|
||||
}
|
||||
|
||||
next_ = new_next.first;
|
||||
current_ = code_point;
|
||||
pointer_to_current_ = old_next;
|
||||
return utf8_errc::NoError;
|
||||
}
|
||||
|
||||
|
@ -261,8 +265,9 @@ namespace vcpkg::Unicode
|
|||
|
||||
Utf8Decoder& Utf8Decoder::operator=(sentinel) noexcept
|
||||
{
|
||||
next_ = last_;
|
||||
current_ = end_of_file;
|
||||
pointer_to_current_ = last_;
|
||||
next_ = last_;
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче