This commit is contained in:
Billy O'Neal 2024-01-23 16:42:13 -08:00 коммит произвёл GitHub
Родитель 78681de322
Коммит 8d673fc81a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 213 добавлений и 178 удалений

Просмотреть файл

@ -6,16 +6,6 @@
namespace vcpkg::Unicode
{
enum class Utf8CodeUnitKind
{
Invalid = -1,
Continue = 0,
StartOne = 1,
StartTwo = 2,
StartThree = 3,
StartFour = 4,
};
constexpr static char32_t end_of_file = 0xFFFF'FFFF;
enum class utf8_errc
@ -29,18 +19,12 @@ namespace vcpkg::Unicode
UnexpectedEof = 6,
};
Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept;
int utf8_code_unit_count(char code_unit) noexcept;
int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept;
// returns {after-current-code-point, error},
// and if error = NoError, then out = parsed code point.
// else, out = end_of_file.
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
const char* last,
char32_t& out) noexcept;
// If possible, decodes one codepoint from the beginning of [first, last). If successful advances first after the
// last decoded encoding unit, stores the codepoint in out, and returns utf8_errc::NoError.
// Otherwise, advances first to last, stores end_of_file in out, and returns one of the utf8_errc values.
utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept;
// uses the C++20 definition
/*
@ -117,9 +101,13 @@ namespace vcpkg::Unicode
*/
struct Utf8Decoder
{
constexpr Utf8Decoder() noexcept : current_(end_of_file), next_(nullptr), last_(nullptr) { }
constexpr Utf8Decoder() noexcept
: current_(end_of_file), pointer_to_current_(nullptr), next_(nullptr), last_(nullptr)
{
}
explicit constexpr Utf8Decoder(StringView sv) noexcept : Utf8Decoder(sv.begin(), sv.end()) { }
constexpr Utf8Decoder(const char* first, const char* last) noexcept : current_(0), next_(first), last_(last)
constexpr Utf8Decoder(const char* first, const char* last) noexcept
: current_(0), pointer_to_current_(first), next_(first), last_(last)
{
if (next_ != last_)
{
@ -135,7 +123,7 @@ namespace vcpkg::Unicode
{
}
constexpr Utf8Decoder(const char* first, const char* last, utf8_errc& first_decode_error) noexcept
: current_(0), next_(first), last_(last)
: current_(0), pointer_to_current_(first), next_(first), last_(last)
{
if (next_ != last_)
{
@ -158,7 +146,7 @@ namespace vcpkg::Unicode
Utf8Decoder& operator=(sentinel) noexcept;
char const* pointer_to_current() const noexcept;
char const* pointer_to_current() const noexcept { return pointer_to_current_; }
char32_t operator*() const noexcept
{
@ -186,7 +174,7 @@ namespace vcpkg::Unicode
Checks::unreachable(VCPKG_LINE_INFO);
}
return lhs.next_ == rhs.next_ && lhs.current_ == rhs.current_;
return lhs.pointer_to_current_ == rhs.pointer_to_current_;
}
friend constexpr bool operator!=(const Utf8Decoder& lhs, const Utf8Decoder& rhs) noexcept
{
@ -205,6 +193,7 @@ namespace vcpkg::Unicode
private:
char32_t current_;
const char* pointer_to_current_;
const char* next_;
const char* last_;
};

Просмотреть файл

@ -56,11 +56,12 @@ TEST_CASE ("Utf8Decoder valid", "[unicode]")
REQUIRE(!decode.is_eof());
char32_t decoded;
auto pointer_to_current = decode.pointer_to_current();
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded).second == utf8_errc::NoError);
const auto original_pointer_to_current = pointer_to_current;
REQUIRE(utf8_decode_code_point(pointer_to_current, input_end, decoded) == utf8_errc::NoError);
REQUIRE(decoded == expected[idx]);
char encoded[4];
auto encoded_size = utf8_encode_code_point(encoded, decoded);
REQUIRE(std::equal(encoded, encoded + encoded_size, pointer_to_current));
REQUIRE(std::equal(encoded, encoded + encoded_size, original_pointer_to_current));
++decode;
}
@ -104,3 +105,43 @@ TEST_CASE ("Utf8Decoder invalid", "[unicode]")
REQUIRE(uut.is_eof());
}
TEST_CASE ("Utf8Decoder empty current", "[unicode]")
{
char storage[] = "";
Utf8Decoder uut(storage);
REQUIRE(uut.pointer_to_current() == storage);
REQUIRE(uut.is_eof());
}
TEST_CASE ("utf8_is_valid_string fails", "[unicode]")
{
const char* test = GENERATE("hello \xFF too big",
"hello \xC3\xBF\xBF\xBF also too big",
"hello \x9C continuation",
"hello \xE0\x28 overlong",
"hello \xED\xA0\xBC\xED\xBF\x88 paired WTF-8",
"missing two: \xC3",
"missing three one: \xE6\x9C",
"missing three two: \xE6",
"missing four one: \xF0\x9F\x8F",
"missing four two: \xF0\x9F",
"missing four three: \xF0");
REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
}
TEST_CASE ("utf8_is_valid_string fails at end", "[unicode]")
{
const char* test = GENERATE("\xFF",
"\xC3\xBF\xBF\xBF",
"\x9C",
"\xE0\x28",
"\xED\xA0\xBC\xED\xBF\x88",
"\xC3",
"\xE6\x9C",
"\xE6",
"\xF0\x9F\x8F",
"\xF0\x9F",
"\xF0");
REQUIRE(!utf8_is_valid_string(test, test + strlen(test)));
}

Просмотреть файл

@ -3,176 +3,185 @@
namespace vcpkg::Unicode
{
Utf8CodeUnitKind utf8_code_unit_kind(unsigned char code_unit) noexcept
{
if (code_unit < 0b1000'0000)
{
return Utf8CodeUnitKind::StartOne;
}
else if (code_unit < 0b1100'0000)
{
return Utf8CodeUnitKind::Continue;
}
else if (code_unit < 0b1110'0000)
{
return Utf8CodeUnitKind::StartTwo;
}
else if (code_unit < 0b1111'0000)
{
return Utf8CodeUnitKind::StartThree;
}
else if (code_unit < 0b1111'1000)
{
return Utf8CodeUnitKind::StartFour;
}
else
{
return Utf8CodeUnitKind::Invalid;
}
}
static constexpr int utf8_code_unit_count(Utf8CodeUnitKind kind) noexcept { return static_cast<int>(kind); }
int utf8_code_unit_count(char code_unit) noexcept
{
return utf8_code_unit_count(utf8_code_unit_kind(static_cast<unsigned char>(code_unit)));
}
static constexpr int utf8_encode_code_unit_count(char32_t code_point) noexcept
{
if (code_point < 0x80)
{
return 1;
}
else if (code_point < 0x800)
{
return 2;
}
else if (code_point < 0x10000)
{
return 3;
}
else if (code_point < 0x110000)
{
return 4;
}
else
{
vcpkg::Checks::msg_exit_with_message(
VCPKG_LINE_INFO,
msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
}
}
int utf8_encode_code_point(char (&array)[4], char32_t code_point) noexcept
{
// count \in {2, 3, 4}
const auto start_code_point = [](char32_t code_point, int count) {
const unsigned char and_mask = 0xFF >> (count + 1);
const unsigned char or_mask = (0xFF << (8 - count)) & 0xFF;
const int shift = 6 * (count - 1);
return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
};
// count \in {2, 3, 4}, byte \in {1, 2, 3}
const auto continue_code_point = [](char32_t code_point, int count, int byte) {
constexpr unsigned char and_mask = 0xFF >> 2;
constexpr unsigned char or_mask = (0xFF << 7) & 0xFF;
const int shift = 6 * (count - byte - 1);
return static_cast<char>(or_mask | ((code_point >> shift) & and_mask));
};
int count = utf8_encode_code_unit_count(code_point);
if (count == 1)
if (code_point < 0x80)
{
array[0] = static_cast<char>(code_point);
return 1;
}
array[0] = start_code_point(code_point, count);
for (int i = 1; i < count; ++i)
if (code_point < 0x800)
{
array[i] = continue_code_point(code_point, count, i);
array[0] = static_cast<unsigned char>(0b1100'0000u | (code_point >> 6));
array[1] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
return 2;
}
return count;
if (code_point < 0x10000)
{
// clang-format off
array[0] = static_cast<unsigned char>(0b1110'0000u | (code_point >> 12));
array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6) & 0b0011'1111u));
array[2] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
// clang-format on
return 3;
}
if (code_point < 0x110000)
{
// clang-format off
array[0] = static_cast<unsigned char>(0b1111'0000u | (code_point >> 18));
array[1] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 12) & 0b0011'1111u));
array[2] = static_cast<unsigned char>(0b1000'0000u | ((code_point >> 6) & 0b0011'1111u));
array[3] = static_cast<unsigned char>(0b1000'0000u | (code_point & 0b0011'1111u));
// clang-format on
return 4;
}
vcpkg::Checks::msg_exit_with_message(
VCPKG_LINE_INFO,
msg::format(msgInvalidCodePoint).append_raw(fmt::format("({:x})", static_cast<uint32_t>(code_point))));
}
std::pair<const char*, utf8_errc> utf8_decode_code_point(const char* first,
const char* last,
char32_t& out) noexcept
static utf8_errc check_trailing(unsigned char code_unit) noexcept
{
if ((code_unit & 0b1100'0000u) != 0b1000'0000u)
{
if (code_unit >= 0b1111'1000u)
{
return utf8_errc::InvalidCodeUnit;
}
return utf8_errc::UnexpectedStart;
}
return utf8_errc::NoError;
}
utf8_errc utf8_decode_code_point(const char*& first, const char* last, char32_t& out) noexcept
{
out = end_of_file;
if (first == last)
{
return {last, utf8_errc::NoError};
out = end_of_file;
return utf8_errc::NoError;
}
auto code_unit = *first;
auto kind = utf8_code_unit_kind(static_cast<unsigned char>(code_unit));
const int count = utf8_code_unit_count(kind);
const char* it = first + 1;
if (kind == Utf8CodeUnitKind::Invalid)
auto code_unit = static_cast<unsigned char>(*first);
if (code_unit < 0b1000'0000u)
{
return {it, utf8_errc::InvalidCodeUnit};
}
else if (kind == Utf8CodeUnitKind::Continue)
{
return {it, utf8_errc::UnexpectedContinue};
}
else if (count > last - first)
{
return {last, utf8_errc::UnexpectedEof};
out = code_unit;
++first;
return utf8_errc::NoError;
}
if (count == 1)
if (code_unit < 0b1100'0000u)
{
out = static_cast<char32_t>(code_unit);
return {it, utf8_errc::NoError};
out = end_of_file;
first = last;
return utf8_errc::UnexpectedContinue;
}
// 2 -> 0b0001'1111, 6
// 3 -> 0b0000'1111, 12
// 4 -> 0b0000'0111, 18
const auto start_mask = static_cast<unsigned char>(0xFF >> (count + 1));
const int start_shift = 6 * (count - 1);
char32_t code_point = static_cast<char32_t>(code_unit & start_mask) << start_shift;
constexpr unsigned char continue_mask = 0b0011'1111;
for (int byte = 1; byte < count; ++byte)
if (code_unit < 0b1110'0000u)
{
code_unit = *it++;
kind = utf8_code_unit_kind(code_unit);
if (kind == Utf8CodeUnitKind::Invalid)
if (2 > last - first)
{
return {it, utf8_errc::InvalidCodeUnit};
}
else if (kind != Utf8CodeUnitKind::Continue)
{
return {it, utf8_errc::UnexpectedStart};
out = end_of_file;
first = last;
return utf8_errc::UnexpectedEof;
}
const int shift = 6 * (count - byte - 1);
code_point |= (code_unit & continue_mask) << shift;
utf8_errc out_error;
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError)
{
out = end_of_file;
first = last;
return out_error;
}
out = ((code_unit & 0b0001'1111) << 6) | (static_cast<unsigned char>(first[1]) & 0b0011'1111u);
first += 2;
return utf8_errc::NoError;
}
if (code_point > 0x10'FFFF)
if (code_unit < 0b1111'0000u)
{
return {it, utf8_errc::InvalidCodePoint};
if (3 > last - first)
{
out = end_of_file;
first = last;
return utf8_errc::UnexpectedEof;
}
utf8_errc out_error;
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
(out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError)
{
out = end_of_file;
first = last;
return out_error;
}
// clang-format off
out = ((code_unit & 0b0000'1111) << 12)
| ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 6)
| (static_cast<unsigned char>(first[2]) & 0b0011'1111u);
// clang-format on
first += 3;
return utf8_errc::NoError;
}
out = code_point;
return {it, utf8_errc::NoError};
if (code_unit < 0b1111'1000u)
{
if (4 > last - first)
{
out = end_of_file;
first = last;
return utf8_errc::UnexpectedEof;
}
utf8_errc out_error;
if ((out_error = check_trailing(static_cast<unsigned char>(first[1]))) != utf8_errc::NoError ||
(out_error = check_trailing(static_cast<unsigned char>(first[2]))) != utf8_errc::NoError ||
(out_error = check_trailing(static_cast<unsigned char>(first[3]))) != utf8_errc::NoError)
{
out = end_of_file;
first = last;
return out_error;
}
// clang-format off
out = ((code_unit & 0b0000'0111) << 18)
| ((static_cast<unsigned char>(first[1]) & 0b0011'1111u) << 12)
| ((static_cast<unsigned char>(first[2]) & 0b0011'1111u) << 6)
| (static_cast<unsigned char>(first[3]) & 0b0011'1111u);
// clang-format on
if (out > 0x10'FFFF)
{
out = end_of_file;
first = last;
return utf8_errc::InvalidCodePoint;
}
first += 4;
return utf8_errc::NoError;
}
out = end_of_file;
first = last;
return utf8_errc::InvalidCodeUnit;
}
bool utf8_is_valid_string(const char* first, const char* last) noexcept
{
utf8_errc err = utf8_errc::NoError;
for (auto dec = Utf8Decoder(first, last); dec != dec.end(); err = dec.next())
utf8_errc err;
Utf8Decoder dec(first, last, err);
while (!dec.is_eof())
{
err = dec.next();
}
return err == utf8_errc::NoError;
}
@ -203,17 +212,6 @@ namespace vcpkg::Unicode
}
}
char const* Utf8Decoder::pointer_to_current() const noexcept
{
if (is_eof())
{
return last_;
}
auto count = utf8_encode_code_unit_count(current_);
return next_ - count;
}
utf8_errc Utf8Decoder::next() noexcept
{
if (is_eof())
@ -222,28 +220,34 @@ namespace vcpkg::Unicode
Checks::unreachable(VCPKG_LINE_INFO);
}
if (next_ == last_)
const auto old_next = next_;
const auto last = last_;
if (old_next == last)
{
current_ = end_of_file;
pointer_to_current_ = last;
return utf8_errc::NoError;
}
char32_t code_point;
auto new_next = utf8_decode_code_point(next_, last_, code_point);
if (new_next.second != utf8_errc::NoError)
auto err = utf8_decode_code_point(next_, last, code_point);
if (err != utf8_errc::NoError)
{
*this = sentinel();
return new_next.second;
current_ = end_of_file;
pointer_to_current_ = last;
return err;
}
if (utf16_is_trailing_surrogate_code_point(code_point) && utf16_is_leading_surrogate_code_point(current_))
{
*this = sentinel();
current_ = end_of_file;
pointer_to_current_ = last;
next_ = last;
return utf8_errc::PairedSurrogates;
}
next_ = new_next.first;
current_ = code_point;
pointer_to_current_ = old_next;
return utf8_errc::NoError;
}
@ -261,8 +265,9 @@ namespace vcpkg::Unicode
Utf8Decoder& Utf8Decoder::operator=(sentinel) noexcept
{
next_ = last_;
current_ = end_of_file;
pointer_to_current_ = last_;
next_ = last_;
return *this;
}
}