Move `_Width_estimate_intervals_v2` to `__msvc_format_ucd_tables.hpp` (#4446)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
This commit is contained in:
S. B. Tam 2024-03-08 10:52:50 +08:00 коммит произвёл GitHub
Родитель ddc5a620c6
Коммит 4378648c98
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
4 изменённых файлов: 67 добавлений и 231 удалений

Просмотреть файл

@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t
0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11,
0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}};
// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9,
0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693,
0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea,
0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728,
0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf,
0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0,
0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0,
0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10,
0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0,
0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff,
0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004,
0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249,
0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5,
0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00,
0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0,
0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe};
_STD_END
#pragma pop_macro("new")

Просмотреть файл

@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
#endif // ^^^ EDG workaround ^^^
}
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
// in the https://github.com/microsoft/stl repository.
// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u,
0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du,
0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu,
0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu,
0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u,
0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u,
0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u,
0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u,
0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u,
0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu,
0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u,
0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u,
0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u,
0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu,
0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
// Computes the width estimation for Unicode characters from N4950 [format.string.std]/13
// The two branches are functionally equivalent; `12` is chosen for performance here.

Просмотреть файл

@ -1,201 +0,0 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# The following code generates data for _Width_estimate_intervals_v2 in <format>.
import re
from enum import Enum
from typing import TextIO
from pathlib import Path
# Width estimation.
class UnicodeWidth(Enum):
IS_1: int = 1
IS_2: int = 2
class UnicodeWidthTable:
# A valid Unicode code point won't exceed MAX_CODE_POINT.
MAX_CODE_POINT: int = 0x10FFFF
TABLE_SIZE: int = MAX_CODE_POINT + 1
def __init__(self):
self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
# "rng" denotes a right-closed range.
def fill_range(self, rng: tuple, width: int):
from_, to_ = rng
assert from_ <= to_, "invalid range"
assert to_ <= self.MAX_CODE_POINT, "invalid range"
self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
def width_estimate_intervals(self):
"""
Creates a string representation of the map (in `self.table`) from
unicode code points to their width, using hexadecimal unsigned integer literals.
Since there are long runs of code points of one width or the other,
this representation is a list of code points where the width switches.
Additionally, the width is assumed to start at `1` from the beginning of the list.
For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
"""
values = []
assert self.table[0] == UnicodeWidth.IS_1
for u in range(1, self.TABLE_SIZE):
assert (
self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
)
if self.table[u] != self.table[u - 1]:
values.append(u)
return ", ".join([f"0x{u:X}u" for u in values])
# Print all ranges (right-closed), where self's width is 1 and other's width is 2.
def print_ranges_1_vs_2(self, other):
def _1_vs_2(u: int):
return (
self.table[u] == UnicodeWidth.IS_1
and other.table[u] == UnicodeWidth.IS_2
)
u = 0
while u < self.TABLE_SIZE:
if _1_vs_2(u):
from_ = u
to_ = from_
while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
to_ += 1
if from_ == to_:
print(f"U+{from_:X}")
else:
print(f"U+{from_:X}..U+{to_:X}")
u = to_
u += 1
def get_table_cpp20() -> UnicodeWidthTable:
std_wide_ranges_cpp20 = [
(0x1100, 0x115F),
(0x2329, 0x232A),
(0x2E80, 0x303E),
(0x3040, 0xA4CF),
(0xAC00, 0xD7A3),
(0xF900, 0xFAFF),
(0xFE10, 0xFE19),
(0xFE30, 0xFE6F),
(0xFF00, 0xFF60),
(0xFFE0, 0xFFE6),
(0x1F300, 0x1F64F),
(0x1F900, 0x1F9FF),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
]
table = UnicodeWidthTable()
for rng in std_wide_ranges_cpp20:
table.fill_range(rng, UnicodeWidth.IS_2)
return table
def read_from(source: TextIO) -> UnicodeWidthTable:
"""
Read data from "EastAsianWidth.txt".
The latest version can be found at:
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
The current implementation works for:
https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
To make this function work, the file should not contain a BOM.
"""
table = UnicodeWidthTable()
# "The unassigned code points in the following blocks default to "W":"
default_wide_ranges = [
(0x4E00, 0x9FFF),
(0x3400, 0x4DBF),
(0xF900, 0xFAFF),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
]
for rng in default_wide_ranges:
table.fill_range(rng, UnicodeWidth.IS_2)
# Read explicitly assigned ranges.
# The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
def get_width(str: str):
if str == "F" or str == "W":
return UnicodeWidth.IS_2
else:
assert str == "A" or str == "H" or str == "N" or str == "Na"
return UnicodeWidth.IS_1
for line in source:
line = line.strip()
if line and not line.startswith("#"):
match = LINE_REGEX.fullmatch(line)
assert match, line # invalid line
from_val = int(match.group(1), base=16)
width = get_width(match.group(3))
if match.group(2):
# range (HEX..HEX)
to_val = int(match.group(2)[2:], base=16)
table.fill_range((from_val, to_val), width)
else:
# single character (HEX)
table.table[from_val] = width
return table
def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
table = read_from(source)
# Override with ranges specified by N4958 [format.string.std]/13.
std_wide_ranges_cpp23 = [
(0x4DC0, 0x4DFF),
(0x1F300, 0x1F5FF),
(0x1F900, 0x1F9FF),
]
for rng in std_wide_ranges_cpp23:
table.fill_range(rng, UnicodeWidth.IS_2)
return table
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
{values} }};
"""
def main():
print("Old table:")
old_table = get_table_cpp20()
print(old_table.width_estimate_intervals())
path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
with open(path, mode="rt", encoding="utf-8") as source:
filename = source.readline().replace("#", "//").rstrip()
timestamp = source.readline().replace("#", "//").rstrip()
new_table = get_table_cpp23(source)
print("\nNew table:")
print(
WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
filename=filename,
timestamp=timestamp,
values=new_table.width_estimate_intervals(),
)
)
print("Was 1, now 2:")
old_table.print_ranges_1_vs_2(new_table)
print("\nWas 2, now 1:")
new_table.print_ranges_1_vs_2(old_table)
if __name__ == "__main__":
main()

Просмотреть файл

@ -77,10 +77,10 @@ inline constexpr _Unicode_property_data<_{prop_name}_property_values, {size}, {i
}};
"""
INTERVALS_TEMPLATE = """
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _{prop_name}_ranges[{size}] = {{
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
{data}
}};
"""
@ -148,7 +148,6 @@ MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
#if _STL_COMPILER_PREPROCESSOR
#include <cstdint>
#include <limits>
#include <xutility>
#pragma pack(push, _CRT_PACKING)
@ -166,7 +165,7 @@ struct _Unicode_property_data {{
uint16_t _Props_and_size[_NumRanges];
_NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
if (_Upper_idx == 0) {{
return _No_value_constant;
}}
@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]:
return filename, timestamp, ranges
def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
values = []
for width_2_range in width_2_ranges:
values.append(width_2_range.lower)
values.append(width_2_range.upper + 1)
return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))
def generate_data_tables() -> str:
"""
Generate Unicode data for inclusion into <format> from
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt
GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
and EastAsianWidth.txt.
GraphemeBreakProperty.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
@ -291,12 +302,16 @@ def generate_data_tables() -> str:
DerivedCoreProperties.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
EastAsianWidth.txt can be found at
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
All files are expected to be in the same directory as this script.
"""
gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")
printable_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes")
@ -304,6 +319,31 @@ def generate_data_tables() -> str:
if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
], key=lambda x: x.lower))
# N4971 [format.string.std]/13
std_wide_ranges = [
range(0x4DC0, 0x4DFF),
range(0x1F300, 0x1F5FF),
range(0x1F900, 0x1F9FF),
]
def has_width_2(prop_range):
if prop_range.prop in ("F", "W"):
return True
for std_wide_range in std_wide_ranges:
if prop_range.lower in std_wide_range:
assert prop_range.upper <= std_wide_range.stop
return True
else:
assert prop_range.upper not in std_wide_range
return False
width_2_ranges = compact_property_ranges(sorted([
PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
], key=lambda x: x.lower))
gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
@ -311,8 +351,10 @@ def generate_data_tables() -> str:
printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
x for x in derived_ranges if x.prop == "Grapheme_Extend"])
width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)
return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data])
return "\n".join(
[gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])
if __name__ == "__main__":