Move `_Width_estimate_intervals_v2` to `__msvc_format_ucd_tables.hpp` (#4446)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
2024-03-08 10:52:50 +08:00 · 2024-03-08 10:52:50 +08:00 · 4378648c98
--- a/stl/inc/__msvc_format_ucd_tables.hpp
+++ b/stl/inc/__msvc_format_ucd_tables.hpp
@ -521,6 +521,25 @@ inline constexpr _Unicode_property_data<_Grapheme_Extend_property_values, 363, t
        0x1, 0x4, 0x1, 0x2, 0x2e, 0x17, 0x1, 0x3, 0x5, 0x8, 0x7, 0x4, 0x3, 0x37, 0x32, 0x1, 0x1, 0x5, 0xf, 0x7, 0x11,
        0x7, 0x2, 0x5, 0x1, 0x7, 0x1, 0x4, 0x4, 0x7, 0x7, 0x60, 0xf0}};

+// EastAsianWidth-15.0.0.txt
+// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
+inline constexpr char32_t _Width_estimate_intervals_v2[] = {0x1100, 0x1160, 0x231a, 0x231c, 0x2329, 0x232b, 0x23e9,
+    0x23ed, 0x23f0, 0x23f1, 0x23f3, 0x23f4, 0x25fd, 0x25ff, 0x2614, 0x2616, 0x2648, 0x2654, 0x267f, 0x2680, 0x2693,
+    0x2694, 0x26a1, 0x26a2, 0x26aa, 0x26ac, 0x26bd, 0x26bf, 0x26c4, 0x26c6, 0x26ce, 0x26cf, 0x26d4, 0x26d5, 0x26ea,
+    0x26eb, 0x26f2, 0x26f4, 0x26f5, 0x26f6, 0x26fa, 0x26fb, 0x26fd, 0x26fe, 0x2705, 0x2706, 0x270a, 0x270c, 0x2728,
+    0x2729, 0x274c, 0x274d, 0x274e, 0x274f, 0x2753, 0x2756, 0x2757, 0x2758, 0x2795, 0x2798, 0x27b0, 0x27b1, 0x27bf,
+    0x27c0, 0x2b1b, 0x2b1d, 0x2b50, 0x2b51, 0x2b55, 0x2b56, 0x2e80, 0x2e9a, 0x2e9b, 0x2ef4, 0x2f00, 0x2fd6, 0x2ff0,
+    0x2ffc, 0x3000, 0x303f, 0x3041, 0x3097, 0x3099, 0x3100, 0x3105, 0x3130, 0x3131, 0x318f, 0x3190, 0x31e4, 0x31f0,
+    0x321f, 0x3220, 0x3248, 0x3250, 0xa48d, 0xa490, 0xa4c7, 0xa960, 0xa97d, 0xac00, 0xd7a4, 0xf900, 0xfb00, 0xfe10,
+    0xfe1a, 0xfe30, 0xfe53, 0xfe54, 0xfe67, 0xfe68, 0xfe6c, 0xff01, 0xff61, 0xffe0, 0xffe7, 0x16fe0, 0x16fe5, 0x16ff0,
+    0x16ff2, 0x17000, 0x187f8, 0x18800, 0x18cd6, 0x18d00, 0x18d09, 0x1aff0, 0x1aff4, 0x1aff5, 0x1affc, 0x1affd, 0x1afff,
+    0x1b000, 0x1b123, 0x1b132, 0x1b133, 0x1b150, 0x1b153, 0x1b155, 0x1b156, 0x1b164, 0x1b168, 0x1b170, 0x1b2fc, 0x1f004,
+    0x1f005, 0x1f0cf, 0x1f0d0, 0x1f18e, 0x1f18f, 0x1f191, 0x1f19b, 0x1f200, 0x1f203, 0x1f210, 0x1f23c, 0x1f240, 0x1f249,
+    0x1f250, 0x1f252, 0x1f260, 0x1f266, 0x1f300, 0x1f650, 0x1f680, 0x1f6c6, 0x1f6cc, 0x1f6cd, 0x1f6d0, 0x1f6d3, 0x1f6d5,
+    0x1f6d8, 0x1f6dc, 0x1f6e0, 0x1f6eb, 0x1f6ed, 0x1f6f4, 0x1f6fd, 0x1f7e0, 0x1f7ec, 0x1f7f0, 0x1f7f1, 0x1f900, 0x1fa00,
+    0x1fa70, 0x1fa7d, 0x1fa80, 0x1fa89, 0x1fa90, 0x1fabe, 0x1fabf, 0x1fac6, 0x1face, 0x1fadc, 0x1fae0, 0x1fae9, 0x1faf0,
+    0x1faf9, 0x20000, 0x2fffe, 0x30000, 0x3fffe};
+
 _STD_END

 #pragma pop_macro("new")
--- a/stl/inc/format
+++ b/stl/inc/format
@ -1018,30 +1018,6 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
 #endif // ^^^ EDG workaround ^^^
 }

-// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
-// in the https://github.com/microsoft/stl repository.
-
-// EastAsianWidth-15.0.0.txt
-// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
-inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
-    0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
-    0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
-    0x26BDu, 0x26BFu, 0x26C4u, 0x26C6u, 0x26CEu, 0x26CFu, 0x26D4u, 0x26D5u, 0x26EAu, 0x26EBu, 0x26F2u, 0x26F4u, 0x26F5u,
-    0x26F6u, 0x26FAu, 0x26FBu, 0x26FDu, 0x26FEu, 0x2705u, 0x2706u, 0x270Au, 0x270Cu, 0x2728u, 0x2729u, 0x274Cu, 0x274Du,
-    0x274Eu, 0x274Fu, 0x2753u, 0x2756u, 0x2757u, 0x2758u, 0x2795u, 0x2798u, 0x27B0u, 0x27B1u, 0x27BFu, 0x27C0u, 0x2B1Bu,
-    0x2B1Du, 0x2B50u, 0x2B51u, 0x2B55u, 0x2B56u, 0x2E80u, 0x2E9Au, 0x2E9Bu, 0x2EF4u, 0x2F00u, 0x2FD6u, 0x2FF0u, 0x2FFCu,
-    0x3000u, 0x303Fu, 0x3041u, 0x3097u, 0x3099u, 0x3100u, 0x3105u, 0x3130u, 0x3131u, 0x318Fu, 0x3190u, 0x31E4u, 0x31F0u,
-    0x321Fu, 0x3220u, 0x3248u, 0x3250u, 0xA48Du, 0xA490u, 0xA4C7u, 0xA960u, 0xA97Du, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u,
-    0xFE10u, 0xFE1Au, 0xFE30u, 0xFE53u, 0xFE54u, 0xFE67u, 0xFE68u, 0xFE6Cu, 0xFF01u, 0xFF61u, 0xFFE0u, 0xFFE7u,
-    0x16FE0u, 0x16FE5u, 0x16FF0u, 0x16FF2u, 0x17000u, 0x187F8u, 0x18800u, 0x18CD6u, 0x18D00u, 0x18D09u, 0x1AFF0u,
-    0x1AFF4u, 0x1AFF5u, 0x1AFFCu, 0x1AFFDu, 0x1AFFFu, 0x1B000u, 0x1B123u, 0x1B132u, 0x1B133u, 0x1B150u, 0x1B153u,
-    0x1B155u, 0x1B156u, 0x1B164u, 0x1B168u, 0x1B170u, 0x1B2FCu, 0x1F004u, 0x1F005u, 0x1F0CFu, 0x1F0D0u, 0x1F18Eu,
-    0x1F18Fu, 0x1F191u, 0x1F19Bu, 0x1F200u, 0x1F203u, 0x1F210u, 0x1F23Cu, 0x1F240u, 0x1F249u, 0x1F250u, 0x1F252u,
-    0x1F260u, 0x1F266u, 0x1F300u, 0x1F650u, 0x1F680u, 0x1F6C6u, 0x1F6CCu, 0x1F6CDu, 0x1F6D0u, 0x1F6D3u, 0x1F6D5u,
-    0x1F6D8u, 0x1F6DCu, 0x1F6E0u, 0x1F6EBu, 0x1F6EDu, 0x1F6F4u, 0x1F6FDu, 0x1F7E0u, 0x1F7ECu, 0x1F7F0u, 0x1F7F1u,
-    0x1F900u, 0x1FA00u, 0x1FA70u, 0x1FA7Du, 0x1FA80u, 0x1FA89u, 0x1FA90u, 0x1FABEu, 0x1FABFu, 0x1FAC6u, 0x1FACEu,
-    0x1FADCu, 0x1FAE0u, 0x1FAE9u, 0x1FAF0u, 0x1FAF9u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
-
 _NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
    // Computes the width estimation for Unicode characters from N4950 [format.string.std]/13
    // The two branches are functionally equivalent; `12` is chosen for performance here.
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@ -1,201 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-# The following code generates data for _Width_estimate_intervals_v2 in <format>.
-
-import re
-from enum import Enum
-from typing import TextIO
-from pathlib import Path
-
-
-# Width estimation.
-class UnicodeWidth(Enum):
-    IS_1: int = 1
-    IS_2: int = 2
-
-
-class UnicodeWidthTable:
-    # A valid Unicode code point won't exceed MAX_CODE_POINT.
-    MAX_CODE_POINT: int = 0x10FFFF
-    TABLE_SIZE: int = MAX_CODE_POINT + 1
-
-    def __init__(self):
-        self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
-
-    # "rng" denotes a right-closed range.
-    def fill_range(self, rng: tuple, width: int):
-        from_, to_ = rng
-        assert from_ <= to_, "invalid range"
-        assert to_ <= self.MAX_CODE_POINT, "invalid range"
-        self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
-
-    def width_estimate_intervals(self):
-        """
-        Creates a string representation of the map (in `self.table`) from
-        unicode code points to their width, using hexadecimal unsigned integer literals.
-        Since there are long runs of code points of one width or the other,
-        this representation is a list of code points where the width switches.
-        Additionally, the width is assumed to start at `1` from the beginning of the list.
-        For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
-        """
-        values = []
-        assert self.table[0] == UnicodeWidth.IS_1
-        for u in range(1, self.TABLE_SIZE):
-            assert (
-                self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
-            )
-            if self.table[u] != self.table[u - 1]:
-                values.append(u)
-
-        return ", ".join([f"0x{u:X}u" for u in values])
-
-    # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
-    def print_ranges_1_vs_2(self, other):
-        def _1_vs_2(u: int):
-            return (
-                self.table[u] == UnicodeWidth.IS_1
-                and other.table[u] == UnicodeWidth.IS_2
-            )
-
-        u = 0
-        while u < self.TABLE_SIZE:
-            if _1_vs_2(u):
-                from_ = u
-                to_ = from_
-                while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
-                    to_ += 1
-                if from_ == to_:
-                    print(f"U+{from_:X}")
-                else:
-                    print(f"U+{from_:X}..U+{to_:X}")
-                u = to_
-            u += 1
-
-
-def get_table_cpp20() -> UnicodeWidthTable:
-    std_wide_ranges_cpp20 = [
-        (0x1100, 0x115F),
-        (0x2329, 0x232A),
-        (0x2E80, 0x303E),
-        (0x3040, 0xA4CF),
-        (0xAC00, 0xD7A3),
-        (0xF900, 0xFAFF),
-        (0xFE10, 0xFE19),
-        (0xFE30, 0xFE6F),
-        (0xFF00, 0xFF60),
-        (0xFFE0, 0xFFE6),
-        (0x1F300, 0x1F64F),
-        (0x1F900, 0x1F9FF),
-        (0x20000, 0x2FFFD),
-        (0x30000, 0x3FFFD),
-    ]
-
-    table = UnicodeWidthTable()
-    for rng in std_wide_ranges_cpp20:
-        table.fill_range(rng, UnicodeWidth.IS_2)
-
-    return table
-
-
-def read_from(source: TextIO) -> UnicodeWidthTable:
-    """
-    Read data from "EastAsianWidth.txt".
-    The latest version can be found at:
-    https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
-    The current implementation works for:
-    https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
-    To make this function work, the file should not contain a BOM.
-    """
-    table = UnicodeWidthTable()
-
-    # "The unassigned code points in the following blocks default to "W":"
-    default_wide_ranges = [
-        (0x4E00, 0x9FFF),
-        (0x3400, 0x4DBF),
-        (0xF900, 0xFAFF),
-        (0x20000, 0x2FFFD),
-        (0x30000, 0x3FFFD),
-    ]
-    for rng in default_wide_ranges:
-        table.fill_range(rng, UnicodeWidth.IS_2)
-
-    # Read explicitly assigned ranges.
-    # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
-    LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
-
-    def get_width(str: str):
-        if str == "F" or str == "W":
-            return UnicodeWidth.IS_2
-        else:
-            assert str == "A" or str == "H" or str == "N" or str == "Na"
-            return UnicodeWidth.IS_1
-
-    for line in source:
-        line = line.strip()
-        if line and not line.startswith("#"):
-            match = LINE_REGEX.fullmatch(line)
-            assert match, line  # invalid line
-            from_val = int(match.group(1), base=16)
-            width = get_width(match.group(3))
-            if match.group(2):
-                # range (HEX..HEX)
-                to_val = int(match.group(2)[2:], base=16)
-                table.fill_range((from_val, to_val), width)
-            else:
-                # single character (HEX)
-                table.table[from_val] = width
-
-    return table
-
-
-def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
-    table = read_from(source)
-
-    # Override with ranges specified by N4958 [format.string.std]/13.
-    std_wide_ranges_cpp23 = [
-        (0x4DC0, 0x4DFF),
-        (0x1F300, 0x1F5FF),
-        (0x1F900, 0x1F9FF),
-    ]
-
-    for rng in std_wide_ranges_cpp23:
-        table.fill_range(rng, UnicodeWidth.IS_2)
-
-    return table
-
-
-WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
-{filename}
-{timestamp}
-inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
-{values} }};
-"""
-
-
-def main():
-    print("Old table:")
-    old_table = get_table_cpp20()
-    print(old_table.width_estimate_intervals())
-
-    path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
-    with open(path, mode="rt", encoding="utf-8") as source:
-        filename = source.readline().replace("#", "//").rstrip()
-        timestamp = source.readline().replace("#", "//").rstrip()
-        new_table = get_table_cpp23(source)
-    print("\nNew table:")
-    print(
-        WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
-            filename=filename,
-            timestamp=timestamp,
-            values=new_table.width_estimate_intervals(),
-        )
-    )
-    print("Was 1, now 2:")
-    old_table.print_ranges_1_vs_2(new_table)
-    print("\nWas 2, now 1:")
-    new_table.print_ranges_1_vs_2(old_table)
-
-
-if __name__ == "__main__":
-    main()
--- a/tools/unicode_properties_parse/unicode_properties_data_gen.py
+++ b/tools/unicode_properties_parse/unicode_properties_data_gen.py
@ -77,10 +77,10 @@ inline constexpr _Unicode_property_data<_{prop_name}_property_values, {size}, {i
 }};
 """

-INTERVALS_TEMPLATE = """
+WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
 {filename}
 {timestamp}
-inline constexpr char32_t _{prop_name}_ranges[{size}] = {{
+inline constexpr char32_t _Width_estimate_intervals_v2[] = {{
    {data}
 }};
 """
@ -148,7 +148,6 @@ MSVC_FORMAT_UCD_TABLES_HPP_TEMPLATE = """
 #if _STL_COMPILER_PREPROCESSOR

 #include <cstdint>
-#include <limits>
 #include <xutility>

 #pragma pack(push, _CRT_PACKING)
@ -166,7 +165,7 @@ struct _Unicode_property_data {{
    uint16_t _Props_and_size[_NumRanges];
    _NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {{
        ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
-        constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
+        constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
        if (_Upper_idx == 0) {{
            return _No_value_constant;
        }}
@ -274,10 +273,22 @@ def read_file(filename: str) -> list[PropertyRange]:
        return filename, timestamp, ranges


+def generate_width_estimate_intervals(filename: str, timestamp: str, width_2_ranges: list[PropertyRange]):
+    values = []
+
+    for width_2_range in width_2_ranges:
+        values.append(width_2_range.lower)
+        values.append(width_2_range.upper + 1)
+
+    return WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
+            filename=filename, timestamp=timestamp, data=",".join(['0x' + format(x, 'x') for x in values]))
+
+
 def generate_data_tables() -> str:
    """
    Generate Unicode data for inclusion into <format> from
-    GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, and DerivedCoreProperties.txt
+    GraphemeBreakProperty.txt, emoji-data.txt, DerivedGeneralCategory.txt, DerivedCoreProperties.txt,
+    and EastAsianWidth.txt.

    GraphemeBreakProperty.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
@ -291,12 +302,16 @@ def generate_data_tables() -> str:
    DerivedCoreProperties.txt can be found at
    https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt

+    EastAsianWidth.txt can be found at
+    https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+
    All files are expected to be in the same directory as this script.
    """
    gbp_filename, gbp_timestamp, gbp_ranges = read_file("GraphemeBreakProperty.txt")
    emoji_filename, emoji_timestamp, emoji_ranges = read_file("emoji-data.txt")
    cat_filename, cat_timestamp, cat_ranges = read_file("DerivedGeneralCategory.txt")
    derived_filename, derived_timestamp, derived_ranges = read_file("DerivedCoreProperties.txt")
+    eaw_filename, eaw_timestamp, eaw_ranges = read_file("EastAsianWidth.txt")

    printable_ranges = compact_property_ranges(sorted([
        PropertyRange(x.lower, x.upper, "Yes")
@ -304,6 +319,31 @@ def generate_data_tables() -> str:
        if x.prop not in ('Cc', 'Cf', 'Cs', 'Co', 'Cn', 'Zl', 'Zp', 'Zs') or chr(x.lower) == ' '
    ], key=lambda x: x.lower))

+    # N4971 [format.string.std]/13
+    std_wide_ranges = [
+        range(0x4DC0, 0x4DFF),
+        range(0x1F300, 0x1F5FF),
+        range(0x1F900, 0x1F9FF),
+    ]
+
+    def has_width_2(prop_range):
+        if prop_range.prop in ("F", "W"):
+            return True
+
+        for std_wide_range in std_wide_ranges:
+            if prop_range.lower in std_wide_range:
+                assert prop_range.upper <= std_wide_range.stop
+
+                return True
+            else:
+                assert prop_range.upper not in std_wide_range
+
+        return False
+
+    width_2_ranges = compact_property_ranges(sorted([
+        PropertyRange(x.lower, x.upper, "Yes") for x in eaw_ranges if has_width_2(x)
+    ], key=lambda x: x.lower))
+
    gpb_cpp_data = generate_cpp_data(gbp_filename, gbp_timestamp, "Grapheme_Break", gbp_ranges)
    emoji_cpp_data = generate_cpp_data(emoji_filename, emoji_timestamp, "Extended_Pictographic", [
        x for x in emoji_ranges if x.prop == "Extended_Pictographic"])
@ -311,8 +351,10 @@ def generate_data_tables() -> str:
    printable_cpp_data = generate_cpp_data(cat_filename, cat_timestamp, "_printable", printable_ranges)
    grapheme_extend_cpp_data = generate_cpp_data(derived_filename, derived_timestamp, "Grapheme_Extend", [
        x for x in derived_ranges if x.prop == "Grapheme_Extend"])
+    width_estimate_intervals = generate_width_estimate_intervals(eaw_filename, eaw_timestamp, width_2_ranges)

-    return "\n".join([gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data])
+    return "\n".join(
+        [gpb_cpp_data, emoji_cpp_data, printable_cpp_data, grapheme_extend_cpp_data, width_estimate_intervals])


 if __name__ == "__main__":