P2675R1 `<format>` generator converted from C++ to Python (#3994)

Co-authored-by: achabense <60953653+achabense@users.noreply.github.com> Co-authored-by: Stephan T. Lavavej <stl@nuwen.net> Co-authored-by: nicole mazzuca <83086508+strega-nil-ms@users.noreply.github.com>
2023-10-07 01:45:31 +07:00 · 2023-10-07 01:45:31 +07:00 · d61f7e037b
--- a/stl/inc/format
+++ b/stl/inc/format
@ -987,8 +987,11 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
 #endif // ^^^ EDG workaround ^^^
 }

-// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.cpp
+// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
 // in the https://github.com/microsoft/stl repository.
+
+// EastAsianWidth-15.0.0.txt
+// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
 inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
    0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
    0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.cpp
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.cpp
@ -1,209 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-// The following code generates data for `_Width_estimate_intervals_v2` in <format>.
-
-#include <charconv>
-#include <cstdint>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <regex>
-#include <source_location>
-#include <string>
-#include <system_error>
-#include <vector>
-using namespace std;
-
-void verify(bool test, const char* msg, source_location loc = source_location::current()) {
-    if (!test) {
-        cerr << "Error at line " << loc.line() << ": " << msg << endl;
-        exit(EXIT_FAILURE);
-    }
-}
-constexpr const char* impl_assertion_failed = "impl assertion failed";
-
-struct range_u {
-    uint32_t from;
-    uint32_t to;
-    constexpr range_u(uint32_t f, uint32_t t) : from(f), to(t) {}
-    constexpr explicit range_u(uint32_t v) : from(v), to(v) {}
-};
-
-enum class width_u : bool { is_1 = false, is_2 = true };
-
-class table_u {
-private:
-    // A valid Unicode code point won't exceed `max_u`.
-    static constexpr uint32_t max_u = 0x10'ffff;
-    vector<width_u> table;
-
-public:
-    table_u() : table(max_u + 1, width_u::is_1) {}
-
-    void fill_range(const range_u rng, const width_u width) {
-        const auto [from, to] = rng;
-        verify(from <= to, impl_assertion_failed);
-        verify(to <= max_u, impl_assertion_failed);
-        for (uint32_t u = from; u <= to; ++u) {
-            table[u] = width;
-        }
-    }
-
-    void print_intervals() const {
-        // Print table for `_Width_estimate_intervals_v2`.
-        int c        = 0;
-        width_u last = table[0];
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (table[u] != last) {
-                cout << "0x" << hex << uppercase << u << "u, ";
-                if (++c == 12) {
-                    c = 0;
-                    cout << endl;
-                }
-            }
-            last = table[u];
-        }
-        cout << endl;
-    }
-
-    void print_clusters_1_vs_2(const table_u& other) const {
-        vector<bool> cluster_table(max_u + 1, false);
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (table[u] == width_u::is_1 && other.table[u] == width_u::is_2) {
-                cluster_table[u] = true;
-            }
-        }
-
-        for (uint32_t u = 0; u <= max_u; ++u) {
-            if (cluster_table[u]) {
-                const uint32_t from = u;
-                uint32_t to         = from;
-                while (to + 1 <= max_u && cluster_table[to + 1]) {
-                    ++to;
-                }
-                if (from == to) {
-                    cout << hex << uppercase << "U+" << from << endl;
-                } else {
-                    cout << hex << uppercase << "U+" << from << "..U+" << to << endl;
-                }
-                u = to;
-            }
-        }
-    }
-};
-
-table_u get_table_cpp20() {
-    static constexpr range_u std_wide_ranges_cpp20[]{
-        {0x1100, 0x115F},
-        {0x2329, 0x232A},
-        {0x2E80, 0x303E},
-        {0x3040, 0xA4CF},
-        {0xAC00, 0xD7A3},
-        {0xF900, 0xFAFF},
-        {0xFE10, 0xFE19},
-        {0xFE30, 0xFE6F},
-        {0xFF00, 0xFF60},
-        {0xFFE0, 0xFFE6},
-        {0x1F300, 0x1F64F},
-        {0x1F900, 0x1F9FF},
-        {0x20000, 0x2FFFD},
-        {0x30000, 0x3FFFD},
-    };
-
-    table_u table;
-    for (const range_u& rng : std_wide_ranges_cpp20) {
-        table.fill_range(rng, width_u::is_2);
-    }
-    return table;
-}
-
-// Read data from "EastAsianWidth.txt".
-// The latest version can be found at:
-// https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
-// The current implementation works for:
-// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
-// To make this function work, the file should not contain a BOM.
-table_u read_from(ifstream& source) {
-    table_u table;
-
-    // "The unassigned code points in the following blocks default to "W":"
-    static constexpr range_u default_wide_ranges[]{
-        {0x4E00, 0x9FFF}, {0x3400, 0x4DBF}, {0xF900, 0xFAFF}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}};
-    for (const range_u& rng : default_wide_ranges) {
-        table.fill_range(rng, width_u::is_2);
-    }
-
-    // Read explicitly assigned ranges.
-    // The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
-    auto get_width = [](const string& str) {
-        if (str == "F" || str == "W") {
-            return width_u::is_2;
-        } else {
-            verify(str == "A" || str == "H" || str == "N" || str == "Na", impl_assertion_failed);
-            return width_u::is_1;
-        }
-    };
-    auto get_value = [](const string& str) {
-        uint32_t value{};
-        const auto [end_ptr, ec] = from_chars(str.data(), str.data() + str.size(), value, 16);
-        verify(end_ptr == str.data() + str.size(), impl_assertion_failed);
-        verify(ec == errc{}, impl_assertion_failed);
-        return value;
-    };
-
-    verify(!!source, "invalid path");
-    string line;
-    const regex reg(R"(([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*)");
-    while (getline(source, line)) {
-        if (!line.empty() && !line.starts_with("#")) {
-            smatch match;
-            verify(regex_match(line, match, reg), "invalid line");
-            verify(match[1].matched, impl_assertion_failed);
-            verify(match[3].matched, impl_assertion_failed);
-            const width_u width = get_width(match[3].str());
-            const uint32_t from = get_value(match[1].str());
-            if (match[2].matched) {
-                // range (HEX..HEX)
-                const string match2 = match[2].str();
-                verify(match2.starts_with(".."), impl_assertion_failed);
-                table.fill_range({from, get_value(match2.substr(2))}, width);
-            } else {
-                // single character (HEX)
-                table.fill_range(range_u{from}, width);
-            }
-        }
-    }
-
-    return table;
-}
-
-table_u get_table_cpp23(ifstream& source) {
-    table_u table = read_from(source);
-
-    // Override with ranges specified by the C++ standard.
-    static constexpr range_u std_wide_ranges_cpp23[]{{0x4DC0, 0x4DFF}, {0x1F300, 0x1F5FF}, {0x1F900, 0x1F9FF}};
-    for (const range_u& rng : std_wide_ranges_cpp23) {
-        table.fill_range(rng, width_u::is_2);
-    }
-
-    return table;
-}
-
-int main() {
-    cout << "Old table:\n";
-    const table_u old_table = get_table_cpp20();
-    old_table.print_intervals();
-
-    cout << "\nNew table:\nInput path for EastAsianWidth.txt: ";
-    string path;
-    getline(cin, path);
-    ifstream source(path);
-    const table_u new_table = get_table_cpp23(source);
-    new_table.print_intervals();
-
-    cout << "\nWas 1, now 2:\n";
-    old_table.print_clusters_1_vs_2(new_table);
-    cout << "\nWas 2, now 1:\n";
-    new_table.print_clusters_1_vs_2(old_table);
-}
--- a/tools/unicode_properties_parse/format_width_estimate_intervals.py
+++ b/tools/unicode_properties_parse/format_width_estimate_intervals.py
@ -0,0 +1,201 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# The following code generates data for _Width_estimate_intervals_v2 in <format>.
+
+import re
+from enum import Enum
+from typing import TextIO
+from pathlib import Path
+
+
+# Width estimation.
+class UnicodeWidth(Enum):
+    IS_1: int = 1
+    IS_2: int = 2
+
+
+class UnicodeWidthTable:
+    # A valid Unicode code point won't exceed MAX_CODE_POINT.
+    MAX_CODE_POINT: int = 0x10FFFF
+    TABLE_SIZE: int = MAX_CODE_POINT + 1
+
+    def __init__(self):
+        self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
+
+    # "rng" denotes a right-closed range.
+    def fill_range(self, rng: tuple, width: int):
+        from_, to_ = rng
+        assert from_ <= to_, "invalid range"
+        assert to_ <= self.MAX_CODE_POINT, "invalid range"
+        self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
+
+    def width_estimate_intervals(self):
+        """
+        Creates a string representation of the map (in `self.table`) from
+        unicode code points to their width, using hexadecimal unsigned integer literals.
+        Since there are long runs of code points of one width or the other,
+        this representation is a list of code points where the width switches.
+        Additionally, the width is assumed to start at `1` from the beginning of the list.
+        For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
+        """
+        values = []
+        assert self.table[0] == UnicodeWidth.IS_1
+        for u in range(1, self.TABLE_SIZE):
+            assert (
+                self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
+            )
+            if self.table[u] != self.table[u - 1]:
+                values.append(u)
+
+        return ", ".join([f"0x{u:X}u" for u in values])
+
+    # Print all ranges (right-closed), where self's width is 1 and other's width is 2.
+    def print_ranges_1_vs_2(self, other):
+        def _1_vs_2(u: int):
+            return (
+                self.table[u] == UnicodeWidth.IS_1
+                and other.table[u] == UnicodeWidth.IS_2
+            )
+
+        u = 0
+        while u < self.TABLE_SIZE:
+            if _1_vs_2(u):
+                from_ = u
+                to_ = from_
+                while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
+                    to_ += 1
+                if from_ == to_:
+                    print(f"U+{from_:X}")
+                else:
+                    print(f"U+{from_:X}..U+{to_:X}")
+                u = to_
+            u += 1
+
+
+def get_table_cpp20() -> UnicodeWidthTable:
+    std_wide_ranges_cpp20 = [
+        (0x1100, 0x115F),
+        (0x2329, 0x232A),
+        (0x2E80, 0x303E),
+        (0x3040, 0xA4CF),
+        (0xAC00, 0xD7A3),
+        (0xF900, 0xFAFF),
+        (0xFE10, 0xFE19),
+        (0xFE30, 0xFE6F),
+        (0xFF00, 0xFF60),
+        (0xFFE0, 0xFFE6),
+        (0x1F300, 0x1F64F),
+        (0x1F900, 0x1F9FF),
+        (0x20000, 0x2FFFD),
+        (0x30000, 0x3FFFD),
+    ]
+
+    table = UnicodeWidthTable()
+    for rng in std_wide_ranges_cpp20:
+        table.fill_range(rng, UnicodeWidth.IS_2)
+
+    return table
+
+
+def read_from(source: TextIO) -> UnicodeWidthTable:
+    """
+    Read data from "EastAsianWidth.txt".
+    The latest version can be found at:
+    https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+    The current implementation works for:
+    https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
+    To make this function work, the file should not contain a BOM.
+    """
+    table = UnicodeWidthTable()
+
+    # "The unassigned code points in the following blocks default to "W":"
+    default_wide_ranges = [
+        (0x4E00, 0x9FFF),
+        (0x3400, 0x4DBF),
+        (0xF900, 0xFAFF),
+        (0x20000, 0x2FFFD),
+        (0x30000, 0x3FFFD),
+    ]
+    for rng in default_wide_ranges:
+        table.fill_range(rng, UnicodeWidth.IS_2)
+
+    # Read explicitly assigned ranges.
+    # The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
+    LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
+
+    def get_width(str: str):
+        if str == "F" or str == "W":
+            return UnicodeWidth.IS_2
+        else:
+            assert str == "A" or str == "H" or str == "N" or str == "Na"
+            return UnicodeWidth.IS_1
+
+    for line in source:
+        line = line.strip()
+        if line and not line.startswith("#"):
+            match = LINE_REGEX.fullmatch(line)
+            assert match, line  # invalid line
+            from_val = int(match.group(1), base=16)
+            width = get_width(match.group(3))
+            if match.group(2):
+                # range (HEX..HEX)
+                to_val = int(match.group(2)[2:], base=16)
+                table.fill_range((from_val, to_val), width)
+            else:
+                # single character (HEX)
+                table.table[from_val] = width
+
+    return table
+
+
+def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
+    table = read_from(source)
+
+    # Override with ranges specified by N4958 [format.string.std]/13.
+    std_wide_ranges_cpp23 = [
+        (0x4DC0, 0x4DFF),
+        (0x1F300, 0x1F5FF),
+        (0x1F900, 0x1F9FF),
+    ]
+
+    for rng in std_wide_ranges_cpp23:
+        table.fill_range(rng, UnicodeWidth.IS_2)
+
+    return table
+
+
+WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
+{filename}
+{timestamp}
+inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
+{values} }};
+"""
+
+
+def main():
+    print("Old table:")
+    old_table = get_table_cpp20()
+    print(old_table.width_estimate_intervals())
+
+    path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
+    with open(path, mode="rt", encoding="utf-8") as source:
+        filename = source.readline().replace("#", "//").rstrip()
+        timestamp = source.readline().replace("#", "//").rstrip()
+        new_table = get_table_cpp23(source)
+    print("\nNew table:")
+    print(
+        WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
+            filename=filename,
+            timestamp=timestamp,
+            values=new_table.width_estimate_intervals(),
+        )
+    )
+    print("Was 1, now 2:")
+    old_table.print_ranges_1_vs_2(new_table)
+    print("\nWas 2, now 1:")
+    new_table.print_ranges_1_vs_2(old_table)
+
+
+if __name__ == "__main__":
+    main()