P2675R1 `<format>` generator converted from C++ to Python (#3994)

Co-authored-by: achabense <60953653+achabense@users.noreply.github.com>
Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
Co-authored-by: nicole mazzuca <83086508+strega-nil-ms@users.noreply.github.com>
This commit is contained in:
Igor Zhukov 2023-10-07 01:45:31 +07:00 коммит произвёл GitHub
Родитель 283cf32878
Коммит d61f7e037b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 205 добавлений и 210 удалений

Просмотреть файл

@ -987,8 +987,11 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
#endif // ^^^ EDG workaround ^^^
}
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.cpp
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
// in the https://github.com/microsoft/stl repository.
// EastAsianWidth-15.0.0.txt
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,

Просмотреть файл

@ -1,209 +0,0 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// The following code generates data for `_Width_estimate_intervals_v2` in <format>.
#include <charconv>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <regex>
#include <source_location>
#include <string>
#include <system_error>
#include <vector>
using namespace std;
void verify(bool test, const char* msg, source_location loc = source_location::current()) {
if (!test) {
cerr << "Error at line " << loc.line() << ": " << msg << endl;
exit(EXIT_FAILURE);
}
}
constexpr const char* impl_assertion_failed = "impl assertion failed";
struct range_u {
uint32_t from;
uint32_t to;
constexpr range_u(uint32_t f, uint32_t t) : from(f), to(t) {}
constexpr explicit range_u(uint32_t v) : from(v), to(v) {}
};
enum class width_u : bool { is_1 = false, is_2 = true };
class table_u {
private:
// A valid Unicode code point won't exceed `max_u`.
static constexpr uint32_t max_u = 0x10'ffff;
vector<width_u> table;
public:
table_u() : table(max_u + 1, width_u::is_1) {}
void fill_range(const range_u rng, const width_u width) {
const auto [from, to] = rng;
verify(from <= to, impl_assertion_failed);
verify(to <= max_u, impl_assertion_failed);
for (uint32_t u = from; u <= to; ++u) {
table[u] = width;
}
}
void print_intervals() const {
// Print table for `_Width_estimate_intervals_v2`.
int c = 0;
width_u last = table[0];
for (uint32_t u = 0; u <= max_u; ++u) {
if (table[u] != last) {
cout << "0x" << hex << uppercase << u << "u, ";
if (++c == 12) {
c = 0;
cout << endl;
}
}
last = table[u];
}
cout << endl;
}
void print_clusters_1_vs_2(const table_u& other) const {
vector<bool> cluster_table(max_u + 1, false);
for (uint32_t u = 0; u <= max_u; ++u) {
if (table[u] == width_u::is_1 && other.table[u] == width_u::is_2) {
cluster_table[u] = true;
}
}
for (uint32_t u = 0; u <= max_u; ++u) {
if (cluster_table[u]) {
const uint32_t from = u;
uint32_t to = from;
while (to + 1 <= max_u && cluster_table[to + 1]) {
++to;
}
if (from == to) {
cout << hex << uppercase << "U+" << from << endl;
} else {
cout << hex << uppercase << "U+" << from << "..U+" << to << endl;
}
u = to;
}
}
}
};
table_u get_table_cpp20() {
static constexpr range_u std_wide_ranges_cpp20[]{
{0x1100, 0x115F},
{0x2329, 0x232A},
{0x2E80, 0x303E},
{0x3040, 0xA4CF},
{0xAC00, 0xD7A3},
{0xF900, 0xFAFF},
{0xFE10, 0xFE19},
{0xFE30, 0xFE6F},
{0xFF00, 0xFF60},
{0xFFE0, 0xFFE6},
{0x1F300, 0x1F64F},
{0x1F900, 0x1F9FF},
{0x20000, 0x2FFFD},
{0x30000, 0x3FFFD},
};
table_u table;
for (const range_u& rng : std_wide_ranges_cpp20) {
table.fill_range(rng, width_u::is_2);
}
return table;
}
// Read data from "EastAsianWidth.txt".
// The latest version can be found at:
// https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
// The current implementation works for:
// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
// To make this function work, the file should not contain a BOM.
table_u read_from(ifstream& source) {
table_u table;
// "The unassigned code points in the following blocks default to "W":"
static constexpr range_u default_wide_ranges[]{
{0x4E00, 0x9FFF}, {0x3400, 0x4DBF}, {0xF900, 0xFAFF}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}};
for (const range_u& rng : default_wide_ranges) {
table.fill_range(rng, width_u::is_2);
}
// Read explicitly assigned ranges.
// The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
auto get_width = [](const string& str) {
if (str == "F" || str == "W") {
return width_u::is_2;
} else {
verify(str == "A" || str == "H" || str == "N" || str == "Na", impl_assertion_failed);
return width_u::is_1;
}
};
auto get_value = [](const string& str) {
uint32_t value{};
const auto [end_ptr, ec] = from_chars(str.data(), str.data() + str.size(), value, 16);
verify(end_ptr == str.data() + str.size(), impl_assertion_failed);
verify(ec == errc{}, impl_assertion_failed);
return value;
};
verify(!!source, "invalid path");
string line;
const regex reg(R"(([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*)");
while (getline(source, line)) {
if (!line.empty() && !line.starts_with("#")) {
smatch match;
verify(regex_match(line, match, reg), "invalid line");
verify(match[1].matched, impl_assertion_failed);
verify(match[3].matched, impl_assertion_failed);
const width_u width = get_width(match[3].str());
const uint32_t from = get_value(match[1].str());
if (match[2].matched) {
// range (HEX..HEX)
const string match2 = match[2].str();
verify(match2.starts_with(".."), impl_assertion_failed);
table.fill_range({from, get_value(match2.substr(2))}, width);
} else {
// single character (HEX)
table.fill_range(range_u{from}, width);
}
}
}
return table;
}
table_u get_table_cpp23(ifstream& source) {
table_u table = read_from(source);
// Override with ranges specified by the C++ standard.
static constexpr range_u std_wide_ranges_cpp23[]{{0x4DC0, 0x4DFF}, {0x1F300, 0x1F5FF}, {0x1F900, 0x1F9FF}};
for (const range_u& rng : std_wide_ranges_cpp23) {
table.fill_range(rng, width_u::is_2);
}
return table;
}
int main() {
cout << "Old table:\n";
const table_u old_table = get_table_cpp20();
old_table.print_intervals();
cout << "\nNew table:\nInput path for EastAsianWidth.txt: ";
string path;
getline(cin, path);
ifstream source(path);
const table_u new_table = get_table_cpp23(source);
new_table.print_intervals();
cout << "\nWas 1, now 2:\n";
old_table.print_clusters_1_vs_2(new_table);
cout << "\nWas 2, now 1:\n";
new_table.print_clusters_1_vs_2(old_table);
}

Просмотреть файл

@ -0,0 +1,201 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
# The following code generates data for _Width_estimate_intervals_v2 in <format>.
import re
from enum import Enum
from typing import TextIO
from pathlib import Path
# Width estimation.
class UnicodeWidth(Enum):
IS_1: int = 1
IS_2: int = 2
class UnicodeWidthTable:
# A valid Unicode code point won't exceed MAX_CODE_POINT.
MAX_CODE_POINT: int = 0x10FFFF
TABLE_SIZE: int = MAX_CODE_POINT + 1
def __init__(self):
self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
# "rng" denotes a right-closed range.
def fill_range(self, rng: tuple, width: int):
from_, to_ = rng
assert from_ <= to_, "invalid range"
assert to_ <= self.MAX_CODE_POINT, "invalid range"
self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
def width_estimate_intervals(self):
"""
Creates a string representation of the map (in `self.table`) from
unicode code points to their width, using hexadecimal unsigned integer literals.
Since there are long runs of code points of one width or the other,
this representation is a list of code points where the width switches.
Additionally, the width is assumed to start at `1` from the beginning of the list.
For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
"""
values = []
assert self.table[0] == UnicodeWidth.IS_1
for u in range(1, self.TABLE_SIZE):
assert (
self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
)
if self.table[u] != self.table[u - 1]:
values.append(u)
return ", ".join([f"0x{u:X}u" for u in values])
# Print all ranges (right-closed), where self's width is 1 and other's width is 2.
def print_ranges_1_vs_2(self, other):
def _1_vs_2(u: int):
return (
self.table[u] == UnicodeWidth.IS_1
and other.table[u] == UnicodeWidth.IS_2
)
u = 0
while u < self.TABLE_SIZE:
if _1_vs_2(u):
from_ = u
to_ = from_
while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
to_ += 1
if from_ == to_:
print(f"U+{from_:X}")
else:
print(f"U+{from_:X}..U+{to_:X}")
u = to_
u += 1
def get_table_cpp20() -> UnicodeWidthTable:
std_wide_ranges_cpp20 = [
(0x1100, 0x115F),
(0x2329, 0x232A),
(0x2E80, 0x303E),
(0x3040, 0xA4CF),
(0xAC00, 0xD7A3),
(0xF900, 0xFAFF),
(0xFE10, 0xFE19),
(0xFE30, 0xFE6F),
(0xFF00, 0xFF60),
(0xFFE0, 0xFFE6),
(0x1F300, 0x1F64F),
(0x1F900, 0x1F9FF),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
]
table = UnicodeWidthTable()
for rng in std_wide_ranges_cpp20:
table.fill_range(rng, UnicodeWidth.IS_2)
return table
def read_from(source: TextIO) -> UnicodeWidthTable:
"""
Read data from "EastAsianWidth.txt".
The latest version can be found at:
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
The current implementation works for:
https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
To make this function work, the file should not contain a BOM.
"""
table = UnicodeWidthTable()
# "The unassigned code points in the following blocks default to "W":"
default_wide_ranges = [
(0x4E00, 0x9FFF),
(0x3400, 0x4DBF),
(0xF900, 0xFAFF),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
]
for rng in default_wide_ranges:
table.fill_range(rng, UnicodeWidth.IS_2)
# Read explicitly assigned ranges.
# The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
def get_width(str: str):
if str == "F" or str == "W":
return UnicodeWidth.IS_2
else:
assert str == "A" or str == "H" or str == "N" or str == "Na"
return UnicodeWidth.IS_1
for line in source:
line = line.strip()
if line and not line.startswith("#"):
match = LINE_REGEX.fullmatch(line)
assert match, line # invalid line
from_val = int(match.group(1), base=16)
width = get_width(match.group(3))
if match.group(2):
# range (HEX..HEX)
to_val = int(match.group(2)[2:], base=16)
table.fill_range((from_val, to_val), width)
else:
# single character (HEX)
table.table[from_val] = width
return table
def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
table = read_from(source)
# Override with ranges specified by N4958 [format.string.std]/13.
std_wide_ranges_cpp23 = [
(0x4DC0, 0x4DFF),
(0x1F300, 0x1F5FF),
(0x1F900, 0x1F9FF),
]
for rng in std_wide_ranges_cpp23:
table.fill_range(rng, UnicodeWidth.IS_2)
return table
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
{filename}
{timestamp}
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
{values} }};
"""
def main():
print("Old table:")
old_table = get_table_cpp20()
print(old_table.width_estimate_intervals())
path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
with open(path, mode="rt", encoding="utf-8") as source:
filename = source.readline().replace("#", "//").rstrip()
timestamp = source.readline().replace("#", "//").rstrip()
new_table = get_table_cpp23(source)
print("\nNew table:")
print(
WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
filename=filename,
timestamp=timestamp,
values=new_table.width_estimate_intervals(),
)
)
print("Was 1, now 2:")
old_table.print_ranges_1_vs_2(new_table)
print("\nWas 2, now 1:")
new_table.print_ranges_1_vs_2(old_table)
if __name__ == "__main__":
main()