зеркало из https://github.com/microsoft/STL.git
P2675R1 `<format>` generator converted from C++ to Python (#3994)
Co-authored-by: achabense <60953653+achabense@users.noreply.github.com> Co-authored-by: Stephan T. Lavavej <stl@nuwen.net> Co-authored-by: nicole mazzuca <83086508+strega-nil-ms@users.noreply.github.com>
This commit is contained in:
Родитель
283cf32878
Коммит
d61f7e037b
|
@ -987,8 +987,11 @@ _NODISCARD constexpr bool _Is_execution_charset_self_synchronizing() {
|
|||
#endif // ^^^ EDG workaround ^^^
|
||||
}
|
||||
|
||||
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.cpp
|
||||
// Generated per N4950 [format.string.std]/13, by tools/unicode_properties_parse/format_width_estimate_intervals.py
|
||||
// in the https://github.com/microsoft/stl repository.
|
||||
|
||||
// EastAsianWidth-15.0.0.txt
|
||||
// Date: 2022-05-24, 17:40:20 GMT [KW, LI]
|
||||
inline constexpr char32_t _Width_estimate_intervals_v2[] = { //
|
||||
0x1100u, 0x1160u, 0x231Au, 0x231Cu, 0x2329u, 0x232Bu, 0x23E9u, 0x23EDu, 0x23F0u, 0x23F1u, 0x23F3u, 0x23F4u, 0x25FDu,
|
||||
0x25FFu, 0x2614u, 0x2616u, 0x2648u, 0x2654u, 0x267Fu, 0x2680u, 0x2693u, 0x2694u, 0x26A1u, 0x26A2u, 0x26AAu, 0x26ACu,
|
||||
|
|
|
@ -1,209 +0,0 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
// The following code generates data for `_Width_estimate_intervals_v2` in <format>.
|
||||
|
||||
#include <charconv>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <regex>
|
||||
#include <source_location>
|
||||
#include <string>
|
||||
#include <system_error>
|
||||
#include <vector>
|
||||
using namespace std;
|
||||
|
||||
void verify(bool test, const char* msg, source_location loc = source_location::current()) {
|
||||
if (!test) {
|
||||
cerr << "Error at line " << loc.line() << ": " << msg << endl;
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
constexpr const char* impl_assertion_failed = "impl assertion failed";
|
||||
|
||||
struct range_u {
|
||||
uint32_t from;
|
||||
uint32_t to;
|
||||
constexpr range_u(uint32_t f, uint32_t t) : from(f), to(t) {}
|
||||
constexpr explicit range_u(uint32_t v) : from(v), to(v) {}
|
||||
};
|
||||
|
||||
enum class width_u : bool { is_1 = false, is_2 = true };
|
||||
|
||||
class table_u {
|
||||
private:
|
||||
// A valid Unicode code point won't exceed `max_u`.
|
||||
static constexpr uint32_t max_u = 0x10'ffff;
|
||||
vector<width_u> table;
|
||||
|
||||
public:
|
||||
table_u() : table(max_u + 1, width_u::is_1) {}
|
||||
|
||||
void fill_range(const range_u rng, const width_u width) {
|
||||
const auto [from, to] = rng;
|
||||
verify(from <= to, impl_assertion_failed);
|
||||
verify(to <= max_u, impl_assertion_failed);
|
||||
for (uint32_t u = from; u <= to; ++u) {
|
||||
table[u] = width;
|
||||
}
|
||||
}
|
||||
|
||||
void print_intervals() const {
|
||||
// Print table for `_Width_estimate_intervals_v2`.
|
||||
int c = 0;
|
||||
width_u last = table[0];
|
||||
for (uint32_t u = 0; u <= max_u; ++u) {
|
||||
if (table[u] != last) {
|
||||
cout << "0x" << hex << uppercase << u << "u, ";
|
||||
if (++c == 12) {
|
||||
c = 0;
|
||||
cout << endl;
|
||||
}
|
||||
}
|
||||
last = table[u];
|
||||
}
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
void print_clusters_1_vs_2(const table_u& other) const {
|
||||
vector<bool> cluster_table(max_u + 1, false);
|
||||
for (uint32_t u = 0; u <= max_u; ++u) {
|
||||
if (table[u] == width_u::is_1 && other.table[u] == width_u::is_2) {
|
||||
cluster_table[u] = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t u = 0; u <= max_u; ++u) {
|
||||
if (cluster_table[u]) {
|
||||
const uint32_t from = u;
|
||||
uint32_t to = from;
|
||||
while (to + 1 <= max_u && cluster_table[to + 1]) {
|
||||
++to;
|
||||
}
|
||||
if (from == to) {
|
||||
cout << hex << uppercase << "U+" << from << endl;
|
||||
} else {
|
||||
cout << hex << uppercase << "U+" << from << "..U+" << to << endl;
|
||||
}
|
||||
u = to;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
table_u get_table_cpp20() {
|
||||
static constexpr range_u std_wide_ranges_cpp20[]{
|
||||
{0x1100, 0x115F},
|
||||
{0x2329, 0x232A},
|
||||
{0x2E80, 0x303E},
|
||||
{0x3040, 0xA4CF},
|
||||
{0xAC00, 0xD7A3},
|
||||
{0xF900, 0xFAFF},
|
||||
{0xFE10, 0xFE19},
|
||||
{0xFE30, 0xFE6F},
|
||||
{0xFF00, 0xFF60},
|
||||
{0xFFE0, 0xFFE6},
|
||||
{0x1F300, 0x1F64F},
|
||||
{0x1F900, 0x1F9FF},
|
||||
{0x20000, 0x2FFFD},
|
||||
{0x30000, 0x3FFFD},
|
||||
};
|
||||
|
||||
table_u table;
|
||||
for (const range_u& rng : std_wide_ranges_cpp20) {
|
||||
table.fill_range(rng, width_u::is_2);
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
// Read data from "EastAsianWidth.txt".
|
||||
// The latest version can be found at:
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
|
||||
// The current implementation works for:
|
||||
// https://www.unicode.org/Public/15.0.0/ucd/EastAsianWidth.txt
|
||||
// To make this function work, the file should not contain a BOM.
|
||||
table_u read_from(ifstream& source) {
|
||||
table_u table;
|
||||
|
||||
// "The unassigned code points in the following blocks default to "W":"
|
||||
static constexpr range_u default_wide_ranges[]{
|
||||
{0x4E00, 0x9FFF}, {0x3400, 0x4DBF}, {0xF900, 0xFAFF}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}};
|
||||
for (const range_u& rng : default_wide_ranges) {
|
||||
table.fill_range(rng, width_u::is_2);
|
||||
}
|
||||
|
||||
// Read explicitly assigned ranges.
|
||||
// The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)?;(A|F|H|N|Na|W) #comment".
|
||||
auto get_width = [](const string& str) {
|
||||
if (str == "F" || str == "W") {
|
||||
return width_u::is_2;
|
||||
} else {
|
||||
verify(str == "A" || str == "H" || str == "N" || str == "Na", impl_assertion_failed);
|
||||
return width_u::is_1;
|
||||
}
|
||||
};
|
||||
auto get_value = [](const string& str) {
|
||||
uint32_t value{};
|
||||
const auto [end_ptr, ec] = from_chars(str.data(), str.data() + str.size(), value, 16);
|
||||
verify(end_ptr == str.data() + str.size(), impl_assertion_failed);
|
||||
verify(ec == errc{}, impl_assertion_failed);
|
||||
return value;
|
||||
};
|
||||
|
||||
verify(!!source, "invalid path");
|
||||
string line;
|
||||
const regex reg(R"(([0-9A-Z]+)(\.\.[0-9A-Z]+)?;(A|F|H|N|Na|W) *#.*)");
|
||||
while (getline(source, line)) {
|
||||
if (!line.empty() && !line.starts_with("#")) {
|
||||
smatch match;
|
||||
verify(regex_match(line, match, reg), "invalid line");
|
||||
verify(match[1].matched, impl_assertion_failed);
|
||||
verify(match[3].matched, impl_assertion_failed);
|
||||
const width_u width = get_width(match[3].str());
|
||||
const uint32_t from = get_value(match[1].str());
|
||||
if (match[2].matched) {
|
||||
// range (HEX..HEX)
|
||||
const string match2 = match[2].str();
|
||||
verify(match2.starts_with(".."), impl_assertion_failed);
|
||||
table.fill_range({from, get_value(match2.substr(2))}, width);
|
||||
} else {
|
||||
// single character (HEX)
|
||||
table.fill_range(range_u{from}, width);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
table_u get_table_cpp23(ifstream& source) {
|
||||
table_u table = read_from(source);
|
||||
|
||||
// Override with ranges specified by the C++ standard.
|
||||
static constexpr range_u std_wide_ranges_cpp23[]{{0x4DC0, 0x4DFF}, {0x1F300, 0x1F5FF}, {0x1F900, 0x1F9FF}};
|
||||
for (const range_u& rng : std_wide_ranges_cpp23) {
|
||||
table.fill_range(rng, width_u::is_2);
|
||||
}
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
int main() {
|
||||
cout << "Old table:\n";
|
||||
const table_u old_table = get_table_cpp20();
|
||||
old_table.print_intervals();
|
||||
|
||||
cout << "\nNew table:\nInput path for EastAsianWidth.txt: ";
|
||||
string path;
|
||||
getline(cin, path);
|
||||
ifstream source(path);
|
||||
const table_u new_table = get_table_cpp23(source);
|
||||
new_table.print_intervals();
|
||||
|
||||
cout << "\nWas 1, now 2:\n";
|
||||
old_table.print_clusters_1_vs_2(new_table);
|
||||
cout << "\nWas 2, now 1:\n";
|
||||
new_table.print_clusters_1_vs_2(old_table);
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
|
||||
# The following code generates data for _Width_estimate_intervals_v2 in <format>.
|
||||
|
||||
import re
|
||||
from enum import Enum
|
||||
from typing import TextIO
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Width estimation.
|
||||
class UnicodeWidth(Enum):
|
||||
IS_1: int = 1
|
||||
IS_2: int = 2
|
||||
|
||||
|
||||
class UnicodeWidthTable:
|
||||
# A valid Unicode code point won't exceed MAX_CODE_POINT.
|
||||
MAX_CODE_POINT: int = 0x10FFFF
|
||||
TABLE_SIZE: int = MAX_CODE_POINT + 1
|
||||
|
||||
def __init__(self):
|
||||
self.table = [UnicodeWidth.IS_1] * (self.TABLE_SIZE)
|
||||
|
||||
# "rng" denotes a right-closed range.
|
||||
def fill_range(self, rng: tuple, width: int):
|
||||
from_, to_ = rng
|
||||
assert from_ <= to_, "invalid range"
|
||||
assert to_ <= self.MAX_CODE_POINT, "invalid range"
|
||||
self.table[from_ : to_ + 1] = [width] * (to_ - from_ + 1)
|
||||
|
||||
def width_estimate_intervals(self):
|
||||
"""
|
||||
Creates a string representation of the map (in `self.table`) from
|
||||
unicode code points to their width, using hexadecimal unsigned integer literals.
|
||||
Since there are long runs of code points of one width or the other,
|
||||
this representation is a list of code points where the width switches.
|
||||
Additionally, the width is assumed to start at `1` from the beginning of the list.
|
||||
For example, `[1, 1, 2, 2, 2, 1]` would be represented as `"0x2u, 0x5u"`.
|
||||
"""
|
||||
values = []
|
||||
assert self.table[0] == UnicodeWidth.IS_1
|
||||
for u in range(1, self.TABLE_SIZE):
|
||||
assert (
|
||||
self.table[u] == UnicodeWidth.IS_1 or self.table[u] == UnicodeWidth.IS_2
|
||||
)
|
||||
if self.table[u] != self.table[u - 1]:
|
||||
values.append(u)
|
||||
|
||||
return ", ".join([f"0x{u:X}u" for u in values])
|
||||
|
||||
# Print all ranges (right-closed), where self's width is 1 and other's width is 2.
|
||||
def print_ranges_1_vs_2(self, other):
|
||||
def _1_vs_2(u: int):
|
||||
return (
|
||||
self.table[u] == UnicodeWidth.IS_1
|
||||
and other.table[u] == UnicodeWidth.IS_2
|
||||
)
|
||||
|
||||
u = 0
|
||||
while u < self.TABLE_SIZE:
|
||||
if _1_vs_2(u):
|
||||
from_ = u
|
||||
to_ = from_
|
||||
while to_ + 1 < self.TABLE_SIZE and _1_vs_2(to_ + 1):
|
||||
to_ += 1
|
||||
if from_ == to_:
|
||||
print(f"U+{from_:X}")
|
||||
else:
|
||||
print(f"U+{from_:X}..U+{to_:X}")
|
||||
u = to_
|
||||
u += 1
|
||||
|
||||
|
||||
def get_table_cpp20() -> UnicodeWidthTable:
|
||||
std_wide_ranges_cpp20 = [
|
||||
(0x1100, 0x115F),
|
||||
(0x2329, 0x232A),
|
||||
(0x2E80, 0x303E),
|
||||
(0x3040, 0xA4CF),
|
||||
(0xAC00, 0xD7A3),
|
||||
(0xF900, 0xFAFF),
|
||||
(0xFE10, 0xFE19),
|
||||
(0xFE30, 0xFE6F),
|
||||
(0xFF00, 0xFF60),
|
||||
(0xFFE0, 0xFFE6),
|
||||
(0x1F300, 0x1F64F),
|
||||
(0x1F900, 0x1F9FF),
|
||||
(0x20000, 0x2FFFD),
|
||||
(0x30000, 0x3FFFD),
|
||||
]
|
||||
|
||||
table = UnicodeWidthTable()
|
||||
for rng in std_wide_ranges_cpp20:
|
||||
table.fill_range(rng, UnicodeWidth.IS_2)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def read_from(source: TextIO) -> UnicodeWidthTable:
|
||||
"""
|
||||
Read data from "EastAsianWidth.txt".
|
||||
The latest version can be found at:
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
|
||||
The current implementation works for:
|
||||
https://www.unicode.org/Public/15.1.0/ucd/EastAsianWidth.txt
|
||||
To make this function work, the file should not contain a BOM.
|
||||
"""
|
||||
table = UnicodeWidthTable()
|
||||
|
||||
# "The unassigned code points in the following blocks default to "W":"
|
||||
default_wide_ranges = [
|
||||
(0x4E00, 0x9FFF),
|
||||
(0x3400, 0x4DBF),
|
||||
(0xF900, 0xFAFF),
|
||||
(0x20000, 0x2FFFD),
|
||||
(0x30000, 0x3FFFD),
|
||||
]
|
||||
for rng in default_wide_ranges:
|
||||
table.fill_range(rng, UnicodeWidth.IS_2)
|
||||
|
||||
# Read explicitly assigned ranges.
|
||||
# The lines that are not empty or pure comment are uniformly of the format "HEX(..HEX)? ; (A|F|H|N|Na|W) #comment".
|
||||
LINE_REGEX = re.compile(r"([0-9A-Z]+)(\.\.[0-9A-Z]+)? *; *(A|F|H|N|Na|W) *#.*")
|
||||
|
||||
def get_width(str: str):
|
||||
if str == "F" or str == "W":
|
||||
return UnicodeWidth.IS_2
|
||||
else:
|
||||
assert str == "A" or str == "H" or str == "N" or str == "Na"
|
||||
return UnicodeWidth.IS_1
|
||||
|
||||
for line in source:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#"):
|
||||
match = LINE_REGEX.fullmatch(line)
|
||||
assert match, line # invalid line
|
||||
from_val = int(match.group(1), base=16)
|
||||
width = get_width(match.group(3))
|
||||
if match.group(2):
|
||||
# range (HEX..HEX)
|
||||
to_val = int(match.group(2)[2:], base=16)
|
||||
table.fill_range((from_val, to_val), width)
|
||||
else:
|
||||
# single character (HEX)
|
||||
table.table[from_val] = width
|
||||
|
||||
return table
|
||||
|
||||
|
||||
def get_table_cpp23(source: TextIO) -> UnicodeWidthTable:
|
||||
table = read_from(source)
|
||||
|
||||
# Override with ranges specified by N4958 [format.string.std]/13.
|
||||
std_wide_ranges_cpp23 = [
|
||||
(0x4DC0, 0x4DFF),
|
||||
(0x1F300, 0x1F5FF),
|
||||
(0x1F900, 0x1F9FF),
|
||||
]
|
||||
|
||||
for rng in std_wide_ranges_cpp23:
|
||||
table.fill_range(rng, UnicodeWidth.IS_2)
|
||||
|
||||
return table
|
||||
|
||||
|
||||
WIDTH_ESTIMATE_INTERVALS_TEMPLATE = """
|
||||
{filename}
|
||||
{timestamp}
|
||||
inline constexpr char32_t _Width_estimate_intervals_v2[] = {{ //
|
||||
{values} }};
|
||||
"""
|
||||
|
||||
|
||||
def main():
|
||||
print("Old table:")
|
||||
old_table = get_table_cpp20()
|
||||
print(old_table.width_estimate_intervals())
|
||||
|
||||
path = Path(__file__).absolute().with_name("EastAsianWidth.txt")
|
||||
with open(path, mode="rt", encoding="utf-8") as source:
|
||||
filename = source.readline().replace("#", "//").rstrip()
|
||||
timestamp = source.readline().replace("#", "//").rstrip()
|
||||
new_table = get_table_cpp23(source)
|
||||
print("\nNew table:")
|
||||
print(
|
||||
WIDTH_ESTIMATE_INTERVALS_TEMPLATE.lstrip().format(
|
||||
filename=filename,
|
||||
timestamp=timestamp,
|
||||
values=new_table.width_estimate_intervals(),
|
||||
)
|
||||
)
|
||||
print("Was 1, now 2:")
|
||||
old_table.print_ranges_1_vs_2(new_table)
|
||||
print("\nWas 2, now 1:")
|
||||
new_table.print_ranges_1_vs_2(old_table)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Загрузка…
Ссылка в новой задаче