`<format>` assumes strings are encoded in the active code page (#1834)

This commit is contained in:
Casey Carter 2021-04-20 00:21:40 -07:00 коммит произвёл GitHub
Родитель a10865713d
Коммит ccc5aaaadc
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 442 добавлений и 283 удалений

Просмотреть файл

@ -249,6 +249,7 @@ endforeach()
# Objs that exist in both libcpmt[d][01].lib and msvcprt[d].lib.
set(IMPLIB_SOURCES
${CMAKE_CURRENT_LIST_DIR}/src/filesystem.cpp
${CMAKE_CURRENT_LIST_DIR}/src/format.cpp
${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp
${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp
${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp

Просмотреть файл

@ -53,6 +53,7 @@
#include <stdexcept>
#include <string>
#include <string_view>
#include <xfilesystem_abi.h>
#include <xutility>
#pragma pack(push, _CRT_PACKING)
@ -62,6 +63,8 @@ _STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new
extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt(__std_code_page _Codepage, _Cvtvec* _Pcvt) noexcept;
_STD_BEGIN
class format_error : public runtime_error {
@ -444,95 +447,232 @@ _NODISCARD constexpr bool _Is_execution_charset_utf8() {
#pragma warning(pop)
}
inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8();
inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11
0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u,
0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u};
_NODISCARD constexpr int _Utf8_code_units_in_next_character(
const char* const _First, const char* const _Last) noexcept {
// Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last),
// or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
const auto _Ch = static_cast<unsigned char>(*_First);
if (_Ch < 0b1000'0000u) {
return 1;
inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11
0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
template <auto& _Bounds>
_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
// Computes the width estimation for Unicode characters from N4885 [format.string.std]/11
int _Result = 1;
for (const auto& _Bound : _Bounds) {
if (_Ch < _Bound) {
return _Result;
}
_Result ^= 0b11u; // Flip between 1 and 2 on each iteration
}
const auto _Len = static_cast<size_t>(_Last - _First);
if (_Ch < 0b1110'0000u) {
// check for non-lead byte or partial 2-byte encoded character
return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
}
if (_Ch < 0b1111'0000u) {
// check for partial 3-byte encoded character
return (_Len >= 3) ? 3 : -1;
}
// check for partial 4-byte encoded character
return (_Len >= 4) ? 4 : -1;
return 1;
}
_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character(
const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) {
// Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
// or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
wchar_t _Wide;
mbstate_t _St{};
const auto _Len = static_cast<size_t>(_Last - _First);
const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
if (_Result > 0) {
return _Result;
} else if (_Result < 0) { // invalid or incomplete encoded character
return -1;
} else { // next code unit is '\0'
return 1;
}
}
template <class _CharT, bool _Statically_Utf8 = _Is_execution_charset_utf8()>
class _Fmt_codec;
_NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is not a valid lead byte.
_STL_INTERNAL_CHECK(_First < _Last);
template <bool _Statically_Utf8>
class _Fmt_codec_base {};
if constexpr (_Is_execution_charset_utf8_v) {
return _Utf8_code_units_in_next_character(_First, _Last);
} else {
switch (_Cvt._Mbcurmax) {
default:
_STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
[[fallthrough]];
case 1:
return 1; // all characters have only one code unit
template <>
class _Fmt_codec_base<false> {
protected:
_Cvtvec _Cvt;
case 2:
return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt);
case 4: // Assume UTF-8 (as does _Mbrtowc)
return _Utf8_code_units_in_next_character(_First, _Last);
_NODISCARD int _Double_byte_encoding_code_units_in_next_character(
const char* const _First, const char* const _Last) const {
// Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
// or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
wchar_t _Wide;
mbstate_t _St{};
const auto _Len = static_cast<size_t>(_Last - _First);
const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
if (_Result > 0) {
return _Result;
} else if (_Result < 0) { // invalid or incomplete encoded character
return -1;
} else { // next code unit is '\0'
return 1;
}
}
}
_NODISCARD inline int _Code_units_in_next_character(const wchar_t* _First, const wchar_t* _Last, const _Cvtvec&) {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is an unpaired surrogate.
_STL_INTERNAL_CHECK(_First < _Last);
_Fmt_codec_base() {
#ifndef _FORMAT_CODEPAGE
#define _FORMAT_CODEPAGE __std_code_page::_Acp
#endif // _FORMAT_CODEPAGE
[[maybe_unused]] const __std_win_error _Result = __std_get_cvt(_FORMAT_CODEPAGE, &_Cvt);
_STL_INTERNAL_CHECK(_Result == __std_win_error::_Success);
#undef _FORMAT_CODEPAGE
}
};
if (*_First < 0xD800u || *_First >= 0xE000u) {
return 1;
template <bool _Statically_Utf8>
class _Fmt_codec<char, _Statically_Utf8> : private _Fmt_codec_base<_Statically_Utf8> {
private:
_NODISCARD static constexpr int _Utf8_code_units_in_next_character(
const char* const _First, const char* const _Last) noexcept {
// Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First,
// _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead
// byte.
const auto _Ch = static_cast<unsigned char>(*_First);
if (_Ch < 0b1000'0000u) {
return 1;
}
const auto _Len = static_cast<size_t>(_Last - _First);
if (_Ch < 0b1110'0000u) {
// check for non-lead byte or partial 2-byte encoded character
return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
}
if (_Ch < 0b1111'0000u) {
// check for partial 3-byte encoded character
return (_Len >= 3) ? 3 : -1;
}
// check for partial 4-byte encoded character
return (_Len >= 4) ? 4 : -1;
}
if (*_First >= 0xDC00u) { // unpaired low surrogate
return -1;
_NODISCARD static int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
auto _Ch = static_cast<char32_t>(*_Ptr);
switch (_Units) {
default:
case 1:
case 2:
return 1;
case 3:
_Ch &= 0b1111u;
break;
case 4:
_Ch &= 0b111u;
break;
}
for (int _Idx = 1; _Idx < _Units; ++_Idx) {
_Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u);
}
if (_Units == 3) {
return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
}
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
}
if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate
return -1;
public:
_NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is not a valid lead byte.
_STL_INTERNAL_CHECK(_First < _Last);
if constexpr (_Statically_Utf8) {
return _Utf8_code_units_in_next_character(_First, _Last);
} else {
switch (this->_Cvt._Mbcurmax) {
default:
_STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
[[fallthrough]];
case 1:
return 1; // all characters have only one code unit
case 2:
return this->_Double_byte_encoding_code_units_in_next_character(_First, _Last);
case 4: // Assume UTF-8 (as does _Mbrtowc)
return _Utf8_code_units_in_next_character(_First, _Last);
}
}
}
return 2; // surrogate pair
}
_NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const {
// Returns the first occurrence of _Val as an encoded character (and not, for example, as a
// continuation byte) in [_First, _Last).
if constexpr (_Statically_Utf8) {
return _Find_unchecked(_First, _Last, _Val);
} else {
if (this->_Cvt._Mbcurmax == 1 || this->_Cvt._Mbcurmax == 4) {
// As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
return _Find_unchecked(_First, _Last, _Val);
}
while (_First != _Last && *_First != _Val) {
const int _Units = _Units_in_next_character(_First, _Last);
if (_Units < 0) {
_THROW(format_error("Invalid encoded character in format string."));
}
_First += _Units;
}
return _First;
}
}
_NODISCARD int _Estimate_width(const char* const _Ptr, const int _Units) const {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
if constexpr (_Statically_Utf8) {
return _Estimate_utf8_character_width(_Ptr, _Units);
} else {
if (this->_Cvt._Mbcurmax != 4) {
// not a Unicode encoding; estimate width == number of code units
return _Units;
}
// assume UTF-8
return _Estimate_utf8_character_width(_Ptr, _Units);
}
}
};
template <bool _Statically_Utf8>
class _Fmt_codec<wchar_t, _Statically_Utf8> {
public:
_NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept {
// Returns a count of the number of code units that compose the first encoded character in
// [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
// *_First is an unpaired surrogate.
_STL_INTERNAL_CHECK(_First < _Last);
if (*_First < 0xD800u || *_First >= 0xE000u) {
return 1;
}
if (*_First >= 0xDC00u) { // unpaired low surrogate
return -1;
}
if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate
return -1;
}
return 2; // surrogate pair
}
_NODISCARD const wchar_t* _Find_encoded(
const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const {
return _Find_unchecked(_First, _Last, _Val);
}
_NODISCARD int _Estimate_width(const wchar_t* const _Ptr, const int _Units) const {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
auto _Ch = static_cast<char32_t>(*_Ptr);
if (_Units == 1) {
return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
}
// surrogate pair
_Ch = (_Ch - 0xD800u) << 10;
_Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
_Ch += 0x10000u;
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
}
};
template <class _CharT, _Parse_align_callbacks<_CharT> _Callbacks_type>
_NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _Callbacks_type&& _Callbacks) {
@ -540,7 +680,7 @@ _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End,
_STL_INTERNAL_CHECK(_Begin != _End && *_Begin != '}');
auto _Parsed_align = _Fmt_align::_None;
const int _Units = _Code_units_in_next_character(_Begin, _End, _Getcvt());
const int _Units = _Fmt_codec<_CharT>{}._Units_in_next_character(_Begin, _End);
if (_Units < 0) { // invalid fill character encoding
_THROW(format_error("Invalid format string."));
}
@ -800,44 +940,19 @@ _NODISCARD constexpr const _CharT* _Parse_replacement_field(
return _Begin + 1;
}
template <class _CharT>
_NODISCARD const _CharT* _Find_encoded(
const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) {
// Returns the first occurrence of _Val as an encoded character (and not, for example, as a
// continuation byte) in [_First, _Last).
if constexpr (_Is_execution_charset_utf8_v) {
return _Find_unchecked(_First, _Last, _Val);
} else {
if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
// As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
return _Find_unchecked(_First, _Last, _Val);
}
while (_First != _Last && *_First != _Val) {
const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
if (_Units < 0) {
_THROW(format_error("Invalid encoded character in format string."));
}
_First += _Units;
}
return _First;
}
}
template <class _CharT, _Parse_replacement_field_callbacks<_CharT> _HandlerT>
void _Parse_format_string(basic_string_view<_CharT> _Format_str, _HandlerT&& _Handler) {
auto _Begin = _Format_str.data();
auto _End = _Begin + _Format_str.size();
const _Cvtvec& _Cvt = _Getcvt();
auto _Begin = _Format_str.data();
auto _End = _Begin + _Format_str.size();
const _Fmt_codec<_CharT> _Codec;
while (_Begin != _End) {
const _CharT* _OpeningCurl = _Begin;
if (*_Begin != '{') {
_OpeningCurl = _Find_encoded(_Begin, _End, _CharT{'{'}, _Cvt);
_OpeningCurl = _Codec._Find_encoded(_Begin, _End, _CharT{'{'});
for (;;) {
const _CharT* _ClosingCurl = _Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}, _Cvt);
const _CharT* _ClosingCurl = _Codec._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'});
// In this case there are neither closing nor opening curls in [_Begin, _OpeningCurl)
// Write the whole thing out.
@ -2214,95 +2329,15 @@ _NODISCARD _OutputIt _Fmt_write(
return _Fmt_write(_STD move(_Out), basic_string_view<_CharT>{_Value}, _Specs, _Locale);
}
inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11
0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u,
0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u};
inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11
0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
template <auto& _Bounds>
_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
// Computes the width estimation for Unicode characters from N4885 [format.string.std]/11
int _Result = 1;
for (const auto& _Bound : _Bounds) {
if (_Ch < _Bound) {
return _Result;
}
_Result ^= 1;
}
return 1;
}
_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
auto _Ch = static_cast<char32_t>(*_Ptr);
switch (_Units) {
default:
case 1:
case 2:
return 1;
case 3:
_Ch &= 0b1111u;
break;
case 4:
_Ch &= 0b111u;
break;
}
for (int _Idx = 1; _Idx < _Units; ++_Idx) {
_Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u);
}
if (_Units == 3) {
return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
}
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
}
_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
if constexpr (_Is_execution_charset_utf8_v) {
return _Estimate_utf8_character_width(_Ptr, _Units);
} else {
if (_Cvt._Mbcurmax != 4) {
// not a Unicode encoding; estimate width == number of code units
return _Units;
}
// assume UTF-8
return _Estimate_utf8_character_width(_Ptr, _Units);
}
}
_NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) {
// Return an estimate for the width of the character composed of _Units code units,
// whose first code unit is denoted by _Ptr.
auto _Ch = static_cast<char32_t>(*_Ptr);
if (_Units == 1) {
return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
}
// surrogate pair
_Ch = (_Ch - 0xD8000u) << 10;
_Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
_Ch += 0x10000u;
return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
}
template <class _CharT>
_NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> _Value, int& _Width) {
// Returns a pointer past-the-end of the largest prefix of _Value that fits in _Width, or all
// of _Value if _Width is negative. Updates _Width to the estimated width of that prefix.
const int _Max_width = _Width;
auto _Pos = _Value.data();
const auto _Last = _Pos + _Value.size();
int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos)
const _Cvtvec& _Cvt = _Getcvt();
const int _Max_width = _Width;
auto _Pos = _Value.data();
const auto _Last = _Pos + _Value.size();
int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos)
const _Fmt_codec<_CharT> _Codec;
constexpr auto _Max_int = (numeric_limits<int>::max)();
while (_Pos != _Last) {
@ -2312,8 +2347,8 @@ _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT>
}
// TRANSITION, extended grapheme clustering
const int _Units = _Code_units_in_next_character(_Pos, _Last, _Cvt);
const int _Character_width = _Estimate_character_width(_Pos, _Units, _Cvt);
const int _Units = _Codec._Units_in_next_character(_Pos, _Last);
const int _Character_width = _Codec._Estimate_width(_Pos, _Units);
if (_Max_int - _Character_width < _Estimated_width) { // avoid overflow
// Either _Max_width isn't set, or adding this character will exceed it.

Просмотреть файл

@ -205,7 +205,7 @@ _BITMASK_OPS(__std_fs_file_flags)
enum class __std_fs_file_handle : intptr_t { _Invalid = -1 };
enum class __std_code_page : unsigned int { _Utf8 = 65001 };
enum class __std_code_page : unsigned int { _Acp = 0, _Utf8 = 65001 };
struct __std_fs_convert_result {
int _Len;

Просмотреть файл

@ -161,6 +161,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
(controlled by IncludeInLink and IncludeInImportLib). -->
<BuildFiles Include="
$(CrtRoot)\github\stl\src\filesystem.cpp;
$(CrtRoot)\github\stl\src\format.cpp;
$(CrtRoot)\github\stl\src\locale0_implib.cpp;
$(CrtRoot)\github\stl\src\nothrow.cpp;
$(CrtRoot)\github\stl\src\sharedmutex.cpp;

45
stl/src/format.cpp Normal file
Просмотреть файл

@ -0,0 +1,45 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// Implements a win32 API wrapper for <format>
// This must be as small as possible, because its contents are
// injected into the msvcprt.lib and msvcprtd.lib import libraries.
// Do not include or define anything else here.
// In particular, basic_string must not be included here.
#include <xfilesystem_abi.h>
#include <xlocinfo.h>
#include <Windows.h>
static_assert(__std_code_page::_Acp == __std_code_page{CP_ACP});
extern "C" [[nodiscard]] __std_win_error __stdcall __std_get_cvt(
const __std_code_page _Codepage, _Cvtvec* const _Pcvt) noexcept {
// get conversion info for an arbitrary codepage
*_Pcvt = {};
CPINFOEXW _Info{};
const DWORD _Flags = 0; // reserved, must be zero
if (GetCPInfoExW(static_cast<UINT>(_Codepage), _Flags, &_Info) == 0) {
// NB: the only documented failure mode for GetCPInfoExW is ERROR_INVALID_PARAMETER,
// so in practice it should never fail for CP_ACP.
return __std_win_error{GetLastError()};
}
_Pcvt->_Page = _Info.CodePage;
_Pcvt->_Mbcurmax = _Info.MaxCharSize;
for (int _Idx = 0; _Idx < MAX_LEADBYTES; _Idx += 2) {
if (_Info.LeadByte[_Idx] == 0 && _Info.LeadByte[_Idx + 1] == 0) {
break;
}
for (unsigned char _First = _Info.LeadByte[_Idx], _Last = _Info.LeadByte[_Idx + 1]; _First != _Last; ++_First) {
_Pcvt->_Isleadbyte[_First >> 3] |= 1u << (_First & 0b111u);
}
}
return __std_win_error::_Success;
}

Просмотреть файл

@ -117,9 +117,10 @@ void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback
callback_type&& callbacks = {}) {
try {
auto end = func(view.data(), view.data() + view.size(), std::move(callbacks));
if (expected_end_position != std::basic_string_view<CharT>::npos) {
assert(end == view.data() + expected_end_position);
if (expected_end_position == std::basic_string_view<CharT>::npos) {
expected_end_position = view.size();
}
assert(end == view.data() + expected_end_position);
assert(!err_expected);
} catch (const std::format_error&) {
assert(err_expected);

Просмотреть файл

@ -975,31 +975,6 @@ void test_size() {
test_size_helper<charT>(8, STR("{:8}"), STR("scully"));
}
void test_multibyte_format_strings() {
#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8"
{
assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
// Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
}
{
assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
try {
(void) format("{:\x9f\x8f\x88<10}"sv, 42); // Bad fill character encoding: missing lead byte before \x9f
assert(false);
} catch (const format_error&) {
}
}
#endif // MSVC_INTERNAL_TESTING
assert(setlocale(LC_ALL, "C") != nullptr);
}
// The libfmt_ tests are derived from tests in
// libfmt, Copyright (c) 2012 - present, Victor Zverovich
// See NOTICE.txt for more information.
@ -1318,8 +1293,6 @@ void test() {
test_size<char>();
test_size<wchar_t>();
test_multibyte_format_strings();
libfmt_formatter_test_escape<char>();
libfmt_formatter_test_escape<wchar_t>();

Просмотреть файл

@ -1,6 +1,8 @@
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#define _FORMAT_CODEPAGE (__std_code_page{932})
#include <cassert>
#include <clocale>
#include <format>
@ -11,55 +13,76 @@
using namespace std;
void test_multibyte_format_strings() {
{
assert(setlocale(LC_ALL, ".932") != nullptr);
const auto s =
"\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
assert(format(s) == s);
const auto s = "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
assert(format(s) == s);
assert(format("{:.2}", s) == "\x93\xfa"sv);
assert(format("{:4.2}", s) == "\x93\xfa "sv);
assert(format("{:.2}", s) == "\x93\xfa"sv);
assert(format("{:4.2}", s) == "\x93\xfa "sv);
assert(format("{:<4.2}", s) == "\x93\xfa "sv);
assert(format("{:^4.2}", s) == " \x93\xfa "sv);
assert(format("{:>4.2}", s) == " \x93\xfa"sv);
assert(format("{:<4.2}", s) == "\x93\xfa "sv);
assert(format("{:^4.2}", s) == " \x93\xfa "sv);
assert(format("{:>4.2}", s) == " \x93\xfa"sv);
assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);
assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);
assert(format("{:.3}", s) == "\x93\xfa"sv);
assert(format("{:4.3}", s) == "\x93\xfa "sv);
assert(format("{:.3}", s) == "\x93\xfa"sv);
assert(format("{:4.3}", s) == "\x93\xfa "sv);
assert(format("{:<4.3}", s) == "\x93\xfa "sv);
assert(format("{:^4.3}", s) == " \x93\xfa "sv);
assert(format("{:>4.3}", s) == " \x93\xfa"sv);
assert(format("{:<4.3}", s) == "\x93\xfa "sv);
assert(format("{:^4.3}", s) == " \x93\xfa "sv);
assert(format("{:>4.3}", s) == " \x93\xfa"sv);
assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
}
assert(setlocale(LC_ALL, "C") != nullptr);
assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
}
void test_parse_align() {
auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;
const auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;
{
assert(setlocale(LC_ALL, ".932") != nullptr);
test_parse_helper(parse_align_fn, "\x93\xfa<X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\x93\xfa"sv});
test_parse_helper(parse_align_fn, "\x96\x7b>X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv});
test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
test_parse_helper(parse_align_fn, "\x93\xfa<X"sv, false, 3, //
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\x93\xfa"sv});
test_parse_helper(parse_align_fn, "\x96\x7b>X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv});
test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
}
void test_width_estimation() {
// Format strings of known width with a trailing delimiter using a precision large enough to
// include all but the delimiter to validate the width estimation code.
struct test_case {
const char* str;
int width;
};
constexpr test_case test_cases[] = {
{"\x58", 1},
{"x\x58", 2},
// Pick "short" and "long" codepoints (\x20 and \x96\x7b), then form all permutations of
// 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of
// all adjacent pairings (short/short, short/long, long/short, long/long).
{"\x20\x20\x20\x58", 4},
{"\x20\x20\x96\x7b\x58", 5},
{"\x20\x96\x7b\x20\x58", 5},
{"\x96\x7b\x20\x20\x58", 5},
{"\x20\x96\x7b\x96\x7b\x58", 6},
{"\x96\x7b\x20\x96\x7b\x58", 6},
{"\x96\x7b\x96\x7b\x20\x58", 6},
{"\x96\x7b\x96\x7b\x96\x7b\x58", 7},
};
for (const auto& test : test_cases) {
string_view sv{test.str};
sv = sv.substr(0, sv.size() - 1);
assert(format("{:.{}}", test.str, test.width - 1) == sv);
}
assert(setlocale(LC_ALL, "C") != nullptr);
}
int main() {
test_multibyte_format_strings();
test_parse_align();
test_width_estimation();
}

Просмотреть файл

@ -36,7 +36,7 @@ bool test_parse_align() {
// \x343E (which is from CJK unified ideographs extension A) and similar characters to parse as
// an alignment specifier.
auto s4 = L"*\x343E"sv;
test_parse_helper(parse_align_fn, s4, false, view_typ::npos, {.expected_fill = L"*"sv});
test_parse_helper(parse_align_fn, s4, false, 0, {.expected_fill = L"*"sv});
// test multi-code-unit fill characters
{
@ -47,22 +47,6 @@ bool test_parse_align() {
test_parse_helper(parse_align_fn, L"\U0001F3C8^X"sv, false, 3,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = L"\U0001F3C8"sv});
}
} else {
// test multibyte fill characters
#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8"
{
assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
// "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
}
#endif // MSVC_INTERNAL_TESTING
assert(setlocale(LC_ALL, "C") != nullptr);
}
return true;

Просмотреть файл

@ -12,12 +12,12 @@ using namespace std;
void test_multibyte_format_strings() {
{
// Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
// Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL)
assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32");
assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv);
assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv);
assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv);
}
{
@ -33,19 +33,115 @@ void test_parse_align() {
auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;
{
// "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8<X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Left, .expected_fill = "\U0001f3c8"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8>X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv});
test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5,
{.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv});
}
}
template <class CharT>
void test_width_estimation() {
// Format strings of known width with a trailing delimiter using a precision large enough to
// include all but the delimiter to validate the width estimation code.
struct test_case {
const CharT* str;
int width;
};
constexpr test_case test_cases[] = {
{TYPED_LITERAL(CharT, "\x58"), 1},
{TYPED_LITERAL(CharT, "x\x58"), 2},
// test the boundaries of the intervals defined in n4885 [format.string.std]/11
{TYPED_LITERAL(CharT, "\u10ff\x58"), 2},
{TYPED_LITERAL(CharT, "\u1100\x58"), 3},
{TYPED_LITERAL(CharT, "\u115f\x58"), 3},
{TYPED_LITERAL(CharT, "\u1160\x58"), 2},
{TYPED_LITERAL(CharT, "\u2328\x58"), 2},
{TYPED_LITERAL(CharT, "\u2329\x58"), 3},
{TYPED_LITERAL(CharT, "\u232a\x58"), 3},
{TYPED_LITERAL(CharT, "\u232b\x58"), 2},
{TYPED_LITERAL(CharT, "\u2e7f\x58"), 2},
{TYPED_LITERAL(CharT, "\u2e80\x58"), 3},
{TYPED_LITERAL(CharT, "\u303e\x58"), 3},
{TYPED_LITERAL(CharT, "\u303f\x58"), 2},
{TYPED_LITERAL(CharT, "\u3040\x58"), 3},
{TYPED_LITERAL(CharT, "\ua4cf\x58"), 3},
{TYPED_LITERAL(CharT, "\ua4d0\x58"), 2},
{TYPED_LITERAL(CharT, "\uabff\x58"), 2},
{TYPED_LITERAL(CharT, "\uac00\x58"), 3},
{TYPED_LITERAL(CharT, "\ud7a3\x58"), 3},
{TYPED_LITERAL(CharT, "\ud7a4\x58"), 2},
{TYPED_LITERAL(CharT, "\ud7ff\x58"), 2},
// skip over the surrogate pair range (\ud800-\udfff)
{TYPED_LITERAL(CharT, "\ue000\x58"), 2},
{TYPED_LITERAL(CharT, "\uf8ff\x58"), 2},
{TYPED_LITERAL(CharT, "\uf900\x58"), 3},
{TYPED_LITERAL(CharT, "\ufaff\x58"), 3},
{TYPED_LITERAL(CharT, "\ufb00\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe0f\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe10\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe19\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe1a\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe2f\x58"), 2},
{TYPED_LITERAL(CharT, "\ufe30\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe6f\x58"), 3},
{TYPED_LITERAL(CharT, "\ufe70\x58"), 2},
{TYPED_LITERAL(CharT, "\ufeff\x58"), 2},
{TYPED_LITERAL(CharT, "\uff00\x58"), 3},
{TYPED_LITERAL(CharT, "\uff60\x58"), 3},
{TYPED_LITERAL(CharT, "\uff61\x58"), 2},
{TYPED_LITERAL(CharT, "\uffdf\x58"), 2},
{TYPED_LITERAL(CharT, "\uffe0\x58"), 3},
{TYPED_LITERAL(CharT, "\uffe6\x58"), 3},
{TYPED_LITERAL(CharT, "\uffe7\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f300\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f650\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001f900\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3},
{TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2},
{TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2},
{TYPED_LITERAL(CharT, "\U00020000\x58"), 3},
{TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3},
{TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2},
{TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2},
{TYPED_LITERAL(CharT, "\U00030000\x58"), 3},
{TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3},
{TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2},
{TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2},
// Pick "short" and "long" codepoints (\u2000 and \ufe40), then form all permutations of
// 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of
// all adjacent pairings (short/short, short/long, long/short, long/long).
{TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4},
{TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5},
{TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5},
{TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5},
{TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6},
{TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7},
};
for (const auto& test : test_cases) {
basic_string_view sv{test.str};
sv = sv.substr(0, sv.size() - 1);
assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv);
}
}
void run_tests() {
test_multibyte_format_strings();
test_parse_align();
test_width_estimation<char>();
test_width_estimation<wchar_t>();
}
int main() {