`<format>` assumes strings are encoded in the active code page (#1834)

2021-04-20 00:21:40 -07:00 · 2021-04-20 00:21:40 -07:00 · ccc5aaaadc
--- a/stl/CMakeLists.txt
+++ b/stl/CMakeLists.txt
@ -249,6 +249,7 @@ endforeach()
 # Objs that exist in both libcpmt[d][01].lib and msvcprt[d].lib.
 set(IMPLIB_SOURCES
    ${CMAKE_CURRENT_LIST_DIR}/src/filesystem.cpp
+    ${CMAKE_CURRENT_LIST_DIR}/src/format.cpp
    ${CMAKE_CURRENT_LIST_DIR}/src/locale0_implib.cpp
    ${CMAKE_CURRENT_LIST_DIR}/src/nothrow.cpp
    ${CMAKE_CURRENT_LIST_DIR}/src/sharedmutex.cpp
--- a/stl/inc/format
+++ b/stl/inc/format
@ -53,6 +53,7 @@
 #include <stdexcept>
 #include <string>
 #include <string_view>
+#include <xfilesystem_abi.h>
 #include <xutility>

 #pragma pack(push, _CRT_PACKING)
@ -62,6 +63,8 @@ _STL_DISABLE_CLANG_WARNINGS
 #pragma push_macro("new")
 #undef new

+extern "C" _NODISCARD __std_win_error __stdcall __std_get_cvt(__std_code_page _Codepage, _Cvtvec* _Pcvt) noexcept;
+
 _STD_BEGIN

 class format_error : public runtime_error {
@ -444,95 +447,232 @@ _NODISCARD constexpr bool _Is_execution_charset_utf8() {
 #pragma warning(pop)
 }

-inline constexpr bool _Is_execution_charset_utf8_v = _Is_execution_charset_utf8();
+inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11
+    0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u,
+    0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u};

-_NODISCARD constexpr int _Utf8_code_units_in_next_character(
-    const char* const _First, const char* const _Last) noexcept {
-    // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First, _Last),
-    // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
-    const auto _Ch = static_cast<unsigned char>(*_First);
-    if (_Ch < 0b1000'0000u) {
-        return 1;
+inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11
+    0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
+
+template <auto& _Bounds>
+_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
+    // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11
+    int _Result = 1;
+    for (const auto& _Bound : _Bounds) {
+        if (_Ch < _Bound) {
+            return _Result;
+        }
+        _Result ^= 0b11u; // Flip between 1 and 2 on each iteration
    }

-    const auto _Len = static_cast<size_t>(_Last - _First);
-
-    if (_Ch < 0b1110'0000u) {
-        // check for non-lead byte or partial 2-byte encoded character
-        return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
-    }
-
-    if (_Ch < 0b1111'0000u) {
-        // check for partial 3-byte encoded character
-        return (_Len >= 3) ? 3 : -1;
-    }
-
-    // check for partial 4-byte encoded character
-    return (_Len >= 4) ? 4 : -1;
+    return 1;
 }

-_NODISCARD inline int _Double_byte_encoding_code_units_in_next_character(
-    const char* const _First, const char* const _Last, const _Cvtvec& _Cvt) {
-    // Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
-    // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
-    wchar_t _Wide;
-    mbstate_t _St{};
-    const auto _Len   = static_cast<size_t>(_Last - _First);
-    const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
-    if (_Result > 0) {
-        return _Result;
-    } else if (_Result < 0) { // invalid or incomplete encoded character
-        return -1;
-    } else { // next code unit is '\0'
-        return 1;
-    }
-}
+template <class _CharT, bool _Statically_Utf8 = _Is_execution_charset_utf8()>
+class _Fmt_codec;

-_NODISCARD inline int _Code_units_in_next_character(const char* _First, const char* _Last, const _Cvtvec& _Cvt) {
-    // Returns a count of the number of code units that compose the first encoded character in
-    // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
-    // *_First is not a valid lead byte.
-    _STL_INTERNAL_CHECK(_First < _Last);
+template <bool _Statically_Utf8>
+class _Fmt_codec_base {};

-    if constexpr (_Is_execution_charset_utf8_v) {
-        return _Utf8_code_units_in_next_character(_First, _Last);
-    } else {
-        switch (_Cvt._Mbcurmax) {
-        default:
-            _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
-            [[fallthrough]];
-        case 1:
-            return 1; // all characters have only one code unit
+template <>
+class _Fmt_codec_base<false> {
+protected:
+    _Cvtvec _Cvt;

-        case 2:
-            return _Double_byte_encoding_code_units_in_next_character(_First, _Last, _Cvt);
-
-        case 4: // Assume UTF-8 (as does _Mbrtowc)
-            return _Utf8_code_units_in_next_character(_First, _Last);
+    _NODISCARD int _Double_byte_encoding_code_units_in_next_character(
+        const char* const _First, const char* const _Last) const {
+        // Returns a count of the number of code units that compose the first encoded character in [_First, _Last),
+        // or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead byte.
+        wchar_t _Wide;
+        mbstate_t _St{};
+        const auto _Len   = static_cast<size_t>(_Last - _First);
+        const int _Result = _Mbrtowc(&_Wide, _First, _Len, &_St, &_Cvt);
+        if (_Result > 0) {
+            return _Result;
+        } else if (_Result < 0) { // invalid or incomplete encoded character
+            return -1;
+        } else { // next code unit is '\0'
+            return 1;
        }
    }
-}

-_NODISCARD inline int _Code_units_in_next_character(const wchar_t* _First, const wchar_t* _Last, const _Cvtvec&) {
-    // Returns a count of the number of code units that compose the first encoded character in
-    // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
-    // *_First is an unpaired surrogate.
-    _STL_INTERNAL_CHECK(_First < _Last);
+    _Fmt_codec_base() {
+#ifndef _FORMAT_CODEPAGE
+#define _FORMAT_CODEPAGE __std_code_page::_Acp
+#endif // _FORMAT_CODEPAGE
+        [[maybe_unused]] const __std_win_error _Result = __std_get_cvt(_FORMAT_CODEPAGE, &_Cvt);
+        _STL_INTERNAL_CHECK(_Result == __std_win_error::_Success);
+#undef _FORMAT_CODEPAGE
+    }
+};

-    if (*_First < 0xD800u || *_First >= 0xE000u) {
-        return 1;
+template <bool _Statically_Utf8>
+class _Fmt_codec<char, _Statically_Utf8> : private _Fmt_codec_base<_Statically_Utf8> {
+private:
+    _NODISCARD static constexpr int _Utf8_code_units_in_next_character(
+        const char* const _First, const char* const _Last) noexcept {
+        // Returns a count of the number of UTF-8 code units that compose the first encoded character in [_First,
+        // _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or *_First is not a valid lead
+        // byte.
+        const auto _Ch = static_cast<unsigned char>(*_First);
+        if (_Ch < 0b1000'0000u) {
+            return 1;
+        }
+
+        const auto _Len = static_cast<size_t>(_Last - _First);
+
+        if (_Ch < 0b1110'0000u) {
+            // check for non-lead byte or partial 2-byte encoded character
+            return (_Ch >= 0b1100'0000u && _Len >= 2) ? 2 : -1;
+        }
+
+        if (_Ch < 0b1111'0000u) {
+            // check for partial 3-byte encoded character
+            return (_Len >= 3) ? 3 : -1;
+        }
+
+        // check for partial 4-byte encoded character
+        return (_Len >= 4) ? 4 : -1;
    }

-    if (*_First >= 0xDC00u) { // unpaired low surrogate
-        return -1;
+    _NODISCARD static int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
+        // Return an estimate for the width of the character composed of _Units code units,
+        // whose first code unit is denoted by _Ptr.
+        auto _Ch = static_cast<char32_t>(*_Ptr);
+        switch (_Units) {
+        default:
+        case 1:
+        case 2:
+            return 1;
+        case 3:
+            _Ch &= 0b1111u;
+            break;
+        case 4:
+            _Ch &= 0b111u;
+            break;
+        }
+
+        for (int _Idx = 1; _Idx < _Units; ++_Idx) {
+            _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u);
+        }
+
+        if (_Units == 3) {
+            return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
+        }
+
+        return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
    }

-    if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate
-        return -1;
+public:
+    _NODISCARD int _Units_in_next_character(const char* const _First, const char* const _Last) const noexcept {
+        // Returns a count of the number of code units that compose the first encoded character in
+        // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
+        // *_First is not a valid lead byte.
+        _STL_INTERNAL_CHECK(_First < _Last);
+
+        if constexpr (_Statically_Utf8) {
+            return _Utf8_code_units_in_next_character(_First, _Last);
+        } else {
+            switch (this->_Cvt._Mbcurmax) {
+            default:
+                _STL_INTERNAL_CHECK(!"Bad number of encoding units for this code page");
+                [[fallthrough]];
+            case 1:
+                return 1; // all characters have only one code unit
+
+            case 2:
+                return this->_Double_byte_encoding_code_units_in_next_character(_First, _Last);
+
+            case 4: // Assume UTF-8 (as does _Mbrtowc)
+                return _Utf8_code_units_in_next_character(_First, _Last);
+            }
+        }
    }

-    return 2; // surrogate pair
-}
+    _NODISCARD const char* _Find_encoded(const char* _First, const char* const _Last, const char _Val) const {
+        // Returns the first occurrence of _Val as an encoded character (and not, for example, as a
+        // continuation byte) in [_First, _Last).
+        if constexpr (_Statically_Utf8) {
+            return _Find_unchecked(_First, _Last, _Val);
+        } else {
+            if (this->_Cvt._Mbcurmax == 1 || this->_Cvt._Mbcurmax == 4) {
+                // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
+                return _Find_unchecked(_First, _Last, _Val);
+            }
+
+            while (_First != _Last && *_First != _Val) {
+                const int _Units = _Units_in_next_character(_First, _Last);
+                if (_Units < 0) {
+                    _THROW(format_error("Invalid encoded character in format string."));
+                }
+                _First += _Units;
+            }
+
+            return _First;
+        }
+    }
+
+    _NODISCARD int _Estimate_width(const char* const _Ptr, const int _Units) const {
+        // Return an estimate for the width of the character composed of _Units code units,
+        // whose first code unit is denoted by _Ptr.
+        if constexpr (_Statically_Utf8) {
+            return _Estimate_utf8_character_width(_Ptr, _Units);
+        } else {
+            if (this->_Cvt._Mbcurmax != 4) {
+                // not a Unicode encoding; estimate width == number of code units
+                return _Units;
+            }
+
+            // assume UTF-8
+            return _Estimate_utf8_character_width(_Ptr, _Units);
+        }
+    }
+};
+
+template <bool _Statically_Utf8>
+class _Fmt_codec<wchar_t, _Statically_Utf8> {
+public:
+    _NODISCARD int _Units_in_next_character(const wchar_t* _First, const wchar_t* const _Last) const noexcept {
+        // Returns a count of the number of code units that compose the first encoded character in
+        // [_First, _Last), or -1 if [_First, _Last) doesn't contain an entire encoded character or
+        // *_First is an unpaired surrogate.
+        _STL_INTERNAL_CHECK(_First < _Last);
+
+        if (*_First < 0xD800u || *_First >= 0xE000u) {
+            return 1;
+        }
+
+        if (*_First >= 0xDC00u) { // unpaired low surrogate
+            return -1;
+        }
+
+        if (++_First == _Last || *_First < 0xDC00u || *_First >= 0xE000u) { // unpaired high surrogate
+            return -1;
+        }
+
+        return 2; // surrogate pair
+    }
+
+    _NODISCARD const wchar_t* _Find_encoded(
+        const wchar_t* const _First, const wchar_t* const _Last, const wchar_t _Val) const {
+        return _Find_unchecked(_First, _Last, _Val);
+    }
+
+    _NODISCARD int _Estimate_width(const wchar_t* const _Ptr, const int _Units) const {
+        // Return an estimate for the width of the character composed of _Units code units,
+        // whose first code unit is denoted by _Ptr.
+        auto _Ch = static_cast<char32_t>(*_Ptr);
+        if (_Units == 1) {
+            return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
+        }
+
+        // surrogate pair
+        _Ch = (_Ch - 0xD800u) << 10;
+        _Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
+        _Ch += 0x10000u;
+        return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
+    }
+};

 template <class _CharT, _Parse_align_callbacks<_CharT> _Callbacks_type>
 _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End, _Callbacks_type&& _Callbacks) {
@ -540,7 +680,7 @@ _NODISCARD const _CharT* _Parse_align(const _CharT* _Begin, const _CharT* _End,
    _STL_INTERNAL_CHECK(_Begin != _End && *_Begin != '}');
    auto _Parsed_align = _Fmt_align::_None;

-    const int _Units = _Code_units_in_next_character(_Begin, _End, _Getcvt());
+    const int _Units = _Fmt_codec<_CharT>{}._Units_in_next_character(_Begin, _End);
    if (_Units < 0) { // invalid fill character encoding
        _THROW(format_error("Invalid format string."));
    }
@ -800,44 +940,19 @@ _NODISCARD constexpr const _CharT* _Parse_replacement_field(
    return _Begin + 1;
 }

-template <class _CharT>
-_NODISCARD const _CharT* _Find_encoded(
-    const _CharT* _First, const _CharT* _Last, const _CharT _Val, const _Cvtvec& _Cvt) {
-    // Returns the first occurrence of _Val as an encoded character (and not, for example, as a
-    // continuation byte) in [_First, _Last).
-    if constexpr (_Is_execution_charset_utf8_v) {
-        return _Find_unchecked(_First, _Last, _Val);
-    } else {
-        if (_Cvt._Mbcurmax == 1 || _Cvt._Mbcurmax == 4) {
-            // As above and in _Mbrtowc, assume 4-byte encodings are UTF-8
-            return _Find_unchecked(_First, _Last, _Val);
-        }
-
-        while (_First != _Last && *_First != _Val) {
-            const int _Units = _Code_units_in_next_character(_First, _Last, _Cvt);
-            if (_Units < 0) {
-                _THROW(format_error("Invalid encoded character in format string."));
-            }
-            _First += _Units;
-        }
-
-        return _First;
-    }
-}
-
 template <class _CharT, _Parse_replacement_field_callbacks<_CharT> _HandlerT>
 void _Parse_format_string(basic_string_view<_CharT> _Format_str, _HandlerT&& _Handler) {
-    auto _Begin         = _Format_str.data();
-    auto _End           = _Begin + _Format_str.size();
-    const _Cvtvec& _Cvt = _Getcvt();
+    auto _Begin = _Format_str.data();
+    auto _End   = _Begin + _Format_str.size();
+    const _Fmt_codec<_CharT> _Codec;

    while (_Begin != _End) {
        const _CharT* _OpeningCurl = _Begin;
        if (*_Begin != '{') {
-            _OpeningCurl = _Find_encoded(_Begin, _End, _CharT{'{'}, _Cvt);
+            _OpeningCurl = _Codec._Find_encoded(_Begin, _End, _CharT{'{'});

            for (;;) {
-                const _CharT* _ClosingCurl = _Find_encoded(_Begin, _OpeningCurl, _CharT{'}'}, _Cvt);
+                const _CharT* _ClosingCurl = _Codec._Find_encoded(_Begin, _OpeningCurl, _CharT{'}'});

                // In this case there are neither closing nor opening curls in [_Begin, _OpeningCurl)
                // Write the whole thing out.
@ -2214,95 +2329,15 @@ _NODISCARD _OutputIt _Fmt_write(
    return _Fmt_write(_STD move(_Out), basic_string_view<_CharT>{_Value}, _Specs, _Locale);
 }

-inline constexpr char16_t _Width_estimate_low_intervals[] = { // Per N4885 [format.string.std]/11
-    0x1100u, 0x1160u, 0x2329u, 0x232Bu, 0x2E80u, 0x303Fu, 0x3040u, 0xA4D0u, 0xAC00u, 0xD7A4u, 0xF900u, 0xFB00u, 0xFE10u,
-    0xFE1Au, 0xFE30u, 0xFE70u, 0xFF00u, 0xFF61u, 0xFFE0u, 0xFFE7u};
-
-inline constexpr char32_t _Width_estimate_high_intervals[] = { // Per N4885 [format.string.std]/11
-    0x1F300u, 0x1F650u, 0x1F900u, 0x1FA00u, 0x20000u, 0x2FFFEu, 0x30000u, 0x3FFFEu};
-
-template <auto& _Bounds>
-_NODISCARD constexpr int _Unicode_width_estimate(const char32_t _Ch) noexcept {
-    // Computes the width estimation for Unicode characters from N4885 [format.string.std]/11
-    int _Result = 1;
-    for (const auto& _Bound : _Bounds) {
-        if (_Ch < _Bound) {
-            return _Result;
-        }
-        _Result ^= 1;
-    }
-
-    return 1;
-}
-
-_NODISCARD inline int _Estimate_utf8_character_width(const char* const _Ptr, const int _Units) noexcept {
-    // Return an estimate for the width of the character composed of _Units code units,
-    // whose first code unit is denoted by _Ptr.
-    auto _Ch = static_cast<char32_t>(*_Ptr);
-    switch (_Units) {
-    default:
-    case 1:
-    case 2:
-        return 1;
-    case 3:
-        _Ch &= 0b1111u;
-        break;
-    case 4:
-        _Ch &= 0b111u;
-        break;
-    }
-
-    for (int _Idx = 1; _Idx < _Units; ++_Idx) {
-        _Ch = _Ch << 6 | (_Ptr[_Idx] & 0b11'1111u);
-    }
-
-    if (_Units == 3) {
-        return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
-    }
-
-    return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
-}
-
-_NODISCARD inline int _Estimate_character_width(const char* _Ptr, const int _Units, const _Cvtvec& _Cvt) {
-    // Return an estimate for the width of the character composed of _Units code units,
-    // whose first code unit is denoted by _Ptr.
-    if constexpr (_Is_execution_charset_utf8_v) {
-        return _Estimate_utf8_character_width(_Ptr, _Units);
-    } else {
-        if (_Cvt._Mbcurmax != 4) {
-            // not a Unicode encoding; estimate width == number of code units
-            return _Units;
-        }
-
-        // assume UTF-8
-        return _Estimate_utf8_character_width(_Ptr, _Units);
-    }
-}
-
-_NODISCARD inline int _Estimate_character_width(const wchar_t* _Ptr, const int _Units, const _Cvtvec&) {
-    // Return an estimate for the width of the character composed of _Units code units,
-    // whose first code unit is denoted by _Ptr.
-    auto _Ch = static_cast<char32_t>(*_Ptr);
-    if (_Units == 1) {
-        return _Unicode_width_estimate<_Width_estimate_low_intervals>(_Ch);
-    }
-
-    // surrogate pair
-    _Ch = (_Ch - 0xD8000u) << 10;
-    _Ch += static_cast<char32_t>(_Ptr[1]) - 0xDC00u;
-    _Ch += 0x10000u;
-    return _Unicode_width_estimate<_Width_estimate_high_intervals>(_Ch);
-}
-
 template <class _CharT>
 _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT> _Value, int& _Width) {
    // Returns a pointer past-the-end of the largest prefix of _Value that fits in _Width, or all
    // of _Value if _Width is negative. Updates _Width to the estimated width of that prefix.
-    const int _Max_width    = _Width;
-    auto _Pos               = _Value.data();
-    const auto _Last        = _Pos + _Value.size();
-    int _Estimated_width    = 0; // the estimated width of [_Value.data(), _Pos)
-    const _Cvtvec& _Cvt     = _Getcvt();
+    const int _Max_width = _Width;
+    auto _Pos            = _Value.data();
+    const auto _Last     = _Pos + _Value.size();
+    int _Estimated_width = 0; // the estimated width of [_Value.data(), _Pos)
+    const _Fmt_codec<_CharT> _Codec;
    constexpr auto _Max_int = (numeric_limits<int>::max)();

    while (_Pos != _Last) {
@ -2312,8 +2347,8 @@ _NODISCARD const _CharT* _Measure_string_prefix(const basic_string_view<_CharT>
        }

        // TRANSITION, extended grapheme clustering
-        const int _Units           = _Code_units_in_next_character(_Pos, _Last, _Cvt);
-        const int _Character_width = _Estimate_character_width(_Pos, _Units, _Cvt);
+        const int _Units           = _Codec._Units_in_next_character(_Pos, _Last);
+        const int _Character_width = _Codec._Estimate_width(_Pos, _Units);

        if (_Max_int - _Character_width < _Estimated_width) { // avoid overflow
            // Either _Max_width isn't set, or adding this character will exceed it.
--- a/stl/inc/xfilesystem_abi.h
+++ b/stl/inc/xfilesystem_abi.h
@ -205,7 +205,7 @@ _BITMASK_OPS(__std_fs_file_flags)

 enum class __std_fs_file_handle : intptr_t { _Invalid = -1 };

-enum class __std_code_page : unsigned int { _Utf8 = 65001 };
+enum class __std_code_page : unsigned int { _Acp = 0, _Utf8 = 65001 };

 struct __std_fs_convert_result {
    int _Len;
--- a/stl/msbuild/stl_base/stl.files.settings.targets
+++ b/stl/msbuild/stl_base/stl.files.settings.targets
@ -161,6 +161,7 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
             (controlled by IncludeInLink and IncludeInImportLib). -->
        <BuildFiles Include="
            $(CrtRoot)\github\stl\src\filesystem.cpp;
+            $(CrtRoot)\github\stl\src\format.cpp;
            $(CrtRoot)\github\stl\src\locale0_implib.cpp;
            $(CrtRoot)\github\stl\src\nothrow.cpp;
            $(CrtRoot)\github\stl\src\sharedmutex.cpp;
--- a/stl/src/format.cpp
+++ b/stl/src/format.cpp
@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Implements a win32 API wrapper for <format>
+
+// This must be as small as possible, because its contents are
+// injected into the msvcprt.lib and msvcprtd.lib import libraries.
+// Do not include or define anything else here.
+// In particular, basic_string must not be included here.
+
+#include <xfilesystem_abi.h>
+#include <xlocinfo.h>
+
+#include <Windows.h>
+
+static_assert(__std_code_page::_Acp == __std_code_page{CP_ACP});
+
+extern "C" [[nodiscard]] __std_win_error __stdcall __std_get_cvt(
+    const __std_code_page _Codepage, _Cvtvec* const _Pcvt) noexcept {
+    // get conversion info for an arbitrary codepage
+    *_Pcvt = {};
+
+    CPINFOEXW _Info{};
+    const DWORD _Flags = 0; // reserved, must be zero
+    if (GetCPInfoExW(static_cast<UINT>(_Codepage), _Flags, &_Info) == 0) {
+        // NB: the only documented failure mode for GetCPInfoExW is ERROR_INVALID_PARAMETER,
+        // so in practice it should never fail for CP_ACP.
+        return __std_win_error{GetLastError()};
+    }
+
+    _Pcvt->_Page     = _Info.CodePage;
+    _Pcvt->_Mbcurmax = _Info.MaxCharSize;
+
+    for (int _Idx = 0; _Idx < MAX_LEADBYTES; _Idx += 2) {
+        if (_Info.LeadByte[_Idx] == 0 && _Info.LeadByte[_Idx + 1] == 0) {
+            break;
+        }
+
+        for (unsigned char _First = _Info.LeadByte[_Idx], _Last = _Info.LeadByte[_Idx + 1]; _First != _Last; ++_First) {
+            _Pcvt->_Isleadbyte[_First >> 3] |= 1u << (_First & 0b111u);
+        }
+    }
+
+    return __std_win_error::_Success;
+}
--- a/tests/std/include/test_format_support.hpp
+++ b/tests/std/include/test_format_support.hpp
@ -117,9 +117,10 @@ void test_parse_helper(const CharT* (*func)(const CharT*, const CharT*, callback
    callback_type&& callbacks                                               = {}) {
    try {
        auto end = func(view.data(), view.data() + view.size(), std::move(callbacks));
-        if (expected_end_position != std::basic_string_view<CharT>::npos) {
-            assert(end == view.data() + expected_end_position);
+        if (expected_end_position == std::basic_string_view<CharT>::npos) {
+            expected_end_position = view.size();
        }
+        assert(end == view.data() + expected_end_position);
        assert(!err_expected);
    } catch (const std::format_error&) {
        assert(err_expected);
--- a/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp
+++ b/tests/std/tests/P0645R10_text_formatting_formatting/test.cpp
@ -975,31 +975,6 @@ void test_size() {
    test_size_helper<charT>(8, STR("{:8}"), STR("scully"));
 }

-void test_multibyte_format_strings() {
-#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8"
-    {
-        assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
-        // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
-        assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
-
-        assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
-    }
-
-    {
-        assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
-        try {
-            (void) format("{:\x9f\x8f\x88<10}"sv, 42); // Bad fill character encoding: missing lead byte before \x9f
-            assert(false);
-        } catch (const format_error&) {
-        }
-    }
-#endif // MSVC_INTERNAL_TESTING
-
-    assert(setlocale(LC_ALL, "C") != nullptr);
-}
-
 // The libfmt_ tests are derived from tests in
 // libfmt, Copyright (c) 2012 - present, Victor Zverovich
 // See NOTICE.txt for more information.
@ -1318,8 +1293,6 @@ void test() {
    test_size<char>();
    test_size<wchar_t>();

-    test_multibyte_format_strings();
-
    libfmt_formatter_test_escape<char>();
    libfmt_formatter_test_escape<wchar_t>();

--- a/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp
+++ b/tests/std/tests/P0645R10_text_formatting_legacy_text_encoding/test.cpp
@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+#define _FORMAT_CODEPAGE (__std_code_page{932})
+
 #include <cassert>
 #include <clocale>
 #include <format>
@ -11,55 +13,76 @@
 using namespace std;

 void test_multibyte_format_strings() {
-    {
-        assert(setlocale(LC_ALL, ".932") != nullptr);
-        const auto s =
-            "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
-        assert(format(s) == s);
+    const auto s = "\x93\xfa\x96{\x92\x6e\x90}"sv; // Note the use of `{` and `}` as continuation bytes (from GH-1576)
+    assert(format(s) == s);

-        assert(format("{:.2}", s) == "\x93\xfa"sv);
-        assert(format("{:4.2}", s) == "\x93\xfa  "sv);
+    assert(format("{:.2}", s) == "\x93\xfa"sv);
+    assert(format("{:4.2}", s) == "\x93\xfa  "sv);

-        assert(format("{:<4.2}", s) == "\x93\xfa  "sv);
-        assert(format("{:^4.2}", s) == " \x93\xfa "sv);
-        assert(format("{:>4.2}", s) == "  \x93\xfa"sv);
+    assert(format("{:<4.2}", s) == "\x93\xfa  "sv);
+    assert(format("{:^4.2}", s) == " \x93\xfa "sv);
+    assert(format("{:>4.2}", s) == "  \x93\xfa"sv);

-        assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
-        assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
-        assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);
+    assert(format("{:\x90}<4.2}", s) == "\x93\xfa\x90}\x90}"sv);
+    assert(format("{:\x90}^4.2}", s) == "\x90}\x93\xfa\x90}"sv);
+    assert(format("{:\x90}>4.2}", s) == "\x90}\x90}\x93\xfa"sv);

-        assert(format("{:.3}", s) == "\x93\xfa"sv);
-        assert(format("{:4.3}", s) == "\x93\xfa  "sv);
+    assert(format("{:.3}", s) == "\x93\xfa"sv);
+    assert(format("{:4.3}", s) == "\x93\xfa  "sv);

-        assert(format("{:<4.3}", s) == "\x93\xfa  "sv);
-        assert(format("{:^4.3}", s) == " \x93\xfa "sv);
-        assert(format("{:>4.3}", s) == "  \x93\xfa"sv);
+    assert(format("{:<4.3}", s) == "\x93\xfa  "sv);
+    assert(format("{:^4.3}", s) == " \x93\xfa "sv);
+    assert(format("{:>4.3}", s) == "  \x93\xfa"sv);

-        assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
-        assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
-        assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
-    }
-
-    assert(setlocale(LC_ALL, "C") != nullptr);
+    assert(format("{:\x90}<4.3}", s) == "\x93\xfa\x90}\x90}"sv);
+    assert(format("{:\x90}^4.3}", s) == "\x90}\x93\xfa\x90}"sv);
+    assert(format("{:\x90}>4.3}", s) == "\x90}\x90}\x93\xfa"sv);
 }

 void test_parse_align() {
-    auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;
+    const auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;

-    {
-        assert(setlocale(LC_ALL, ".932") != nullptr);
-        test_parse_helper(parse_align_fn, "\x93\xfa<X"sv, false, 3,
-            {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\x93\xfa"sv});
-        test_parse_helper(parse_align_fn, "\x96\x7b>X"sv, false, 3,
-            {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv});
-        test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3,
-            {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
+    test_parse_helper(parse_align_fn, "\x93\xfa<X"sv, false, 3, //
+        {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\x93\xfa"sv});
+    test_parse_helper(parse_align_fn, "\x96\x7b>X"sv, false, 3,
+        {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\x96\x7b"sv});
+    test_parse_helper(parse_align_fn, "\x92\x6e^X"sv, false, 3,
+        {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\x92\x6e"sv});
+}
+
+void test_width_estimation() {
+    // Format strings of known width with a trailing delimiter using a precision large enough to
+    // include all but the delimiter to validate the width estimation code.
+    struct test_case {
+        const char* str;
+        int width;
+    };
+    constexpr test_case test_cases[] = {
+        {"\x58", 1},
+        {"x\x58", 2},
+
+        // Pick "short" and "long" codepoints (\x20 and \x96\x7b), then form all permutations of
+        // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of
+        // all adjacent pairings (short/short, short/long, long/short, long/long).
+        {"\x20\x20\x20\x58", 4},
+        {"\x20\x20\x96\x7b\x58", 5},
+        {"\x20\x96\x7b\x20\x58", 5},
+        {"\x96\x7b\x20\x20\x58", 5},
+        {"\x20\x96\x7b\x96\x7b\x58", 6},
+        {"\x96\x7b\x20\x96\x7b\x58", 6},
+        {"\x96\x7b\x96\x7b\x20\x58", 6},
+        {"\x96\x7b\x96\x7b\x96\x7b\x58", 7},
+    };
+
+    for (const auto& test : test_cases) {
+        string_view sv{test.str};
+        sv = sv.substr(0, sv.size() - 1);
+        assert(format("{:.{}}", test.str, test.width - 1) == sv);
    }
-
-    assert(setlocale(LC_ALL, "C") != nullptr);
 }

 int main() {
    test_multibyte_format_strings();
    test_parse_align();
+    test_width_estimation();
 }
--- a/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp
+++ b/tests/std/tests/P0645R10_text_formatting_parsing/test.cpp
@ -36,7 +36,7 @@ bool test_parse_align() {
        // \x343E (which is from CJK unified ideographs extension A) and similar characters to parse as
        // an alignment specifier.
        auto s4 = L"*\x343E"sv;
-        test_parse_helper(parse_align_fn, s4, false, view_typ::npos, {.expected_fill = L"*"sv});
+        test_parse_helper(parse_align_fn, s4, false, 0, {.expected_fill = L"*"sv});

        // test multi-code-unit fill characters
        {
@ -47,22 +47,6 @@ bool test_parse_align() {
            test_parse_helper(parse_align_fn, L"\U0001F3C8^X"sv, false, 3,
                {.expected_alignment = _Fmt_align::_Center, .expected_fill = L"\U0001F3C8"sv});
        }
-    } else {
-        // test multibyte fill characters
-#ifndef MSVC_INTERNAL_TESTING // TRANSITION, the Windows version on Contest VMs doesn't always understand ".UTF-8"
-        {
-            assert(setlocale(LC_ALL, ".UTF-8") != nullptr);
-            // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
-            test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
-                {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-            test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
-                {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-            test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
-                {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-        }
-#endif // MSVC_INTERNAL_TESTING
-
-        assert(setlocale(LC_ALL, "C") != nullptr);
    }

    return true;
--- a/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp
+++ b/tests/std/tests/P0645R10_text_formatting_utf8/test.cpp
@ -12,12 +12,12 @@ using namespace std;

 void test_multibyte_format_strings() {
    {
-        // Filling with footballs ("\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL)
-        assert(format("{:\xf0\x9f\x8f\x88>4}"sv, 42) == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x34\x32");
+        // Filling with footballs ("\U0001f3c8" is U+1F3C8 AMERICAN FOOTBALL)
+        assert(format("{:\U0001f3c8>4}"sv, 42) == "\U0001f3c8\U0001f3c8\x34\x32");

-        assert(format("{:\xf0\x9f\x8f\x88<4.2}", "1") == "\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88^4.2}", "1") == "\xf0\x9f\x8f\x88\x31\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88"sv);
-        assert(format("{:\xf0\x9f\x8f\x88>4.2}", "1") == "\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\xf0\x9f\x8f\x88\x31"sv);
+        assert(format("{:\U0001f3c8<4.2}", "1") == "\x31\U0001f3c8\U0001f3c8\U0001f3c8"sv);
+        assert(format("{:\U0001f3c8^4.2}", "1") == "\U0001f3c8\x31\U0001f3c8\U0001f3c8"sv);
+        assert(format("{:\U0001f3c8>4.2}", "1") == "\U0001f3c8\U0001f3c8\U0001f3c8\x31"sv);
    }

    {
@ -33,19 +33,115 @@ void test_parse_align() {
    auto parse_align_fn = _Parse_align<char, testing_callbacks<char>>;

    {
-        // "\xf0\x9f\x8f\x88" is U+1F3C8 AMERICAN FOOTBALL
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88<X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88>X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\xf0\x9f\x8f\x88"sv});
-        test_parse_helper(parse_align_fn, "\xf0\x9f\x8f\x88^X"sv, false, 5,
-            {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\xf0\x9f\x8f\x88"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8<X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Left, .expected_fill = "\U0001f3c8"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8>X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Right, .expected_fill = "\U0001f3c8"sv});
+        test_parse_helper(parse_align_fn, "\U0001f3c8^X"sv, false, 5,
+            {.expected_alignment = _Fmt_align::_Center, .expected_fill = "\U0001f3c8"sv});
+    }
+}
+
+template <class CharT>
+void test_width_estimation() {
+    // Format strings of known width with a trailing delimiter using a precision large enough to
+    // include all but the delimiter to validate the width estimation code.
+    struct test_case {
+        const CharT* str;
+        int width;
+    };
+    constexpr test_case test_cases[] = {
+        {TYPED_LITERAL(CharT, "\x58"), 1},
+        {TYPED_LITERAL(CharT, "x\x58"), 2},
+
+        // test the boundaries of the intervals defined in n4885 [format.string.std]/11
+        {TYPED_LITERAL(CharT, "\u10ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u1100\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u115f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u1160\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2328\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2329\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u232a\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u232b\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2e7f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u2e80\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u303e\x58"), 3},
+        {TYPED_LITERAL(CharT, "\u303f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\u3040\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ua4cf\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ua4d0\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uabff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uac00\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ud7a3\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ud7a4\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ud7ff\x58"), 2},
+
+        // skip over the surrogate pair range (\ud800-\udfff)
+
+        {TYPED_LITERAL(CharT, "\ue000\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uf8ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uf900\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufaff\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufb00\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe0f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe10\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe19\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe1a\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe2f\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufe30\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe6f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\ufe70\x58"), 2},
+        {TYPED_LITERAL(CharT, "\ufeff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uff00\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uff60\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uff61\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uffdf\x58"), 2},
+        {TYPED_LITERAL(CharT, "\uffe0\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uffe6\x58"), 3},
+        {TYPED_LITERAL(CharT, "\uffe7\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f2ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f300\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f64f\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f650\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f8ff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001f900\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001f9ff\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0001fa00\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0001ffff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U00020000\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0002fffd\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0002fffe\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0002ffff\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U00030000\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0003fffd\x58"), 3},
+        {TYPED_LITERAL(CharT, "\U0003fffe\x58"), 2},
+        {TYPED_LITERAL(CharT, "\U0010ffff\x58"), 2},
+
+        // Pick "short" and "long" codepoints (\u2000 and \ufe40), then form all permutations of
+        // 3-codepoint prefixes with the same fixed delimiter as above. This gives us coverage of
+        // all adjacent pairings (short/short, short/long, long/short, long/long).
+        {TYPED_LITERAL(CharT, "\u2000\u2000\u2000\x58"), 4},
+        {TYPED_LITERAL(CharT, "\u2000\u2000\ufe40\x58"), 5},
+        {TYPED_LITERAL(CharT, "\u2000\ufe40\u2000\x58"), 5},
+        {TYPED_LITERAL(CharT, "\ufe40\u2000\u2000\x58"), 5},
+        {TYPED_LITERAL(CharT, "\u2000\ufe40\ufe40\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\u2000\ufe40\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\ufe40\u2000\x58"), 6},
+        {TYPED_LITERAL(CharT, "\ufe40\ufe40\ufe40\x58"), 7},
+    };
+
+    for (const auto& test : test_cases) {
+        basic_string_view sv{test.str};
+        sv = sv.substr(0, sv.size() - 1);
+        assert(format(TYPED_LITERAL(CharT, "{:.{}}"), test.str, test.width - 1) == sv);
    }
 }

 void run_tests() {
    test_multibyte_format_strings();
    test_parse_align();
+    test_width_estimation<char>();
+    test_width_estimation<wchar_t>();
 }

 int main() {