Concentrated header for internal bit utilities (#3721)

Co-authored-by: Stephan T. Lavavej <stl@microsoft.com>
2023-06-15 16:21:12 +08:00 · 2023-06-15 16:21:12 +08:00 · 47679bbaa8
--- a/stl/CMakeLists.txt
+++ b/stl/CMakeLists.txt
@ -7,6 +7,7 @@

 set(HEADERS
    ${CMAKE_CURRENT_LIST_DIR}/inc/__msvc_all_public_headers.hpp
+    ${CMAKE_CURRENT_LIST_DIR}/inc/__msvc_bit_utils.hpp
    ${CMAKE_CURRENT_LIST_DIR}/inc/__msvc_chrono.hpp
    ${CMAKE_CURRENT_LIST_DIR}/inc/__msvc_cxx_stdatomic.hpp
    ${CMAKE_CURRENT_LIST_DIR}/inc/__msvc_filebuf.hpp
--- a/stl/inc/__msvc_bit_utils.hpp
+++ b/stl/inc/__msvc_bit_utils.hpp
@ -0,0 +1,448 @@
+// __msvc_bit_utils.hpp internal header (core)
+
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#pragma once
+#ifndef __MSVC_BIT_UTILS_HPP
+#define __MSVC_BIT_UTILS_HPP
+#include <yvals_core.h>
+#if _STL_COMPILER_PREPROCESSOR
+
+#include <climits>
+#include <xtr1common>
+
+#include _STL_INTRIN_HEADER
+
+// TRANSITION, GH-2129, move down to _Arm64_popcount
+#if (defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
+    && !defined(__INTEL_COMPILER) && !defined(__clang__) // TRANSITION, LLVM-51488
+#define _HAS_NEON_INTRINSICS 1
+#else // ^^^ intrinsics available / intrinsics unavailable vvv
+#define _HAS_NEON_INTRINSICS 0
+#endif // ^^^ intrinsics unavailable ^^^
+
+#if _HAS_NEON_INTRINSICS
+#include <arm64_neon.h> // TRANSITION, GH-2129
+#endif // _HAS_NEON_INTRINSICS
+
+#pragma pack(push, _CRT_PACKING)
+#pragma warning(push, _STL_WARNING_LEVEL)
+#pragma warning(disable : _STL_DISABLED_WARNINGS)
+_STL_DISABLE_CLANG_WARNINGS
+#pragma push_macro("new")
+#undef new
+
+_STD_BEGIN
+extern "C" {
+extern int __isa_available;
+}
+
+_INLINE_VAR constexpr int _Stl_isa_available_sse42 = 2; // equal to __ISA_AVAILABLE_SSE42
+_INLINE_VAR constexpr int _Stl_isa_available_avx2  = 5; // equal to __ISA_AVAILABLE_AVX2
+
+template <class _UInt>
+_INLINE_VAR constexpr int _Unsigned_integer_digits = sizeof(_UInt) * CHAR_BIT;
+
+// Implementation of countl_zero without using specialized CPU instructions.
+// Used at compile time and when said instructions are not supported.
+// see "Hacker's Delight" section 5-3
+template <class _Ty>
+_NODISCARD constexpr int _Countl_zero_fallback(_Ty _Val) noexcept {
+    _Ty _Yy = 0;
+
+    unsigned int _Nn = _Unsigned_integer_digits<_Ty>;
+    unsigned int _Cc = _Unsigned_integer_digits<_Ty> / 2;
+    do {
+        _Yy = static_cast<_Ty>(_Val >> _Cc);
+        if (_Yy != 0) {
+            _Nn -= _Cc;
+            _Val = _Yy;
+        }
+        _Cc >>= 1;
+    } while (_Cc != 0);
+    return static_cast<int>(_Nn) - static_cast<int>(_Val);
+}
+
+#if !defined(_M_CEE_PURE) && !defined(__CUDACC__) && !defined(__INTEL_COMPILER)
+#define _HAS_COUNTL_ZERO_INTRINSICS 1
+#else // ^^^ intrinsics available / intrinsics unavailable vvv
+#define _HAS_COUNTL_ZERO_INTRINSICS 0
+#endif // ^^^ intrinsics unavailable ^^^
+
+#if _HAS_COUNTL_ZERO_INTRINSICS
+#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
+template <class _Ty>
+_NODISCARD int _Countl_zero_lzcnt(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+
+    if constexpr (_Digits <= 16) {
+        return static_cast<int>(__lzcnt16(_Val) - (16 - _Digits));
+    } else if constexpr (_Digits == 32) {
+        return static_cast<int>(__lzcnt(_Val));
+    } else {
+#ifdef _M_IX86
+        const unsigned int _High = _Val >> 32;
+        const auto _Low          = static_cast<unsigned int>(_Val);
+        if (_High == 0) {
+            return 32 + _Countl_zero_lzcnt(_Low);
+        } else {
+            return _Countl_zero_lzcnt(_High);
+        }
+#else // ^^^ _M_IX86 / !_M_IX86 vvv
+        return static_cast<int>(__lzcnt64(_Val));
+#endif // _M_IX86
+    }
+}
+
+template <class _Ty>
+_NODISCARD int _Countl_zero_bsr(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+
+    unsigned long _Result;
+    if constexpr (_Digits <= 32) {
+        if (!_BitScanReverse(&_Result, _Val)) {
+            return _Digits;
+        }
+    } else {
+#ifdef _M_IX86
+        const unsigned int _High = _Val >> 32;
+        if (_BitScanReverse(&_Result, _High)) {
+            return static_cast<int>(31 - _Result);
+        }
+
+        const auto _Low = static_cast<unsigned int>(_Val);
+        if (!_BitScanReverse(&_Result, _Low)) {
+            return _Digits;
+        }
+#else // ^^^ _M_IX86 / !_M_IX86 vvv
+        if (!_BitScanReverse64(&_Result, _Val)) {
+            return _Digits;
+        }
+#endif // _M_IX86
+    }
+    return static_cast<int>(_Digits - 1 - _Result);
+}
+
+template <class _Ty>
+_NODISCARD int _Checked_x86_x64_countl_zero(const _Ty _Val) noexcept {
+#ifdef __AVX2__
+    return _Countl_zero_lzcnt(_Val);
+#else // __AVX2__
+    const bool _Definitely_have_lzcnt = __isa_available >= _Stl_isa_available_avx2;
+    if (_Definitely_have_lzcnt) {
+        return _Countl_zero_lzcnt(_Val);
+    } else {
+        return _Countl_zero_bsr(_Val);
+    }
+#endif // __AVX2__
+}
+#endif // defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
+
+#if defined(_M_ARM) || defined(_M_ARM64)
+#ifdef __clang__ // TRANSITION, GH-1586
+_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned short _Val) {
+    return __builtin_clzs(_Val);
+}
+
+_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned int _Val) {
+    return __builtin_clz(_Val);
+}
+
+_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long _Val) {
+    return __builtin_clzl(_Val);
+}
+
+_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long long _Val) {
+    return __builtin_clzll(_Val);
+}
+#endif // TRANSITION, GH-1586
+
+template <class _Ty>
+_NODISCARD int _Checked_arm_arm64_countl_zero(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+    if (_Val == 0) {
+        return _Digits;
+    }
+
+#ifdef __clang__ // TRANSITION, GH-1586
+    if constexpr (is_same_v<remove_cv_t<_Ty>, unsigned char>) {
+        return _Clang_arm_arm64_countl_zero(static_cast<unsigned short>(_Val))
+             - (_Unsigned_integer_digits<unsigned short> - _Digits);
+    } else {
+        return _Clang_arm_arm64_countl_zero(_Val);
+    }
+#else // ^^^ workaround / no workaround vvv
+    if constexpr (_Digits <= 32) {
+        return static_cast<int>(_CountLeadingZeros(_Val)) - (_Unsigned_integer_digits<unsigned long> - _Digits);
+    } else {
+        return static_cast<int>(_CountLeadingZeros64(_Val));
+    }
+#endif // TRANSITION, GH-1586
+}
+#endif // defined(_M_ARM) || defined(_M_ARM64)
+#endif // _HAS_COUNTL_ZERO_INTRINSICS
+
+// Implementation of countr_zero without using specialized CPU instructions.
+// Used at compile time and when said instructions are not supported.
+// see "Hacker's Delight" section 5-4
+template <class _Ty>
+_NODISCARD constexpr int _Countr_zero_fallback(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+    return _Digits - _Countl_zero_fallback(static_cast<_Ty>(static_cast<_Ty>(~_Val) & static_cast<_Ty>(_Val - 1)));
+}
+
+// Implementation of popcount without using specialized CPU instructions.
+// Used at compile time and when said instructions are not supported.
+template <class _Ty>
+_NODISCARD constexpr int _Popcount_fallback(_Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+#if defined(_M_IX86) || defined(_M_ARM)
+    if constexpr (_Digits == 64) {
+        // 64-bit bit operations on architectures without 64-bit registers are less efficient,
+        // hence we split the value so that it fits in 32-bit registers
+        return _Popcount_fallback(static_cast<unsigned long>(_Val))
+             + _Popcount_fallback(static_cast<unsigned long>(_Val >> 32));
+    }
+#endif // defined(_M_IX86) || defined(_M_ARM)
+    // we static_cast these bit patterns in order to truncate them to the correct size
+    _Val = static_cast<_Ty>(_Val - ((_Val >> 1) & static_cast<_Ty>(0x5555'5555'5555'5555ull)));
+    _Val = static_cast<_Ty>((_Val & static_cast<_Ty>(0x3333'3333'3333'3333ull))
+                            + ((_Val >> 2) & static_cast<_Ty>(0x3333'3333'3333'3333ull)));
+    _Val = static_cast<_Ty>((_Val + (_Val >> 4)) & static_cast<_Ty>(0x0F0F'0F0F'0F0F'0F0Full));
+    // Multiply by one in each byte, so that it will have the sum of all source bytes in the highest byte
+    _Val = static_cast<_Ty>(_Val * static_cast<_Ty>(0x0101'0101'0101'0101ull));
+    // Extract highest byte
+    return static_cast<int>(_Val >> (_Digits - 8));
+}
+
+#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
+    && !defined(__INTEL_COMPILER)
+#define _HAS_TZCNT_BSF_INTRINSICS 1
+#else // ^^^ intrinsics available / intrinsics unavailable vvv
+#define _HAS_TZCNT_BSF_INTRINSICS 0
+#endif // ^^^ intrinsics unavailable ^^^
+
+#if _HAS_TZCNT_BSF_INTRINSICS
+#ifdef __clang__
+#define _TZCNT_U32 __builtin_ia32_tzcnt_u32
+#define _TZCNT_U64 __builtin_ia32_tzcnt_u64
+#else // ^^^ __clang__ / !__clang__ vvv
+#define _TZCNT_U32 _tzcnt_u32
+#define _TZCNT_U64 _tzcnt_u64
+#endif // __clang__
+
+template <class _Ty>
+_NODISCARD int _Countr_zero_tzcnt(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+    constexpr _Ty _Max    = static_cast<_Ty>(-1); // equal to (numeric_limits<_Ty>::max)()
+
+    if constexpr (_Digits <= 32) {
+        // Intended widening to int. This operation means that a narrow 0 will widen
+        // to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
+        // of the wider type.
+        return static_cast<int>(_TZCNT_U32(static_cast<unsigned int>(~_Max | _Val)));
+    } else {
+#ifdef _M_IX86
+        const auto _Low = static_cast<unsigned int>(_Val);
+        if (_Low == 0) {
+            const unsigned int _High = _Val >> 32;
+            return static_cast<int>(32 + _TZCNT_U32(_High));
+        } else {
+            return static_cast<int>(_TZCNT_U32(_Low));
+        }
+#else // ^^^ _M_IX86 / !_M_IX86 vvv
+        return static_cast<int>(_TZCNT_U64(_Val));
+#endif // _M_IX86
+    }
+}
+
+#undef _TZCNT_U32
+#undef _TZCNT_U64
+
+template <class _Ty>
+_NODISCARD int _Countr_zero_bsf(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+    constexpr _Ty _Max    = static_cast<_Ty>(-1); // equal to (numeric_limits<_Ty>::max)()
+
+    unsigned long _Result;
+    if constexpr (_Digits <= 32) {
+        // Intended widening to int. This operation means that a narrow 0 will widen
+        // to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
+        // of the wider type.
+        if (!_BitScanForward(&_Result, static_cast<unsigned int>(~_Max | _Val))) {
+            return _Digits;
+        }
+    } else {
+#ifdef _M_IX86
+        const auto _Low = static_cast<unsigned int>(_Val);
+        if (_BitScanForward(&_Result, _Low)) {
+            return static_cast<int>(_Result);
+        }
+
+        const unsigned int _High = _Val >> 32;
+        if (!_BitScanForward(&_Result, _High)) {
+            return _Digits;
+        } else {
+            return static_cast<int>(_Result + 32);
+        }
+#else // ^^^ _M_IX86 / !_M_IX86 vvv
+        if (!_BitScanForward64(&_Result, _Val)) {
+            return _Digits;
+        }
+#endif // _M_IX86
+    }
+    return static_cast<int>(_Result);
+}
+
+template <class _Ty>
+_NODISCARD int _Checked_x86_x64_countr_zero(const _Ty _Val) noexcept {
+#ifdef __AVX2__
+    return _Countr_zero_tzcnt(_Val);
+#else // __AVX2__
+    const bool _Definitely_have_tzcnt = __isa_available >= _Stl_isa_available_avx2;
+    if (_Definitely_have_tzcnt) {
+        return _Countr_zero_tzcnt(_Val);
+    } else {
+        return _Countr_zero_bsf(_Val);
+    }
+#endif // __AVX2__
+}
+
+#endif // _HAS_TZCNT_BSF_INTRINSICS
+
+#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
+    && !defined(__INTEL_COMPILER)
+#define _HAS_POPCNT_INTRINSICS 1
+#else // ^^^ intrinsics available / intrinsics unavailable vvv
+#define _HAS_POPCNT_INTRINSICS 0
+#endif // ^^^ intrinsics unavailable ^^^
+
+#if _HAS_POPCNT_INTRINSICS
+template <class _Ty>
+_NODISCARD int _Unchecked_x86_x64_popcount(const _Ty _Val) noexcept {
+    constexpr int _Digits = _Unsigned_integer_digits<_Ty>;
+    if constexpr (_Digits <= 16) {
+        return static_cast<int>(__popcnt16(_Val));
+    } else if constexpr (_Digits == 32) {
+        return static_cast<int>(__popcnt(_Val));
+    } else {
+#ifdef _M_IX86
+        return static_cast<int>(__popcnt(_Val >> 32) + __popcnt(static_cast<unsigned int>(_Val)));
+#else // ^^^ _M_IX86 / !_M_IX86 vvv
+        return static_cast<int>(__popcnt64(_Val));
+#endif // _M_IX86
+    }
+}
+
+template <class _Ty>
+_NODISCARD int _Checked_x86_x64_popcount(const _Ty _Val) noexcept {
+#ifndef __AVX__
+    const bool _Definitely_have_popcnt = __isa_available >= _Stl_isa_available_sse42;
+    if (!_Definitely_have_popcnt) {
+        return _Popcount_fallback(_Val);
+    }
+#endif // !defined(__AVX__)
+    return _Unchecked_x86_x64_popcount(_Val);
+}
+#endif // _HAS_POPCNT_INTRINSICS
+
+#if _HAS_NEON_INTRINSICS
+_NODISCARD inline int _Arm64_popcount(const unsigned long long _Val) noexcept {
+    const __n64 _Temp = neon_cnt(__uint64ToN64_v(_Val));
+    return neon_addv8(_Temp).n8_i8[0];
+}
+#endif // _HAS_NEON_INTRINSICS
+
+template <class _Ty>
+_INLINE_VAR constexpr bool _Is_standard_unsigned_integer =
+    _Is_any_of_v<remove_cv_t<_Ty>, unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long>;
+
+template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
+_NODISCARD _CONSTEXPR20 int _Countr_zero(const _Ty _Val) noexcept {
+#if _HAS_TZCNT_BSF_INTRINSICS
+#if _HAS_CXX20
+    if (!_STD is_constant_evaluated())
+#endif // _HAS_CXX20
+    {
+        return _Checked_x86_x64_countr_zero(_Val);
+    }
+#endif // _HAS_TZCNT_BSF_INTRINSICS
+    return _Countr_zero_fallback(_Val);
+}
+
+template <class _Ty, class _Fn>
+constexpr decltype(auto) _Select_countr_zero_impl(_Fn _Callback) {
+    // TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
+#if _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20
+    if (!_STD is_constant_evaluated()) {
+#ifdef __AVX2__
+        return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
+#else // ^^^ AVX2 / not AVX2 vvv
+        const bool _Definitely_have_tzcnt = __isa_available >= _Stl_isa_available_avx2;
+        if (_Definitely_have_tzcnt) {
+            return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
+        } else {
+            return _Callback([](_Ty _Val) { return _Countr_zero_bsf(_Val); });
+        }
+#endif // ^^^ not AVX2 ^^^
+    }
+#endif // ^^^ _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20 ^^^
+    // C++17 constexpr gcd() calls this function, so it should be constexpr unless we detect runtime evaluation.
+    return _Callback([](_Ty _Val) { return _Countr_zero_fallback(_Val); });
+}
+
+template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
+_NODISCARD _CONSTEXPR20 int _Popcount(const _Ty _Val) noexcept {
+#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
+#if _HAS_CXX20
+    if (!_STD is_constant_evaluated())
+#endif // _HAS_CXX20
+    {
+#if _HAS_POPCNT_INTRINSICS
+        return _Checked_x86_x64_popcount(_Val);
+#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
+        return _Arm64_popcount(_Val);
+#endif // ^^^ ARM64 intrinsics available ^^^
+    }
+#endif // ^^^ any intrinsics available ^^^
+    return _Popcount_fallback(_Val);
+}
+
+template <class _Ty, class _Fn>
+_CONSTEXPR20 decltype(auto) _Select_popcount_impl(_Fn _Callback) {
+    // TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
+#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
+#if _HAS_CXX20
+    if (!_STD is_constant_evaluated())
+#endif // _HAS_CXX20
+    {
+#if _HAS_POPCNT_INTRINSICS
+#ifndef __AVX__
+        const bool _Definitely_have_popcnt = __isa_available >= _Stl_isa_available_sse42;
+        if (!_Definitely_have_popcnt) {
+            return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
+        }
+#endif // !defined(__AVX__)
+        return _Callback([](_Ty _Val) { return _Unchecked_x86_x64_popcount(_Val); });
+#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
+        return _Callback([](_Ty _Val) { return _Arm64_popcount(_Val); });
+#endif // ^^^ ARM64 intrinsics available ^^^
+    }
+#endif // ^^^ any intrinsics available ^^^
+    return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
+}
+
+#undef _HAS_POPCNT_INTRINSICS
+#undef _HAS_TZCNT_BSF_INTRINSICS
+
+_STD_END
+
+#undef _HAS_NEON_INTRINSICS
+
+#pragma pop_macro("new")
+_STL_RESTORE_CLANG_WARNINGS
+#pragma warning(pop)
+#pragma pack(pop)
+#endif // _STL_COMPILER_PREPROCESSOR
+#endif // __MSVC_BIT_UTILS_HPP
--- a/stl/inc/__msvc_format_ucd_tables.hpp
+++ b/stl/inc/__msvc_format_ucd_tables.hpp
@ -61,7 +61,6 @@
 #if _STL_COMPILER_PREPROCESSOR

 #include <cstdint>
-#include <limits>
 #include <xutility>

 #pragma pack(push, _CRT_PACKING)
@ -79,7 +78,7 @@ struct _Unicode_property_data {
    uint16_t _Props_and_size[_NumRanges];
    _NODISCARD constexpr _ValueEnum _Get_property_for_codepoint(const uint32_t _Code_point) const noexcept {
        ptrdiff_t _Upper_idx = _STD upper_bound(_Lower_bounds, _STD end(_Lower_bounds), _Code_point) - _Lower_bounds;
-        constexpr auto _No_value_constant = static_cast<_ValueEnum>((numeric_limits<uint8_t>::max)());
+        constexpr auto _No_value_constant = static_cast<_ValueEnum>(UINT8_MAX);
        if (_Upper_idx == 0) {
            return _No_value_constant;
        }
--- a/stl/inc/__msvc_int128.hpp
+++ b/stl/inc/__msvc_int128.hpp
@ -9,6 +9,7 @@

 #include <yvals_core.h>
 #if _STL_COMPILER_PREPROCESSOR
+#include <__msvc_bit_utils.hpp>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
@ -16,7 +17,6 @@
 #include _STL_INTRIN_HEADER

 #if _HAS_CXX20
-#include <bit>
 #include <compare>
 #define _ZERO_OR_NO_INIT
 #else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
@ -56,6 +56,24 @@ _STD_BEGIN
 #define _STL_128_DIV_INTRINSICS     0
 #endif // ^^^ intrinsics unavailable ^^^

+template <class _Ty>
+_NODISCARD constexpr int _Countl_zero_internal(const _Ty _Val) noexcept {
+    _STL_INTERNAL_STATIC_ASSERT(_Is_standard_unsigned_integer<_Ty>);
+#if _HAS_COUNTL_ZERO_INTRINSICS
+#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
+    if (!_Is_constant_evaluated()) {
+        return _Checked_x86_x64_countl_zero(_Val);
+    }
+#elif defined(_M_ARM) || defined(_M_ARM64)
+    if (!_Is_constant_evaluated()) {
+        return _Checked_arm_arm64_countl_zero(_Val);
+    }
+#endif // defined(_M_ARM) || defined(_M_ARM64)
+#endif // _HAS_COUNTL_ZERO_INTRINSICS
+
+    return _Countl_zero_fallback(_Val);
+}
+
 struct
 #ifndef _M_ARM
    alignas(16)
@ -143,7 +161,7 @@ struct
    static constexpr void _Knuth_4_3_1_M(
        const uint32_t (&__u)[__m], const uint32_t (&__v)[__n], uint32_t (&__w)[__n + __m]) noexcept {
 #ifdef _ENABLE_STL_INTERNAL_CHECK
-        constexpr auto _Int_max = static_cast<size_t>((numeric_limits<int>::max)());
+        constexpr auto _Int_max = static_cast<size_t>(INT_MAX);
        _STL_INTERNAL_STATIC_ASSERT(__m <= _Int_max);
        _STL_INTERNAL_STATIC_ASSERT(__n <= _Int_max);
 #endif // _ENABLE_STL_INTERNAL_CHECK
@ -192,7 +210,7 @@ struct
    static constexpr void _Knuth_4_3_1_D(uint32_t* const __u, const size_t __u_size, const uint32_t* const __v,
        const size_t __v_size, uint32_t* const __q) noexcept {
        // Pre: __u + [0, __u_size), __v + [0, __v_size), and __q + [0, __u_size - __v_size) are all valid ranges
-        // constexpr auto _Int_max = static_cast<size_t>((numeric_limits<int>::max)());
+        // constexpr auto _Int_max = static_cast<size_t>(INT_MAX);
        // _STL_INTERNAL_CHECK(__v_size <= _Int_max);
        const int __n = static_cast<int>(__v_size);
        // _STL_INTERNAL_CHECK(__u_size > __v_size);
@ -252,11 +270,7 @@ struct
        }
 #endif // _STL_128_DIV_INTRINSICS

-#if _HAS_CXX20
-        const auto __d = _STD countl_zero(static_cast<uint32_t>(_Div >> 32));
-#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
-        const auto __d = _Countl_zero_fallback(static_cast<uint32_t>(_Div >> 32));
-#endif // ^^^ !_HAS_CXX20 ^^^
+        const auto __d = _Countl_zero_internal(static_cast<uint32_t>(_Div >> 32));
        if (__d >= 32) { // _Div < 2^32
            auto _Rem    = (_High << 32) | (_Low >> 32);
            auto _Result = _Rem / static_cast<uint32_t>(_Div);
@ -464,11 +478,7 @@ struct
        // _STL_INTERNAL_CHECK(_Den._Word[1] != 0);
        // _STL_INTERNAL_CHECK(_Num._Word[1] > _Den._Word[1]);
        // Normalize by shifting both left until _Den's high bit is set (So _Den's high digit is >= b / 2)
-#if _HAS_CXX20
-        const auto __d = _STD countl_zero(_Den._Word[1]);
-#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
-        const auto __d = _Countl_zero_fallback(_Den._Word[1]);
-#endif // ^^^ !_HAS_CXX20 ^^^
+        const auto __d = _Countl_zero_internal(_Den._Word[1]);
        _Den <<= __d;
        auto _High_digit = __d == 0 ? 0 : _Num._Word[1] >> (64 - __d); // This creates a third digit for _Num
        _Num <<= __d;
@ -513,11 +523,7 @@ struct
        }
        return __qhat;
 #else // ^^^ 128-bit intrinsics / no such intrinsics vvv
-#if _HAS_CXX20
-        auto __d                   = _STD countl_zero(_Den._Word[1]);
-#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
-        auto __d = _Countl_zero_fallback(_Den._Word[1]);
-#endif // ^^^ !_HAS_CXX20 ^^^
+        auto __d                   = _Countl_zero_internal(_Den._Word[1]);
        const bool _Three_word_den = __d >= 32;
        __d &= 31;
        uint32_t __u[5]{
@ -597,11 +603,7 @@ struct
        // _STL_INTERNAL_CHECK(_Den._Word[1] != 0);
        // _STL_INTERNAL_CHECK(_Num._Word[1] > _Den._Word[1]);
        // Normalize by shifting both left until _Den's high bit is set (So _Den's high digit is >= b / 2)
-#if _HAS_CXX20
-        const auto __d = _STD countl_zero(_Den._Word[1]);
-#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
-        const auto __d = _Countl_zero_fallback(_Den._Word[1]);
-#endif // ^^^ !_HAS_CXX20 ^^^
+        const auto __d = _Countl_zero_internal(_Den._Word[1]);
        _Den <<= __d;
        auto _High_digit = __d == 0 ? 0 : _Num._Word[1] >> (64 - __d); // This creates a third digit for _Num
        _Num <<= __d;
@ -648,11 +650,7 @@ struct
            (void) _AddCarry64(_Carry, _Num._Word[1], _Den._Word[1], _Num._Word[1]);
        }
 #else // ^^^ 128-bit intrinsics / no such intrinsics vvv
-#if _HAS_CXX20
-        auto __d                   = _STD countl_zero(_Den._Word[1]);
-#else // ^^^ _HAS_CXX20 / !_HAS_CXX20 vvv
-        auto __d = _Countl_zero_fallback(_Den._Word[1]);
-#endif // ^^^ !_HAS_CXX20 ^^^
+        auto __d                   = _Countl_zero_internal(_Den._Word[1]);
        const bool _Three_word_den = __d >= 32;
        __d &= 31;
        uint32_t __u[5]{
--- a/stl/inc/bit
+++ b/stl/inc/bit
@ -12,9 +12,8 @@
 _EMIT_STL_WARNING(STL4038, "The contents of <bit> are available only with C++20 or later.");
 #else // ^^^ !_HAS_CXX20 / _HAS_CXX20 vvv

+#include <__msvc_bit_utils.hpp>
 #include <cstdlib>
-#include <isa_availability.h>
-#include <limits>
 #include <type_traits>

 #include _STL_INTRIN_HEADER
@ -95,7 +94,7 @@ _NODISCARD constexpr _Ty bit_ceil(const _Ty _Val) noexcept /* strengthened */ {
        return _Ty{1};
    }

-    const int _Num = numeric_limits<_Ty>::digits - _STD countl_zero(static_cast<_Ty>(_Val - 1));
+    const int _Num = _Unsigned_integer_digits<_Ty> - _STD countl_zero(static_cast<_Ty>(_Val - 1));

    if constexpr (sizeof(_Ty) < sizeof(unsigned int)) { // for types subject to integral promotion
        if (_STD is_constant_evaluated()) {
@ -108,7 +107,7 @@ _NODISCARD constexpr _Ty bit_ceil(const _Ty _Val) noexcept /* strengthened */ {
            // "Preconditions: N is representable as a value of type T."
            // "Remarks: A function call expression that violates the precondition in the Preconditions: element
            // is not a core constant expression (7.7)."
-            if (_Num == numeric_limits<_Ty>::digits) {
+            if (_Num == _Unsigned_integer_digits<_Ty>) {
                _Precondition_violation_in_bit_ceil();
            }
        }
@ -123,12 +122,12 @@ _NODISCARD constexpr _Ty bit_floor(const _Ty _Val) noexcept {
        return 0;
    }

-    return static_cast<_Ty>(_Ty{1} << (numeric_limits<_Ty>::digits - 1 - _STD countl_zero(_Val)));
+    return static_cast<_Ty>(_Ty{1} << (_Unsigned_integer_digits<_Ty> - 1 - _STD countl_zero(_Val)));
 }

 _EXPORT_STD template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
 _NODISCARD constexpr int bit_width(const _Ty _Val) noexcept {
-    return numeric_limits<_Ty>::digits - _STD countl_zero(_Val);
+    return _Unsigned_integer_digits<_Ty> - _STD countl_zero(_Val);
 }

 _EXPORT_STD template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
@ -136,7 +135,7 @@ _NODISCARD constexpr _Ty rotr(_Ty _Val, int _Rotation) noexcept;

 _EXPORT_STD template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
 _NODISCARD constexpr _Ty rotl(const _Ty _Val, const int _Rotation) noexcept {
-    constexpr auto _Digits = numeric_limits<_Ty>::digits;
+    constexpr auto _Digits = _Unsigned_integer_digits<_Ty>;

    if (!_STD is_constant_evaluated()) {
        if constexpr (_Digits == 64) {
@ -164,7 +163,7 @@ _NODISCARD constexpr _Ty rotl(const _Ty _Val, const int _Rotation) noexcept {

 _EXPORT_STD template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> /* = 0 */>
 _NODISCARD constexpr _Ty rotr(const _Ty _Val, const int _Rotation) noexcept {
-    constexpr auto _Digits = numeric_limits<_Ty>::digits;
+    constexpr auto _Digits = _Unsigned_integer_digits<_Ty>;

    if (!_STD is_constant_evaluated()) {
        if constexpr (_Digits == 64) {
@ -190,124 +189,9 @@ _NODISCARD constexpr _Ty rotr(const _Ty _Val, const int _Rotation) noexcept {
    }
 }

-#if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
-
-extern "C" {
-extern int __isa_available;
-}
-
-template <class _Ty>
-_NODISCARD int _Countl_zero_lzcnt(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-
-    if constexpr (_Digits <= 16) {
-        return static_cast<int>(__lzcnt16(_Val) - (16 - _Digits));
-    } else if constexpr (_Digits == 32) {
-        return static_cast<int>(__lzcnt(_Val));
-    } else {
-#ifdef _M_IX86
-        const unsigned int _High = _Val >> 32;
-        const auto _Low          = static_cast<unsigned int>(_Val);
-        if (_High == 0) {
-            return 32 + _Countl_zero_lzcnt(_Low);
-        } else {
-            return _Countl_zero_lzcnt(_High);
-        }
-#else // ^^^ _M_IX86 / !_M_IX86 vvv
-        return static_cast<int>(__lzcnt64(_Val));
-#endif // _M_IX86
-    }
-}
-
-template <class _Ty>
-_NODISCARD int _Countl_zero_bsr(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-
-    unsigned long _Result;
-    if constexpr (_Digits <= 32) {
-        if (!_BitScanReverse(&_Result, _Val)) {
-            return _Digits;
-        }
-    } else {
-#ifdef _M_IX86
-        const unsigned int _High = _Val >> 32;
-        if (_BitScanReverse(&_Result, _High)) {
-            return static_cast<int>(31 - _Result);
-        }
-
-        const auto _Low = static_cast<unsigned int>(_Val);
-        if (!_BitScanReverse(&_Result, _Low)) {
-            return _Digits;
-        }
-#else // ^^^ _M_IX86 / !_M_IX86 vvv
-        if (!_BitScanReverse64(&_Result, _Val)) {
-            return _Digits;
-        }
-#endif // _M_IX86
-    }
-    return static_cast<int>(_Digits - 1 - _Result);
-}
-
-template <class _Ty>
-_NODISCARD int _Checked_x86_x64_countl_zero(const _Ty _Val) noexcept {
-#ifdef __AVX2__
-    return _Countl_zero_lzcnt(_Val);
-#else // __AVX2__
-    const bool _Definitely_have_lzcnt = __isa_available >= __ISA_AVAILABLE_AVX2;
-    if (_Definitely_have_lzcnt) {
-        return _Countl_zero_lzcnt(_Val);
-    } else {
-        return _Countl_zero_bsr(_Val);
-    }
-#endif // __AVX2__
-}
-#endif // defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
-
-#if defined(_M_ARM) || defined(_M_ARM64)
-#ifdef __clang__ // TRANSITION, GH-1586
-_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned short _Val) {
-    return __builtin_clzs(_Val);
-}
-
-_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned int _Val) {
-    return __builtin_clz(_Val);
-}
-
-_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long _Val) {
-    return __builtin_clzl(_Val);
-}
-
-_NODISCARD constexpr int _Clang_arm_arm64_countl_zero(const unsigned long long _Val) {
-    return __builtin_clzll(_Val);
-}
-#endif // TRANSITION, GH-1586
-
-template <class _Ty>
-_NODISCARD int _Checked_arm_arm64_countl_zero(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-    if (_Val == 0) {
-        return _Digits;
-    }
-
-#ifdef __clang__ // TRANSITION, GH-1586
-    if constexpr (is_same_v<remove_cv_t<_Ty>, unsigned char>) {
-        return _Clang_arm_arm64_countl_zero(static_cast<unsigned short>(_Val))
-             - (numeric_limits<unsigned short>::digits - _Digits);
-    } else {
-        return _Clang_arm_arm64_countl_zero(_Val);
-    }
-#else // ^^^ workaround / no workaround vvv
-    if constexpr (_Digits <= 32) {
-        return static_cast<int>(_CountLeadingZeros(_Val)) - (numeric_limits<unsigned long>::digits - _Digits);
-    } else {
-        return static_cast<int>(_CountLeadingZeros64(_Val));
-    }
-#endif // TRANSITION, GH-1586
-}
-#endif // defined(_M_ARM) || defined(_M_ARM64)
-
 _EXPORT_STD template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> /* = 0 */>
 _NODISCARD constexpr int countl_zero(const _Ty _Val) noexcept {
+#if _HAS_COUNTL_ZERO_INTRINSICS
 #if defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
    if (!_STD is_constant_evaluated()) {
        return _Checked_x86_x64_countl_zero(_Val);
@ -317,6 +201,7 @@ _NODISCARD constexpr int countl_zero(const _Ty _Val) noexcept {
        return _Checked_arm_arm64_countl_zero(_Val);
    }
 #endif // defined(_M_ARM) || defined(_M_ARM64)
+#endif // _HAS_COUNTL_ZERO_INTRINSICS

    return _Countl_zero_fallback(_Val);
 }
--- a/stl/inc/bitset
+++ b/stl/inc/bitset
@ -8,8 +8,8 @@
 #define _BITSET_
 #include <yvals_core.h>
 #if _STL_COMPILER_PREPROCESSOR
+#include <__msvc_bit_utils.hpp>
 #include <iosfwd>
-#include <limits>
 #include <xstring>

 #pragma pack(push, _CRT_PACKING)
--- a/stl/inc/complex
+++ b/stl/inc/complex
@ -26,9 +26,8 @@
 // TRANSITION, not using x86/x64 FMA intrinsics for Clang yet
 #elif defined(_M_IX86) || defined(_M_X64)
 #define _FMP_USING_X86_X64_INTRINSICS
+#include <__msvc_bit_utils.hpp>
 #include <emmintrin.h>
-#include <isa_availability.h>
-extern "C" int __isa_available;
 extern "C" __m128d __cdecl _mm_fmsub_sd(__m128d, __m128d, __m128d);
 #endif // ^^^ defined(_M_IX86) || defined(_M_X64) ^^^

@ -177,7 +176,7 @@ namespace _Float_multi_prec {
 #ifdef __AVX2__
        return {_Prod0, _Sqr_error_x86_x64_fma(_Xval, _Prod0)};
 #else // ^^^ defined(__AVX2__) / !defined(__AVX2__) vvv
-        const bool _Definitely_have_fma = __isa_available >= __ISA_AVAILABLE_AVX2;
+        const bool _Definitely_have_fma = __isa_available >= _Stl_isa_available_avx2;
        if (_Definitely_have_fma) {
            return {_Prod0, _Sqr_error_x86_x64_fma(_Xval, _Prod0)};
        } else {
--- a/stl/inc/header-units.json
+++ b/stl/inc/header-units.json
@ -5,6 +5,7 @@
    "Version": "1.0",
    "BuildAsHeaderUnits": [
        // "__msvc_all_public_headers.hpp", // for testing, not production
+        "__msvc_bit_utils.hpp",
        "__msvc_chrono.hpp",
        "__msvc_cxx_stdatomic.hpp",
        "__msvc_filebuf.hpp",
--- a/stl/inc/limits
+++ b/stl/inc/limits
@ -11,23 +11,10 @@
 #include <cfloat>
 #include <climits>
 #include <cwchar>
-#include <isa_availability.h>
 #include <xtr1common>

 #include _STL_INTRIN_HEADER

-// TRANSITION, GH-2129, move down to _Arm64_popcount
-#if (defined(_M_ARM64) || defined(_M_ARM64EC)) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
-    && !defined(__INTEL_COMPILER) && !defined(__clang__) // TRANSITION, LLVM-51488
-#define _HAS_NEON_INTRINSICS 1
-#else // ^^^ intrinsics available / intrinsics unavailable vvv
-#define _HAS_NEON_INTRINSICS 0
-#endif // ^^^ intrinsics unavailable ^^^
-
-#if _HAS_NEON_INTRINSICS
-#include <arm64_neon.h> // TRANSITION, GH-2129
-#endif
-
 #pragma pack(push, _CRT_PACKING)
 #pragma warning(push, _STL_WARNING_LEVEL)
 #pragma warning(disable : _STL_DISABLED_WARNINGS)
@ -1006,284 +993,6 @@ public:
    static constexpr int min_exponent10 = LDBL_MIN_10_EXP;
 };

-// Implementation of countl_zero without using specialized CPU instructions.
-// Used at compile time and when said instructions are not supported.
-// see "Hacker's Delight" section 5-3
-template <class _Ty>
-_NODISCARD constexpr int _Countl_zero_fallback(_Ty _Val) noexcept {
-    _Ty _Yy = 0;
-
-    unsigned int _Nn = numeric_limits<_Ty>::digits;
-    unsigned int _Cc = numeric_limits<_Ty>::digits / 2;
-    do {
-        _Yy = static_cast<_Ty>(_Val >> _Cc);
-        if (_Yy != 0) {
-            _Nn -= _Cc;
-            _Val = _Yy;
-        }
-        _Cc >>= 1;
-    } while (_Cc != 0);
-    return static_cast<int>(_Nn) - static_cast<int>(_Val);
-}
-
-// Implementation of countr_zero without using specialized CPU instructions.
-// Used at compile time and when said instructions are not supported.
-// see "Hacker's Delight" section 5-4
-template <class _Ty>
-_NODISCARD constexpr int _Countr_zero_fallback(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-    return _Digits - _Countl_zero_fallback(static_cast<_Ty>(static_cast<_Ty>(~_Val) & static_cast<_Ty>(_Val - 1)));
-}
-
-// Implementation of popcount without using specialized CPU instructions.
-// Used at compile time and when said instructions are not supported.
-template <class _Ty>
-_NODISCARD constexpr int _Popcount_fallback(_Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-#if defined(_M_IX86) || defined(_M_ARM)
-    if constexpr (_Digits == 64) {
-        // 64-bit bit operations on architectures without 64-bit registers are less efficient,
-        // hence we split the value so that it fits in 32-bit registers
-        return _Popcount_fallback(static_cast<unsigned long>(_Val))
-             + _Popcount_fallback(static_cast<unsigned long>(_Val >> 32));
-    }
-#endif // defined(_M_IX86) || defined(_M_ARM)
-    // we static_cast these bit patterns in order to truncate them to the correct size
-    _Val = static_cast<_Ty>(_Val - ((_Val >> 1) & static_cast<_Ty>(0x5555'5555'5555'5555ull)));
-    _Val = static_cast<_Ty>((_Val & static_cast<_Ty>(0x3333'3333'3333'3333ull))
-                            + ((_Val >> 2) & static_cast<_Ty>(0x3333'3333'3333'3333ull)));
-    _Val = static_cast<_Ty>((_Val + (_Val >> 4)) & static_cast<_Ty>(0x0F0F'0F0F'0F0F'0F0Full));
-    // Multiply by one in each byte, so that it will have the sum of all source bytes in the highest byte
-    _Val = static_cast<_Ty>(_Val * static_cast<_Ty>(0x0101'0101'0101'0101ull));
-    // Extract highest byte
-    return static_cast<int>(_Val >> (_Digits - 8));
-}
-
-#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
-    && !defined(__INTEL_COMPILER)
-#define _HAS_TZCNT_BSF_INTRINSICS 1
-#else // ^^^ intrinsics available / intrinsics unavailable vvv
-#define _HAS_TZCNT_BSF_INTRINSICS 0
-#endif // ^^^ intrinsics unavailable ^^^
-
-#if _HAS_TZCNT_BSF_INTRINSICS
-extern "C" {
-extern int __isa_available;
-}
-
-#ifdef __clang__
-#define _TZCNT_U32 __builtin_ia32_tzcnt_u32
-#define _TZCNT_U64 __builtin_ia32_tzcnt_u64
-#else // ^^^ __clang__ / !__clang__ vvv
-#define _TZCNT_U32 _tzcnt_u32
-#define _TZCNT_U64 _tzcnt_u64
-#endif // __clang__
-
-template <class _Ty>
-_NODISCARD int _Countr_zero_tzcnt(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-    constexpr _Ty _Max    = (numeric_limits<_Ty>::max)();
-
-    if constexpr (_Digits <= 32) {
-        // Intended widening to int. This operation means that a narrow 0 will widen
-        // to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
-        // of the wider type.
-        return static_cast<int>(_TZCNT_U32(static_cast<unsigned int>(~_Max | _Val)));
-    } else {
-#ifdef _M_IX86
-        const auto _Low = static_cast<unsigned int>(_Val);
-        if (_Low == 0) {
-            const unsigned int _High = _Val >> 32;
-            return static_cast<int>(32 + _TZCNT_U32(_High));
-        } else {
-            return static_cast<int>(_TZCNT_U32(_Low));
-        }
-#else // ^^^ _M_IX86 / !_M_IX86 vvv
-        return static_cast<int>(_TZCNT_U64(_Val));
-#endif // _M_IX86
-    }
-}
-
-#undef _TZCNT_U32
-#undef _TZCNT_U64
-
-template <class _Ty>
-_NODISCARD int _Countr_zero_bsf(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-    constexpr _Ty _Max    = (numeric_limits<_Ty>::max)();
-
-    unsigned long _Result;
-    if constexpr (_Digits <= 32) {
-        // Intended widening to int. This operation means that a narrow 0 will widen
-        // to 0xFFFF....FFFF0... instead of 0. We need this to avoid counting all the zeros
-        // of the wider type.
-        if (!_BitScanForward(&_Result, static_cast<unsigned int>(~_Max | _Val))) {
-            return _Digits;
-        }
-    } else {
-#ifdef _M_IX86
-        const auto _Low = static_cast<unsigned int>(_Val);
-        if (_BitScanForward(&_Result, _Low)) {
-            return static_cast<int>(_Result);
-        }
-
-        const unsigned int _High = _Val >> 32;
-        if (!_BitScanForward(&_Result, _High)) {
-            return _Digits;
-        } else {
-            return static_cast<int>(_Result + 32);
-        }
-#else // ^^^ _M_IX86 / !_M_IX86 vvv
-        if (!_BitScanForward64(&_Result, _Val)) {
-            return _Digits;
-        }
-#endif // _M_IX86
-    }
-    return static_cast<int>(_Result);
-}
-
-template <class _Ty>
-_NODISCARD int _Checked_x86_x64_countr_zero(const _Ty _Val) noexcept {
-#ifdef __AVX2__
-    return _Countr_zero_tzcnt(_Val);
-#else // __AVX2__
-    const bool _Definitely_have_tzcnt = __isa_available >= __ISA_AVAILABLE_AVX2;
-    if (_Definitely_have_tzcnt) {
-        return _Countr_zero_tzcnt(_Val);
-    } else {
-        return _Countr_zero_bsf(_Val);
-    }
-#endif // __AVX2__
-}
-
-#endif // _HAS_TZCNT_BSF_INTRINSICS
-
-#if (defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))) && !defined(_M_CEE_PURE) && !defined(__CUDACC__) \
-    && !defined(__INTEL_COMPILER)
-#define _HAS_POPCNT_INTRINSICS 1
-#else // ^^^ intrinsics available / intrinsics unavailable vvv
-#define _HAS_POPCNT_INTRINSICS 0
-#endif // ^^^ intrinsics unavailable ^^^
-
-#if _HAS_POPCNT_INTRINSICS
-template <class _Ty>
-_NODISCARD int _Unchecked_x86_x64_popcount(const _Ty _Val) noexcept {
-    constexpr int _Digits = numeric_limits<_Ty>::digits;
-    if constexpr (_Digits <= 16) {
-        return static_cast<int>(__popcnt16(_Val));
-    } else if constexpr (_Digits == 32) {
-        return static_cast<int>(__popcnt(_Val));
-    } else {
-#ifdef _M_IX86
-        return static_cast<int>(__popcnt(_Val >> 32) + __popcnt(static_cast<unsigned int>(_Val)));
-#else // ^^^ _M_IX86 / !_M_IX86 vvv
-        return static_cast<int>(__popcnt64(_Val));
-#endif // _M_IX86
-    }
-}
-
-template <class _Ty>
-_NODISCARD int _Checked_x86_x64_popcount(const _Ty _Val) noexcept {
-#ifndef __AVX__
-    const bool _Definitely_have_popcnt = __isa_available >= __ISA_AVAILABLE_SSE42;
-    if (!_Definitely_have_popcnt) {
-        return _Popcount_fallback(_Val);
-    }
-#endif // !defined(__AVX__)
-    return _Unchecked_x86_x64_popcount(_Val);
-}
-#endif // _HAS_POPCNT_INTRINSICS
-
-#if _HAS_NEON_INTRINSICS
-_NODISCARD inline int _Arm64_popcount(const unsigned long long _Val) noexcept {
-    const __n64 _Temp = neon_cnt(__uint64ToN64_v(_Val));
-    return neon_addv8(_Temp).n8_i8[0];
-}
-#endif // _HAS_NEON_INTRINSICS
-
-template <class _Ty>
-constexpr bool _Is_standard_unsigned_integer =
-    _Is_any_of_v<remove_cv_t<_Ty>, unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long>;
-
-template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
-_NODISCARD _CONSTEXPR20 int _Countr_zero(const _Ty _Val) noexcept {
-#if _HAS_TZCNT_BSF_INTRINSICS
-#if _HAS_CXX20
-    if (!_STD is_constant_evaluated())
-#endif // _HAS_CXX20
-    {
-        return _Checked_x86_x64_countr_zero(_Val);
-    }
-#endif // _HAS_TZCNT_BSF_INTRINSICS
-    return _Countr_zero_fallback(_Val);
-}
-
-template <class _Ty, class _Fn>
-constexpr decltype(auto) _Select_countr_zero_impl(_Fn _Callback) {
-    // TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
-#if _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20
-    if (!_STD is_constant_evaluated()) {
-#ifdef __AVX2__
-        return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
-#else // ^^^ AVX2 / not AVX2 vvv
-        const bool _Definitely_have_tzcnt = __isa_available >= __ISA_AVAILABLE_AVX2;
-        if (_Definitely_have_tzcnt) {
-            return _Callback([](_Ty _Val) { return _Countr_zero_tzcnt(_Val); });
-        } else {
-            return _Callback([](_Ty _Val) { return _Countr_zero_bsf(_Val); });
-        }
-#endif // ^^^ not AVX2 ^^^
-    }
-#endif // ^^^ _HAS_TZCNT_BSF_INTRINSICS && _HAS_CXX20 ^^^
-    // C++17 constexpr gcd() calls this function, so it should be constexpr unless we detect runtime evaluation.
-    return _Callback([](_Ty _Val) { return _Countr_zero_fallback(_Val); });
-}
-
-template <class _Ty, enable_if_t<_Is_standard_unsigned_integer<_Ty>, int> = 0>
-_NODISCARD _CONSTEXPR20 int _Popcount(const _Ty _Val) noexcept {
-#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
-#if _HAS_CXX20
-    if (!_STD is_constant_evaluated())
-#endif // _HAS_CXX20
-    {
-#if _HAS_POPCNT_INTRINSICS
-        return _Checked_x86_x64_popcount(_Val);
-#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
-        return _Arm64_popcount(_Val);
-#endif // ^^^ ARM64 intrinsics available ^^^
-    }
-#endif // ^^^ any intrinsics available ^^^
-    return _Popcount_fallback(_Val);
-}
-
-template <class _Ty, class _Fn>
-_CONSTEXPR20 decltype(auto) _Select_popcount_impl(_Fn _Callback) {
-    // TRANSITION, DevCom-1527995: Lambdas in this function ensure inlining
-#if _HAS_POPCNT_INTRINSICS || _HAS_NEON_INTRINSICS
-#if _HAS_CXX20
-    if (!_STD is_constant_evaluated())
-#endif // _HAS_CXX20
-    {
-#if _HAS_POPCNT_INTRINSICS
-#ifndef __AVX__
-        const bool _Definitely_have_popcnt = __isa_available >= __ISA_AVAILABLE_SSE42;
-        if (!_Definitely_have_popcnt) {
-            return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
-        }
-#endif // !defined(__AVX__)
-        return _Callback([](_Ty _Val) { return _Unchecked_x86_x64_popcount(_Val); });
-#elif _HAS_NEON_INTRINSICS // ^^^ x86/x64 intrinsics available / ARM64 intrinsics available vvv
-        return _Callback([](_Ty _Val) { return _Arm64_popcount(_Val); });
-#endif // ^^^ ARM64 intrinsics available ^^^
-    }
-#endif // ^^^ any intrinsics available ^^^
-    return _Callback([](_Ty _Val) { return _Popcount_fallback(_Val); });
-}
-
-#undef _HAS_POPCNT_INTRINSICS
-#undef _HAS_TZCNT_BSF_INTRINSICS
-#undef _HAS_NEON_INTRINSICS
-
 _STD_END
 #pragma pop_macro("new")
 _STL_RESTORE_CLANG_WARNINGS
--- a/stl/inc/numeric
+++ b/stl/inc/numeric
@ -11,9 +11,13 @@
 #include <xutility>

 #if _HAS_CXX17
-#include <limits>
+#include <__msvc_bit_utils.hpp>
 #endif // _HAS_CXX17

+#if _HAS_CXX20
+#include <cfloat>
+#endif // _HAS_CXX20
+
 #pragma pack(push, _CRT_PACKING)
 #pragma warning(push, _STL_WARNING_LEVEL)
 #pragma warning(disable : _STL_DISABLED_WARNINGS)
@ -648,6 +652,24 @@ _NODISCARD constexpr common_type_t<_Mt, _Nt> lcm(const _Mt _Mx, const _Nt _Nx) n
 #endif // _HAS_CXX17

 #if _HAS_CXX20
+template <class _Flt>
+inline constexpr _Flt _Floating_max{};
+template <>
+inline constexpr float _Floating_max<float> = FLT_MAX;
+template <>
+inline constexpr double _Floating_max<double> = DBL_MAX;
+template <>
+inline constexpr long double _Floating_max<long double> = LDBL_MAX;
+
+template <class _Flt>
+inline constexpr _Flt _Floating_min{};
+template <>
+inline constexpr float _Floating_min<float> = FLT_MIN;
+template <>
+inline constexpr double _Floating_min<double> = DBL_MIN;
+template <>
+inline constexpr long double _Floating_min<long double> = LDBL_MIN;
+
 _EXPORT_STD template <class _Ty, enable_if_t<is_arithmetic_v<_Ty> && !is_same_v<remove_cv_t<_Ty>, bool>, int> = 0>
 _NODISCARD constexpr _Ty midpoint(const _Ty _Val1, const _Ty _Val2) noexcept {
    if constexpr (is_floating_point_v<_Ty>) {
@ -666,7 +688,7 @@ _NODISCARD constexpr _Ty midpoint(const _Ty _Val1, const _Ty _Val2) noexcept {
            }
        }

-        constexpr _Ty _High_limit = (numeric_limits<_Ty>::max)() / 2;
+        constexpr _Ty _High_limit = _Floating_max<remove_cv_t<_Ty>> / 2;
        const auto _Val1_a        = _Float_abs(_Val1);
        const auto _Val2_a        = _Float_abs(_Val2);
        if (_Val1_a <= _High_limit && _Val2_a <= _High_limit) {
@ -690,7 +712,7 @@ _NODISCARD constexpr _Ty midpoint(const _Ty _Val1, const _Ty _Val2) noexcept {

        // In the default rounding mode this less than one ULP difference will always be rounded away, so under
        // /fp:fast we could avoid these tests if we had some means of detecting it in the caller.
-        constexpr _Ty _Low_limit = (numeric_limits<_Ty>::min)() * 2;
+        constexpr _Ty _Low_limit = _Floating_min<remove_cv_t<_Ty>> * 2;
        if (_Val1_a < _Low_limit) {
            return _Val1 + _Val2 / 2;
        }
--- a/stl/inc/random
+++ b/stl/inc/random
@ -2169,7 +2169,7 @@ _NODISCARD _Flt _Float_upper_bound(_Ty _Val) {
        constexpr auto _Mask = static_cast<_Ty>(-1) << (_Ty_digits - _Flt_digits);
 #ifdef _M_CEE_PURE
        constexpr auto _Ty_32or64_digits = numeric_limits<_Ty_32or64>::digits;
-        const auto _Log_plus1 = _Ty_32or64_digits - _Countl_zero_fallback(static_cast<_Ty_32or64>(_Val | _Ty{1}));
+        const auto _Log_plus1 = _Ty_32or64_digits - _Countl_zero_internal(static_cast<_Ty_32or64>(_Val | _Ty{1}));
 #else // _M_CEE_PURE
        const auto _Log_plus1 = _Bit_scan_reverse(static_cast<_Ty_32or64>(_Val | _Ty{1}));
 #endif // _M_CEE_PURE
--- a/stl/inc/ranges
+++ b/stl/inc/ranges
@ -21,6 +21,7 @@ _EMIT_STL_WARNING(STL4038, "The contents of <ranges> are available only with C++

 #if _HAS_CXX23
 #include <array>
+#include <bit>
 #endif // _HAS_CXX23

 #pragma pack(push, _CRT_PACKING)
--- a/stl/inc/vector
+++ b/stl/inc/vector
@ -8,6 +8,7 @@
 #define _VECTOR_
 #include <yvals_core.h>
 #if _STL_COMPILER_PREPROCESSOR
+#include <__msvc_bit_utils.hpp>
 #include <xmemory>

 #if _HAS_CXX17
--- a/tests/std/tests/GH_001411_core_headers/test.cpp
+++ b/tests/std/tests/GH_001411_core_headers/test.cpp
@ -19,11 +19,24 @@
 #include <xfilesystem_abi.h>
 #endif // _HAS_CXX17

+#if _HAS_CXX23
+#include <__msvc_print.hpp>
+#endif // _HAS_CXX23
+
+// <__msvc_bit_utils.hpp> is included by <bit> and <limits>
 // <__msvc_iter_core.hpp> is included by <tuple>
 // <xkeycheck.h> should not be included outside of <yvals_core.h>
 // <xtr1common> is included by <cstddef>
 // <yvals_core.h> is included by every public core header

+// Also test GH-3692 "Including <isa_availability.h> emits a non-reserved name"
+#include <isa_availability.h>
+
+#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, #__VA_ARGS__)
+
+STATIC_ASSERT(std::_Stl_isa_available_sse42 == __ISA_AVAILABLE_SSE42);
+STATIC_ASSERT(std::_Stl_isa_available_avx2 == __ISA_AVAILABLE_AVX2);
+
 #ifdef _YVALS
 #error Core headers should not include <yvals.h>.
 #endif
--- a/tests/std/tests/GH_002206_unreserved_names/test.compile.pass.cpp
+++ b/tests/std/tests/GH_002206_unreserved_names/test.compile.pass.cpp
@ -1,9 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

-#define nsec      delete
-#define sec       delete
-#define xtime     delete
-#define xtime_get delete
+#define ISA_AVAILABILITY delete
+#define nsec             delete
+#define sec              delete
+#define xtime            delete
+#define xtime_get        delete

 #include <__msvc_all_public_headers.hpp>
--- a/tests/std/tests/GH_002431_byte_range_find_with_unreachable_sentinel/test.cpp
+++ b/tests/std/tests/GH_002431_byte_range_find_with_unreachable_sentinel/test.cpp
@ -4,6 +4,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
+#include <isa_availability.h>
 #include <ranges>

 #include <Windows.h>