Bug 1786069 - Update libjxl and highway r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D155272
2022-08-23 12:31:34 +00:00 · 2022-08-23 12:31:34 +00:00 · ca5565dea6
--- a/media/highway/moz.build
+++ b/media/highway/moz.build
@ -11,6 +11,7 @@ LOCAL_INCLUDES += [
 SOURCES += [
    "/third_party/highway/hwy/aligned_allocator.cc",
    "/third_party/highway/hwy/contrib/image/image.cc",
+    "/third_party/highway/hwy/per_target.cc",
    "/third_party/highway/hwy/targets.cc",
 ]

@ -29,6 +30,7 @@ EXPORTS.hwy += [
 EXPORTS.hwy.ops += [
    "/third_party/highway/hwy/ops/arm_neon-inl.h",
    "/third_party/highway/hwy/ops/arm_sve-inl.h",
+    "/third_party/highway/hwy/ops/emu128-inl.h",
    "/third_party/highway/hwy/ops/generic_ops-inl.h",
    "/third_party/highway/hwy/ops/rvv-inl.h",
    "/third_party/highway/hwy/ops/scalar-inl.h",
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z).
+  release: 7f2e26854086fba4255220fd6c77e9141f1f87cc

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: f13e3b956eb226561ac79427893ec0afd66f91a8
+  revision: 7f2e26854086fba4255220fd6c77e9141f1f87cc

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libjxl/moz.build
+++ b/media/libjxl/moz.build
@ -49,7 +49,6 @@ SOURCES += [
    "/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc",
    "/third_party/jpeg-xl/lib/jxl/entropy_coder.cc",
    "/third_party/jpeg-xl/lib/jxl/epf.cc",
-    "/third_party/jpeg-xl/lib/jxl/exif.cc",
    "/third_party/jpeg-xl/lib/jxl/fast_dct.cc",
    "/third_party/jpeg-xl/lib/jxl/fields.cc",
    "/third_party/jpeg-xl/lib/jxl/frame_header.cc",
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: 59f7d19e454bc5e1edd692187e1178f5926fdfd9 (2022-07-28T11:13:01Z).
+  release: bb8eac5d6acec223e44cf8cc72ae02f0816de311

-  revision: 59f7d19e454bc5e1edd692187e1178f5926fdfd9
+  revision: bb8eac5d6acec223e44cf8cc72ae02f0816de311

  license: Apache-2.0

--- a/third_party/highway/BUILD
+++ b/third_party/highway/BUILD
@ -1,7 +1,9 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")

 load("@rules_cc//cc:defs.bzl", "cc_test")
-package(default_visibility = ["//visibility:public"])
+package(
+    default_visibility = ["//visibility:public"],
+)

 licenses(["notice"])

@ -14,10 +16,33 @@ config_setting(
 )

 config_setting(
-    name = "compiler_msvc",
+    name = "compiler_clangcl",
+    flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
+)
+
+config_setting(
+    name = "compiler_msvc_actual",
    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
 )

+# The above is insufficient for Bazel on Windows, which does not seem to
+# detect/set a compiler flag. This workaround prevents compile errors due to
+# passing clang-only warning flags to MSVC.
+config_setting(
+    name = "compiler_msvc_cpu",
+    values = {
+        "cpu": "x64_windows",
+    },
+)
+
+selects.config_setting_group(
+    name = "compiler_msvc",
+    match_any = [
+        ":compiler_msvc_actual",
+        ":compiler_msvc_cpu",
+    ],
+)
+
 config_setting(
    name = "compiler_emscripten",
    values = {"cpu": "wasm32"},
@ -54,8 +79,8 @@ CLANG_GCC_COPTS = [
    "-Wunreachable-code",
 ]

-# Additional warnings only supported by Clang
-CLANG_ONLY_COPTS = [
+# Warnings supported by Clang and Clang-cl
+CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
    "-Wfloat-overflow-conversion",
    "-Wfloat-zero-conversion",
    "-Wfor-loop-analysis",
@ -73,11 +98,19 @@ CLANG_ONLY_COPTS = [
    "-Wunused-comparison",
 ]

+# Warnings only supported by Clang, but not Clang-cl
+CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
+    # Do not treat the third_party headers as system headers when building
+    # highway - the errors are pertinent.
+    "--no-system-header-prefix=third_party/highway",
+]
+
 COPTS = select({
    ":compiler_msvc": [],
    ":compiler_gcc": CLANG_GCC_COPTS,
+    ":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
    # Default to clang because compiler detection only works in Bazel
-    "//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
+    "//conditions:default": CLANG_ONLY_COPTS,
 }) + select({
    "@platforms//cpu:riscv64": [
        "-march=rv64gcv1p0",
@ -87,6 +120,12 @@ COPTS = select({
    ],
 })

+DEFINES = select({
+    ":compiler_msvc": ["HWY_SHARED_DEFINE"],
+    ":compiler_clangcl": ["HWY_SHARED_DEFINE"],
+    "//conditions:default": [],
+})
+
 # Unused on Bazel builds, where this is not defined/known; Copybara replaces
 # usages with an empty list.
 COMPAT = [
@ -102,6 +141,8 @@ cc_library(
    name = "hwy",
    srcs = [
        "hwy/aligned_allocator.cc",
+        "hwy/per_target.cc",
+        "hwy/print.cc",
        "hwy/targets.cc",
    ],
    # Normal headers with include guards
@ -110,10 +151,12 @@ cc_library(
        "hwy/base.h",
        "hwy/cache_control.h",
        "hwy/detect_compiler_arch.h",  # private
-        "hwy/highway_export.h",
+        "hwy/print.h",
    ],
    compatible_with = [],
    copts = COPTS,
+    defines = DEFINES,
+    local_defines = ["hwy_EXPORTS"],
    textual_hdrs = [
        # These are textual because config macros influence them:
        "hwy/detect_targets.h",  # private
@ -121,8 +164,12 @@ cc_library(
        # End of list
        "hwy/highway.h",  # public
        "hwy/foreach_target.h",  # public
+        "hwy/per_target.h",  # public
+        "hwy/print-inl.h",  # public
+        "hwy/highway_export.h",  # public
        "hwy/ops/arm_neon-inl.h",
        "hwy/ops/arm_sve-inl.h",
+        "hwy/ops/emu128-inl.h",
        "hwy/ops/generic_ops-inl.h",
        "hwy/ops/scalar-inl.h",
        "hwy/ops/set_macros-inl.h",
@ -140,9 +187,24 @@ cc_library(
    }),
 )

+cc_library(
+    name = "algo",
+    compatible_with = [],
+    copts = COPTS,
+    textual_hdrs = [
+        "hwy/contrib/algo/copy-inl.h",
+        "hwy/contrib/algo/find-inl.h",
+        "hwy/contrib/algo/transform-inl.h",
+    ],
+    deps = [
+        ":hwy",
+    ],
+)
+
 cc_library(
    name = "dot",
    compatible_with = [],
+    copts = COPTS,
    textual_hdrs = [
        "hwy/contrib/dot/dot-inl.h",
    ],
@ -160,6 +222,8 @@ cc_library(
        "hwy/contrib/image/image.h",
    ],
    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_contrib_EXPORTS"],
    deps = [
        ":hwy",
    ],
@ -168,6 +232,7 @@ cc_library(
 cc_library(
    name = "math",
    compatible_with = [],
+    copts = COPTS,
    textual_hdrs = [
        "hwy/contrib/math/math-inl.h",
    ],
@ -181,6 +246,9 @@ cc_library(
    name = "hwy_test_util",
    srcs = ["hwy/tests/test_util.cc"],
    hdrs = ["hwy/tests/test_util.h"],
+    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_test_EXPORTS"],
    textual_hdrs = [
        "hwy/tests/test_util-inl.h",
        "hwy/tests/hwy_gtest.h",
@ -196,12 +264,16 @@ cc_library(
    name = "nanobenchmark",
    srcs = ["hwy/nanobenchmark.cc"],
    hdrs = ["hwy/nanobenchmark.h"],
+    compatible_with = [],
+    copts = COPTS,
+    local_defines = ["hwy_EXPORTS"],
    deps = [":hwy"],
 )

 cc_binary(
    name = "benchmark",
    srcs = ["hwy/examples/benchmark.cc"],
+    copts = COPTS,
    deps = [
        ":hwy",
        ":nanobenchmark",
@ -212,6 +284,8 @@ cc_library(
    name = "skeleton",
    srcs = ["hwy/examples/skeleton.cc"],
    hdrs = ["hwy/examples/skeleton.h"],
+    copts = COPTS,
+    local_defines = ["hwy_EXPORTS"],
    textual_hdrs = ["hwy/examples/skeleton-inl.h"],
    deps = [
        ":hwy",
@ -226,6 +300,9 @@ cc_binary(

 # path, name
 HWY_TESTS = [
+    ("hwy/contrib/algo/", "copy_test"),
+    ("hwy/contrib/algo/", "find_test"),
+    ("hwy/contrib/algo/", "transform_test"),
    ("hwy/contrib/dot/", "dot_test"),
    ("hwy/contrib/image/", "image_test"),
    ("hwy/contrib/math/", "math_test"),
@ -238,20 +315,40 @@ HWY_TESTS = [
    ("hwy/", "targets_test"),
    ("hwy/tests/", "arithmetic_test"),
    ("hwy/tests/", "blockwise_test"),
+    ("hwy/tests/", "blockwise_shift_test"),
    ("hwy/tests/", "combine_test"),
    ("hwy/tests/", "compare_test"),
+    ("hwy/tests/", "compress_test"),
    ("hwy/tests/", "convert_test"),
    ("hwy/tests/", "crypto_test"),
    ("hwy/tests/", "demote_test"),
+    ("hwy/tests/", "float_test"),
+    ("hwy/tests/", "if_test"),
+    ("hwy/tests/", "interleaved_test"),
    ("hwy/tests/", "logical_test"),
    ("hwy/tests/", "mask_test"),
+    ("hwy/tests/", "mask_mem_test"),
    ("hwy/tests/", "memory_test"),
+    ("hwy/tests/", "mul_test"),
+    ("hwy/tests/", "reduction_test"),
+    ("hwy/tests/", "reverse_test"),
    ("hwy/tests/", "shift_test"),
    ("hwy/tests/", "swizzle_test"),
    ("hwy/tests/", "test_util_test"),
 ]

+HWY_TEST_COPTS = select({
+    ":compiler_msvc": [],
+    "//conditions:default": [
+        # gTest triggers this warning (which is enabled by the
+        # extra-semi in COPTS), so we need to disable it here,
+        # but it's still enabled for :hwy.
+        "-Wno-c++98-compat-extra-semi",
+    ],
+})
+
 HWY_TEST_DEPS = [
+    ":algo",
    ":dot",
    ":hwy",
    ":hwy_test_util",
@ -272,12 +369,7 @@ HWY_TEST_DEPS = [
            srcs = [
                subdir + test + ".cc",
            ],
-            copts = COPTS + [
-                # gTest triggers this warning (which is enabled by the
-                # extra-semi in COPTS), so we need to disable it here,
-                # but it's still enabled for :hwy.
-                "-Wno-c++98-compat-extra-semi",
-            ],
+            copts = COPTS + HWY_TEST_COPTS,
            features = select({
                "@platforms//cpu:riscv64": ["fully_static_link"],
                "//conditions:default": [],
--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@ -19,7 +19,7 @@ if(POLICY CMP0083)
  cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 0.16.0)  # Keep in sync with highway.h version
+project(hwy VERSION 1.0.0)  # Keep in sync with highway.h version

 # Directly define the ABI version from the cmake project() version values:
 set(LIBRARY_VERSION "${hwy_VERSION}")
@ -48,6 +48,7 @@ set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4
 # arise due to compiler/platform changes. Enable this in CI/tests.
 set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")

+set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
 set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
 set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")

@ -62,12 +63,25 @@ check_cxx_source_compiles(
  HWY_EMSCRIPTEN
 )

-set(HWY_CONTRIB_SOURCES
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__riscv)
+      static_assert(false, \"__riscv is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_RISCV
+)
+
+if (HWY_ENABLE_CONTRIB)
+# Glob all the traits so we don't need to modify this file when adding
+# additional special cases.
+file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
+list(APPEND HWY_CONTRIB_SOURCES
    hwy/contrib/dot/dot-inl.h
    hwy/contrib/image/image.cc
    hwy/contrib/image/image.h
    hwy/contrib/math/math-inl.h
-    hwy/contrib/sort/disabled_targets.h
    hwy/contrib/sort/shared-inl.h
    hwy/contrib/sort/sorting_networks-inl.h
    hwy/contrib/sort/traits-inl.h
@ -75,25 +89,8 @@ set(HWY_CONTRIB_SOURCES
    hwy/contrib/sort/vqsort-inl.h
    hwy/contrib/sort/vqsort.cc
    hwy/contrib/sort/vqsort.h
-    hwy/contrib/sort/vqsort_128a.cc
-    hwy/contrib/sort/vqsort_128d.cc
-    hwy/contrib/sort/vqsort_f32a.cc
-    hwy/contrib/sort/vqsort_f32d.cc
-    hwy/contrib/sort/vqsort_f64a.cc
-    hwy/contrib/sort/vqsort_f64d.cc
-    hwy/contrib/sort/vqsort_i16a.cc
-    hwy/contrib/sort/vqsort_i16d.cc
-    hwy/contrib/sort/vqsort_i32a.cc
-    hwy/contrib/sort/vqsort_i32d.cc
-    hwy/contrib/sort/vqsort_i64a.cc
-    hwy/contrib/sort/vqsort_i64d.cc
-    hwy/contrib/sort/vqsort_u16a.cc
-    hwy/contrib/sort/vqsort_u16d.cc
-    hwy/contrib/sort/vqsort_u32a.cc
-    hwy/contrib/sort/vqsort_u32d.cc
-    hwy/contrib/sort/vqsort_u64a.cc
-    hwy/contrib/sort/vqsort_u64d.cc
 )
+endif()  # HWY_ENABLE_CONTRIB

 set(HWY_SOURCES
    hwy/aligned_allocator.cc
@ -109,6 +106,7 @@ set(HWY_SOURCES
    hwy/nanobenchmark.h
    hwy/ops/arm_neon-inl.h
    hwy/ops/arm_sve-inl.h
+    hwy/ops/emu128-inl.h
    hwy/ops/generic_ops-inl.h
    hwy/ops/scalar-inl.h
    hwy/ops/set_macros-inl.h
@ -117,6 +115,11 @@ set(HWY_SOURCES
    hwy/ops/x86_128-inl.h
    hwy/ops/x86_256-inl.h
    hwy/ops/x86_512-inl.h
+    hwy/per_target.cc
+    hwy/per_target.h
+    hwy/print-inl.h
+    hwy/print.cc
+    hwy/print.h
    hwy/targets.cc
    hwy/targets.h
 )
@ -129,7 +132,10 @@ set(HWY_TEST_SOURCES
 )

 if (MSVC)
-  # TODO(janwas): add flags
+  set(HWY_FLAGS
+    # fix build error C1128 in blockwise*_test & arithmetic_test
+    /bigobj
+  )
 else()
  set(HWY_FLAGS
    # Avoid changing binaries based on the current time and date.
@ -215,10 +221,23 @@ else()
    )
  endif()  # HWY_CMAKE_ARM7

+  if(HWY_RISCV)
+    list(APPEND HWY_FLAGS -march=rv64gcv1p0)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      list(APPEND HWY_FLAGS -menable-experimental-extensions)
+    endif()
+  endif()
+
  if (HWY_WARNINGS_ARE_ERRORS)
    list(APPEND HWY_FLAGS -Werror)
  endif()

+  # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
+  # because it was not compiled with 'atomics' or 'bulk-memory' features."
+  if (HWY_EMSCRIPTEN)
+    list(APPEND HWY_FLAGS -matomics)
+  endif()
+
 endif()  # !MSVC

 # By default prefer STATIC build (legacy behavior)
@ -249,31 +268,57 @@ target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
 target_compile_options(hwy PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+target_include_directories(hwy PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy PUBLIC cxx_std_11)
 set_target_properties(hwy PROPERTIES
  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
 if(UNIX AND NOT APPLE)
+  if(NOT HWY_EMSCRIPTEN)
+    # For GCC __atomic_store_8, see #887
+    target_link_libraries(hwy atomic)
+  endif()
+  # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
  set_property(TARGET hwy APPEND_STRING PROPERTY
    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
 endif()

+if (HWY_ENABLE_CONTRIB)
 add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
 target_link_libraries(hwy_contrib hwy)
 target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+target_include_directories(hwy_contrib PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_contrib PUBLIC cxx_std_11)
+set_target_properties(hwy_contrib PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()
+endif()  # HWY_ENABLE_CONTRIB

 add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
 target_link_libraries(hwy_test hwy)
 target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
 set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+target_include_directories(hwy_test PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
 target_compile_features(hwy_test PUBLIC cxx_std_11)
+set_target_properties(hwy_test PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
+# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
+if(UNIX AND NOT APPLE)
+  set_property(TARGET hwy_test APPEND_STRING PROPERTY
+    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
+endif()

 # -------------------------------------------------------- hwy_list_targets
 # Generate a tool to print the compiled-in targets as defined by the current
@ -313,6 +358,7 @@ foreach (source ${HWY_SOURCES})
  endif()
 endforeach()

+if (HWY_ENABLE_CONTRIB)
 install(TARGETS hwy_contrib
  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
@ -326,6 +372,7 @@ foreach (source ${HWY_CONTRIB_SOURCES})
        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
  endif()
 endforeach()
+endif()  # HWY_ENABLE_CONTRIB

 install(TARGETS hwy_test
  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
@ -343,13 +390,17 @@ endforeach()

 # Add a pkg-config file for libhwy and the contrib/test libraries.
 set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
-foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
+set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
+if (HWY_ENABLE_CONTRIB)
+list(APPEND HWY_PC_FILES libhwy-contrib.pc)
+endif()  # HWY_ENABLE_CONTRIB
+foreach (pc ${HWY_PC_FILES})
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 endforeach()

-endif() # HWY_ENABLE_INSTALL
+endif()  # HWY_ENABLE_INSTALL
 # -------------------------------------------------------- Examples
 if (HWY_ENABLE_EXAMPLES)

@ -360,14 +411,14 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 add_executable(hwy_benchmark hwy/examples/benchmark.cc)
 target_sources(hwy_benchmark PRIVATE
    hwy/nanobenchmark.h)
-# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
-# observe the difference in targets printed.
+# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
+# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
 target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
 target_link_libraries(hwy_benchmark hwy)
 set_target_properties(hwy_benchmark
    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")

-endif() # HWY_ENABLE_EXAMPLES
+endif()  # HWY_ENABLE_EXAMPLES
 # -------------------------------------------------------- Tests

 include(CTest)
@ -411,34 +462,62 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
 if (CMAKE_VERSION VERSION_LESS 2.8.11)
  include_directories("${gtest_SOURCE_DIR}/include")
 endif()
-endif() # HWY_SYSTEM_GTEST
+endif()  # HWY_SYSTEM_GTEST

 set(HWY_TEST_FILES
+  hwy/contrib/algo/copy_test.cc
+  hwy/contrib/algo/find_test.cc
+  hwy/contrib/algo/transform_test.cc
+  hwy/aligned_allocator_test.cc
+  hwy/base_test.cc
+  hwy/highway_test.cc
+  hwy/nanobenchmark_test.cc
+  hwy/targets_test.cc
+  hwy/examples/skeleton_test.cc
+  hwy/tests/arithmetic_test.cc
+  hwy/tests/blockwise_test.cc
+  hwy/tests/blockwise_shift_test.cc
+  hwy/tests/combine_test.cc
+  hwy/tests/compare_test.cc
+  hwy/tests/compress_test.cc
+  hwy/tests/convert_test.cc
+  hwy/tests/crypto_test.cc
+  hwy/tests/demote_test.cc
+  hwy/tests/float_test.cc
+  hwy/tests/if_test.cc
+  hwy/tests/interleaved_test.cc
+  hwy/tests/logical_test.cc
+  hwy/tests/mask_test.cc
+  hwy/tests/mask_mem_test.cc
+  hwy/tests/memory_test.cc
+  hwy/tests/mul_test.cc
+  hwy/tests/reduction_test.cc
+  hwy/tests/reverse_test.cc
+  hwy/tests/shift_test.cc
+  hwy/tests/swizzle_test.cc
+  hwy/tests/test_util_test.cc
+)
+
+set(HWY_TEST_LIBS hwy hwy_test)
+
+if (HWY_ENABLE_CONTRIB)
+list(APPEND HWY_TEST_LIBS hwy_contrib)
+
+list(APPEND HWY_TEST_FILES
  hwy/contrib/dot/dot_test.cc
  hwy/contrib/image/image_test.cc
  # Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
  # not reproducible locally. Still tested via bazel build.
  # hwy/contrib/math/math_test.cc
  hwy/contrib/sort/sort_test.cc
-  hwy/aligned_allocator_test.cc
-  hwy/base_test.cc
-  hwy/highway_test.cc
-  hwy/targets_test.cc
-  hwy/examples/skeleton_test.cc
-  hwy/tests/arithmetic_test.cc
-  hwy/tests/blockwise_test.cc
-  hwy/tests/combine_test.cc
-  hwy/tests/compare_test.cc
-  hwy/tests/convert_test.cc
-  hwy/tests/crypto_test.cc
-  hwy/tests/demote_test.cc
-  hwy/tests/logical_test.cc
-  hwy/tests/mask_test.cc
-  hwy/tests/memory_test.cc
-  hwy/tests/shift_test.cc
-  hwy/tests/swizzle_test.cc
-  hwy/tests/test_util_test.cc
 )
+endif()  # HWY_ENABLE_CONTRIB
+
+if(HWY_SYSTEM_GTEST)
+  set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
+else()
+  set(HWY_GTEST_LIBS gtest gtest_main)
+endif()

 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
 foreach (TESTFILE IN LISTS HWY_TEST_FILES)
@ -452,11 +531,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
  # that include us may set them.
  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)

-  if(HWY_SYSTEM_GTEST)
-    target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test GTest::GTest GTest::Main)
-  else()
-    target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
-  endif()
+  target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
  # Output test targets in the test directory.
  set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")

@ -474,4 +549,4 @@ endforeach ()
 # The skeleton test uses the skeleton library code.
 target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)

-endif() # BUILD_TESTING
+endif()  # BUILD_TESTING
--- a/third_party/highway/README.md
+++ b/third_party/highway/README.md
@ -64,7 +64,7 @@ via the below email)
 *   [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
 *   [JPEG XL image codec](https://github.com/libjxl/libjxl)
 *   [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
-*   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort)
+*   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))

 ## Current status

@ -72,11 +72,9 @@ via the below email)

 Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
 requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
-WASM SIMD.
+WASM SIMD, RISC-V V.

-SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV
-is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops
-which were not yet supported by GCC.
+SVE was initially tested using farm_sve (see acknowledgments).

 ### Versioning

@ -85,18 +83,19 @@ incrementing MINOR after backward-compatible additions and PATCH after
 backward-compatible fixes. We recommend using releases (rather than the Git tip)
 because they are tested more extensively, see below.

-Version 0.11 is considered stable enough to use in other projects.
-Version 1.0 will signal an increased focus on backwards compatibility and will
-be reached after the RVV target is finished (planned for 2022H1).
+The current version 1.0 signals an increased focus on backwards compatibility.
+Applications using documented functionality will remain compatible with future
+updates that have the same major version number.

 ### Testing

 Continuous integration tests build with a recent version of Clang (running on
-x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
+native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
+x86).

-Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
-GCC cross-compile and QEMU. See the
-[testing process](g3doc/release_testing_process.md) for details.
+Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
+cross-compile. See the [testing process](g3doc/release_testing_process.md) for
+details.

 ### Related modules

@ -148,21 +147,29 @@ portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
 `HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
 alternatives for use-cases requiring an upper bound on the lanes:

-   For up to a power of two `N`, specify `CappedTag<T, N>` (or
-    equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as
-    a narrow matrix. A loop is still required because vectors may actually have
-    fewer than `N` lanes.
+-   For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
+    `HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
+    the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
+    useful for data structures such as a narrow matrix. A loop is still required
+    because vectors may actually have fewer than `N` lanes.

 -   For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
    supported `N` depends on the target, but is guaranteed to be at least
    `16/sizeof(T)`.

-Functions using Highway must either be inside `namespace HWY_NAMESPACE {`
-(possibly nested in one or more other namespaces defined by the project), OR
-each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE;
-hn::LoadDup128()`. Additionally, each function using Highway must either be
-prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
-`HWY_AFTER_NAMESPACE()`.
+Due to ADL restrictions, user code calling Highway ops must either:
+*   Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
+*   prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
+    hn::Add()`; or
+*   add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
+
+Additionally, each function that calls Highway ops must either be prefixed with
+`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
+`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
+their opening brace.
+
+The entry points into code using Highway differ slightly depending on whether
+they use static or dynamic dispatch.

 *   For static dispatch, `HWY_TARGET` will be the best available target among
    `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
@ -222,12 +229,26 @@ Highway offers several ways to express loops where `N` need not divide `count`:
    this avoids the (potentially large) cost of predication or partial
    loads/stores on older targets, and does not duplicate code.

+*   Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
+    takes care of the loop and remainder handling and you simply define a
+    generic lambda function (C++14) or functor which receives the current vector
+    from the input/output array, plus optionally vectors from up to two extra
+    input arrays, and returns the value to write to the input/output array.
+
+    Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
+
+    ```
+    Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
+      return MulAdd(Set(d, alpha), v, v1);
+    });
+    ```
+
 *   Process whole vectors as above, followed by a scalar loop:

    ```
    size_t i = 0;
    for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
-    for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0);
+    for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
    ```
    The template parameter and second function arguments are again not needed.

@ -247,10 +268,14 @@ Highway offers several ways to express loops where `N` need not divide `count`:
    }
    ```
    Now the template parameter and third function argument can be used inside
-    `LoopBody` to 'blend' the new partial vector with previous memory contents:
-    `Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.
+    `LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
+    with the previous contents of memory at subsequent locations:
+    `BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
+    `MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
+    `num_remaining` elements and returns zero in other lanes.

-    This is a good default when it is infeasible to ensure vectors are padded.
+    This is a good default when it is infeasible to ensure vectors are padded,
+    but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
    In contrast to the scalar loop, only a single final iteration is needed.
    The increased code size from two loop bodies is expected to be worthwhile
    because it avoids the cost of masking in all but the final iteration.
@ -260,6 +285,7 @@ Highway offers several ways to express loops where `N` need not divide `count`:
 *   [Highway introduction (slides)](g3doc/highway_intro.pdf)
 *   [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
 *   [Design philosophy and comparison](g3doc/design_philosophy.md)
+*   [Implementation details](g3doc/impl_details.md)

 ## Acknowledgments

--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@ -1,3 +1,37 @@
+highway (1.0.0-1) UNRELEASED; urgency=medium
+
+* ABI change: 64-bit target values, more room for expansion
+* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
+* Add HWY_SVE2_128 target
+* Sort speedups especially for 128-bit
+* Documentation clarifications
+* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
+* Improved SVE codegen
+* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
+* MSVC 2017 workarounds
+* Support for runtime dispatch on Arm/GCC/Linux
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 27 Jul 2022 10:00:00 +0200
+
+highway (0.17.0-1) UNRELEASED; urgency=medium
+
+* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
+* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
+* Add MulFixedPoint15, Or3
+* Add Copy[If], Find[If], Generate, Replace[If] algos
+* Add HWY_EMU128 target (replaces HWY_SCALAR)
+* HWY_RVV is feature-complete
+* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
+* Extend ConcatOdd/Even and StoreInterleaved* to all types
+* Allow CappedTag<T, nonPowerOfTwo>
+* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
+* Expand documentation
+* Fix RDTSCP crash in nanobenchmark
+* Fix XCR0 check (was ignoring AVX3 on ICL)
+* Support Arm/RISC-V timers
+
+ -- Jan Wassenberg <janwas@google.com>  Fri, 20 May 2022 10:00:00 +0200
+
 highway (0.16.0-1) UNRELEASED; urgency=medium

  * Add contrib/sort (vectorized quicksort)
--- a/third_party/highway/hwy/aligned_allocator.cc
+++ b/third_party/highway/hwy/aligned_allocator.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -62,8 +63,8 @@ size_t NextAlignedOffset() {

 }  // namespace

-void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
-                           void* opaque_ptr) {
+HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
+                                         AllocPtr alloc_ptr, void* opaque_ptr) {
  HWY_ASSERT(payload_size != 0);  // likely a bug in caller
  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
    HWY_DASSERT(false && "payload_size too large");
@ -109,8 +110,8 @@ void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
 }

-void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
-                      void* opaque_ptr) {
+HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
+                                    FreePtr free_ptr, void* opaque_ptr) {
  if (aligned_pointer == nullptr) return;

  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
@ -126,9 +127,10 @@ void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
 }

 // static
-void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
-                                        void* opaque_ptr,
-                                        ArrayDeleter deleter) {
+HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
+                                                      FreePtr free_ptr,
+                                                      void* opaque_ptr,
+                                                      ArrayDeleter deleter) {
  if (aligned_pointer == nullptr) return;

  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
--- a/third_party/highway/hwy/aligned_allocator.h
+++ b/third_party/highway/hwy/aligned_allocator.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/aligned_allocator_test.cc
+++ b/third_party/highway/hwy/aligned_allocator_test.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -22,7 +23,6 @@
 #include <vector>

 #include "gtest/gtest.h"
-#include "hwy/base.h"

 namespace {

@ -69,8 +69,8 @@ class FakeAllocator {
  void Free(void* memory) {
    if (!memory) return;
    EXPECT_NE(allocs_.end(), allocs_.find(memory));
-    free(memory);
    allocs_.erase(memory);
+    free(memory);
  }

  std::set<void*> allocs_;
@ -276,9 +276,3 @@ TEST(AlignedAllocatorTest, DefaultInit) {
 }

 }  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -20,12 +21,13 @@
 #include <stddef.h>
 #include <stdint.h>

-#include <atomic>
-#include <cfloat>
-
 #include "hwy/detect_compiler_arch.h"
 #include "hwy/highway_export.h"

+#if HWY_ARCH_X86
+#include <atomic>
+#endif
+
 //------------------------------------------------------------------------------
 // Compiler-specific definitions

@ -57,7 +59,13 @@
 #else

 #define HWY_RESTRICT __restrict__
+// force inlining without optimization enabled creates very inefficient code
+// that can cause compiler timeout
+#ifdef __OPTIMIZE__
 #define HWY_INLINE inline __attribute__((always_inline))
+#else
+#define HWY_INLINE inline
+#endif
 #define HWY_NOINLINE __attribute__((noinline))
 #define HWY_FLATTEN __attribute__((flatten))
 #define HWY_NORETURN __attribute__((noreturn))
@ -165,6 +173,14 @@
 #define HWY_IS_TSAN 0
 #endif

+// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
+// You can disable MSAN by adding this attribute to the function that fails.
+#if HWY_IS_MSAN
+#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
+#else
+#define HWY_ATTR_NO_MSAN
+#endif
+
 // For enabling HWY_DASSERT and shortening tests in slower debug builds
 #if !defined(HWY_IS_DEBUG_BUILD)
 // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
@ -219,19 +235,17 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 // by concatenating base type and bits.

-#if HWY_ARCH_ARM && (__ARM_FP & 2)
-#define HWY_NATIVE_FLOAT16 1
-#else
-#define HWY_NATIVE_FLOAT16 0
-#endif
-
 #pragma pack(push, 1)

-#if HWY_NATIVE_FLOAT16
+// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
+// always supported on aarch64, for v7 only if -mfp16-format is given.
+#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
 using float16_t = __fp16;
-// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
-// arguments, so use a wrapper.
-// TODO(janwas): replace with _Float16 when that is supported?
+// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
+// Required for Clang RVV if the float16 extension is used.
+#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
+using float16_t = _Float16;
+// Otherwise emulate
 #else
 struct float16_t {
  uint16_t bits;
@ -247,6 +261,44 @@ struct bfloat16_t {
 using float32_t = float;
 using float64_t = double;

+#pragma pack(push, 1)
+
+// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
+// https://reviews.llvm.org/D86310
+struct alignas(16) uint128_t {
+  uint64_t lo;  // little-endian layout
+  uint64_t hi;
+};
+
+// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
+// field is to be compared (Lt128Upper instead of Lt128).
+struct alignas(16) K64V64 {
+  uint64_t value;  // little-endian layout
+  uint64_t key;
+};
+
+#pragma pack(pop)
+
+static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
+                                              const uint128_t& b) {
+  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
+                                              const uint128_t& b) {
+  return b < a;
+}
+
+static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
+                                              const K64V64& b) {
+  return a.key < b.key;
+}
+// Required for std::greater.
+static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
+                                              const K64V64& b) {
+  return b < a;
+}
+
 //------------------------------------------------------------------------------
 // Controlling overload resolution (SFINAE)

@ -299,6 +351,11 @@ HWY_API constexpr bool IsSame() {
  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
 #define HWY_IF_NOT_LANE_SIZE(T, bytes) \
  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+#define HWY_IF_LANE_SIZE_LT(T, bytes) \
+  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
+
+#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
+  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr

 // Empty struct used as a size tag type.
 template <size_t N>
@ -328,12 +385,16 @@ struct Relations<uint8_t> {
  using Unsigned = uint8_t;
  using Signed = int8_t;
  using Wide = uint16_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int8_t> {
  using Unsigned = uint8_t;
  using Signed = int8_t;
  using Wide = int16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint16_t> {
@ -341,6 +402,8 @@ struct Relations<uint16_t> {
  using Signed = int16_t;
  using Wide = uint32_t;
  using Narrow = uint8_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int16_t> {
@ -348,6 +411,8 @@ struct Relations<int16_t> {
  using Signed = int16_t;
  using Wide = int32_t;
  using Narrow = int8_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint32_t> {
@ -356,6 +421,8 @@ struct Relations<uint32_t> {
  using Float = float;
  using Wide = uint64_t;
  using Narrow = uint16_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int32_t> {
@ -364,13 +431,18 @@ struct Relations<int32_t> {
  using Float = float;
  using Wide = int64_t;
  using Narrow = int16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<uint64_t> {
  using Unsigned = uint64_t;
  using Signed = int64_t;
  using Float = double;
+  using Wide = uint128_t;
  using Narrow = uint32_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<int64_t> {
@ -378,6 +450,15 @@ struct Relations<int64_t> {
  using Signed = int64_t;
  using Float = double;
  using Narrow = int32_t;
+  enum { is_signed = 1 };
+  enum { is_float = 0 };
+};
+template <>
+struct Relations<uint128_t> {
+  using Unsigned = uint128_t;
+  using Narrow = uint64_t;
+  enum { is_signed = 0 };
+  enum { is_float = 0 };
 };
 template <>
 struct Relations<float16_t> {
@ -385,12 +466,16 @@ struct Relations<float16_t> {
  using Signed = int16_t;
  using Float = float16_t;
  using Wide = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<bfloat16_t> {
  using Unsigned = uint16_t;
  using Signed = int16_t;
  using Wide = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<float> {
@ -399,6 +484,8 @@ struct Relations<float> {
  using Float = float;
  using Wide = double;
  using Narrow = float16_t;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };
 template <>
 struct Relations<double> {
@ -406,6 +493,8 @@ struct Relations<double> {
  using Signed = int64_t;
  using Float = double;
  using Narrow = float;
+  enum { is_signed = 1 };
+  enum { is_float = 1 };
 };

 template <size_t N>
@ -432,6 +521,10 @@ struct TypeFromSize<8> {
  using Signed = int64_t;
  using Float = double;
 };
+template <>
+struct TypeFromSize<16> {
+  using Unsigned = uint128_t;
+};

 }  // namespace detail

@ -457,6 +550,24 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
 template <size_t N>
 using FloatFromSize = typename detail::TypeFromSize<N>::Float;

+// Avoid confusion with SizeTag where the parameter is a lane size.
+using UnsignedTag = SizeTag<0>;
+using SignedTag = SizeTag<0x100>;  // integer
+using FloatTag = SizeTag<0x200>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
+  return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
+}
+
+// For when we only want to distinguish FloatTag from everything else.
+using NonFloatTag = SizeTag<0x400>;
+
+template <typename T, class R = detail::Relations<T>>
+constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
+  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
+}
+
 //------------------------------------------------------------------------------
 // Type traits

@ -502,11 +613,11 @@ HWY_API constexpr T LowestValue() {
 }
 template <>
 constexpr float LowestValue<float>() {
-  return -FLT_MAX;
+  return -3.402823466e+38F;
 }
 template <>
 constexpr double LowestValue<double>() {
-  return -DBL_MAX;
+  return -1.7976931348623158e+308;
 }

 template <typename T>
@ -515,41 +626,51 @@ HWY_API constexpr T HighestValue() {
 }
 template <>
 constexpr float HighestValue<float>() {
-  return FLT_MAX;
+  return 3.402823466e+38F;
 }
 template <>
 constexpr double HighestValue<double>() {
-  return DBL_MAX;
+  return 1.7976931348623158e+308;
+}
+
+// Returns width in bits of the mantissa field in IEEE binary32/64.
+template <typename T>
+constexpr int MantissaBits() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr int MantissaBits<float>() {
+  return 23;
+}
+template <>
+constexpr int MantissaBits<double>() {
+  return 52;
+}
+
+// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
+// the largest possible (biased) exponent field. Used by IsInf.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentTimes2() {
+  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
+}
+
+// Returns bitmask of the sign bit in IEEE binary32/64.
+template <typename T>
+constexpr MakeUnsigned<T> SignMask() {
+  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
 }

 // Returns bitmask of the exponent field in IEEE binary32/64.
 template <typename T>
-constexpr T ExponentMask() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr uint32_t ExponentMask<uint32_t>() {
-  return 0x7F800000;
-}
-template <>
-constexpr uint64_t ExponentMask<uint64_t>() {
-  return 0x7FF0000000000000ULL;
+constexpr MakeUnsigned<T> ExponentMask() {
+  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
 }

 // Returns bitmask of the mantissa field in IEEE binary32/64.
 template <typename T>
-constexpr T MantissaMask() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr uint32_t MantissaMask<uint32_t>() {
-  return 0x007FFFFF;
-}
-template <>
-constexpr uint64_t MantissaMask<uint64_t>() {
-  return 0x000FFFFFFFFFFFFFULL;
+constexpr MakeUnsigned<T> MantissaMask() {
+  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
 }

 // Returns 1 << mantissa_bits as a floating-point number. All integers whose
@ -569,6 +690,21 @@ constexpr double MantissaEnd<double>() {
  return 4503599627370496.0;  // 1 << 52
 }

+// Returns width in bits of the exponent field in IEEE binary32/64.
+template <typename T>
+constexpr int ExponentBits() {
+  // Exponent := remaining bits after deducting sign and mantissa.
+  return 8 * sizeof(T) - 1 - MantissaBits<T>();
+}
+
+// Returns largest value of the biased exponent field in IEEE binary32/64,
+// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
+// This is expressed as a signed integer for more efficient comparison.
+template <typename T>
+constexpr MakeSigned<T> MaxExponentField() {
+  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
+}
+
 //------------------------------------------------------------------------------
 // Helper functions

@ -602,7 +738,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
 #else   // HWY_ARCH_X86_64
  // _BitScanForward64 not available
  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
-  unsigned long index;
+  unsigned long index;  // NOLINT
  if (lsb == 0) {
    uint32_t msb = static_cast<uint32_t>(x >> 32u);
    _BitScanForward(&index, msb);
@ -637,7 +773,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 #else   // HWY_ARCH_X86_64
  // _BitScanReverse64 not available
  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
-  unsigned long index;
+  unsigned long index;  // NOLINT
  if (msb == 0) {
    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
    _BitScanReverse(&index, lsb);
@ -653,7 +789,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
 }

 HWY_API size_t PopCount(uint64_t x) {
-#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
+#if HWY_COMPILER_GCC  // includes clang
  return static_cast<size_t>(__builtin_popcountll(x));
  // This instruction has a separate feature flag, but is often called from
  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
@ -662,7 +798,8 @@ HWY_API size_t PopCount(uint64_t x) {
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
  return _mm_popcnt_u64(x);
 #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
-  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
+         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
 #else
  x -= ((x >> 1) & 0x5555555555555555ULL);
  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
@ -715,22 +852,30 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
 #endif
 }

+#if HWY_COMPILER_MSVC
+#pragma intrinsic(memcpy)
+#pragma intrinsic(memset)
+#endif
+
 // The source/destination must not overlap/alias.
 template <size_t kBytes, typename From, typename To>
 HWY_API void CopyBytes(const From* from, To* to) {
 #if HWY_COMPILER_MSVC
-  const uint8_t* HWY_RESTRICT from_bytes =
-      reinterpret_cast<const uint8_t*>(from);
-  uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
-  for (size_t i = 0; i < kBytes; ++i) {
-    to_bytes[i] = from_bytes[i];
-  }
+  memcpy(to, from, kBytes);
 #else
-  // Avoids horrible codegen on Clang (series of PINSRB)
  __builtin_memcpy(to, from, kBytes);
 #endif
 }

+template <size_t kBytes, typename To>
+HWY_API void ZeroBytes(To* to) {
+#if HWY_COMPILER_MSVC
+  memset(to, 0, kBytes);
+#else
+  __builtin_memset(to, 0, kBytes);
+#endif
+}
+
 HWY_API float F32FromBF16(bfloat16_t bf) {
  uint32_t bits = bf.bits;
  bits <<= 16;
--- a/third_party/highway/hwy/base_test.cc
+++ b/third_party/highway/hwy/base_test.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,7 +22,7 @@

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "base_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

@ -30,25 +31,26 @@ namespace hwy {
 namespace HWY_NAMESPACE {

 HWY_NOINLINE void TestAllLimits() {
-  HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
+  HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());

-  HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
-  HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
+  HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
+  HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
+  HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
+  HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
+                LimitsMin<int64_t>());

-  HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
+  HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());

-  HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
-  HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
-  HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
-  HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
+  HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
+  HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
+  HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
+  HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
 }

 struct TestLowestHighest {
@ -88,6 +90,10 @@ HWY_NOINLINE void TestAllType() {
  ForUnsignedTypes(TestIsUnsigned());
  ForSignedTypes(TestIsSigned());
  ForFloatTypes(TestIsFloat());
+
+  static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
+  static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
+  static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
 }

 struct TestIsSame {
@ -102,54 +108,54 @@ struct TestIsSame {
 HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }

 HWY_NOINLINE void TestAllBitScan() {
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
-  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));

-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0},
                Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
-  HWY_ASSERT_EQ(size_t(0),
+  HWY_ASSERT_EQ(size_t{0},
                Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
-  HWY_ASSERT_EQ(size_t(1),
+  HWY_ASSERT_EQ(size_t{1},
                Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
-  HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
+  HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));

-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
-  HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
+  HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));

-  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
-  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t(62),
+  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t{62},
                Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
-  HWY_ASSERT_EQ(size_t(63),
+  HWY_ASSERT_EQ(size_t{63},
                Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
 }

 HWY_NOINLINE void TestAllPopCount() {
-  HWY_ASSERT_EQ(size_t(0), PopCount(0u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(1u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(2u));
-  HWY_ASSERT_EQ(size_t(2), PopCount(3u));
-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
+  HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));

-  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
-  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
-  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
+  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
+  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@ -169,10 +175,4 @@ HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/cache_control.h
+++ b/third_party/highway/hwy/cache_control.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -50,10 +51,10 @@ namespace hwy {
 #define HWY_ATTR_CACHE
 #endif

-// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
-// serves as a full fence (waits for all prior instructions to complete).
-// No effect on non-x86.
-// DEPRECATED due to differing behavior across architectures AND vendors.
+// Delays subsequent loads until prior loads are visible. Beware of potentially
+// differing behavior across architectures and vendors: on Intel but not
+// AMD CPUs, also serves as a full fence (waits for all prior instructions to
+// complete).
 HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_lfence();
@ -76,7 +77,7 @@ template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
-#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+#elif HWY_COMPILER_GCC  // includes clang
  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
  // desirable, so use the default 3 (keep in caches).
  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
--- a/third_party/highway/hwy/contrib/algo/copy-inl.h
+++ b/third_party/highway/hwy/contrib/algo/copy-inl.h
@ -0,0 +1,138 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
+#endif
+
+#include <string.h>  // memcpy
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a CopyAlignedPadded because it
+// would be more verbose than such a loop.
+
+// Fills `to`[0, `count`) with `value`.
+template <class D, typename T = TFromD<D>>
+void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
+  const size_t N = Lanes(d);
+  const Vec<D> v = Set(d, value);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    StoreU(v, d, to + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  SafeFillN(remaining, value, d, to + idx);
+}
+
+// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
+template <class D, typename T = TFromD<D>>
+void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, from + idx);
+    StoreU(v, d, to + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  SafeCopyN(remaining, d, from + idx, to + idx);
+}
+
+// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
+// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
+// of the newly written elements in `to`.
+//
+// `func` is either a functor with a templated operator()(d, v) returning a
+// mask, or a generic lambda if using C++14. Due to apparent limitations of
+// Clang on Windows, it is currently necessary to add HWY_ATTR before the
+// opening { of the lambda to avoid errors about "function .. requires target".
+//
+// NOTE: this is only supported for 16-, 32- or 64-bit types.
+// NOTE: Func may be called a second time for elements it has already seen, but
+// these elements will not be written to `to` again.
+template <class D, class Func, typename T = TFromD<D>>
+T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
+          const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, from + idx);
+    to += CompressBlendedStore(v, func(d, v), d, to);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return to;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    // Workaround for -Waggressive-loop-optimizations on GCC 8
+    // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
+    const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
+    const T* HWY_RESTRICT from_idx =
+        reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
+    const V1 v = LoadU(d1, from_idx);
+    // Avoid storing to `to` unless we know it should be kept - otherwise, we
+    // might overrun the end if it was allocated for the exact count.
+    if (CountTrue(d1, func(d1, v)) == 0) continue;
+    StoreU(v, d1, to);
+    to += 1;
+  }
+#else
+  // Start index of the last unaligned whole vector, ending at the array end.
+  const size_t last = count - N;
+  // Number of elements before `from` or already written.
+  const size_t invalid = idx - last;
+  HWY_DASSERT(0 != invalid && invalid < N);
+  const Mask<D> mask = Not(FirstN(d, invalid));
+  const Vec<D> v = MaskedLoad(mask, d, from + last);
+  to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
+#endif
+  return to;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
--- a/third_party/highway/hwy/contrib/algo/copy_test.cc
+++ b/third_party/highway/hwy/contrib/algo/copy_test.cc
@ -0,0 +1,199 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/copy-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random integer in [0, 128), which fits in any lane type.
+template <typename T>
+T Random7Bit(RandomState& rng) {
+  return static_cast<T>(Random32(&rng) & 127);
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+struct IsOdd {
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return TestBit(v, Set(d, TFromD<D>{1}));
+  }
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    for (size_t count = 0; count < 2 * N; ++count) {
+      for (size_t ma : misalignments) {
+        for (size_t mb : misalignments) {
+          Test()(d, count, ma, mb, rng);
+        }
+      }
+    }
+  }
+};
+
+struct TestFill {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // HWY_MAX prevents error when misalign == count == 0.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* expected = pa.get() + misalign_a;
+    const T value = Random7Bit<T>(rng);
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = value;
+    }
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
+    T* actual = pb.get() + misalign_b;
+
+    actual[count] = T{0};  // sentinel
+    Fill(d, value, count, actual);
+    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+void TestAllFill() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
+}
+
+struct TestCopy {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random7Bit<T>(rng);
+    }
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    T* b = pb.get() + misalign_b;
+
+    Copy(d, a, count, b);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
+                                  __LINE__);
+  }
+};
+
+void TestAllCopy() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
+}
+
+struct TestCopyIf {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random7Bit<T>(rng);
+    }
+    const size_t padding = Lanes(ScalableTag<T>());
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
+    T* b = pb.get() + misalign_b;
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    size_t num_odd = 0;
+    for (size_t i = 0; i < count; ++i) {
+      if (a[i] & 1) {
+        expected[num_odd++] = a[i];
+      }
+    }
+
+#if HWY_GENERIC_LAMBDA
+    const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
+      return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
+    };
+#else
+    const IsOdd is_odd;
+#endif
+    T* end = CopyIf(d, a, count, b, is_odd);
+    const size_t num_written = static_cast<size_t>(end - b);
+    HWY_ASSERT_EQ(num_odd, num_written);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+void TestAllCopyIf() {
+  ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(CopyTest);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
+HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
+}  // namespace hwy
+
+#endif
--- a/third_party/highway/hwy/contrib/algo/find-inl.h
+++ b/third_party/highway/hwy/contrib/algo/find-inl.h
@ -0,0 +1,109 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns index of the first element equal to `value` in `in[0, count)`, or
+// `count` if not found.
+template <class D, typename T = TFromD<D>>
+size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
+  const size_t N = Lanes(d);
+  const Vec<D> broadcasted = Set(d, value);
+
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+  }
+
+  if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+    // Scan single elements.
+    const CappedTag<T, 1> d1;
+    using V1 = Vec<decltype(d1)>;
+    const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
+    for (; i < count; ++i) {
+      if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
+        return i;
+      }
+    }
+#else
+    const size_t remaining = count - i;
+    HWY_DASSERT(0 != remaining && remaining < N);
+    const Mask<D> mask = FirstN(d, remaining);
+    const Vec<D> v = MaskedLoad(mask, d, in + i);
+    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+    const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif  // HWY_MEM_OPS_MIGHT_FAULT
+  }
+
+  return count;  // not found
+}
+
+// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
+// returns true, otherwise `count`.
+template <class D, class Func, typename T = TFromD<D>>
+size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t i = 0;
+  for (; i + N <= count; i += N) {
+    const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+  }
+
+  if (i != count) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+    // Scan single elements.
+    const CappedTag<T, 1> d1;
+    for (; i < count; ++i) {
+      if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
+        return i;
+      }
+    }
+#else
+    const size_t remaining = count - i;
+    HWY_DASSERT(0 != remaining && remaining < N);
+    const Mask<D> mask = FirstN(d, remaining);
+    const Vec<D> v = MaskedLoad(mask, d, in + i);
+    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
+    const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
+    if (pos >= 0) return i + static_cast<size_t>(pos);
+#endif  // HWY_MEM_OPS_MIGHT_FAULT
+  }
+
+  return count;  // not found
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
--- a/third_party/highway/hwy/contrib/algo/find_test.cc
+++ b/third_party/highway/hwy/contrib/algo/find_test.cc
@ -0,0 +1,219 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/print.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/find-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to FindIf, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Returns random number in [-8, 8) - we use knowledge of the range to Find()
+// values we know are not present.
+template <typename T>
+T Random(RandomState& rng) {
+  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+  const double val = (bits - 512) / 64.0;
+  // Clamp negative to zero for unsigned types.
+  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+class GreaterThan {
+ public:
+  GreaterThan(int val) : val_(val) {}
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
+  }
+
+ private:
+  int val_;
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestFind) with all arg combinations.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    // Find() checks 8 vectors at a time, so we want to cover a fairly large
+    // range without oversampling (checking every possible count).
+    std::vector<size_t> counts(AdjustedReps(512));
+    for (size_t& count : counts) {
+      count = static_cast<size_t>(rng()) % (16 * N + 1);
+    }
+    counts[0] = 0;  // ensure we test count=0.
+
+    for (size_t count : counts) {
+      for (size_t m : misalignments) {
+        Test()(d, count, m, rng);
+      }
+    }
+  }
+};
+
+struct TestFind {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+    using T = TFromD<D>;
+    // Must allocate at least one even if count is zero.
+    AlignedFreeUniquePtr<T[]> storage =
+        AllocateAligned<T>(HWY_MAX(1, misalign + count));
+    T* in = storage.get() + misalign;
+    for (size_t i = 0; i < count; ++i) {
+      in[i] = Random<T>(rng);
+    }
+
+    // For each position, search for that element (which we know is there)
+    for (size_t pos = 0; pos < count; ++pos) {
+      const size_t actual = Find(d, in[pos], in, count);
+
+      // We may have found an earlier occurrence of the same value; ensure the
+      // value is the same, and that it is the first.
+      if (!IsEqual(in[pos], in[actual])) {
+        fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                static_cast<double>(in[actual]), static_cast<int>(actual),
+                static_cast<double>(in[pos]));
+        HWY_ASSERT(false);
+      }
+      for (size_t i = 0; i < actual; ++i) {
+        if (IsEqual(in[i], in[pos])) {
+          fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
+                  hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                  static_cast<double>(in[i]), static_cast<int>(i),
+                  static_cast<int>(actual));
+          HWY_ASSERT(false);
+        }
+      }
+    }
+
+    // Also search for values we know not to be present (out of range)
+    HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
+    HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
+  }
+};
+
+void TestAllFind() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
+}
+
+struct TestFindIf {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
+    using T = TFromD<D>;
+    using TI = MakeSigned<T>;
+    // Must allocate at least one even if count is zero.
+    AlignedFreeUniquePtr<T[]> storage =
+        AllocateAligned<T>(HWY_MAX(1, misalign + count));
+    T* in = storage.get() + misalign;
+    for (size_t i = 0; i < count; ++i) {
+      in[i] = Random<T>(rng);
+      HWY_ASSERT(in[i] < 8);
+      HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
+    }
+
+    bool found_any = false;
+    bool not_found_any = false;
+
+    // unsigned T would be promoted to signed and compare greater than any
+    // negative val, whereas Set() would just cast to an unsigned value and the
+    // comparison remains unsigned, so avoid negative numbers there.
+    const int min_val = IsSigned<T>() ? -9 : 0;
+    // Includes out-of-range value 9 to test the not-found path.
+    for (int val = min_val; val <= 9; ++val) {
+#if HWY_GENERIC_LAMBDA
+      const auto greater = [val](const auto d, const auto v) HWY_ATTR {
+        return Gt(v, Set(d, static_cast<T>(val)));
+      };
+#else
+      const GreaterThan greater(val);
+#endif
+      const size_t actual = FindIf(d, in, count, greater);
+      found_any |= actual < count;
+      not_found_any |= actual == count;
+
+      const auto pos = std::find_if(
+          in, in + count, [val](T x) { return x > static_cast<T>(val); });
+      // Convert returned iterator to index.
+      const size_t expected = static_cast<size_t>(pos - in);
+      if (expected != actual) {
+        fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
+                val, static_cast<int>(expected), static_cast<int>(actual));
+        hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
+                                0, count);
+        HWY_ASSERT(false);
+      }
+    }
+
+    // We will always not-find something due to val=9.
+    HWY_ASSERT(not_found_any);
+    // We'll find something unless the input is empty or {0} - because 0 > i
+    // is false for all i=[0,9].
+    if (count != 0 && in[0] != 0) {
+      HWY_ASSERT(found_any);
+    }
+  }
+};
+
+void TestAllFindIf() {
+  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(FindTest);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
+HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
+}  // namespace hwy
+
+#endif
--- a/third_party/highway/hwy/contrib/algo/transform-inl.h
+++ b/third_party/highway/hwy/contrib/algo/transform-inl.h
@ -0,0 +1,262 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These functions avoid having to write a loop plus remainder handling in the
+// (unfortunately still common) case where arrays are not aligned/padded. If the
+// inputs are known to be aligned/padded, it is more efficient to write a single
+// loop using Load(). We do not provide a TransformAlignedPadded because it
+// would be more verbose than such a loop.
+//
+// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
+// generic lambda if using C++14. Due to apparent limitations of Clang on
+// Windows, it is currently necessary to add HWY_ATTR before the opening { of
+// the lambda to avoid errors about "always_inline function .. requires target".
+//
+// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
+// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
+// vector.
+
+// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
+// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
+// the value of its lane i is i, and increases by `Lanes(d)` after every call.
+// Note that some of these indices may be `>= count`, but the elements that
+// `func` returns in those lanes will not be written to `out`.
+template <class D, class Func, typename T = TFromD<D>>
+void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
+  const RebindToUnsigned<D> du;
+  using TU = TFromD<decltype(du)>;
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  Vec<decltype(du)> vidx = Iota(du, 0);
+  for (; idx + N <= count; idx += N) {
+    StoreU(func(d, vidx), d, out + idx);
+    vidx = Add(vidx, Set(du, static_cast<TU>(N)));
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const RebindToUnsigned<decltype(d1)> du1;
+  for (; idx < count; ++idx) {
+    StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  BlendedStore(func(d, vidx), mask, d, out + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
+// array elements by a constant.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    StoreU(func(d, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    StoreU(func(d1, v), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(func(d, v), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
+// multiplying array elements by those of another array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
+                const T* HWY_RESTRICT in1, const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    const Vec<D> v1 = LoadU(d, in1 + idx);
+    StoreU(func(d, v, v1), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    const V1 v1 = LoadU(d1, in1 + idx);
+    StoreU(func(d1, v, v1), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+  BlendedStore(func(d, v, v1), mask, d, inout + idx);
+#endif
+}
+
+// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
+// usage: FMA of elements from three arrays, stored into the first array.
+template <class D, class Func, typename T = TFromD<D>>
+void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
+                const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
+                const Func& func) {
+  const size_t N = Lanes(d);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    const Vec<D> v = LoadU(d, inout + idx);
+    const Vec<D> v1 = LoadU(d, in1 + idx);
+    const Vec<D> v2 = LoadU(d, in2 + idx);
+    StoreU(func(d, v, v1, v2), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    const V1 v1 = LoadU(d1, in1 + idx);
+    const V1 v2 = LoadU(d1, in2 + idx);
+    StoreU(func(d1, v, v1, v2), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
+  const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
+  BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
+#endif
+}
+
+template <class D, typename T = TFromD<D>>
+void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
+  const size_t N = Lanes(d);
+  const Vec<D> old_v = Set(d, old_t);
+  const Vec<D> new_v = Set(d, new_t);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    Vec<D> v = LoadU(d, inout + idx);
+    StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
+  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v1 = LoadU(d1, inout + idx);
+    StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+template <class D, class Func, typename T = TFromD<D>>
+void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
+               const Func& func) {
+  const size_t N = Lanes(d);
+  const Vec<D> new_v = Set(d, new_t);
+
+  size_t idx = 0;
+  for (; idx + N <= count; idx += N) {
+    Vec<D> v = LoadU(d, inout + idx);
+    StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
+  }
+
+  // `count` was a multiple of the vector length `N`: already done.
+  if (HWY_UNLIKELY(idx == count)) return;
+
+#if HWY_MEM_OPS_MIGHT_FAULT
+  // Proceed one by one.
+  const CappedTag<T, 1> d1;
+  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
+  for (; idx < count; ++idx) {
+    using V1 = Vec<decltype(d1)>;
+    const V1 v = LoadU(d1, inout + idx);
+    StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
+  }
+#else
+  const size_t remaining = count - idx;
+  HWY_DASSERT(0 != remaining && remaining < N);
+  const Mask<D> mask = FirstN(d, remaining);
+  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
+  BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
--- a/third_party/highway/hwy/contrib/algo/transform_test.cc
+++ b/third_party/highway/hwy/contrib/algo/transform_test.cc
@ -0,0 +1,372 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string.h>
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+#include "hwy/contrib/algo/transform-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// If your project requires C++14 or later, you can ignore this and pass lambdas
+// directly to Transform, without requiring an lvalue as we do here for C++11.
+#if __cplusplus < 201402L
+#define HWY_GENERIC_LAMBDA 0
+#else
+#define HWY_GENERIC_LAMBDA 1
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T>
+T Alpha() {
+  return static_cast<T>(1.5);  // arbitrary scalar
+}
+
+// Returns random floating-point number in [-8, 8) to ensure computations do
+// not exceed float32 precision.
+template <typename T>
+T Random(RandomState& rng) {
+  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+  const double val = (bits - 512) / 64.0;
+  // Clamp negative to zero for unsigned types.
+  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
+}
+
+// SCAL, AXPY names are from BLAS.
+template <typename T>
+HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = Alpha<T>() * x[i];
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = Alpha<T>() * x[i] + y[i];
+  }
+}
+
+template <typename T>
+HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
+                             size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    out[i] = x[i] * y[i] + z[i];
+  }
+}
+
+// In C++14, we can instead define these as generic lambdas next to where they
+// are invoked.
+#if !HWY_GENERIC_LAMBDA
+
+// Generator that returns even numbers by doubling the output indices.
+struct Gen2 {
+  template <class D, class VU>
+  Vec<D> operator()(D d, VU vidx) const {
+    return BitCast(d, Add(vidx, vidx));
+  }
+};
+
+struct SCAL {
+  template <class D, class V>
+  Vec<D> operator()(D d, V v) const {
+    using T = TFromD<D>;
+    return Mul(Set(d, Alpha<T>()), v);
+  }
+};
+
+struct AXPY {
+  template <class D, class V>
+  Vec<D> operator()(D d, V v, V v1) const {
+    using T = TFromD<D>;
+    return MulAdd(Set(d, Alpha<T>()), v, v1);
+  }
+};
+
+struct FMA4 {
+  template <class D, class V>
+  Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
+    return MulAdd(v, v1, v2);
+  }
+};
+
+#endif  // !HWY_GENERIC_LAMBDA
+
+// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
+// ForFloatTypes.
+template <class Test>
+struct ForeachCountAndMisalign {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    for (size_t count = 0; count < 2 * N; ++count) {
+      for (size_t ma : misalignments) {
+        for (size_t mb : misalignments) {
+          Test()(d, count, ma, mb, rng);
+        }
+      }
+    }
+  }
+};
+
+// Output-only, no loads
+struct TestGenerate {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
+                  RandomState& /*rng*/) {
+    using T = TFromD<D>;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
+    T* actual = pa.get() + misalign_a;
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = static_cast<T>(2 * i);
+    }
+
+    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+    const auto gen2 = [](const auto d, const auto vidx)
+                          HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
+#else
+    const Gen2 gen2;
+#endif
+    actual[count] = T{0};  // sentinel
+    Generate(d, actual, count, gen2);
+    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
+                                  target_name, __FILE__, __LINE__);
+  }
+};
+
+// Zero extra input arrays
+struct TestTransform {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    if (misalign_b != 0) return;
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleSCAL(a, expected.get(), count);
+
+    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
+    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
+#if HWY_GENERIC_LAMBDA
+    const auto scal = [](const auto d, const auto v)
+                          HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
+#else
+    const SCAL scal;
+#endif
+    Transform(d, a, count, scal);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+// One extra input array
+struct TestTransform1 {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+      b[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleAXPY(a, b, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+    const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
+      return MulAdd(Set(d, Alpha<T>()), v, v1);
+    };
+#else
+    const AXPY axpy;
+#endif
+    Transform1(d, a, count, b, axpy);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+// Two extra input arrays
+struct TestTransform2 {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    using T = TFromD<D>;
+    // Prevents error if size to allocate is zero.
+    AlignedFreeUniquePtr<T[]> pa =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    AlignedFreeUniquePtr<T[]> pb =
+        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
+    AlignedFreeUniquePtr<T[]> pc =
+        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    T* c = pc.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+      b[i] = Random<T>(rng);
+      c[i] = Random<T>(rng);
+    }
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
+    SimpleFMA4(a, b, c, expected.get(), count);
+
+#if HWY_GENERIC_LAMBDA
+    const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
+                          HWY_ATTR { return MulAdd(v, v1, v2); };
+#else
+    const FMA4 fma4;
+#endif
+    Transform2(d, a, count, b, c, fma4);
+
+    const auto info = hwy::detail::MakeTypeInfo<T>();
+    const char* target_name = hwy::TargetName(HWY_TARGET);
+    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
+                                  __FILE__, __LINE__);
+  }
+};
+
+template <typename T>
+class IfEq {
+ public:
+  IfEq(T val) : val_(val) {}
+
+  template <class D, class V>
+  Mask<D> operator()(D d, V v) const {
+    return Eq(v, Set(d, val_));
+  }
+
+ private:
+  T val_;
+};
+
+struct TestReplace {
+  template <class D>
+  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
+                  RandomState& rng) {
+    if (misalign_b != 0) return;
+    if (count == 0) return;
+    using T = TFromD<D>;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
+    T* a = pa.get() + misalign_a;
+    for (size_t i = 0; i < count; ++i) {
+      a[i] = Random<T>(rng);
+    }
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
+
+    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
+
+    std::vector<size_t> positions(AdjustedReps(count));
+    for (size_t& pos : positions) {
+      pos = static_cast<size_t>(rng()) % count;
+    }
+
+    for (size_t pos = 0; pos < count; ++pos) {
+      const T old_t = a[pos];
+      const T new_t = Random<T>(rng);
+      for (size_t i = 0; i < count; ++i) {
+        expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
+      }
+
+      // Copy so ReplaceIf gets the same input (and thus also outputs expected)
+      memcpy(pb.get(), a, count * sizeof(T));
+
+      Replace(d, a, count, new_t, old_t);
+      HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
+
+      ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
+      HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
+    }
+  }
+};
+
+void TestAllGenerate() {
+  // The test BitCast-s the indices, which does not work for floats.
+  ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
+}
+
+void TestAllTransform() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
+}
+
+void TestAllTransform1() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
+}
+
+void TestAllTransform2() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
+}
+
+void TestAllReplace() {
+  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(TransformTest);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
+HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
+}  // namespace hwy
+
+#endif
--- a/third_party/highway/hwy/contrib/dot/dot-inl.h
+++ b/third_party/highway/hwy/contrib/dot/dot-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -34,23 +35,19 @@ struct Dot {
  // argument to Compute. Each one may improve performance or reduce code size,
  // at the cost of additional requirements on the arguments.
  enum Assumptions {
-    // num_elements is at least N, which may be up to HWY_MAX_LANES(T).
+    // num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
    kAtLeastOneVector = 1,
    // num_elements is divisible by N (a power of two, so this can be used if
-    // the problem size is known to be a power of two >= HWY_MAX_LANES(T)).
+    // the problem size is known to be a power of two >= HWY_MAX_BYTES /
+    // sizeof(T)).
    kMultipleOfVector = 2,
    // RoundUpTo(num_elements, N) elements are accessible; their value does not
    // matter (will be treated as if they were zero).
    kPaddedToVector = 4,
-    // Pointers pa and pb, respectively, are multiples of N * sizeof(T).
-    // For example, aligned_allocator.h ensures this. Note that it is still
-    // beneficial to ensure such alignment even if these flags are not set.
-    // If not set, the pointers need only be aligned to alignof(T).
-    kVectorAlignedA = 8,
-    kVectorAlignedB = 16,
  };

-  // Returns sum{pa[i] * pb[i]} for float or double inputs.
+  // Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
+  // pointers to a multiple of N elements is helpful but not required.
  template <int kAssumptions, class D, typename T = TFromD<D>,
            HWY_IF_NOT_LANE_SIZE_D(D, 2)>
  static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
@ -67,8 +64,6 @@ struct Dot {
    constexpr bool kIsMultipleOfVector =
        (kAssumptions & kMultipleOfVector) != 0;
    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
-    constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
-    constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;

    // Won't be able to do a full vector load without padding => scalar loop.
    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
@ -98,28 +93,28 @@ struct Dot {

    // Main loop: unrolled
    for (; i + 4 * N <= num_elements; /* i += 4 * N */) {  // incr in loop
-      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
      i += N;
      sum0 = MulAdd(a0, b0, sum0);
-      const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a1 = LoadU(d, pa + i);
+      const auto b1 = LoadU(d, pb + i);
      i += N;
      sum1 = MulAdd(a1, b1, sum1);
-      const auto a2 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b2 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a2 = LoadU(d, pa + i);
+      const auto b2 = LoadU(d, pb + i);
      i += N;
      sum2 = MulAdd(a2, b2, sum2);
-      const auto a3 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b3 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a3 = LoadU(d, pa + i);
+      const auto b3 = LoadU(d, pb + i);
      i += N;
      sum3 = MulAdd(a3, b3, sum3);
    }

    // Up to 3 iterations of whole vectors
    for (; i + N <= num_elements; i += N) {
-      const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a = LoadU(d, pa + i);
+      const auto b = LoadU(d, pb + i);
      sum0 = MulAdd(a, b, sum0);
    }

@ -128,8 +123,8 @@ struct Dot {
      if (remaining != 0) {
        if (kIsPaddedToVector) {
          const auto mask = FirstN(d, remaining);
-          const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-          const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+          const auto a = LoadU(d, pa + i);
+          const auto b = LoadU(d, pb + i);
          sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
        } else {
          // Unaligned load such that the last element is in the highest lane -
@ -152,7 +147,8 @@ struct Dot {
    return GetLane(SumOfLanes(d, sum0));
  }

-  // Returns sum{pa[i] * pb[i]} for bfloat16 inputs.
+  // Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
+  // multiple of N elements is helpful but not required.
  template <int kAssumptions, class D>
  static HWY_INLINE float Compute(const D d,
                                  const bfloat16_t* const HWY_RESTRICT pa,
@ -170,8 +166,6 @@ struct Dot {
    constexpr bool kIsMultipleOfVector =
        (kAssumptions & kMultipleOfVector) != 0;
    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
-    constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
-    constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;

    // Won't be able to do a full vector load without padding => scalar loop.
    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
@ -197,20 +191,20 @@ struct Dot {

    // Main loop: unrolled
    for (; i + 2 * N <= num_elements; /* i += 2 * N */) {  // incr in loop
-      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
      i += N;
      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
-      const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a1 = LoadU(d, pa + i);
+      const auto b1 = LoadU(d, pb + i);
      i += N;
      sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
    }

    // Possibly one more iteration of whole vectors
    if (i + N <= num_elements) {
-      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      const auto a0 = LoadU(d, pa + i);
+      const auto b0 = LoadU(d, pb + i);
      i += N;
      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
    }
@ -220,8 +214,8 @@ struct Dot {
      if (remaining != 0) {
        if (kIsPaddedToVector) {
          const auto mask = FirstN(du16, remaining);
-          const auto va = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
-          const auto vb = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+          const auto va = LoadU(d, pa + i);
+          const auto vb = LoadU(d, pb + i);
          const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
          const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
--- a/third_party/highway/hwy/contrib/dot/dot_test.cc
+++ b/third_party/highway/hwy/contrib/dot/dot_test.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,7 +22,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 #include "hwy/contrib/dot/dot-inl.h"
 #include "hwy/tests/test_util-inl.h"
@ -69,11 +70,6 @@ class TestDot {
      return static_cast<float>(bits - 512) * (1.0f / 64);
    };

-    const bool kIsAlignedA = (kAssumptions & Dot::kVectorAlignedA) != 0;
-    const bool kIsAlignedB = (kAssumptions & Dot::kVectorAlignedB) != 0;
-
-    HWY_ASSERT(!kIsAlignedA || misalign_a == 0);
-    HWY_ASSERT(!kIsAlignedB || misalign_b == 0);
    const size_t padded =
        (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
@ -99,27 +95,11 @@ class TestDot {
    HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
  }

-  // Runs tests with various alignments compatible with the given assumptions.
+  // Runs tests with various alignments.
  template <int kAssumptions, class D>
  void ForeachMisalign(D d, size_t num, RandomState& rng) {
-    static_assert(
-        (kAssumptions & (Dot::kVectorAlignedA | Dot::kVectorAlignedB)) == 0,
-        "Alignment must not be specified by caller");
-
    const size_t N = Lanes(d);
    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
-
-    // Both flags, both aligned
-    Test<kAssumptions | Dot::kVectorAlignedA | Dot::kVectorAlignedB>(d, num, 0,
-                                                                     0, rng);
-
-    // One flag and aligned, other aligned/misaligned
-    for (size_t m : misalignments) {
-      Test<kAssumptions | Dot::kVectorAlignedA>(d, num, 0, m, rng);
-      Test<kAssumptions | Dot::kVectorAlignedB>(d, num, m, 0, rng);
-    }
-
-    // Neither flag, all combinations of aligned/misaligned
    for (size_t ma : misalignments) {
      for (size_t mb : misalignments) {
        Test<kAssumptions>(d, num, ma, mb, rng);
@ -184,10 +164,4 @@ HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
 HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/contrib/image/image.cc
+++ b/third_party/highway/hwy/contrib/image/image.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -19,7 +20,7 @@

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"

 HWY_BEFORE_NAMESPACE();
@ -104,7 +105,7 @@ ImageBase::ImageBase(const size_t xsize, const size_t ysize,
 }

 void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
-#if defined(MEMORY_SANITIZER) || HWY_IDE
+#if HWY_IS_MSAN || HWY_IDE
  if (xsize_ == 0 || ysize_ == 0) return;

  const size_t vec_size = VectorSize();  // Bytes, independent of sizeof_t!
@ -130,7 +131,7 @@ void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
 #else
  (void)sizeof_t;
  (void)padding;
-#endif  // MEMORY_SANITIZER
+#endif  // HWY_IS_MSAN
 }

 void ImageBase::Swap(ImageBase& other) {
--- a/third_party/highway/hwy/contrib/image/image.h
+++ b/third_party/highway/hwy/contrib/image/image.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/contrib/image/image_test.cc
+++ b/third_party/highway/hwy/contrib/image/image_test.cc
@ -1,4 +1,5 @@
 // Copyright (c) the JPEG XL Project
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -14,14 +15,7 @@

 #include "hwy/contrib/image/image.h"

-#include <cstddef>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
-#include "hwy/foreach_target.h"
-
+#include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -29,6 +23,11 @@
 #include <random>
 #include <utility>

+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target:
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

@ -82,7 +81,7 @@ struct TestUnalignedT {

 // This test reads padding, which only works if it was initialized,
 // which only happens in MSAN builds.
-#if defined(MEMORY_SANITIZER) || HWY_IDE
+#if HWY_IS_MSAN || HWY_IDE
        // Initialize only the valid samples
        for (size_t y = 0; y < ysize; ++y) {
          T* HWY_RESTRICT row = img.MutableRow(y);
@ -150,10 +149,4 @@ HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
 HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/contrib/math/math-inl.h
+++ b/third_party/highway/hwy/contrib/math/math-inl.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/contrib/math/math_test.cc
+++ b/third_party/highway/hwy/contrib/math/math_test.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -20,7 +21,7 @@
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 #include "hwy/contrib/math/math-inl.h"
 #include "hwy/tests/test_util-inl.h"
@ -219,10 +220,4 @@ HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/contrib/sort/BUILD
+++ b/third_party/highway/hwy/contrib/sort/BUILD
@ -8,35 +8,91 @@ COMPAT = [
    "//buildenv/target:non_prod",  # includes mobile/vendor.
 ]

+# cc_library(
+#     name = "vxsort",
+#     srcs = [
+#         "vxsort/isa_detection.cpp",
+#         "vxsort/isa_detection_msvc.cpp",
+#         "vxsort/isa_detection_sane.cpp",
+#         "vxsort/machine_traits.avx2.cpp",
+#         "vxsort/smallsort/avx2_load_mask_tables.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
+#         "vxsort/vxsort_stats.cpp",
+#     ],
+#     hdrs = [
+#         "vxsort/alignment.h",
+#         "vxsort/defs.h",
+#         "vxsort/isa_detection.h",
+#         "vxsort/machine_traits.avx2.h",
+#         "vxsort/machine_traits.avx512.h",
+#         "vxsort/machine_traits.h",
+#         "vxsort/packer.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
+#         "vxsort/smallsort/bitonic_sort.h",
+#         "vxsort/vxsort.h",
+#         "vxsort/vxsort_stats.h",
+#     ],
+#     compatible_with = [],
+#     textual_hdrs = [
+#         "vxsort/vxsort_targets_disable.h",
+#         "vxsort/vxsort_targets_enable_avx2.h",
+#         "vxsort/vxsort_targets_enable_avx512.h",
+#     ],
+# )
+
 cc_library(
    name = "vqsort",
    srcs = [
        # Split into separate files to reduce MSVC build time.
        "vqsort.cc",
-        "vqsort_i16a.cc",
-        "vqsort_i16d.cc",
-        "vqsort_u16a.cc",
-        "vqsort_u16d.cc",
-        "vqsort_f32a.cc",
-        "vqsort_f32d.cc",
-        "vqsort_i32a.cc",
-        "vqsort_i32d.cc",
-        "vqsort_u32a.cc",
-        "vqsort_u32d.cc",
-        "vqsort_f64a.cc",
-        "vqsort_f64d.cc",
-        "vqsort_i64a.cc",
-        "vqsort_i64d.cc",
-        "vqsort_u64a.cc",
-        "vqsort_u64d.cc",
        "vqsort_128a.cc",
        "vqsort_128d.cc",
+        "vqsort_f32a.cc",
+        "vqsort_f32d.cc",
+        "vqsort_f64a.cc",
+        "vqsort_f64d.cc",
+        "vqsort_i16a.cc",
+        "vqsort_i16d.cc",
+        "vqsort_i32a.cc",
+        "vqsort_i32d.cc",
+        "vqsort_i64a.cc",
+        "vqsort_i64d.cc",
+        "vqsort_kv128a.cc",
+        "vqsort_kv128d.cc",
+        "vqsort_u16a.cc",
+        "vqsort_u16d.cc",
+        "vqsort_u32a.cc",
+        "vqsort_u32d.cc",
+        "vqsort_u64a.cc",
+        "vqsort_u64d.cc",
    ],
    hdrs = [
-        "disabled_targets.h",
        "vqsort.h",  # public interface
    ],
    compatible_with = [],
+    local_defines = ["hwy_contrib_EXPORTS"],
    textual_hdrs = [
        "shared-inl.h",
        "sorting_networks-inl.h",
@ -48,6 +104,7 @@ cc_library(
        # Only if VQSORT_SECURE_RNG is set.
        # "//third_party/absl/random",
        "//:hwy",
+        # ":vxsort",  # required if HAVE_VXSORT
    ],
 )

@ -86,8 +143,7 @@ cc_test(
    name = "sort_test",
    size = "medium",
    srcs = ["sort_test.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
    local_defines = ["HWY_IS_TEST"],
    # for test_suite.
    tags = ["hwy_ops_test"],
@ -104,8 +160,7 @@ cc_binary(
    name = "bench_sort",
    testonly = 1,
    srcs = ["bench_sort.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
    local_defines = ["HWY_IS_TEST"],
    deps = [
        ":helpers",
@ -120,8 +175,7 @@ cc_binary(
    name = "bench_parallel",
    testonly = 1,
    srcs = ["bench_parallel.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
+    # Do not enable fully_static_link (pthread crash on bazel)
    local_defines = ["HWY_IS_TEST"],
    deps = [
        ":helpers",
--- a/third_party/highway/hwy/contrib/sort/README.md
+++ b/third_party/highway/hwy/contrib/sort/README.md
@ -0,0 +1,81 @@
+# Vectorized and performance-portable Quicksort
+
+## Introduction
+
+As of 2022-06-07 this sorts large arrays of built-in types about ten times as
+fast as `std::sort`. See also our
+[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
+and [paper](https://arxiv.org/abs/2205.05982).
+
+## Instructions
+
+Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
+and Arm V1 (NEON, SVE).
+
+### x86 (Linux)
+
+Please first ensure golang, and Clang (tested with 13.0.1) are installed via
+your system's package manager.
+
+```
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+### AWS Graviton3
+
+Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
+32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
+config is verified, then re-launch. See IPv4 hostname in list of instances.
+
+`ssh -i /path/key.pem ec2-user@hostname`
+
+Note that the AWS CMake package is too old for llvm, so we build it first:
+```
+wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
+tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
+./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
+make -j8 && sudo make install
+cd ..
+```
+
+AWS clang is at version 11.1, which generates unnecessary AND instructions which
+slow down the sort by 1.15x. We tested with clang trunk as of June 13
+(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
+```
+git clone --depth 1 https://github.com/llvm/llvm-project.git
+cd llvm-project
+mkdir -p build && cd build
+/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
+make -j32 && sudo make install
+```
+
+```
+sudo yum install go
+go install github.com/bazelbuild/bazelisk@latest
+git clone https://github.com/google/highway
+cd highway
+CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
+bazel-bin/hwy/contrib/sort/sort_test
+bazel-bin/hwy/contrib/sort/bench_sort
+```
+
+## Results
+
+`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
+algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
+sorted (f32 is float), the distribution of keys (uniform32 for uniform random
+with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
+number of key bytes output per second).
+
+Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
+
+```
+[ RUN      ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
+      AVX3:          std:     f32: uniform32: 1.00E+06   54 MB/s ( 1 threads)
+      AVX3:           vq:     f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
+```
--- a/third_party/highway/hwy/contrib/sort/algo-inl.h
+++ b/third_party/highway/hwy/contrib/sort/algo-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -29,16 +30,18 @@
 // Third-party algorithms
 #define HAVE_AVX2SORT 0
 #define HAVE_IPS4O 0
+// When enabling, consider changing max_threads (required for Table 1a)
 #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
 #define HAVE_PDQSORT 0
 #define HAVE_SORT512 0
+#define HAVE_VXSORT 0

 #if HAVE_AVX2SORT
 HWY_PUSH_ATTRIBUTES("avx2,avx")
-#include "avx2sort.h"
+#include "avx2sort.h"  //NOLINT
 HWY_POP_ATTRIBUTES
 #endif
-#if HAVE_IPS4O
+#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
 #include "third_party/ips4o/include/ips4o.hpp"
 #include "third_party/ips4o/include/ips4o/thread_pool.hpp"
 #endif
@ -46,18 +49,59 @@ HWY_POP_ATTRIBUTES
 #include "third_party/boost/allowed/sort/sort.hpp"
 #endif
 #if HAVE_SORT512
-#include "sort512.h"
+#include "sort512.h"  //NOLINT
 #endif

+// vxsort is difficult to compile for multiple targets because it also uses
+// .cpp files, and we'd also have to #undef its include guards. Instead, compile
+// only for AVX2 or AVX3 depending on this macro.
+#define VXSORT_AVX3 1
+#if HAVE_VXSORT
+// inlined from vxsort_targets_enable_avx512 (must close before end of header)
+#ifdef __GNUC__
+#ifdef __clang__
+#if VXSORT_AVX3
+#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
+                             apply_to = any(function))
+#else
+#pragma clang attribute push(__attribute__((target("avx2"))), \
+                             apply_to = any(function))
+#endif  // VXSORT_AVX3
+
+#else
+#pragma GCC push_options
+#if VXSORT_AVX3
+#pragma GCC target("avx512f,avx512dq")
+#else
+#pragma GCC target("avx2")
+#endif  // VXSORT_AVX3
+#endif
+#endif
+
+#if VXSORT_AVX3
+#include "vxsort/machine_traits.avx512.h"
+#else
+#include "vxsort/machine_traits.avx2.h"
+#endif  // VXSORT_AVX3
+#include "vxsort/vxsort.h"
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma clang attribute pop
+#else
+#pragma GCC pop_options
+#endif
+#endif
+#endif  // HAVE_VXSORT
+
 namespace hwy {

 enum class Dist { kUniform8, kUniform16, kUniform32 };

-std::vector<Dist> AllDist() {
-  return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32};
+static inline std::vector<Dist> AllDist() {
+  return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
 }

-const char* DistName(Dist dist) {
+static inline const char* DistName(Dist dist) {
  switch (dist) {
    case Dist::kUniform8:
      return "uniform8";
@ -75,7 +119,13 @@ class InputStats {
  void Notify(T value) {
    min_ = std::min(min_, value);
    max_ = std::max(max_, value);
-    sumf_ += static_cast<double>(value);
+    // Converting to integer would truncate floats, multiplying to save digits
+    // risks overflow especially when casting, so instead take the sum of the
+    // bit representations as the checksum.
+    uint64_t bits = 0;
+    static_assert(sizeof(T) <= 8, "Expected a built-in type");
+    CopyBytes<sizeof(T)>(&value, &bits);
+    sum_ += bits;
    count_ += 1;
  }

@ -86,20 +136,16 @@ class InputStats {
    }

    if (min_ != other.min_ || max_ != other.max_) {
-      HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
-                double(other.min_), double(other.max_));
+      HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
+                static_cast<double>(max_), static_cast<double>(other.min_),
+                static_cast<double>(other.max_));
    }

    // Sum helps detect duplicated/lost values
-    if (sumf_ != other.sumf_) {
-      // Allow some tolerance because kUniform32 * num can exceed double
-      // precision.
-      const double mul = 1E-9;  // prevent destructive cancellation
-      const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
-      if (err > 1E-3) {
-        HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
-                  other.sumf_, err, double(min_), double(max_));
-      }
+    if (sum_ != other.sum_) {
+      HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
+                static_cast<double>(sum_), static_cast<double>(other.sum_),
+                static_cast<double>(min_), static_cast<double>(max_));
    }

    return true;
@ -108,7 +154,7 @@ class InputStats {
 private:
  T min_ = hwy::HighestValue<T>();
  T max_ = hwy::LowestValue<T>();
-  double sumf_ = 0.0;
+  uint64_t sum_ = 0;
  size_t count_ = 0;
 };

@ -127,13 +173,16 @@ enum class Algo {
 #endif
 #if HAVE_SORT512
  kSort512,
+#endif
+#if HAVE_VXSORT
+  kVXSort,
 #endif
  kStd,
  kVQSort,
  kHeap,
 };

-const char* AlgoName(Algo algo) {
+static inline const char* AlgoName(Algo algo) {
  switch (algo) {
 #if HAVE_AVX2SORT
    case Algo::kSEA:
@ -154,6 +203,10 @@ const char* AlgoName(Algo algo) {
 #if HAVE_SORT512
    case Algo::kSort512:
      return "sort512";
+#endif
+#if HAVE_VXSORT
+    case Algo::kVXSort:
+      return "vxsort";
 #endif
    case Algo::kStd:
      return "std";
@ -205,12 +258,11 @@ class Xorshift128Plus {
  }

  // Need to pass in the state because vector cannot be class members.
-  template <class DU64>
-  static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
-                              Vec<DU64>& state1) {
-    Vec<DU64> s1 = state0;
-    Vec<DU64> s0 = state1;
-    const Vec<DU64> bits = Add(s1, s0);
+  template <class VU64>
+  static VU64 RandomBits(VU64& state0, VU64& state1) {
+    VU64 s1 = state0;
+    VU64 s0 = state1;
+    const VU64 bits = Add(s1, s0);
    state0 = s0;
    s1 = Xor(s1, ShiftLeft<23>(s1));
    state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
@ -218,30 +270,34 @@ class Xorshift128Plus {
  }
 };

-template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  return And(bits, mask);
+template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
+Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
+  const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
+  return BitCast(d, And(bits, mask));
 }

-// Important to avoid denormals, which are flushed to zero by SIMD but not
+// It is important to avoid denormals, which are flushed to zero by SIMD but not
 // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
-template <typename T, class DU64, HWY_IF_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  const Vec<DU64> values = And(bits, mask);
-#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to i32
-  const RebindToSigned<DU64> di;
+template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
+Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
+  using TF = TFromD<DF>;
+  const RebindToUnsigned<decltype(df)> du;
+  using VU = Vec<decltype(du)>;
+
+  const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
+
+#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to smaller types
+  using TU = MakeUnsigned<TF>;
+  const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
 #else
-  const Repartition<MakeSigned<T>, DU64> di;
+  const VU bits = BitCast(du, bits64);
 #endif
-  const RebindToFloat<decltype(di)> df;
-  // Avoid NaN/denormal by converting from (range-limited) integer.
-  const Vec<DU64> no_nan =
-      And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
-  return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
+  // Avoid NaN/denormal by only generating values in [1, 2), i.e. random
+  // mantissas with the exponent taken from the representation of 1.0.
+  const VU k1 = BitCast(du, Set(df, TF{1.0}));
+  const VU mantissa_mask = Set(du, MantissaMask<TF>());
+  const VU representation = OrAnd(k1, bits, mantissa_mask);
+  return BitCast(df, representation);
 }

 template <class DU64>
@ -269,29 +325,29 @@ InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
  SortTag<uint64_t> du64;
  using VU64 = Vec<decltype(du64)>;
  const size_t N64 = Lanes(du64);
-  auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
-  Xorshift128Plus::GenerateSeeds(du64, buf.get());
-  auto s0 = Load(du64, buf.get());
-  auto s1 = Load(du64, buf.get() + N64);
-
-  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+  auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
+  Xorshift128Plus::GenerateSeeds(du64, seeds.get());
+  VU64 s0 = Load(du64, seeds.get());
+  VU64 s1 = Load(du64, seeds.get() + N64);

+#if HWY_TARGET == HWY_SCALAR
+  const Sisd<T> d;
+#else
  const Repartition<T, decltype(du64)> d;
+#endif
+  using V = Vec<decltype(d)>;
  const size_t N = Lanes(d);
+  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
+  auto buf = hwy::AllocateAligned<T>(N);
+
  size_t i = 0;
  for (; i + N <= num; i += N) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-#if HWY_ARCH_RVV
-    // v may not be 64-bit aligned
-    StoreU(bits, du64, buf.get());
-    memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
-#else
-    StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
-#endif
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, v + i);
  }
  if (i < num) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-    StoreU(bits, du64, buf.get());
+    const V values = RandomValues(d, s0, s1, mask);
+    StoreU(values, d, buf.get());
    memcpy(v + i, buf.get(), (num - i) * sizeof(T));
  }

@ -308,18 +364,65 @@ struct ThreadLocal {

 struct SharedState {
 #if HAVE_PARALLEL_IPS4O
-  ips4o::StdThreadPool pool{
-      HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))};
+  const unsigned max_threads = hwy::LimitsMax<unsigned>();  // 16 for Table 1a
+  ips4o::StdThreadPool pool{static_cast<int>(
+      HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
 #endif
  std::vector<ThreadLocal> tls{1};
 };

-template <class Order, typename T>
-void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
-         size_t thread) {
-  using detail::HeapSort;
-  using detail::LaneTraits;
+// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
+// non-128-bit keys they are the same:
+template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
+void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::TraitsLane;
  using detail::SharedTraits;
+  if (Order().IsAscending()) {
+    const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  } else {
+    const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
+    return detail::HeapSort(st, keys, num_keys);
+  }
+}
+
+#if VQSORT_ENABLED
+template <class Order>
+void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescending128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+
+template <class Order>
+void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
+  using detail::SharedTraits;
+  using detail::Traits128;
+  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
+  const size_t num_lanes = num_keys * 2;
+  if (Order().IsAscending()) {
+    const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  } else {
+    const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
+    return detail::HeapSort(st, lanes, num_lanes);
+  }
+}
+#endif  // VQSORT_ENABLED
+
+template <class Order, typename KeyType>
+void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
+         SharedState& shared, size_t thread) {
+  const std::less<KeyType> less;
+  const std::greater<KeyType> greater;

  switch (algo) {
 #if HAVE_AVX2SORT
@ -330,18 +433,18 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
 #if HAVE_IPS4O
    case Algo::kIPS4O:
      if (Order().IsAscending()) {
-        return ips4o::sort(inout, inout + num, std::less<T>());
+        return ips4o::sort(inout, inout + num, less);
      } else {
-        return ips4o::sort(inout, inout + num, std::greater<T>());
+        return ips4o::sort(inout, inout + num, greater);
      }
 #endif

 #if HAVE_PARALLEL_IPS4O
    case Algo::kParallelIPS4O:
      if (Order().IsAscending()) {
-        return ips4o::parallel::sort(inout, inout + num, std::less<T>());
+        return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
      } else {
-        return ips4o::parallel::sort(inout, inout + num, std::greater<T>());
+        return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
      }
 #endif

@ -354,33 +457,47 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
 #if HAVE_PDQSORT
    case Algo::kPDQ:
      if (Order().IsAscending()) {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::less<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, less);
      } else {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::greater<T>());
+        return boost::sort::pdqsort_branchless(inout, inout + num, greater);
      }
 #endif

+#if HAVE_VXSORT
+    case Algo::kVXSort: {
+#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
+    (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
+      fprintf(stderr, "Do not call for target %s\n",
+              hwy::TargetName(HWY_TARGET));
+      return;
+#else
+#if VXSORT_AVX3
+      vxsort::vxsort<KeyType, vxsort::AVX512> vx;
+#else
+      vxsort::vxsort<KeyType, vxsort::AVX2> vx;
+#endif
+      if (Order().IsAscending()) {
+        return vx.sort(inout, inout + num - 1);
+      } else {
+        fprintf(stderr, "Skipping VX - does not support descending order\n");
+        return;
+      }
+#endif  // enabled for this target
+    }
+#endif  // HAVE_VXSORT
+
    case Algo::kStd:
      if (Order().IsAscending()) {
-        return std::sort(inout, inout + num, std::less<T>());
+        return std::sort(inout, inout + num, less);
      } else {
-        return std::sort(inout, inout + num, std::greater<T>());
+        return std::sort(inout, inout + num, greater);
      }

    case Algo::kVQSort:
      return shared.tls[thread].sorter(inout, num, Order());

    case Algo::kHeap:
-      HWY_ASSERT(sizeof(T) < 16);
-      if (Order().IsAscending()) {
-        const SharedTraits<LaneTraits<detail::OrderAscending>> st;
-        return HeapSort(st, inout, num);
-      } else {
-        const SharedTraits<LaneTraits<detail::OrderDescending>> st;
-        return HeapSort(st, inout, num);
-      }
+      return CallHeapSort<Order>(inout, num);

    default:
      HWY_ABORT("Not implemented");
--- a/third_party/highway/hwy/contrib/sort/bench_parallel.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_parallel.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -15,20 +16,6 @@
 // Concurrent, independent sorts for generating more memory traffic and testing
 // scalability.

-// clang-format off
-#include "hwy/contrib/sort/vqsort.h"
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/aligned_allocator.h"
-// Last
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
 #include <stdint.h>
 #include <stdio.h>

@ -40,18 +27,29 @@
 #include <utility>
 #include <vector>

+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"  //NOLINT
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/aligned_allocator.h"
+// Last
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {

-#if HWY_TARGET != HWY_SCALAR
-
 class ThreadPool {
 public:
  // Starts the given number of worker threads and blocks until they are ready.
  explicit ThreadPool(
-      const size_t num_threads = std::thread::hardware_concurrency() / 2)
+      const size_t num_threads = std::thread::hardware_concurrency())
      : num_threads_(num_threads) {
    HWY_ASSERT(num_threads_ > 0);
    threads_.reserve(num_threads_);
@ -168,36 +166,42 @@ class ThreadPool {
  const void* data_;                               // points to caller's Func
 };

-template <class Order, typename T>
-void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
-                      SharedState& shared, size_t thread) {
-  auto aligned = hwy::AllocateAligned<T>(num);
+template <class Traits>
+void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
+                      const Algo algo, SharedState& shared, size_t thread) {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  using Order = typename Traits::Order;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);

-  (void)GenerateInput(dist, aligned.get(), num);
+  (void)GenerateInput(dist, aligned.get(), num_lanes);

  const Timestamp t0;
-  Run<Order>(algo, aligned.get(), num, shared, thread);
-  HWY_ASSERT(aligned[0] < aligned[num - 1]);
+  Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
+             thread);
+  HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
 }

 void BenchParallel() {
-  // Not interested in benchmark results for other targets
-  if (HWY_TARGET != HWY_AVX3) return;
+  // Not interested in benchmark results for other targets on x86
+  if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
+    return;
+  }

  ThreadPool pool;
  const size_t NT = pool.NumThreads();

-  using T = int64_t;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-
-  size_t num = 100 * 1000 * 1000;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
+  using KeyType = typename decltype(st)::KeyType;
+  const size_t num_keys = size_t{100} * 1000 * 1000;

 #if HAVE_IPS4O
  const Algo algo = Algo::kIPS4O;
 #else
  const Algo algo = Algo::kVQSort;
 #endif
-  const Dist dist = Dist::kUniform16;
+  const Dist dist = Dist::kUniform32;

  SharedState shared;
  shared.tls.resize(NT);
@ -207,18 +211,15 @@ void BenchParallel() {
    Timestamp t0;
    // Default capture because MSVC wants algo/dist but clang does not.
    pool.RunOnThreads(nt, [=, &shared](size_t thread) {
-      RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
+      RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
    });
    const double sec = SecondsSince(t0);
-    results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
+    results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
+                         st.KeyString());
    results.back().Print();
  }
 }

-#else
-void BenchParallel() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@ -234,10 +235,4 @@ HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
 }  // namespace
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/bench_sort.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,71 +13,87 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort.h"
-#include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>  // memcpy

 #include <vector>

+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
+#include "hwy/contrib/sort/traits-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+// Mode for larger sorts because M1 is able to access more than the per-core
+// share of L2, so 1M elements might still be in cache.
+#define SORT_100M 0
+
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
+// Defined within HWY_ONCE, used by BenchAllSort.
+extern int64_t first_sort_target;
+
 namespace HWY_NAMESPACE {
 namespace {
-using detail::LaneTraits;
+using detail::TraitsLane;
 using detail::OrderAscending;
 using detail::OrderDescending;
 using detail::SharedTraits;

-#if HWY_TARGET != HWY_SCALAR
+#if VQSORT_ENABLED || HWY_IDE
 using detail::OrderAscending128;
-using detail::OrderDescending128;
 using detail::Traits128;

-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
  detail::SharedTraits<Traits> st;
  const Dist dist = Dist::kUniform8;
  double sum = 0.0;

+  detail::Generator rng(&sum, 123);  // for ChoosePivot
+
  const size_t max_log2 = AdjustedLog2Reps(20);
  for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
-    const size_t num = 1ull << log2;
-    auto aligned = hwy::AllocateAligned<T>(num);
-    auto buf =
-        hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
+    const size_t num_lanes = 1ull << log2;
+    const size_t num_keys = num_lanes / st.LanesPerKey();
+    auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+    auto buf = hwy::AllocateAligned<LaneType>(
+        HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
+                hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));

    std::vector<double> seconds;
-    const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
+    const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
    for (size_t rep = 0; rep < num_reps; ++rep) {
-      (void)GenerateInput(dist, aligned.get(), num);
+      (void)GenerateInput(dist, aligned.get(), num_lanes);
+
+      // The pivot value can influence performance. Do exactly what vqsort will
+      // do so that the performance (influenced by prefetching and branch
+      // prediction) is likely to predict the actual performance inside vqsort.
+      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), 0, num_lanes,
+                                             buf.get(), rng);

      const Timestamp t0;
-
-      detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
+      detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
                        buf.get());
      seconds.push_back(SecondsSince(t0));
      // 'Use' the result to prevent optimizing out the partition.
-      sum += static_cast<double>(aligned.get()[num / 2]);
+      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
    }

-    MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
-                  SummarizeMeasurements(seconds))
+    Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
+           sizeof(KeyType), st.KeyString())
        .Print();
  }
  HWY_ASSERT(sum != 999999);  // Prevent optimizing out
@ -84,52 +101,60 @@ HWY_NOINLINE void BenchPartition() {

 HWY_NOINLINE void BenchAllPartition() {
  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
-      HWY_TARGET == HWY_AVX2) {
+  if (HWY_TARGET == HWY_SSSE3) {
    return;
  }

-  BenchPartition<LaneTraits<OrderDescending>, float>();
-  BenchPartition<LaneTraits<OrderAscending>, int64_t>();
-  BenchPartition<Traits128<OrderDescending128>, uint64_t>();
+  BenchPartition<TraitsLane<OrderDescending<float>>>();
+  BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
+  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
+  BenchPartition<Traits128<OrderAscending128>>();
+  // BenchPartition<Traits128<OrderDescending128>>();
+  // BenchPartition<Traits128<OrderAscendingKV128>>();
 }

-template <class Traits, typename T>
+template <class Traits>
 HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
  // Not interested in benchmark results for these targets
  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
    return;
  }

-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const SortTag<LaneType> d;
  detail::SharedTraits<Traits> st;
  const Dist dist = Dist::kUniform32;

  const size_t N = Lanes(d);
-  const size_t num = SortConstants::BaseCaseNum(N);
-  auto keys = hwy::AllocateAligned<T>(num);
-  auto buf = hwy::AllocateAligned<T>(num + N);
+  const size_t num_lanes = SortConstants::BaseCaseNum(N);
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+  auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
+  auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);

  std::vector<double> seconds;
  double sum = 0;                             // prevents elision
  constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure

-  for (size_t rep = 0; rep < kReps; ++rep) {
-    InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
+  for (size_t rep = 0; rep < 30; ++rep) {
+    InputStats<LaneType> input_stats =
+        GenerateInput(dist, keys.get(), num_lanes);

    const Timestamp t0;
    for (size_t i = 0; i < kMul; ++i) {
-      detail::BaseCase(d, st, keys.get(), num, buf.get());
+      detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
+                       buf.get());
      sum += static_cast<double>(keys[0]);
    }
    seconds.push_back(SecondsSince(t0));
    // printf("%f\n", seconds.back());

-    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
+    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
  }
  HWY_ASSERT(sum < 1E99);
-  results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
-                                  SummarizeMeasurements(seconds)));
+  results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
+                       SummarizeMeasurements(seconds), sizeof(KeyType),
+                       st.KeyString());
 }

 HWY_NOINLINE void BenchAllBase() {
@ -139,14 +164,19 @@ HWY_NOINLINE void BenchAllBase() {
  }

  std::vector<Result> results;
-  BenchBase<LaneTraits<OrderAscending>, float>(results);
-  BenchBase<LaneTraits<OrderDescending>, int64_t>(results);
-  BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
+  BenchBase<TraitsLane<OrderAscending<float>>>(results);
+  BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
+  BenchBase<Traits128<OrderAscending128>>(results);
  for (const Result& r : results) {
    r.Print();
  }
 }

+#else
+void BenchAllPartition() {}
+void BenchAllBase() {}
+#endif  // VQSORT_ENABLED
+
 std::vector<Algo> AlgoForBench() {
  return {
 #if HAVE_AVX2SORT
@ -154,8 +184,7 @@ std::vector<Algo> AlgoForBench() {
 #endif
 #if HAVE_PARALLEL_IPS4O
        Algo::kParallelIPS4O,
-#endif
-#if HAVE_IPS4O
+#elif HAVE_IPS4O
        Algo::kIPS4O,
 #endif
 #if HAVE_PDQSORT
@ -164,33 +193,64 @@ std::vector<Algo> AlgoForBench() {
 #if HAVE_SORT512
        Algo::kSort512,
 #endif
-        // Algo::kStd,  // too slow to always benchmark
-        // Algo::kHeap,  // too slow to always benchmark
-        Algo::kVQSort,
+// Only include if we're compiling for the target it supports.
+#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
+                    (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
+        Algo::kVXSort,
+#endif
+
+#if !HAVE_PARALLEL_IPS4O
+#if !SORT_100M
+        // These are 10-20x slower, but that's OK for the default size when we
+        // are not testing the parallel nor 100M modes.
+        Algo::kStd, Algo::kHeap,
+#endif
+
+        Algo::kVQSort,  // only ~4x slower, but not required for Table 1a
+#endif
  };
 }

-template <class Traits, typename T>
-HWY_NOINLINE void BenchSort(size_t num) {
+template <class Traits>
+HWY_NOINLINE void BenchSort(size_t num_keys) {
+  if (first_sort_target == 0) first_sort_target = HWY_TARGET;
+
  SharedState shared;
  detail::SharedTraits<Traits> st;
-  auto aligned = hwy::AllocateAligned<T>(num);
+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+  const size_t num_lanes = num_keys * st.LanesPerKey();
+  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
+
+  const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
+
  for (Algo algo : AlgoForBench()) {
+    // Other algorithms don't depend on the vector instructions, so only run
+    // them for the first target.
+#if !HAVE_VXSORT
+    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
+      continue;
+    }
+#endif
+
    for (Dist dist : AllDist()) {
      std::vector<double> seconds;
-      for (size_t rep = 0; rep < kReps; ++rep) {
-        InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
+      for (size_t rep = 0; rep < reps; ++rep) {
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, aligned.get(), num_lanes);

        const Timestamp t0;
-        Run<typename Traits::Order>(algo, aligned.get(), num, shared,
-                                    /*thread=*/0);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
+                   shared, /*thread=*/0);
        seconds.push_back(SecondsSince(t0));
        // printf("%f\n", seconds.back());

        HWY_ASSERT(
-            VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
+            VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
      }
-      MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
+      Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
+             sizeof(KeyType), st.KeyString())
          .Print();
    }  // dist
  }    // algo
@ -198,41 +258,40 @@ HWY_NOINLINE void BenchSort(size_t num) {

 HWY_NOINLINE void BenchAllSort() {
  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
+  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
+      HWY_TARGET == HWY_EMU128) {
    return;
  }
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;

  constexpr size_t K = 1000;
  constexpr size_t M = K * K;
  (void)K;
  (void)M;
-  for (size_t num : {
-#if HAVE_PARALLEL_IPS4O
+  for (size_t num_keys : {
+#if HAVE_PARALLEL_IPS4O || SORT_100M
         100 * M,
 #else
-         AdjustedReps(1 * M),
+        1 * M,
 #endif
       }) {
-    // BenchSort<LaneTraits<OrderAscending>, float>(num);
-    // BenchSort<LaneTraits<OrderDescending>, double>(num);
-    // BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
-    BenchSort<LaneTraits<OrderDescending>, int32_t>(num);
-    BenchSort<LaneTraits<OrderAscending>, int64_t>(num);
-    // BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
-    // BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
-    // BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
+    BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
+    BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
+    // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);

-    BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
-    // BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
+#if !HAVE_VXSORT && VQSORT_ENABLED
+    BenchSort<Traits128<OrderAscending128>>(num_keys);
+    // BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
+#endif
  }
 }

-#else
-void BenchAllPartition() {}
-void BenchAllBase() {}
-void BenchAllSort() {}
-#endif
-
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@ -242,6 +301,7 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE

 namespace hwy {
+int64_t first_sort_target = 0;  // none run yet
 namespace {
 HWY_BEFORE_TEST(BenchSort);
 HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
@ -250,10 +310,4 @@ HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
 }  // namespace
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/disabled_targets.h
+++ b/third_party/highway/hwy/contrib/sort/disabled_targets.h
@ -1,30 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Speed up MSVC builds by building fewer targets. This header must be included
-// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
-// However, users of vqsort.h are unaffected.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-
-#include "hwy/base.h"
-
-#if HWY_COMPILER_MSVC
-#undef HWY_DISABLED_TARGETS
-// HWY_SCALAR remains, so there will still be a valid target to call.
-#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
-#endif  // HWY_COMPILER_MSVC
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
--- a/third_party/highway/hwy/contrib/sort/print_network.cc
+++ b/third_party/highway/hwy/contrib/sort/print_network.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/contrib/sort/result-inl.h
+++ b/third_party/highway/hwy/contrib/sort/result-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -33,20 +34,19 @@ struct Timestamp {
  double t;
 };

-double SecondsSince(const Timestamp& t0) {
+static inline double SecondsSince(const Timestamp& t0) {
  const Timestamp t1;
  return t1.t - t0.t;
 }

-constexpr size_t kReps = 30;
-
 // Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
 // enough for the mode to be reliable).
-double SummarizeMeasurements(std::vector<double>& seconds) {
+static inline double SummarizeMeasurements(std::vector<double>& seconds) {
  std::sort(seconds.begin(), seconds.end());
  double sum = 0;
  int count = 0;
-  for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
+  const size_t num = seconds.size();
+  for (size_t i = num / 4; i < num / 2; ++i) {
    sum += seconds[i];
    count += 1;
  }
@ -71,72 +71,62 @@ namespace HWY_NAMESPACE {

 struct Result {
  Result() {}
-  Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
-         size_t num, size_t num_threads, double sec, size_t sizeof_t,
-         const char* type_name)
-      : target(target),
+  Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
+         double sec, size_t sizeof_key, const std::string& key_name)
+      : target(HWY_TARGET),
        algo(algo),
        dist(dist),
-        is128(is128),
-        num(num),
+        num_keys(num_keys),
        num_threads(num_threads),
        sec(sec),
-        sizeof_t(sizeof_t),
-        type_name(type_name) {}
+        sizeof_key(sizeof_key),
+        key_name(key_name) {}

  void Print() const {
-    const double bytes = static_cast<double>(num) *
+    const double bytes = static_cast<double>(num_keys) *
                         static_cast<double>(num_threads) *
-                         static_cast<double>(sizeof_t);
+                         static_cast<double>(sizeof_key);
    printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
-           hwy::TargetName(target), AlgoName(algo),
-           is128 ? "u128" : type_name.c_str(), DistName(dist),
-           static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
+           hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
+           DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
+           num_threads);
  }

-  uint32_t target;
+  int64_t target;
  Algo algo;
  Dist dist;
-  bool is128;
-  size_t num = 0;
+  size_t num_keys = 0;
  size_t num_threads = 0;
  double sec = 0.0;
-  size_t sizeof_t = 0;
-  std::string type_name;
+  size_t sizeof_key = 0;
+  std::string key_name;
 };

-template <typename T, class Traits>
-Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
-                  size_t num_threads, double sec) {
-  char string100[100];
-  hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
-  return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
-                sizeof(T), string100);
-}
+template <class Traits, typename LaneType>
+bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
+                const LaneType* out, size_t num_lanes, const char* caller) {
+  constexpr size_t N1 = st.LanesPerKey();
+  HWY_ASSERT(num_lanes >= N1);

-template <class Traits, typename T>
-bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
-                size_t num, const char* caller) {
-  constexpr size_t N1 = st.Is128() ? 2 : 1;
-  HWY_ASSERT(num >= N1);
-
-  InputStats<T> output_stats;
+  InputStats<LaneType> output_stats;
  // Ensure it matches the sort order
-  for (size_t i = 0; i < num - N1; i += N1) {
+  for (size_t i = 0; i < num_lanes - N1; i += N1) {
    output_stats.Notify(out[i]);
    if (N1 == 2) output_stats.Notify(out[i + 1]);
    // Reverse order instead of checking !Compare1 so we accept equal keys.
    if (st.Compare1(out + i + N1, out + i)) {
-      printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
-             static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
-             double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
-             double(out[i + N1]));
+      printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
+             caller, static_cast<int>(i), static_cast<int>(num_lanes),
+             static_cast<int>(N1), static_cast<double>(out[i + 1]),
+             static_cast<double>(out[i + 0]),
+             static_cast<double>(out[i + N1 + 1]),
+             static_cast<double>(out[i + N1]));
      HWY_ABORT("%d-bit sort is incorrect\n",
-                static_cast<int>(sizeof(T) * 8 * N1));
+                static_cast<int>(sizeof(LaneType) * 8 * N1));
    }
  }
-  output_stats.Notify(out[num - N1]);
-  if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
+  output_stats.Notify(out[num_lanes - N1]);
+  if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);

  return input_stats == output_stats;
 }
--- a/third_party/highway/hwy/contrib/sort/shared-inl.h
+++ b/third_party/highway/hwy/contrib/sort/shared-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -27,8 +28,8 @@ namespace hwy {
 struct SortConstants {
 // SortingNetwork reshapes its input into a matrix. This is the maximum number
 // of *keys* per vector.
-#if HWY_COMPILER_MSVC
-  static constexpr size_t kMaxCols = 8;  // avoids build timeout
+#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
+  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
 #else
  static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
 #endif
@ -41,7 +42,7 @@ struct SortConstants {
  static constexpr size_t kMaxRowsLog2 = 4;
  static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;

-  static HWY_INLINE size_t BaseCaseNum(size_t N) {
+  static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
    return kMaxRows * HWY_MIN(N, kMaxCols);
  }

@ -52,7 +53,7 @@ struct SortConstants {
  // To change, must also update left + 3 * N etc. in the loop.
  static constexpr size_t kPartitionUnroll = 4;

-  static HWY_INLINE size_t PartitionBufNum(size_t N) {
+  static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
    // The main loop reads kPartitionUnroll vectors, and first loads from
    // both left and right beforehand, so it requires min = 2 *
    // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
@ -64,9 +65,26 @@ struct SortConstants {
  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
  // are larger, use entire vectors to ensure we do not overrun the array.
-  static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
+  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
    return HWY_MAX(64 / sizeof_t, N);
  }
+
+  static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
+    // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
+    return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
+  }
+
+  template <typename T>
+  static constexpr HWY_INLINE size_t BufNum(size_t N) {
+    // One extra for padding plus another for full-vector loads.
+    return HWY_MAX(BaseCaseNum(N) + 2 * N,
+                   HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
+  }
+
+  template <typename T>
+  static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
+    return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
+  }
 };

 }  // namespace hwy
@ -84,12 +102,23 @@ struct SortConstants {

 #include "hwy/highway.h"

+// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
+// Arm v7 debug.
+#undef VQSORT_ENABLED
+#if (HWY_TARGET == HWY_SCALAR) ||                 \
+    (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
+    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
+#define VQSORT_ENABLED 0
+#else
+#define VQSORT_ENABLED 1
+#endif
+
 namespace hwy {
 namespace HWY_NAMESPACE {

 // Default tag / vector width selector.
-// TODO(janwas): enable once LMUL < 1 is supported.
-#if HWY_TARGET == HWY_RVV && 0
+#if HWY_TARGET == HWY_RVV
+// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
 template <typename T>
 using SortTag = ScalableTag<T, -1>;
 #else
--- a/third_party/highway/hwy/contrib/sort/sort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/sort-inl.h
@ -1,909 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target include guard
-
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
-#endif
-
-#include <inttypes.h>
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-enum class SortOrder { kAscending, kDescending };
-
-#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
-
-#define HWY_SORT_VERIFY 1
-
-constexpr inline SortOrder Reverse(SortOrder order) {
-  return (order == SortOrder::kAscending) ? SortOrder::kDescending
-                                          : SortOrder::kAscending;
-}
-
-namespace verify {
-
-template <typename T>
-bool Compare(T a, T b, SortOrder kOrder) {
-  if (kOrder == SortOrder::kAscending) return a <= b;
-  return a >= b;
-}
-
-#if HWY_SORT_VERIFY
-
-template <class D>
-class Runs {
-  using T = TFromD<D>;
-
- public:
-  Runs(D d, size_t num_regs, size_t run_length = 0, bool alternating = false) {
-    const size_t N = Lanes(d);
-
-    buf_ = AllocateAligned<T>(N);
-    consecutive_ = AllocateAligned<T>(num_regs * N);
-
-    num_regs_ = num_regs;
-    if (run_length) {
-      run_length_ = run_length;
-      num_runs_ = num_regs * N / run_length;
-      is_vector_ = true;
-      alternating_ = alternating;
-    } else {
-      run_length_ = num_regs * 4;
-      num_runs_ = N / 4;
-      is_vector_ = false;
-      alternating_ = false;
-    }
-  }
-
-  void ScatterQuartets(D d, const size_t idx_reg, Vec<D> v) {
-    HWY_ASSERT(idx_reg < num_regs_);
-    const size_t N = Lanes(d);
-    for (size_t i = 0; i < N; i += 4) {
-      Store(v, d, buf_.get());
-      const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
-      CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
-    }
-  }
-
-  void StoreVector(D d, const size_t idx_reg, Vec<D> v) {
-    HWY_ASSERT(idx_reg < num_regs_);
-    Store(v, d, &consecutive_[idx_reg * Lanes(d)]);
-  }
-
-  bool IsBitonic() const {
-    HWY_ASSERT(!alternating_);
-    for (size_t ir = 0; ir < num_runs_; ++ir) {
-      const T* p = &consecutive_[ir * run_length_];
-      bool is_asc = true;
-      bool is_desc = true;
-      bool is_zero = true;
-
-      for (size_t i = 0; i < run_length_ / 2 - 1; ++i) {
-        is_asc &= (p[i] <= p[i + 1]);
-        is_desc &= (p[i] >= p[i + 1]);
-      }
-      for (size_t i = 0; i < run_length_; ++i) {
-        is_zero &= (p[i] == 0);
-      }
-
-      bool is_asc2 = true;
-      bool is_desc2 = true;
-      for (size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
-        is_asc2 &= (p[i] <= p[i + 1]);
-        is_desc2 &= (p[i] >= p[i + 1]);
-      }
-
-      if (is_zero) continue;
-      if (is_asc && is_desc2) continue;
-      if (is_desc && is_asc2) continue;
-      return false;
-    }
-    return true;
-  }
-
-  void CheckBitonic(int line, int caller) const {
-    if (IsBitonic()) return;
-    for (size_t ir = 0; ir < num_runs_; ++ir) {
-      const T* p = &consecutive_[ir * run_length_];
-      printf("run %" PRIu64 " (len %" PRIu64 ")\n", static_cast<uint64_t>(ir),
-             static_cast<uint64_t>(run_length_));
-      for (size_t i = 0; i < run_length_; ++i) {
-        printf("%.0f\n", static_cast<float>(p[i]));
-      }
-    }
-    printf("caller %d\n", caller);
-    hwy::Abort("", line, "not bitonic");
-  }
-
-  void CheckSorted(SortOrder kOrder, int line, int caller) const {
-    for (size_t ir = 0; ir < num_runs_; ++ir) {
-      const SortOrder order =
-          (alternating_ && (ir & 1)) ? Reverse(kOrder) : kOrder;
-      const T* p = &consecutive_[ir * run_length_];
-
-      for (size_t i = 0; i < run_length_ - 1; ++i) {
-        if (!Compare(p[i], p[i + 1], order)) {
-          printf("ir%" PRIu64 " run_length=%" PRIu64
-                 " alt=%d original order=%d this order=%d\n",
-                 static_cast<uint64_t>(ir), static_cast<uint64_t>(run_length_),
-                 alternating_, static_cast<int>(kOrder),
-                 static_cast<int>(order));
-          for (size_t i = 0; i < run_length_; ++i) {
-            printf(" %.0f\n", static_cast<float>(p[i]));
-          }
-          printf("caller %d\n", caller);
-          hwy::Abort("", line, "not sorted");
-        }
-      }
-    }
-  }
-
- private:
-  AlignedFreeUniquePtr<T[]> buf_;
-  AlignedFreeUniquePtr<T[]> consecutive_;
-  size_t num_regs_;
-  size_t run_length_;
-  size_t num_runs_;
-  bool is_vector_;
-  bool alternating_;
-};
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
-  Runs<D> runs(d, 1);
-  runs.ScatterQuartets(d, 0, v0);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
-  Runs<D> runs(d, 2);
-  runs.ScatterQuartets(d, 0, v0);
-  runs.ScatterQuartets(d, 1, v1);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
-                                   Vec<D> v3) {
-  Runs<D> runs(d, 4);
-  runs.ScatterQuartets(d, 0, v0);
-  runs.ScatterQuartets(d, 1, v1);
-  runs.ScatterQuartets(d, 2, v2);
-  runs.ScatterQuartets(d, 3, v3);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
-                                   Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
-                                   Vec<D> v7) {
-  Runs<D> runs(d, 8);
-  runs.ScatterQuartets(d, 0, v0);
-  runs.ScatterQuartets(d, 1, v1);
-  runs.ScatterQuartets(d, 2, v2);
-  runs.ScatterQuartets(d, 3, v3);
-  runs.ScatterQuartets(d, 4, v4);
-  runs.ScatterQuartets(d, 5, v5);
-  runs.ScatterQuartets(d, 6, v6);
-  runs.ScatterQuartets(d, 7, v7);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
-                                   Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
-                                   Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
-                                   Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
-                                   Vec<D> vF) {
-  Runs<D> runs(d, 16);
-  runs.ScatterQuartets(d, 0x0, v0);
-  runs.ScatterQuartets(d, 0x1, v1);
-  runs.ScatterQuartets(d, 0x2, v2);
-  runs.ScatterQuartets(d, 0x3, v3);
-  runs.ScatterQuartets(d, 0x4, v4);
-  runs.ScatterQuartets(d, 0x5, v5);
-  runs.ScatterQuartets(d, 0x6, v6);
-  runs.ScatterQuartets(d, 0x7, v7);
-  runs.ScatterQuartets(d, 0x8, v8);
-  runs.ScatterQuartets(d, 0x9, v9);
-  runs.ScatterQuartets(d, 0xA, vA);
-  runs.ScatterQuartets(d, 0xB, vB);
-  runs.ScatterQuartets(d, 0xC, vC);
-  runs.ScatterQuartets(d, 0xD, vD);
-  runs.ScatterQuartets(d, 0xE, vE);
-  runs.ScatterQuartets(d, 0xF, vF);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreDeinterleavedQuartets(
-    D d, const Vec<D>& v00, const Vec<D>& v01, const Vec<D>& v02,
-    const Vec<D>& v03, const Vec<D>& v04, const Vec<D>& v05, const Vec<D>& v06,
-    const Vec<D>& v07, const Vec<D>& v08, const Vec<D>& v09, const Vec<D>& v0A,
-    const Vec<D>& v0B, const Vec<D>& v0C, const Vec<D>& v0D, const Vec<D>& v0E,
-    const Vec<D>& v0F, const Vec<D>& v10, const Vec<D>& v11, const Vec<D>& v12,
-    const Vec<D>& v13, const Vec<D>& v14, const Vec<D>& v15, const Vec<D>& v16,
-    const Vec<D>& v17, const Vec<D>& v18, const Vec<D>& v19, const Vec<D>& v1A,
-    const Vec<D>& v1B, const Vec<D>& v1C, const Vec<D>& v1D, const Vec<D>& v1E,
-    const Vec<D>& v1F) {
-  Runs<D> runs(d, 32);
-  runs.ScatterQuartets(d, 0x00, v00);
-  runs.ScatterQuartets(d, 0x01, v01);
-  runs.ScatterQuartets(d, 0x02, v02);
-  runs.ScatterQuartets(d, 0x03, v03);
-  runs.ScatterQuartets(d, 0x04, v04);
-  runs.ScatterQuartets(d, 0x05, v05);
-  runs.ScatterQuartets(d, 0x06, v06);
-  runs.ScatterQuartets(d, 0x07, v07);
-  runs.ScatterQuartets(d, 0x08, v08);
-  runs.ScatterQuartets(d, 0x09, v09);
-  runs.ScatterQuartets(d, 0x0A, v0A);
-  runs.ScatterQuartets(d, 0x0B, v0B);
-  runs.ScatterQuartets(d, 0x0C, v0C);
-  runs.ScatterQuartets(d, 0x0D, v0D);
-  runs.ScatterQuartets(d, 0x0E, v0E);
-  runs.ScatterQuartets(d, 0x0F, v0F);
-  runs.ScatterQuartets(d, 0x10, v10);
-  runs.ScatterQuartets(d, 0x11, v11);
-  runs.ScatterQuartets(d, 0x12, v12);
-  runs.ScatterQuartets(d, 0x13, v13);
-  runs.ScatterQuartets(d, 0x14, v14);
-  runs.ScatterQuartets(d, 0x15, v15);
-  runs.ScatterQuartets(d, 0x16, v16);
-  runs.ScatterQuartets(d, 0x17, v17);
-  runs.ScatterQuartets(d, 0x18, v18);
-  runs.ScatterQuartets(d, 0x19, v19);
-  runs.ScatterQuartets(d, 0x1A, v1A);
-  runs.ScatterQuartets(d, 0x1B, v1B);
-  runs.ScatterQuartets(d, 0x1C, v1C);
-  runs.ScatterQuartets(d, 0x1D, v1D);
-  runs.ScatterQuartets(d, 0x1E, v1E);
-  runs.ScatterQuartets(d, 0x1F, v1F);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreVectors(D d, Vec<D> v0, size_t run_length, bool alternating) {
-  Runs<D> runs(d, 1, run_length, alternating);
-  runs.StoreVector(d, 0, v0);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
-  constexpr size_t kRegs = 2;
-  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
-  runs.StoreVector(d, 0, v0);
-  runs.StoreVector(d, 1, v1);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
-  constexpr size_t kRegs = 4;
-  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
-  runs.StoreVector(d, 0, v0);
-  runs.StoreVector(d, 1, v1);
-  runs.StoreVector(d, 2, v2);
-  runs.StoreVector(d, 3, v3);
-  return runs;
-}
-
-template <class D>
-Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
-                     Vec<D> v5, Vec<D> v6, Vec<D> v7) {
-  constexpr size_t kRegs = 8;
-  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
-  runs.StoreVector(d, 0, v0);
-  runs.StoreVector(d, 1, v1);
-  runs.StoreVector(d, 2, v2);
-  runs.StoreVector(d, 3, v3);
-  runs.StoreVector(d, 4, v4);
-  runs.StoreVector(d, 5, v5);
-  runs.StoreVector(d, 6, v6);
-  runs.StoreVector(d, 7, v7);
-  return runs;
-}
-
-#endif  // HWY_SORT_VERIFY
-}  // namespace verify
-
-namespace detail {
-
-// ------------------------------ Vector-length agnostic (quartets)
-
-// For each lane i: replaces a[i] with the first and b[i] with the second
-// according to kOrder.
-// Corresponds to a conditional swap, which is one "node" of a sorting network.
-// Min/Max are cheaper than compare + blend at least for integers.
-template <SortOrder kOrder, class V>
-HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
-  V temp = a;
-  a = (kOrder == SortOrder::kAscending) ? Min(a, b) : Max(a, b);
-  b = (kOrder == SortOrder::kAscending) ? Max(temp, b) : Min(temp, b);
-}
-
-// For each lane: sorts the four values in the that lane of the four vectors.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void SortLanesIn4Vectors(D d, const TFromD<D>* in, V& v0, V& v1,
-                                    V& v2, V& v3) {
-  const size_t N = Lanes(d);
-
-  // Bitonic and odd-even sorters both have 5 nodes. This one is from
-  // http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
-
-  // layer 1
-  v0 = Load(d, in + 0 * N);
-  v2 = Load(d, in + 2 * N);
-  SortLanesIn2Vectors<kOrder>(v0, v2);
-  v1 = Load(d, in + 1 * N);
-  v3 = Load(d, in + 3 * N);
-  SortLanesIn2Vectors<kOrder>(v1, v3);
-
-  // layer 2
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  SortLanesIn2Vectors<kOrder>(v2, v3);
-
-  // layer 3
-  SortLanesIn2Vectors<kOrder>(v1, v2);
-}
-
-// Inputs are vectors with columns in sorted order (from SortLanesIn4Vectors).
-// Transposes so that output vectors are sorted quartets (128-bit blocks),
-// and a quartet in v0 comes before its counterpart in v1, etc.
-template <class D, class V = Vec<D>>
-HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
-  const RepartitionToWide<decltype(d)> dw;
-
-  // Input: first number is reg, second is lane (0 is lowest)
-  // 03 02 01 00  |
-  // 13 12 11 10  | columns are sorted
-  // 23 22 21 20  | (in this order)
-  // 33 32 31 30  V
-  const V t0 = InterleaveLower(d, v0, v1);  // 11 01 10 00
-  const V t1 = InterleaveLower(d, v2, v3);  // 31 21 30 20
-  const V t2 = InterleaveUpper(d, v0, v1);  // 13 03 12 02
-  const V t3 = InterleaveUpper(d, v2, v3);  // 33 23 32 22
-
-  // 30 20 10 00
-  v0 = BitCast(d, InterleaveLower(BitCast(dw, t0), BitCast(dw, t1)));
-  // 31 21 11 01
-  v1 = BitCast(d, InterleaveUpper(BitCast(dw, t0), BitCast(dw, t1)));
-  // 32 22 12 02
-  v2 = BitCast(d, InterleaveLower(BitCast(dw, t2), BitCast(dw, t3)));
-  // 33 23 13 03 --> sorted in descending order (03=smallest in lane 0).
-  v3 = BitCast(d, InterleaveUpper(BitCast(dw, t2), BitCast(dw, t3)));
-}
-
-// 12 ops (including 4 swizzle)
-// Precondition: v0 and v1 are already sorted according to kOrder.
-// Postcondition: concatenate(v0, v1) is sorted and v0 is the lower half.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
-  const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
-  input0.CheckSorted(kOrder, __LINE__, caller);
-  input1.CheckSorted(kOrder, __LINE__, caller);
-#endif
-
-  // See figure 5 from https://www.vldb.org/pvldb/vol8/p1274-inoue.pdf.
-  // This requires 8 min/max vs 6 for bitonic merge (see Figure 2 in
-  // http://www.vldb.org/pvldb/vol1/1454171.pdf), but is faster overall because
-  // it needs less shuffling, and does not need a bitonic input.
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  v0 = Shuffle0321(v0);
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  v0 = Shuffle0321(v0);
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  v0 = Shuffle0321(v0);
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  v0 = Shuffle0321(v0);
-
-#if HWY_SORT_VERIFY
-  auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// ------------------------------ Bitonic merge (quartets)
-
-// For the last layer of bitonic merge. Conditionally swaps even-numbered lanes
-// with their odd-numbered neighbor. Works for both quartets and vectors.
-template <SortOrder kOrder, class D>
-HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
-  (void)d;
-  // Optimization for 32-bit integers: swap via Shuffle and 64-bit Min/Max.
-  // (not worthwhile on SSE4/AVX2 because they lack 64-bit Min/Max)
-#if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
-  if (sizeof(TFromD<D>) == 4 && !IsFloat<TFromD<D>>()) {
-    const RepartitionToWide<decltype(d)> dw;
-    const auto wide = BitCast(dw, q_or_v);
-    const auto swap = BitCast(dw, Shuffle2301(q_or_v));
-    if (kOrder == SortOrder::kAscending) {
-      q_or_v = BitCast(d, Max(wide, swap));
-    } else {
-      q_or_v = BitCast(d, Min(wide, swap));
-    }
-  } else
-#endif
-  {
-    Vec<D> swapped = Shuffle2301(q_or_v);
-    SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
-    q_or_v = OddEven(swapped, q_or_v);
-  }
-}
-
-// Lane 0 with 2, 1 with 3 etc. Works for both quartets and vectors.
-template <SortOrder kOrder, class D>
-HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
-  const RepartitionToWide<decltype(d)> dw;
-  Vec<D> swapped = Shuffle1032(q_or_v);
-  SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
-  q_or_v = BitCast(d, OddEven(BitCast(dw, swapped), BitCast(dw, q_or_v)));
-}
-
-// For all BitonicMerge*, and each block, the concatenation of those blocks from
-// the first half and second half of the input vectors must be sorted in
-// opposite orders.
-
-// 14 ops (including 4 swizzle)
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 4 (2 ops)
-  SortLanesIn2Vectors<kOrder>(q0, q1);
-
-  // Layer 2: lane stride 2 (6 ops)
-  SortDistance2LanesQV<kOrder>(d, q0);
-  SortDistance2LanesQV<kOrder>(d, q1);
-
-  // Layer 3: lane stride 1 (4 ops)
-  SortAdjacentLanesQV<kOrder>(d, q0);
-  SortAdjacentLanesQV<kOrder>(d, q1);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// 32 ops, more efficient than three 4+4 merges (36 ops).
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
-                                      int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input =
-      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 8
-  SortLanesIn2Vectors<kOrder>(q0, q2);
-  SortLanesIn2Vectors<kOrder>(q1, q3);
-
-  // Layers 2 to 4
-  // Inputs are not fully sorted, so cannot use Merge2SortedQuartets.
-  BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
-  BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output =
-      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// 72 ops.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
-                                      V& q5, V& q6, V& q7, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input =
-      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 16
-  SortLanesIn2Vectors<kOrder>(q0, q4);
-  SortLanesIn2Vectors<kOrder>(q1, q5);
-  SortLanesIn2Vectors<kOrder>(q2, q6);
-  SortLanesIn2Vectors<kOrder>(q3, q7);
-
-  // Layers 2 to 5
-  BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
-  BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output =
-      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// ------------------------------ Bitonic merge (vectors)
-
-// Lane 0 with 4, 1 with 5 etc. Only used for vectors with at least 8 lanes.
-#if HWY_TARGET <= HWY_AVX3
-
-// TODO(janwas): move to op
-template <typename T>
-Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
-  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
-}
-
-template <typename T>
-Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
-  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
-}
-
-template <typename T>
-Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
-  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
-}
-
-template <typename T>
-Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
-  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
-}
-
-template <SortOrder kOrder, class T>
-HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d, Vec<decltype(d)>& v) {
-  // In: FEDCBA98 76543210
-  // Swap 128-bit halves of each 256 bits => BA98FEDC 32107654
-  Vec512<T> swapped = Shuffle128_2301(v, v);
-  SortLanesIn2Vectors<kOrder>(v, swapped);
-  v = OddEven128(swapped, v);
-}
-
-#endif
-
-template <SortOrder kOrder, typename T>
-HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d, Vec<decltype(d)>& v) {
-  Vec<decltype(d)> swapped = ConcatLowerUpper(d, v, v);
-  SortLanesIn2Vectors<kOrder>(v, swapped);
-  v = ConcatUpperLower(swapped, v);
-}
-
-template <SortOrder kOrder, typename T>
-HWY_INLINE void SortDistance4LanesV(Simd<T, 4> /* tag */, ...) {}
-
-// Only used for vectors with at least 16 lanes.
-template <SortOrder kOrder, class D>
-HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
-  Vec<D> swapped = ConcatLowerUpper(d, v, v);
-  SortLanesIn2Vectors<kOrder>(v, swapped);
-  v = ConcatUpperLower(swapped, v);
-}
-
-// 120 ops. Only used if vectors are at least 8 lanes.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                                 V& v6, V& v7, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input =
-      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 32
-  SortLanesIn2Vectors<kOrder>(v0, v4);
-  SortLanesIn2Vectors<kOrder>(v1, v5);
-  SortLanesIn2Vectors<kOrder>(v2, v6);
-  SortLanesIn2Vectors<kOrder>(v3, v7);
-
-  // Layer 2: lane stride 16
-  SortLanesIn2Vectors<kOrder>(v0, v2);
-  SortLanesIn2Vectors<kOrder>(v1, v3);
-  SortLanesIn2Vectors<kOrder>(v4, v6);
-  SortLanesIn2Vectors<kOrder>(v5, v7);
-
-  // Layer 3: lane stride 8
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  SortLanesIn2Vectors<kOrder>(v2, v3);
-  SortLanesIn2Vectors<kOrder>(v4, v5);
-  SortLanesIn2Vectors<kOrder>(v6, v7);
-
-  // Layer 4: lane stride 4
-  SortDistance4LanesV<kOrder>(d, v0);
-  SortDistance4LanesV<kOrder>(d, v1);
-  SortDistance4LanesV<kOrder>(d, v2);
-  SortDistance4LanesV<kOrder>(d, v3);
-  SortDistance4LanesV<kOrder>(d, v4);
-  SortDistance4LanesV<kOrder>(d, v5);
-  SortDistance4LanesV<kOrder>(d, v6);
-  SortDistance4LanesV<kOrder>(d, v7);
-
-  // Layer 5: lane stride 2
-  SortDistance2LanesQV<kOrder>(d, v0);
-  SortDistance2LanesQV<kOrder>(d, v1);
-  SortDistance2LanesQV<kOrder>(d, v2);
-  SortDistance2LanesQV<kOrder>(d, v3);
-  SortDistance2LanesQV<kOrder>(d, v4);
-  SortDistance2LanesQV<kOrder>(d, v5);
-  SortDistance2LanesQV<kOrder>(d, v6);
-  SortDistance2LanesQV<kOrder>(d, v7);
-
-  // Layer 6: lane stride 1
-  SortAdjacentLanesQV<kOrder>(d, v0);
-  SortAdjacentLanesQV<kOrder>(d, v1);
-  SortAdjacentLanesQV<kOrder>(d, v2);
-  SortAdjacentLanesQV<kOrder>(d, v3);
-  SortAdjacentLanesQV<kOrder>(d, v4);
-  SortAdjacentLanesQV<kOrder>(d, v5);
-  SortAdjacentLanesQV<kOrder>(d, v6);
-  SortAdjacentLanesQV<kOrder>(d, v7);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output =
-      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// 60 ops. Only used if vectors are at least 16 lanes.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 32
-  SortLanesIn2Vectors<kOrder>(v0, v2);
-  SortLanesIn2Vectors<kOrder>(v1, v3);
-
-  // Layer 2: lane stride 16
-  SortLanesIn2Vectors<kOrder>(v0, v1);
-  SortLanesIn2Vectors<kOrder>(v2, v3);
-
-  // Layer 3: lane stride 8
-  SortDistance8LanesV<kOrder>(d, v0);
-  SortDistance8LanesV<kOrder>(d, v1);
-  SortDistance8LanesV<kOrder>(d, v2);
-  SortDistance8LanesV<kOrder>(d, v3);
-
-  // Layer 4: lane stride 4
-  SortDistance4LanesV<kOrder>(d, v0);
-  SortDistance4LanesV<kOrder>(d, v1);
-  SortDistance4LanesV<kOrder>(d, v2);
-  SortDistance4LanesV<kOrder>(d, v3);
-
-  // Layer 5: lane stride 2
-  SortDistance2LanesQV<kOrder>(d, v0);
-  SortDistance2LanesQV<kOrder>(d, v1);
-  SortDistance2LanesQV<kOrder>(d, v2);
-  SortDistance2LanesQV<kOrder>(d, v3);
-
-  // Layer 6: lane stride 1
-  SortAdjacentLanesQV<kOrder>(d, v0);
-  SortAdjacentLanesQV<kOrder>(d, v1);
-  SortAdjacentLanesQV<kOrder>(d, v2);
-  SortAdjacentLanesQV<kOrder>(d, v3);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// 128 ops. Only used if vectors are at least 16 lanes.
-template <SortOrder kOrder, class D, class V = Vec<D>>
-HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                                  V& v6, V& v7, int caller) {
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> input =
-      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
-  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
-#endif
-
-  // Layer 1: lane stride 64
-  SortLanesIn2Vectors<kOrder>(v0, v4);
-  SortLanesIn2Vectors<kOrder>(v1, v5);
-  SortLanesIn2Vectors<kOrder>(v2, v6);
-  SortLanesIn2Vectors<kOrder>(v3, v7);
-
-  BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
-  BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
-
-#if HWY_SORT_VERIFY
-  const verify::Runs<D> output =
-      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
-  output.CheckSorted(kOrder, __LINE__, caller);
-#endif
-}
-
-// ------------------------------ Vector-length dependent
-
-// Only called when N=4 (single block, so quartets can just be stored).
-template <SortOrder kOrder, class D, class V>
-HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
-                                      V& q5, V& q6, V& q7, TFromD<D>* inout) {
-  Store(q0, d, inout + 0 * 4);
-  Store(q1, d, inout + 1 * 4);
-  Store(q2, d, inout + 2 * 4);
-  Store(q3, d, inout + 3 * 4);
-  Store(q4, d, inout + 4 * 4);
-  Store(q5, d, inout + 5 * 4);
-  Store(q6, d, inout + 6 * 4);
-  Store(q7, d, inout + 7 * 4);
-  return 8 * 4;
-}
-
-// Only called when N=8.
-template <SortOrder kOrder, class D, class V>
-HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
-                                    V& q5, V& q6, V& q7, TFromD<D>* inout) {
-  V v0 = ConcatLowerLower(d, q1, q0);
-  V v1 = ConcatLowerLower(d, q3, q2);
-  V v2 = ConcatLowerLower(d, q5, q4);
-  V v3 = ConcatLowerLower(d, q7, q6);
-  // TODO(janwas): merge into single table
-  V v4 = Reverse(d, ConcatUpperUpper(d, q7, q6));
-  V v5 = Reverse(d, ConcatUpperUpper(d, q5, q4));
-  V v6 = Reverse(d, ConcatUpperUpper(d, q3, q2));
-  V v7 = Reverse(d, ConcatUpperUpper(d, q1, q0));
-  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
-
-  Store(v0, d, inout + 0 * 8);
-  Store(v1, d, inout + 1 * 8);
-  Store(v2, d, inout + 2 * 8);
-  Store(v3, d, inout + 3 * 8);
-  Store(v4, d, inout + 4 * 8);
-  Store(v5, d, inout + 5 * 8);
-  Store(v6, d, inout + 6 * 8);
-  Store(v7, d, inout + 7 * 8);
-  return 8 * 8;
-}
-
-// Only called when N=16.
-template <SortOrder kOrder, typename T, class V>
-HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
-                                     V& q4, V& q5, V& q6, V& q7, T* inout) {
-  const V q11_01_10_00 = Shuffle128_2020(q0, q1);
-  const V q13_03_12_02 = Shuffle128_2020(q2, q3);
-  V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02);  // 3..0
-
-  const V q15_05_14_04 = Shuffle128_2020(q4, q5);
-  const V q17_07_16_06 = Shuffle128_2020(q6, q7);
-  V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06);  // 7..4
-
-  const V q19_09_18_08 = Shuffle128_3131(q0, q1);
-  const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
-  V v3 = Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a));  // b..8
-
-  const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
-  const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
-  V v2 = Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e));  // f..c
-
-  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
-
-  // TODO(janwas): merge into single table
-  V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02);              // 13..10
-  V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06);              // 17..14
-  V v7 = Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a));  // 1b..18
-  V v6 = Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e));  // 1f..1c
-
-  detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
-
-  detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
-
-  Store(v0, d, inout + 0 * 16);
-  Store(v1, d, inout + 1 * 16);
-  Store(v2, d, inout + 2 * 16);
-  Store(v3, d, inout + 3 * 16);
-  Store(v4, d, inout + 4 * 16);
-  Store(v5, d, inout + 5 * 16);
-  Store(v6, d, inout + 6 * 16);
-  Store(v7, d, inout + 7 * 16);
-  return 8 * 16;
-}
-
-// Avoid needing #if at the call sites.
-template <SortOrder kOrder, typename T>
-HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
-  return 0;
-}
-
-template <SortOrder kOrder, typename T>
-HWY_API size_t FourQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
-  return 0;
-}
-template <SortOrder kOrder, typename T>
-HWY_API size_t FourQuartetsPerVector(Simd<T, 8> /* tag */, ...) {
-  return 0;
-}
-
-}  // namespace detail
-
-template <class D>
-HWY_API size_t SortBatchSize(D d) {
-  const size_t N = Lanes(d);
-  if (N == 4) return 32;
-  if (N == 8) return 64;
-  if (N == 16) return 128;
-  return 0;
-}
-
-template <SortOrder kOrder, class D>
-HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
-  const size_t N = Lanes(d);
-
-  Vec<D> q0, q1, q2, q3;
-  detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
-  detail::Transpose4x4(d, q0, q1, q2, q3);
-  detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
-  detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
-
-  // Bitonic merges require one input to be in reverse order.
-  constexpr SortOrder kReverse = Reverse(kOrder);
-
-  Vec<D> q4, q5, q6, q7;
-  detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
-  detail::Transpose4x4(d, q4, q5, q6, q7);
-  detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
-  detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
-
-  detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
-  detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
-
-  detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
-                                        __LINE__);
-
-  if (N == 4) {
-    return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
-                                                  q7, inout);
-  }
-
-  if (N == 8) {
-    return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
-                                                q7, inout);
-  }
-
-  return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
-                                               q7, inout);
-}
-
-#else
-
-// Avoids unused attribute warning
-template <SortOrder kOrder, class D>
-HWY_API size_t SortBatch(D /* tag */, TFromD<D>* /* inout */) {
-  return 0;
-}
-
-#endif  // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
--- a/third_party/highway/hwy/contrib/sort/sort_test.cc
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,97 +13,76 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
-#include "hwy/foreach_target.h"
-
-#include "hwy/contrib/sort/vqsort.h"
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>  // memcpy

-#include <algorithm>  // std::max
 #include <vector>

-#undef VQSORT_TEST_IMPL
-#if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
-// Scalar does not implement these, and MSVC non-debug builds time out.
-#define VQSORT_TEST_IMPL 0
-#else
-#define VQSORT_TEST_IMPL 1
-#endif
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

-#undef VQSORT_TEST_SORT
-// MSVC non-debug builds time out.
-#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
-#define VQSORT_TEST_SORT 0
-#else
-#define VQSORT_TEST_SORT 1
-#endif
+#include "hwy/contrib/sort/vqsort.h"
+// After foreach_target
+#include "hwy/contrib/sort/algo-inl.h"
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/result-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
+#include "hwy/tests/test_util-inl.h"
+// clang-format on

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace {

-#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
-using detail::LaneTraits;
 using detail::OrderAscending;
-using detail::OrderAscending128;
 using detail::OrderDescending;
-using detail::OrderDescending128;
 using detail::SharedTraits;
+using detail::TraitsLane;
+#if VQSORT_ENABLED || HWY_IDE
+using detail::OrderAscending128;
+using detail::OrderAscendingKV128;
+using detail::OrderDescending128;
+using detail::OrderDescendingKV128;
 using detail::Traits128;
-#endif
-
-#if !VQSORT_TEST_IMPL
-static void TestAllMedian() {}
-static void TestAllBaseCase() {}
-static void TestAllPartition() {}
-static void TestAllGenerator() {}
-#else

 template <class Traits>
 static HWY_NOINLINE void TestMedian3() {
-  using T = uint64_t;
-  using D = CappedTag<T, 1>;
+  using LaneType = typename Traits::LaneType;
+  using D = CappedTag<LaneType, 1>;
  SharedTraits<Traits> st;
  const D d;
  using V = Vec<D>;
  for (uint32_t bits = 0; bits < 8; ++bits) {
-    const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
-    const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
-    const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
-    const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
+    const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
+    const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
+    const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
+    const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
    // If at least half(rounded up) of bits are 1, so is the median.
    const size_t count = PopCount(bits);
-    HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
+    HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
  }
 }

 HWY_NOINLINE void TestAllMedian() {
-  TestMedian3<LaneTraits<OrderAscending> >();
+  TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
 }

-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCaseAscDesc() {
+  using LaneType = typename Traits::LaneType;
  SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
  const size_t N = Lanes(d);
  const size_t base_case_num = SortConstants::BaseCaseNum(N);
  const size_t N1 = st.LanesPerKey();

  constexpr int kDebug = 0;
-  auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);

  std::vector<size_t> lengths;
  lengths.push_back(HWY_MAX(1, N1));
@ -123,43 +103,45 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
  for (bool asc : {false, true}) {
    for (size_t len : lengths) {
      for (size_t misalign : misalignments) {
-        T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+        LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
        if (kDebug) {
          printf("============%s asc %d N1 %d len %d misalign %d\n",
-                 hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
+                 st.KeyString().c_str(), asc, static_cast<int>(N1),
                 static_cast<int>(len), static_cast<int>(misalign));
        }

        for (size_t i = 0; i < misalign; ++i) {
-          aligned_keys[i] = hwy::LowestValue<T>();
+          aligned_lanes[i] = hwy::LowestValue<LaneType>();
        }
-        InputStats<T> input_stats;
+        InputStats<LaneType> input_stats;
        for (size_t i = 0; i < len; ++i) {
-          keys[i] =
-              asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
-          input_stats.Notify(keys[i]);
-          if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+          lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
+                         : static_cast<LaneType>(LaneType(len) - LaneType(i));
+          input_stats.Notify(lanes[i]);
+          if (kDebug >= 2) {
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+          }
        }
        for (size_t i = len; i < base_case_num + N; ++i) {
-          keys[i] = hwy::LowestValue<T>();
+          lanes[i] = hwy::LowestValue<LaneType>();
        }

-        detail::BaseCase(d, st, keys, len, buf.get());
+        detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());

        if (kDebug >= 2) {
          printf("out>>>>>>\n");
          for (size_t i = 0; i < len; ++i) {
-            printf("%3zu: %f\n", i, double(keys[i]));
+            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
          }
        }

-        HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned_keys[i] != hwy::LowestValue<T>())
+          if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
            HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
        }
        for (size_t i = len; i < base_case_num + N; ++i) {
-          if (keys[i] != hwy::LowestValue<T>())
+          if (lanes[i] != hwy::LowestValue<LaneType>())
            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
        }
      }  // misalign
@ -167,17 +149,18 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
  }      // asc
 }

-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase01() {
+  using LaneType = typename Traits::LaneType;
  SharedTraits<Traits> st;
-  const SortTag<T> d;
+  const SortTag<LaneType> d;
  const size_t N = Lanes(d);
  const size_t base_case_num = SortConstants::BaseCaseNum(N);
  const size_t N1 = st.LanesPerKey();

  constexpr int kDebug = 0;
-  auto keys = hwy::AllocateAligned<T>(base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
+  auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
+  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);

  std::vector<size_t> lengths;
  lengths.push_back(HWY_MAX(1, N1));
@ -189,65 +172,69 @@ static HWY_NOINLINE void TestBaseCase01() {

  for (size_t len : lengths) {
    if (kDebug) {
-      printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
+      printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
             static_cast<int>(N1), static_cast<int>(len));
    }
    const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
    for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
-      InputStats<T> input_stats;
+      InputStats<LaneType> input_stats;
      for (size_t i = 0; i < len; ++i) {
-        keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
-        input_stats.Notify(keys[i]);
-        if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+        lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
+        input_stats.Notify(lanes[i]);
+        if (kDebug >= 2) {
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+        }
      }
      for (size_t i = len; i < base_case_num + N; ++i) {
-        keys[i] = hwy::LowestValue<T>();
+        lanes[i] = hwy::LowestValue<LaneType>();
      }

-      detail::BaseCase(d, st, keys.get(), len, buf.get());
+      detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());

      if (kDebug >= 2) {
        printf("out>>>>>>\n");
        for (size_t i = 0; i < len; ++i) {
-          printf("%3zu: %f\n", i, double(keys[i]));
+          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
        }
      }

-      HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
+      HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
      for (size_t i = len; i < base_case_num + N; ++i) {
-        if (keys[i] != hwy::LowestValue<T>())
+        if (lanes[i] != hwy::LowestValue<LaneType>())
          HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
      }
    }  // bits
  }    // len
 }

-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestBaseCase() {
-  TestBaseCaseAscDesc<Traits, T>();
-  TestBaseCase01<Traits, T>();
+  TestBaseCaseAscDesc<Traits>();
+  TestBaseCase01<Traits>();
 }

 HWY_NOINLINE void TestAllBaseCase() {
  // Workaround for stack overflow on MSVC debug.
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
  return;
 #endif
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;

-  TestBaseCase<LaneTraits<OrderAscending>, int32_t>();
-  TestBaseCase<LaneTraits<OrderDescending>, int64_t>();
-  TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
-  TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
+  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
+  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
+  TestBaseCase<Traits128<OrderAscending128> >();
+  TestBaseCase<Traits128<OrderDescending128> >();
 }

-template <class Traits, typename T>
-static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
-                                         size_t left, size_t border,
-                                         size_t right, const size_t N1,
-                                         const T* pivot) {
+template <class Traits>
+static HWY_NOINLINE void VerifyPartition(
+    Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
+    size_t border, size_t right, const size_t N1,
+    const typename Traits::LaneType* pivot) {
  /* for (size_t i = left; i < right; ++i) {
     if (i == border) printf("--\n");
-     printf("%4zu: %3d\n", i, keys[i]);
+     printf("%4zu: %3d\n", i, lanes[i]);
   }*/

  HWY_ASSERT(left % N1 == 0);
@ -255,30 +242,33 @@ static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
  HWY_ASSERT(right % N1 == 0);
  const bool asc = typename Traits::Order().IsAscending();
  for (size_t i = left; i < border; i += N1) {
-    if (st.Compare1(pivot, keys + i)) {
+    if (st.Compare1(pivot, lanes + i)) {
      HWY_ABORT(
          "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
          "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i + 0]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
+          static_cast<int>(border));
    }
  }
  for (size_t i = border; i < right; i += N1) {
-    if (!st.Compare1(pivot, keys + i)) {
+    if (!st.Compare1(pivot, lanes + i)) {
      HWY_ABORT(
          "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
          "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i]), static_cast<int>(border));
+          st.KeyString().c_str(), asc, static_cast<int>(i),
+          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
+          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
+          static_cast<int>(border));
    }
  }
 }

-template <class Traits, typename T>
+template <class Traits>
 static HWY_NOINLINE void TestPartition() {
-  const SortTag<T> d;
+  using LaneType = typename Traits::LaneType;
+  const SortTag<LaneType> d;
  SharedTraits<Traits> st;
  const bool asc = typename Traits::Order().IsAscending();
  const size_t N = Lanes(d);
@ -286,8 +276,8 @@ static HWY_NOINLINE void TestPartition() {
  const size_t base_case_num = SortConstants::BaseCaseNum(N);
  // left + len + align
  const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
-  auto aligned_keys = hwy::AllocateAligned<T>(total);
-  auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
+  auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
+  auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));

  const size_t N1 = st.LanesPerKey();
  for (bool in_asc : {false, true}) {
@ -296,61 +286,66 @@ static HWY_NOINLINE void TestPartition() {
      for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
                         2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
        const size_t len = (base_case_num + ofs) & ~(N1 - 1);
-        for (T pivot1 :
-             {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
-          const T pivot2[2] = {pivot1, 0};
+        for (LaneType pivot1 :
+             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
+              LaneType(2 * len / 3), LaneType(len)}) {
+          const LaneType pivot2[2] = {pivot1, 0};
          const auto pivot = st.SetKey(d, pivot2);
          for (size_t misalign = 0; misalign < N;
               misalign += st.LanesPerKey()) {
-            T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
+            LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
            const size_t right = left + len;
            if (kDebug) {
              printf(
                  "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
-                  hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
+                  st.KeyString().c_str(), asc, static_cast<int>(left),
                  static_cast<int>(len), static_cast<int>(right),
-                  double(pivot2[1]), double(pivot2[0]));
+                  static_cast<double>(pivot2[1]),
+                  static_cast<double>(pivot2[0]));
            }

            for (size_t i = 0; i < misalign; ++i) {
-              aligned_keys[i] = hwy::LowestValue<T>();
+              aligned_lanes[i] = hwy::LowestValue<LaneType>();
            }
            for (size_t i = 0; i < left; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
            }
            for (size_t i = left; i < right; ++i) {
-              keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
-                                              : static_cast<T>(right) - T(i));
-              if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
+              lanes[i] = static_cast<LaneType>(
+                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
+                         : static_cast<LaneType>(right) - LaneType(i));
+              if (kDebug >= 2) {
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
+              }
            }
            for (size_t i = right; i < total - misalign; ++i) {
-              keys[i] = hwy::LowestValue<T>();
+              lanes[i] = hwy::LowestValue<LaneType>();
            }

            size_t border =
-                detail::Partition(d, st, keys, left, right, pivot, buf.get());
+                detail::Partition(d, st, lanes, left, right, pivot, buf.get());

            if (kDebug >= 2) {
              printf("out>>>>>>\n");
              for (size_t i = left; i < right; ++i) {
-                printf("%3zu: %f\n", i, double(keys[i]));
+                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
              }
              for (size_t i = right; i < total - misalign; ++i) {
-                printf("%3zu: sentinel %f\n", i, double(keys[i]));
+                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
              }
            }

-            VerifyPartition(st, keys, left, border, right, N1, pivot2);
+            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
            for (size_t i = 0; i < misalign; ++i) {
-              if (aligned_keys[i] != hwy::LowestValue<T>())
+              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
                HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
            }
            for (size_t i = 0; i < left; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
            }
            for (size_t i = right; i < total - misalign; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
+              if (lanes[i] != hwy::LowestValue<LaneType>())
                HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
            }
          }  // misalign
@ -361,15 +356,18 @@ static HWY_NOINLINE void TestPartition() {
 }

 HWY_NOINLINE void TestAllPartition() {
-  TestPartition<LaneTraits<OrderAscending>, int16_t>();
-  TestPartition<LaneTraits<OrderDescending>, int32_t>();
-  TestPartition<LaneTraits<OrderAscending>, int64_t>();
-  TestPartition<LaneTraits<OrderDescending>, float>();
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
+
+  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
+  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
+  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
+  TestPartition<TraitsLane<OrderDescending<float> > >();
 #if HWY_HAVE_FLOAT64
-  TestPartition<LaneTraits<OrderDescending>, double>();
+  TestPartition<TraitsLane<OrderDescending<double> > >();
 #endif
-  TestPartition<Traits128<OrderAscending128>, uint64_t>();
-  TestPartition<Traits128<OrderDescending128>, uint64_t>();
+  TestPartition<Traits128<OrderAscending128> >();
+  TestPartition<Traits128<OrderDescending128> >();
 }

 // (used for sample selection for choosing a pivot)
@ -399,7 +397,7 @@ static HWY_NOINLINE void TestRandomGenerator() {

    // Also ensure the mean is near the middle of the range
    const double expected = (num_blocks - 1) / 2.0;
-    const double actual = double(sum) / kReps;
+    const double actual = static_cast<double>(sum) / kReps;
    HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
  }
 }
@ -409,22 +407,26 @@ HWY_NOINLINE void TestAllGenerator() {
  TestRandomGenerator<uint64_t>();
 }

-#endif  // VQSORT_TEST_IMPL
-
-#if !VQSORT_TEST_SORT
-static void TestAllSort() {}
 #else
+static void TestAllMedian() {}
+static void TestAllBaseCase() {}
+static void TestAllPartition() {}
+static void TestAllGenerator() {}
+#endif  // VQSORT_ENABLED

 // Remembers input, and compares results to that of a reference algorithm.
-template <class Traits, typename T>
+template <class Traits>
 class CompareResults {
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
+
 public:
-  void SetInput(const T* in, size_t num) {
-    copy_.resize(num);
-    memcpy(copy_.data(), in, num * sizeof(T));
+  CompareResults(const LaneType* in, size_t num_lanes) {
+    copy_.resize(num_lanes);
+    memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
  }

-  bool Verify(const T* output) {
+  bool Verify(const LaneType* output) {
 #if HAVE_PDQSORT
    const Algo reference = Algo::kPDQ;
 #else
@ -432,13 +434,28 @@ class CompareResults {
 #endif
    SharedState shared;
    using Order = typename Traits::Order;
-    Run<Order>(reference, copy_.data(), copy_.size(), shared,
-               /*thread=*/0);
+    const Traits st;
+    const size_t num_keys = copy_.size() / st.LanesPerKey();
+    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
+               shared, /*thread=*/0);

    for (size_t i = 0; i < copy_.size(); ++i) {
      if (copy_[i] != output[i]) {
-        fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
-                static_cast<int>(i), double(copy_[i]), double(output[i]));
+        if (sizeof(KeyType) == 16) {
+          fprintf(stderr,
+                  "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()),
+                  static_cast<uint64_t>(copy_[i]),
+                  static_cast<uint64_t>(output[i]));
+        } else {
+          fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
+                  st.KeyString().c_str(), Order().IsAscending(),
+                  static_cast<int>(i), static_cast<int>(copy_.size()));
+          PrintValue(copy_[i]);
+          PrintValue(output[i]);
+          fprintf(stderr, "\n");
+        }
        return false;
      }
    }
@ -446,7 +463,7 @@ class CompareResults {
  }

 private:
-  std::vector<T> copy_;
+  std::vector<LaneType> copy_;
 };

 std::vector<Algo> AlgoForTest() {
@ -467,62 +484,65 @@ std::vector<Algo> AlgoForTest() {
  };
 }

-template <class Traits, typename T>
-void TestSort(size_t num) {
-  // TODO(janwas): fix
-  if (HWY_TARGET == HWY_SSSE3) return;
+template <class Traits>
+void TestSort(size_t num_lanes) {
 // Workaround for stack overflow on clang-cl (/F 8388608 does not help).
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
+#if defined(_MSC_VER)
  return;
 #endif
+  // Only enable EMU128 on x86 - it's slow on emulators.
+  if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;

+  using Order = typename Traits::Order;
+  using LaneType = typename Traits::LaneType;
+  using KeyType = typename Traits::KeyType;
  SharedState shared;
  SharedTraits<Traits> st;

+  // Round up to a whole number of keys.
+  num_lanes += (st.Is128() && (num_lanes & 1));
+  const size_t num_keys = num_lanes / st.LanesPerKey();
+
  constexpr size_t kMaxMisalign = 16;
-  auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
+  auto aligned =
+      hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
  for (Algo algo : AlgoForTest()) {
-#if HAVE_IPS4O
-    if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
-      continue;
-    }
-#endif
    for (Dist dist : AllDist()) {
      for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
                              size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
-        T* keys = aligned.get() + misalign;
+        LaneType* lanes = aligned.get() + misalign;

        // Set up red zones before/after the keys to sort
        for (size_t i = 0; i < misalign; ++i) {
-          aligned[i] = hwy::LowestValue<T>();
+          aligned[i] = hwy::LowestValue<LaneType>();
        }
        for (size_t i = 0; i < kMaxMisalign; ++i) {
-          keys[num + i] = hwy::HighestValue<T>();
+          lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
        }
 #if HWY_IS_MSAN
-        __msan_poison(aligned.get(), misalign * sizeof(T));
-        __msan_poison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_poison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
-        InputStats<T> input_stats = GenerateInput(dist, keys, num);
+        InputStats<LaneType> input_stats =
+            GenerateInput(dist, lanes, num_lanes);

-        CompareResults<Traits, T> compare;
-        compare.SetInput(keys, num);
-
-        Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
-        HWY_ASSERT(compare.Verify(keys));
-        HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
+        CompareResults<Traits> compare(lanes, num_lanes);
+        Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
+                   /*thread=*/0);
+        HWY_ASSERT(compare.Verify(lanes));
+        HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));

        // Check red zones
 #if HWY_IS_MSAN
-        __msan_unpoison(aligned.get(), misalign * sizeof(T));
-        __msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
+        __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
+        __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
 #endif
        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned[i] != hwy::LowestValue<T>())
+          if (aligned[i] != hwy::LowestValue<LaneType>())
            HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
        }
-        for (size_t i = num; i < num + kMaxMisalign; ++i) {
-          if (keys[i] != hwy::HighestValue<T>())
+        for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
+          if (lanes[i] != hwy::HighestValue<LaneType>())
            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
        }
      }  // misalign
@ -531,32 +551,37 @@ void TestSort(size_t num) {
 }

 void TestAllSort() {
-  const size_t num = 15 * 1000;
+  for (int num : {129, 504, 20 * 1000, 34567}) {
+    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
+    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);

-  TestSort<LaneTraits<OrderAscending>, int16_t>(num);
-  TestSort<LaneTraits<OrderDescending>, uint16_t>(num);
+    TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);

-  TestSort<LaneTraits<OrderDescending>, int32_t>(num);
-  TestSort<LaneTraits<OrderDescending>, uint32_t>(num);
+    TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
+    TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);

-  TestSort<LaneTraits<OrderAscending>, int64_t>(num);
-  TestSort<LaneTraits<OrderAscending>, uint64_t>(num);
-
-  // WARNING: for float types, SIMD comparisons will flush denormals to zero,
-  // causing mismatches with scalar sorts. In this test, we avoid generating
-  // denormal inputs.
-  TestSort<LaneTraits<OrderAscending>, float>(num);
+    // WARNING: for float types, SIMD comparisons will flush denormals to
+    // zero, causing mismatches with scalar sorts. In this test, we avoid
+    // generating denormal inputs.
+    TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
 #if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
-  if (Sorter::HaveFloat64()) {
-    TestSort<LaneTraits<OrderDescending>, double>(num);
-  }
+    if (Sorter::HaveFloat64()) {
+      TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
+    }
 #endif

-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-}
+// Our HeapSort does not support 128-bit keys.
+#if VQSORT_ENABLED
+    TestSort<Traits128<OrderAscending128> >(num_lanes);
+    TestSort<Traits128<OrderDescending128> >(num_lanes);

-#endif  // VQSORT_TEST_SORT
+    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
+    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
+#endif
+  }
+}

 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@ -577,10 +602,4 @@ HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
 }  // namespace
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
+++ b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,7 +22,6 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
 #endif

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/highway.h"

@ -30,6 +30,8 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {

+#if VQSORT_ENABLED
+
 using Constants = hwy::SortConstants;

 // ------------------------------ SharedTraits
@ -589,17 +591,19 @@ HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
 // Reshapes `buf` into a matrix, sorts columns independently, and then merges
 // into a sorted 1D array without transposing.
 //
-// `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
-//   bridges differences in sort order and single-lane vs 128-bit keys.
+// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
+//   differences in sort order and single-lane vs 128-bit keys.
 // `buf` ensures full vectors are aligned, and enables loads/stores without
 //   bounds checks.
 //
+// NOINLINE because this is large and called twice from vqsort-inl.h.
+//
 // References:
 // https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
 // https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
 // "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
 template <class Traits, typename T>
-HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
+HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
  const CappedTag<T, Constants::kMaxCols> d;
  using V = decltype(Zero(d));

@ -646,8 +650,8 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
        Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
               ve, vf);

-        // Avoids build timeout
-#if !HWY_COMPILER_MSVC
+        // Avoids build timeout. Must match #if condition in kMaxCols.
+#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
        if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
          Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
                  ve, vf);
@ -677,6 +681,11 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
  StoreU(vf, d, buf + 0xf * cols);
 }

+#else
+template <class Base>
+struct SharedTraits : public Base {};
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
--- a/third_party/highway/hwy/contrib/sort/traits-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,36 +22,64 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
 #endif

-#include "hwy/contrib/sort/disabled_targets.h"
+#include <string>
+
 #include "hwy/contrib/sort/shared-inl.h"  // SortConstants
 #include "hwy/contrib/sort/vqsort.h"      // SortDescending
 #include "hwy/highway.h"
+#include "hwy/print.h"

 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {

+#if VQSORT_ENABLED || HWY_IDE
+
 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
+template <typename T>
 struct KeyLane {
+  constexpr bool Is128() const { return false; }
  constexpr size_t LanesPerKey() const { return 1; }

+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = T;
+  // What type to pass to Sorter::operator().
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+
  // For HeapSort
-  template <typename T>
  HWY_INLINE void Swap(T* a, T* b) const {
    const T temp = *a;
    *a = *b;
    *b = temp;
  }

+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressNot(keys, mask);
+  }
+
  // Broadcasts one key into a vector
  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+  HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
    return Set(d, *key);
  }

+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq(a, b);
+  }
+
+  HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
+
  template <class D>
  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
    return Reverse(d, v);
@ -148,10 +177,10 @@ struct KeyLane {
 // We avoid overloaded functions because we want all functions to be callable
 // from a SortTraits without per-function wrappers. Specializing would work, but
 // we are anyway going to specialize at a higher level.
-struct OrderAscending : public KeyLane {
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
  using Order = SortAscending;

-  template <typename T>
  HWY_INLINE bool Compare1(const T* a, const T* b) {
    return *a < *b;
  }
@ -174,31 +203,31 @@ struct OrderAscending : public KeyLane {

  template <class D>
  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
    return MinOfLanes(d, v);
  }

  template <class D>
  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
    return MaxOfLanes(d, v);
  }

  template <class D>
  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
  }

  template <class D>
  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
  }
 };

-struct OrderDescending : public KeyLane {
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
  using Order = SortDescending;

-  template <typename T>
  HWY_INLINE bool Compare1(const T* a, const T* b) {
    return *b < *a;
  }
@ -220,32 +249,30 @@ struct OrderDescending : public KeyLane {

  template <class D>
  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                 T* HWY_RESTRICT /* buf */) const {
    return MaxOfLanes(d, v);
  }

  template <class D>
  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
+                                T* HWY_RESTRICT /* buf */) const {
    return MinOfLanes(d, v);
  }

  template <class D>
  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
+    return Set(d, hwy::HighestValue<T>());
  }

  template <class D>
  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
+    return Set(d, hwy::LowestValue<T>());
  }
 };

 // Shared code that depends on Order.
 template <class Base>
-struct LaneTraits : public Base {
-  constexpr bool Is128() const { return false; }
-
+struct TraitsLane : public Base {
  // For each lane i: replaces a[i] with the first and b[i] with the second
  // according to Base.
  // Corresponds to a conditional swap, which is one "node" of a sorting
@ -315,6 +342,66 @@ struct LaneTraits : public Base {
  }
 };

+#else
+
+// Base class shared between OrderAscending, OrderDescending.
+template <typename T>
+struct KeyLane {
+  constexpr bool Is128() const { return false; }
+  constexpr size_t LanesPerKey() const { return 1; }
+
+  using LaneType = T;
+  using KeyType = T;
+
+  std::string KeyString() const {
+    char string100[100];
+    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
+    return string100;
+  }
+};
+
+template <typename T>
+struct OrderAscending : public KeyLane<T> {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(a, b);
+  }
+};
+
+template <typename T>
+struct OrderDescending : public KeyLane<T> {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
+    return Lt(b, a);
+  }
+};
+
+template <class Order>
+struct TraitsLane : public Order {
+  // For HeapSort
+  template <typename T>  // MSVC doesn't find typename Order::LaneType.
+  HWY_INLINE void Swap(T* a, T* b) const {
+    const T temp = *a;
+    *a = *b;
+    *b = temp;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
+    return Set(d, *key);
+  }
+};
+
+#endif  // VQSORT_ENABLED
+
 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
--- a/third_party/highway/hwy/contrib/sort/traits128-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -21,6 +22,9 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
 #endif

+#include <string>
+
+#include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/vqsort.h"  // SortDescending
 #include "hwy/highway.h"

@ -29,48 +33,31 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {

-#if HWY_TARGET == HWY_SCALAR
-
-struct OrderAscending128 {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-};
-
-struct OrderDescending128 {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-};
-
-template <class Order>
-struct Traits128 : public Order {
-  constexpr bool Is128() const { return true; }
-  constexpr size_t LanesPerKey() const { return 2; }
-};
-
-#else
+#if VQSORT_ENABLED || HWY_IDE

 // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 // along with an abstraction layer for single-lane vs. lane-pair, which is
 // independent of the order.
-struct Key128 {
+struct KeyAny128 {
+  constexpr bool Is128() const { return true; }
  constexpr size_t LanesPerKey() const { return 2; }

-  template <typename T>
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const FixedTag<T, 2> d;
+  // What type bench_sort should allocate for generating inputs.
+  using LaneType = uint64_t;
+  // KeyType and KeyString are defined by derived classes.
+
+  HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
+    const FixedTag<LaneType, 2> d;
    const auto temp = LoadU(d, a);
    StoreU(LoadU(d, b), d, a);
    StoreU(temp, d, b);
  }

+  template <class V, class M>
+  HWY_INLINE V CompressKeys(V keys, M mask) const {
+    return CompressBlocksNot(keys, mask);
+  }
+
  template <class D>
  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
    return LoadDup128(d, key);
@ -135,6 +122,23 @@ struct Key128 {
  }
 };

+// Base class shared between OrderAscending128, OrderDescending128.
+struct Key128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = hwy::uint128_t;
+
+  std::string KeyString() const { return "U128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq128(a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[0] == b[0] && a[1] == b[1];
+  }
+};
+
 // Anything order-related depends on the key traits *and* the order (see
 // FirstOfLanes). We cannot implement just one Compare function because Lt128
 // only compiles if the lane type is u64. Thus we need either overloaded
@ -145,8 +149,7 @@ struct Key128 {
 struct OrderAscending128 : public Key128 {
  using Order = SortAscending;

-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
  }

@ -171,30 +174,6 @@ struct OrderAscending128 : public Key128 {
    return Max128(d, a, b);
  }

-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
  // Same as for regular lanes because 128-bit lanes are u64.
  template <class D>
  HWY_INLINE Vec<D> FirstValue(D d) const {
@ -210,8 +189,7 @@ struct OrderAscending128 : public Key128 {
 struct OrderDescending128 : public Key128 {
  using Order = SortDescending;

-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
  }

@ -236,28 +214,101 @@ struct OrderDescending128 : public Key128 {
    return Min128(d, a, b);
  }

+  // Same as for regular lanes because 128-bit lanes are u64.
  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
  }

  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+};
+
+// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
+struct KeyValue128 : public KeyAny128 {
+  // What type to pass to Sorter::operator().
+  using KeyType = K64V64;
+
+  std::string KeyString() const { return "KV128"; }
+
+  template <class D>
+  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
+    return Eq128Upper(a, b);
+  }
+
+  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
+    return a[1] == b[1];
+  }
+};
+
+struct OrderAscendingKV128 : public KeyValue128 {
+  using Order = SortAscending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return a[1] < b[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, a, b);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  // Same as for regular lanes because 128-bit lanes are u64.
+  template <class D>
+  HWY_INLINE Vec<D> FirstValue(D d) const {
+    return Set(d, hwy::LowestValue<TFromD<D> >());
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastValue(D d) const {
+    return Set(d, hwy::HighestValue<TFromD<D> >());
+  }
+};
+
+struct OrderDescendingKV128 : public KeyValue128 {
+  using Order = SortDescending;
+
+  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
+    return b[1] < a[1];
+  }
+
+  template <class D>
+  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
+    return Lt128Upper(d, b, a);
+  }
+
+  // Used by CompareTop
+  template <class V>
+  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
+    return Lt(b, a);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
+    return Max128Upper(d, a, b);
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
+    return Min128Upper(d, a, b);
  }

  // Same as for regular lanes because 128-bit lanes are u64.
@ -275,16 +326,21 @@ struct OrderDescending128 : public Key128 {
 // Shared code that depends on Order.
 template <class Base>
 class Traits128 : public Base {
-#if HWY_TARGET <= HWY_AVX2
+  // Special case for >= 256 bit vectors
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
  // Returns vector with only the top u64 lane valid. Useful when the next step
  // is to replicate the mask anyway.
  template <class D>
  HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
    const Base* base = static_cast<const Base*>(this);
-    const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
+    const Mask<D> eqHL = Eq(a, b);
    const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
+#if HWY_TARGET == HWY_SVE_256
+    return IfThenElse(eqHL, DupEven(ltHL), ltHL);
+#else
    const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
-    return OrAnd(ltHL, eqHL, ltLX);
+    return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
+#endif
  }

  // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
@ -292,16 +348,42 @@ class Traits128 : public Base {
  // replicate it 4x. Only called for >= 256-bit vectors.
  template <class V>
  HWY_INLINE V ReplicateTop4x(V v) const {
-#if HWY_TARGET <= HWY_AVX3
+#if HWY_TARGET == HWY_SVE_256
+    return svdup_lane_u64(v, 3);
+#elif HWY_TARGET <= HWY_AVX3
    return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #else  // AVX2
    return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
 #endif
  }
-#endif
+#endif  // HWY_TARGET

 public:
-  constexpr bool Is128() const { return true; }
+  template <class D>
+  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
+                                 TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->First(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }
+
+  template <class D>
+  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
+                                TFromD<D>* HWY_RESTRICT buf) const {
+    const Base* base = static_cast<const Base*>(this);
+    const size_t N = Lanes(d);
+    Store(v, d, buf);
+    v = base->SetKey(d, buf + 0);  // result must be broadcasted
+    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
+      v = base->Last(d, v, base->SetKey(d, buf + i));
+    }
+    return v;
+  }

  template <class D>
  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
@ -319,7 +401,7 @@ class Traits128 : public Base {
    const Base* base = static_cast<const Base*>(this);
    Vec<D> swapped = base->ReverseKeys2(d, v);

-#if HWY_TARGET <= HWY_AVX2
+#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
    const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
    return IfVecThenElse(select, swapped, v);
 #else
@ -357,7 +439,7 @@ class Traits128 : public Base {
  }
 };

-#endif  // HWY_TARGET != HWY_SCALAR
+#endif  // VQSORT_ENABLED

 }  // namespace detail
 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/contrib/sort/vqsort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,6 +17,10 @@
 #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_

+#ifndef VQSORT_PRINT
+#define VQSORT_PRINT 0
+#endif
+
 // Makes it harder for adversaries to predict our sampling locations, at the
 // cost of 1-2% increased runtime.
 #ifndef VQSORT_SECURE_RNG
@ -26,10 +31,13 @@
 #include "third_party/absl/random/random.h"
 #endif

+#if VQSORT_PRINT
+#include <stdio.h>
+#endif
+
 #include <string.h>  // memcpy

-#include "hwy/cache_control.h"  // Prefetch
-#include "hwy/contrib/sort/disabled_targets.h"
+#include "hwy/cache_control.h"        // Prefetch
 #include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes

 #if HWY_IS_MSAN
@ -47,6 +55,10 @@
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
 #endif

+#if VQSORT_PRINT
+#include "hwy/print-inl.h"
+#endif
+
 #include "hwy/contrib/sort/shared-inl.h"
 #include "hwy/contrib/sort/sorting_networks-inl.h"
 #include "hwy/highway.h"
@ -56,117 +68,66 @@ namespace hwy {
 namespace HWY_NAMESPACE {
 namespace detail {

-#if HWY_TARGET == HWY_SCALAR
-
-template <typename T>
-void Swap(T* a, T* b) {
-  T t = *a;
-  *a = *b;
-  *b = t;
-}
-
-// Scalar version of HeapSort (see below)
-template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
-  if (num < 2) return;
-
-  // Build heap.
-  for (size_t i = 1; i < num; i += 1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - 1) / 1 / 2);
-      if (!st.Compare1(keys + idx_parent, keys + j)) {
-        break;
-      }
-      Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
-  }
-
-  for (size_t i = num - 1; i != 0; i -= 1) {
-    // Swap root with last
-    Swap(keys + 0, keys + i);
-
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + 1;
-      const size_t right = 2 * j + 2;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      if (st.Compare1(keys + j, keys + left)) {
-        idx_larger = left;
-      }
-      if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
-    }
-  }
-}
-
-#else
-
 using Constants = hwy::SortConstants;

 // ------------------------------ HeapSort

-// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
-// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
 template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
+void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
+              size_t start) {
  constexpr size_t N1 = st.LanesPerKey();
  const FixedTag<T, N1> d;

-  if (num < 2 * N1) return;
-
-  // Build heap.
-  for (size_t i = N1; i < num; i += N1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
-      if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
-                                 st.SetKey(d, keys + j)))) {
-        break;
-      }
-      st.Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
+  while (start < num_lanes) {
+    const size_t left = 2 * start + N1;
+    const size_t right = 2 * start + 2 * N1;
+    if (left >= num_lanes) break;
+    size_t idx_larger = start;
+    const auto key_j = st.SetKey(d, lanes + start);
+    if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
+      idx_larger = left;
    }
-  }
-
-  for (size_t i = num - N1; i != 0; i -= N1) {
-    // Swap root with last
-    st.Swap(keys + 0, keys + i);
-
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + N1;
-      const size_t right = 2 * j + 2 * N1;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      const auto key_j = st.SetKey(d, keys + j);
-      if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
-        idx_larger = left;
-      }
-      if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
-                                             st.SetKey(d, keys + right)))) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      st.Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
+    if (right < num_lanes &&
+        AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
+                              st.SetKey(d, lanes + right)))) {
+      idx_larger = right;
    }
+    if (idx_larger == start) break;
+    st.Swap(lanes + start, lanes + idx_larger);
+    start = idx_larger;
  }
 }

+// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
+// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
+template <class Traits, typename T>
+void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
+  constexpr size_t N1 = st.LanesPerKey();
+
+  if (num_lanes < 2 * N1) return;
+
+  // Build heap.
+  for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
+    SiftDown(st, lanes, num_lanes, i);
+  }
+
+  for (size_t i = num_lanes - N1; i != 0; i -= N1) {
+    // Swap root with last
+    st.Swap(lanes + 0, lanes + i);
+
+    // Sift down the new root.
+    SiftDown(st, lanes, i, 0);
+  }
+}
+
+#if VQSORT_ENABLED || HWY_IDE
+
 // ------------------------------ BaseCase

 // Sorts `keys` within the range [0, num) via sorting network.
 template <class D, class Traits, typename T>
-HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
+                           T* HWY_RESTRICT keys_end, size_t num,
                           T* HWY_RESTRICT buf) {
  const size_t N = Lanes(d);
  using V = decltype(Zero(d));
@ -184,14 +145,25 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
      HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
  HWY_DASSERT(cols <= N);

+  // We can avoid padding and load/store directly to `keys` after checking the
+  // original input array has enough space. Except at the right border, it's OK
+  // to sort more than the current sub-array. Even if we sort across a previous
+  // partition point, we know that keys will not migrate across it. However, we
+  // must use the maximum size of the sorting network, because the StoreU of its
+  // last vector would otherwise write invalid data starting at kMaxRows * cols.
+  const size_t N_sn = Lanes(CappedTag<T, Constants::kMaxCols>());
+  if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) {
+    SortingNetwork(st, keys, N_sn);
+    return;
+  }
+
  // Copy `keys` to `buf`.
  size_t i;
  for (i = 0; i + N <= num; i += N) {
    Store(LoadU(d, keys + i), d, buf + i);
  }
-  for (; i < num; ++i) {
-    buf[i] = keys[i];
-  }
+  SafeCopyN(num - i, d, keys + i, buf + i);
+  i = num;

  // Fill with padding - last in sort order, not copied to keys.
  const V kPadding = st.LastValue(d);
@ -206,9 +178,7 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
  for (i = 0; i + N <= num; i += N) {
    StoreU(Load(d, buf + i), d, keys + i);
  }
-  for (; i < num; ++i) {
-    keys[i] = buf[i];
-  }
+  SafeCopyN(num - i, d, buf + i, keys + i);
 }

 // ------------------------------ Partition
@ -268,15 +238,35 @@ HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
 template <class D, class Traits, typename T>
 HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
                               const Vec<D> pivot, T* HWY_RESTRICT keys,
-                               size_t& writeL, size_t& writeR) {
+                               size_t& writeL, size_t& remaining) {
  const size_t N = Lanes(d);

  const auto comp = st.Compare(d, pivot, v);
-  const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL);
-  writeL += num_left;

-  writeR -= (N - num_left);
-  (void)CompressBlendedStore(v, comp, d, keys + writeR);
+  remaining -= N;
+  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
+      (HWY_MAX_BYTES == 16 && st.Is128())) {
+    // Non-native Compress (e.g. AVX2): we are able to partition a vector using
+    // a single Compress+two StoreU instead of two Compress[Blended]Store. The
+    // latter are more expensive. Because we store entire vectors, the contents
+    // between the updated writeL and writeR are ignored and will be overwritten
+    // by subsequent calls. This works because writeL and writeR are at least
+    // two vectors apart.
+    const auto lr = st.CompressKeys(v, comp);
+    const size_t num_left = N - CountTrue(d, comp);
+    StoreU(lr, d, keys + writeL);
+    // Now write the right-side elements (if any), such that the previous writeR
+    // is one past the end of the newly written right elements, then advance.
+    StoreU(lr, d, keys + remaining + writeL);
+    writeL += num_left;
+  } else {
+    // Native Compress[Store] (e.g. AVX3), which only keep the left or right
+    // side, not both, hence we require two calls.
+    const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
+    writeL += num_left;
+
+    (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
+  }
 }

 template <class D, class Traits, typename T>
@ -284,11 +274,11 @@ HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
                                const Vec<D> v1, const Vec<D> v2,
                                const Vec<D> v3, const Vec<D> pivot,
                                T* HWY_RESTRICT keys, size_t& writeL,
-                                size_t& writeR) {
-  StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
+                                size_t& remaining) {
+  StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
+  StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
 }

 // Moves "<= pivot" keys to the front, and others to the back. pivot is
@ -313,9 +303,39 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
  PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
  constexpr size_t kUnroll = Constants::kPartitionUnroll;

-  // Invariant: [left, writeL) and [writeR, right) are already partitioned.
+  // Partition splits the vector into 3 sections, left to right: Elements
+  // smaller or equal to the pivot, unpartitioned elements and elements larger
+  // than the pivot. To write elements unconditionally on the loop body without
+  // overwriting existing data, we maintain two regions of the loop where all
+  // elements have been copied elsewhere (e.g. vector registers.). I call these
+  // bufferL and bufferR, for left and right respectively.
+  //
+  // These regions are tracked by the indices (writeL, writeR, left, right) as
+  // presented in the diagram below.
+  //
+  //              writeL                                  writeR
+  //               \/                                       \/
+  //  |  <= pivot   | bufferL |   unpartitioned   | bufferR |   > pivot   |
+  //                          \/                  \/
+  //                         left                 right
+  //
+  // In the main loop body below we choose a side, load some elements out of the
+  // vector and move either `left` or `right`. Next we call into StoreLeftRight
+  // to partition the data, and the partitioned elements will be written either
+  // to writeR or writeL and the corresponding index will be moved accordingly.
+  //
+  // Note that writeR is not explicitly tracked as an optimization for platforms
+  // with conditional operations. Instead we track writeL and the number of
+  // elements left to process (`remaining`). From the diagram above we can see
+  // that:
+  //    writeR - writeL = remaining => writeR = remaining + writeL
+  //
+  // Tracking `remaining` is advantageous because each iteration reduces the
+  // number of unpartitioned elements by a fixed amount, so we can compute
+  // `remaining` without data dependencies.
+  //
  size_t writeL = left;
-  size_t writeR = right;
+  size_t remaining = right - left;

  const size_t num = right - left;
  // Cannot load if there were fewer than 2 * kUnroll * N.
@ -339,12 +359,33 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
    while (left != right) {
      V v0, v1, v2, v3;

-      // Free up capacity for writing by loading from the side that has less.
      // Data-dependent but branching is faster than forcing branch-free.
      const size_t capacityL = left - writeL;
-      const size_t capacityR = writeR - right;
-      HWY_DASSERT(capacityL <= num && capacityR <= num);  // >= 0
-      if (capacityR < capacityL) {
+      HWY_DASSERT(capacityL <= num);  // >= 0
+      // Load data from the end of the vector with less data (front or back).
+      // The next paragraphs explain how this works.
+      //
+      // let block_size = (kUnroll * N)
+      // On the loop prelude we load block_size elements from the front of the
+      // vector and an additional block_size elements from the back. On each
+      // iteration k elements are written to the front of the vector and
+      // (block_size - k) to the back.
+      //
+      // This creates a loop invariant where the capacity on the front
+      // (capacityL) and on the back (capacityR) always add to 2 * block_size.
+      // In other words:
+      //    capacityL + capacityR = 2 * block_size
+      //    capacityR = 2 * block_size - capacityL
+      //
+      // This means that:
+      //    capacityL < capacityR <=>
+      //    capacityL < 2 * block_size - capacityL <=>
+      //    2 * capacityL < 2 * block_size <=>
+      //    capacityL < block_size
+      //
+      // Thus the check on the next line is equivalent to capacityL > capacityR.
+      //
+      if (kUnroll * N < capacityL) {
        right -= kUnroll * N;
        v0 = LoadU(d, keys + right + 0 * N);
        v1 = LoadU(d, keys + right + 1 * N);
@ -360,16 +401,16 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
        hwy::Prefetch(keys + left + 3 * kUnroll * N);
      }

-      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
+      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
    }

    // Now finish writing the initial left/right to the middle.
-    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
-    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
+    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
+    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
  }

  // We have partitioned [left, right) such that writeL is the boundary.
-  HWY_DASSERT(writeL == writeR);
+  HWY_DASSERT(remaining == 0);
  // Make space for inserting vlast: move up to N of the first right-side keys
  // into the unused space starting at last. If we have fewer, ensure they are
  // the last items in that vector by subtracting from the *load* address,
@ -406,41 +447,6 @@ HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
  return v1;
 }

-// Replaces triplets with their median and recurses until less than 3 keys
-// remain. Ignores leftover values (non-whole triplets)!
-template <class D, class Traits, typename T>
-Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                          T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  constexpr size_t N1 = st.LanesPerKey();
-
-  if (num < 3 * N1) return st.SetKey(d, keys);
-
-  size_t read = 0;
-  size_t written = 0;
-
-  // Triplets of vectors
-  for (; read + 3 * N <= num; read += 3 * N) {
-    const auto v0 = Load(d, keys + read + 0 * N);
-    const auto v1 = Load(d, keys + read + 1 * N);
-    const auto v2 = Load(d, keys + read + 2 * N);
-    Store(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N;
-  }
-
-  // Triplets of keys
-  for (; read + 3 * N1 <= num; read += 3 * N1) {
-    const auto v0 = st.SetKey(d, keys + read + 0 * N1);
-    const auto v1 = st.SetKey(d, keys + read + 1 * N1);
-    const auto v2 = st.SetKey(d, keys + read + 2 * N1);
-    StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N1;
-  }
-
-  // Tail recursion; swap buffers
-  return RecursiveMedianOf3(d, st, buf, written, keys);
-}
-
 #if VQSORT_SECURE_RNG
 using Generator = absl::BitGen;
 #else
@ -453,6 +459,11 @@ class Generator {
    k_ = 1;  // stream index: must be odd
  }

+  explicit Generator(uint64_t seed) {
+    a_ = b_ = w_ = seed;
+    k_ = 1;
+  }
+
  uint64_t operator()() {
    const uint64_t b = b_;
    w_ += k_;
@ -481,19 +492,190 @@ HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
  return static_cast<size_t>(chunk_index);
 }

+template <class Traits, typename T>
+HWY_INLINE void SortSamples(Traits st, T* HWY_RESTRICT buf) {
+  // buf contains 192 bytes, so 16 128-bit vectors are necessary and sufficient.
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  const CappedTag<T, 16 / sizeof(T)> d128;
+  const size_t N128 = Lanes(d128);
+  constexpr size_t kCols = HWY_MIN(16 / sizeof(T), Constants::kMaxCols);
+  constexpr size_t kBytes = kCols * Constants::kMaxRows * sizeof(T);
+  static_assert(192 <= kBytes, "");
+  // Fill with padding - last in sort order.
+  const auto kPadding = st.LastValue(d128);
+  // Initialize an extra vector because SortingNetwork loads full vectors,
+  // which may exceed cols*kMaxRows.
+  for (size_t i = kSampleLanes; i <= kBytes / sizeof(T); i += N128) {
+    StoreU(kPadding, d128, buf + i);
+  }
+
+  SortingNetwork(st, buf, kCols);
+}
+
+template <class Traits, typename T>
+HWY_INLINE size_t PivotRank(Traits st, T* HWY_RESTRICT buf) {
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  constexpr size_t N1 = st.LanesPerKey();
+
+  constexpr size_t kRankMid = kSampleLanes / 2;
+  static_assert(kRankMid % N1 == 0, "Mid is not an aligned key");
+
+  // Find the previous value not equal to the median.
+  size_t rank_prev = kRankMid - N1;
+  for (; st.Equal1(buf + rank_prev, buf + kRankMid); rank_prev -= N1) {
+    // All previous samples are equal to the median.
+    if (rank_prev == 0) return 0;
+  }
+
+  size_t rank_next = rank_prev + N1;
+  for (; st.Equal1(buf + rank_next, buf + kRankMid); rank_next += N1) {
+    // The median is also the largest sample. If it is also the largest key,
+    // we'd end up with an empty right partition, so choose the previous key.
+    if (rank_next == kSampleLanes - N1) return rank_prev;
+  }
+
+  // If we choose the median as pivot, the ratio of keys ending in the left
+  // partition will likely be rank_next/kSampleLanes (if the sample is
+  // representative). This is because equal-to-pivot values also land in the
+  // left - it's infeasible to do an in-place vectorized 3-way partition.
+  // Check whether prev would lead to a more balanced partition.
+  const size_t excess_if_median = rank_next - kRankMid;
+  const size_t excess_if_prev = kRankMid - rank_prev;
+  return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
+}
+
+#if VQSORT_PRINT
+// Compute exact min/max.
 template <class D, class Traits, typename T>
-HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
-                                const size_t begin, const size_t end,
-                                T* HWY_RESTRICT buf, Generator& rng) {
+HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
+                             size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
+                             Vec<D>& last) {
+  const size_t N = Lanes(d);
+
+  first = st.LastValue(d);
+  last = st.FirstValue(d);
+
+  size_t i = 0;
+  for (; i + N <= num; i += N) {
+    const Vec<D> v = LoadU(d, keys + i);
+    first = st.First(d, v, first);
+    last = st.Last(d, v, last);
+  }
+  if (HWY_LIKELY(i != num)) {
+    HWY_DASSERT(num >= N);  // See HandleSpecialCases
+    const Vec<D> v = LoadU(d, keys + num - N);
+    first = st.First(d, v, first);
+    last = st.Last(d, v, last);
+  }
+
+  first = st.FirstOfLanes(d, first, buf);
+  last = st.LastOfLanes(d, last, buf);
+}
+#endif  // VQSORT_PRINT
+
+template <class D, class Traits, typename T>
+HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
+                          size_t num) {
+  using V = Vec<decltype(d)>;
+  const size_t N = Lanes(d);
+  HWY_DASSERT(num >= N);  // See HandleSpecialCases
+  const V reference = st.SetKey(d, keys);
+  const V zero = Zero(d);
+  // Sticky bits registering any difference between `keys` and the first key.
+  // We use vector XOR because it may be cheaper than comparisons, especially
+  // for 128-bit. 2x unrolled for more ILP.
+  V diff0 = zero;
+  V diff1 = zero;
+
+  // We want to stop once a difference has been found, but without slowing down
+  // the loop by comparing during each iteration. The compromise is to compare
+  // after a 'group', which consists of kLoops times two vectors.
+  constexpr size_t kLoops = 4;
+  const size_t lanes_per_group = kLoops * 2 * N;
+  size_t i = 0;
+  for (; i + lanes_per_group <= num; i += lanes_per_group) {
+    for (size_t loop = 0; loop < kLoops; ++loop) {
+      const V v0 = LoadU(d, keys + i + loop * 2 * N);
+      const V v1 = LoadU(d, keys + i + loop * 2 * N + N);
+      // TODO(janwas): ternlog
+      diff0 = Or(diff0, Xor(v0, reference));
+      diff1 = Or(diff1, Xor(v1, reference));
+    }
+    diff0 = Or(diff0, diff1);
+    if (!AllTrue(d, Eq(diff0, zero))) {
+      return false;
+    }
+  }
+  // Whole vectors, no unrolling
+  for (; i + N <= num; i += N) {
+    const V v0 = LoadU(d, keys + i);
+    // TODO(janwas): ternlog
+    diff0 = Or(diff0, Xor(v0, reference));
+    if (!AllTrue(d, Eq(diff0, zero))) {
+      return false;
+    }
+  }
+  // If there are remainders, re-check the last whole vector.
+  if (HWY_LIKELY(i != num)) {
+    const V v0 = LoadU(d, keys + num - N);
+    // TODO(janwas): ternlog
+    diff0 = Or(diff0, Xor(v0, reference));
+    if (!AllTrue(d, Eq(diff0, zero))) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Returns key prior to reference in sort order.
+template <class D, class Traits, typename T>
+HWY_INLINE Vec<D> ScanForPrev(D d, Traits st, const T* HWY_RESTRICT keys,
+                              size_t num, Vec<D> reference,
+                              T* HWY_RESTRICT buf) {
+  const size_t N = Lanes(d);
+  HWY_DASSERT(num >= N);  // See HandleSpecialCases
+
+  Vec<D> prev = st.FirstValue(d);
+  Mask<D> any_found = st.Compare(d, prev, prev);  // false
+
+  size_t i = 0;
+  // Whole vectors, no unrolling
+  for (; i + N <= num; i += N) {
+    const Vec<D> curr = LoadU(d, keys + i);
+    const auto is_before = st.Compare(d, curr, reference);
+    any_found = Or(any_found, is_before);
+    prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
+  }
+  // If there are remainders, re-check the last whole vector.
+  if (HWY_LIKELY(i != num)) {
+    const Vec<D> curr = LoadU(d, keys + num - N);
+    const auto is_before = st.Compare(d, curr, reference);
+    any_found = Or(any_found, is_before);
+    prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
+  }
+
+  const Vec<D> candidate = st.LastOfLanes(d, prev, buf);
+  // If we didn't find any key less than reference, we're still stuck with
+  // FirstValue; replace that with reference. (We cannot compare directly to
+  // FirstValue because that might be the desired value of prev.)
+  return IfThenElse(any_found, candidate, reference);
+}
+
+enum class PivotResult {
+  kNormal,    // use partition
+  kAllEqual,  // already done
+};
+
+template <class D, class Traits, typename T>
+HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
+                            T* HWY_RESTRICT buf, Generator& rng) {
  using V = decltype(Zero(d));
  const size_t N = Lanes(d);

  // Power of two
  const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);

-  keys += begin;
-  size_t num = end - begin;
-
  // Align start of keys to chunks. We always have at least 2 chunks because the
  // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
  HWY_DASSERT(num >= 2 * lanes_per_chunk);
@ -548,49 +730,92 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
    const V medians2 = MedianOf3(st, v6, v7, v8);
    Store(medians2, d, buf + i + lanes_per_chunk * 2);
  }
-
-  return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
-                            buf + 3 * lanes_per_chunk);
 }

-// Compute exact min/max to detect all-equal partitions. Only called after a
-// degenerate Partition (none in the right partition).
+// Returns pivot, which is never the largest key (thus the right partition will
+// never be empty).
 template <class D, class Traits, typename T>
-HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
-                             size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
-                             Vec<D>& last) {
+HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
+                                const size_t begin, const size_t end,
+                                T* HWY_RESTRICT buf, Generator& rng,
+                                PivotResult& result) {
+  using V = decltype(Zero(d));
  const size_t N = Lanes(d);

-  first = st.LastValue(d);
-  last = st.FirstValue(d);
+  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
+  constexpr size_t N1 = st.LanesPerKey();

-  size_t i = 0;
-  for (; i + N <= num; i += N) {
-    const Vec<D> v = LoadU(d, keys + i);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
+  const size_t num = end - begin;
+#if VQSORT_PRINT
+  fprintf(stderr, "\nChoosePivot num %zu:\n", num);
+#endif
+  DrawSamples(d, st, keys + begin, num, buf, rng);
+
+  SortSamples(st, buf);
+#if VQSORT_PRINT
+  for (size_t i = 0; i < kSampleLanes; i += N) {
+    Print(d, "", Load(d, buf + i), 0, N);
  }
-  if (HWY_LIKELY(i != num)) {
-    HWY_DASSERT(num >= N);  // See HandleSpecialCases
-    const Vec<D> v = LoadU(d, keys + num - N);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
+#endif
+
+  // All samples are equal.
+  if (st.Equal1(buf, buf + kSampleLanes - N1)) {
+    const bool all_eq = ScanEqual(d, st, keys + begin, num);
+#if VQSORT_PRINT
+    fprintf(stderr, "Pivot num=%zu all eq samples, keys also: %d\n", num,
+            all_eq);
+#endif
+    if (all_eq) {
+      result = PivotResult::kAllEqual;
+      return Zero(d);
+    }
+
+    // If the sample is indeed the most common key and it is the largest, then
+    // the right partition will be empty. Prevent this by replacing the pivot
+    // with the previous key in sort order. By contrast, selecting the first key
+    // in sort order would guarantee (minimal) progress. We instead do a full
+    // scan to maximize load balance in case there are numerous keys that
+    // precede the most common key.
+    result = PivotResult::kNormal;
+    const V reference = st.SetKey(d, buf);
+    const V pivot = ScanForPrev(d, st, keys + begin, num, reference, buf);
+#if VQSORT_PRINT
+    Print(d, "PREV pivot", pivot, 0, st.LanesPerKey());
+#endif
+    return pivot;
  }

-  first = st.FirstOfLanes(d, first, buf);
-  last = st.LastOfLanes(d, last, buf);
+  const size_t pivot_rank = PivotRank(st, buf);
+  const Vec<D> pivot = st.SetKey(d, buf + pivot_rank);
+#if VQSORT_PRINT
+  fprintf(stderr, "  Pivot rank %zu = %.0f\n", pivot_rank,
+          static_cast<double>(GetLane(pivot)));
+#endif
+  result = PivotResult::kNormal;
+  return pivot;
 }

 template <class D, class Traits, typename T>
-void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
-             const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
-             Generator& rng, size_t remaining_levels) {
+void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
+             const size_t begin, const size_t end, const Vec<D> pivot,
+             T* HWY_RESTRICT buf, Generator& rng, size_t remaining_levels) {
  HWY_DASSERT(begin + 1 < end);
  const size_t num = end - begin;  // >= 2
+#if VQSORT_PRINT
+  fprintf(stderr, "- Recurse remaining %zu [%zu %zu) len %zu\n",
+          remaining_levels, begin, end, num);
+  Vec<D> first, last;
+  ScanMinMax(d, st, keys + begin, num, buf, first, last);
+  Print(d, "first", first, 0, st.LanesPerKey());
+  Print(d, "last", last, 0, st.LanesPerKey());
+#endif

-  // Too many degenerate partitions. This is extremely unlikely to happen
-  // because we select pivots from large (though still O(1)) samples.
+  // Too many recursions. This is unlikely to happen because we select pivots
+  // from large (though still O(1)) samples.
  if (HWY_UNLIKELY(remaining_levels == 0)) {
+#if VQSORT_PRINT
+    fprintf(stderr, "HeapSort reached, size=%zu\n", num);
+#endif
    HeapSort(st, keys + begin, num);  // Slow but N*logN.
    return;
  }
@ -604,35 +829,31 @@ void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
  const ptrdiff_t num_right =
      static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);

-  // Check for degenerate partitions (i.e. Partition did not move any keys):
-  if (HWY_UNLIKELY(num_right == 0)) {
-    // Because the pivot is one of the keys, it must have been equal to the
-    // first or last key in sort order. Scan for the actual min/max:
-    // passing the current pivot as the new bound is insufficient because one of
-    // the partitions might not actually include that key.
-    Vec<D> first, last;
-    ScanMinMax(d, st, keys + begin, num, buf, first, last);
-    if (AllTrue(d, Eq(first, last))) return;
-
-    // Separate recursion to make sure that we don't pick `last` as the
-    // pivot - that would again lead to a degenerate partition.
-    Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
-    return;
-  }
+  // ChoosePivot ensures pivot != largest key, so this should never happen.
+  HWY_ASSERT(num_right != 0);

  if (HWY_UNLIKELY(num_left <= base_case_num)) {
-    BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
+    BaseCase(d, st, keys + begin, keys_end, static_cast<size_t>(num_left), buf);
  } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
-    Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
-            remaining_levels - 1);
+    PivotResult result;
+    const Vec<D> next_pivot =
+        ChoosePivot(d, st, keys, begin, bound, buf, rng, result);
+    if (result != PivotResult::kAllEqual) {
+      Recurse(d, st, keys, keys_end, begin, bound, next_pivot, buf, rng,
+              remaining_levels - 1);
+    }
  }
  if (HWY_UNLIKELY(num_right <= base_case_num)) {
-    BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
+    BaseCase(d, st, keys + bound, keys_end, static_cast<size_t>(num_right),
+             buf);
  } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
-    Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
-            remaining_levels - 1);
+    PivotResult result;
+    const Vec<D> next_pivot =
+        ChoosePivot(d, st, keys, bound, end, buf, rng, result);
+    if (result != PivotResult::kAllEqual) {
+      Recurse(d, st, keys, keys_end, bound, end, next_pivot, buf, rng,
+              remaining_levels - 1);
+    }
  }
 }

@ -646,10 +867,11 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
  // 128-bit keys require vectors with at least two u64 lanes, which is always
  // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
  // hardware vector width is less than 128bit / fraction.
-  const bool partial_128 = N < 2 && st.Is128();
+  const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
  // Partition assumes its input is at least two vectors. If vectors are huge,
  // base_case_num may actually be smaller. If so, which is only possible on
-  // RVV, pass a capped or partial d (LMUL < 1).
+  // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
+  // HWY_LANES to account for the largest possible LMUL.
  constexpr bool kPotentiallyHuge =
      HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
  const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
@ -661,7 +883,7 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,

  // Small arrays: use sorting network, no need for other checks.
  if (HWY_UNLIKELY(num <= base_case_num)) {
-    BaseCase(d, st, keys, num, buf);
+    BaseCase(d, st, keys, keys + num, num, buf);
    return true;
  }

@ -671,7 +893,7 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
  return false;  // not finished sorting
 }

-#endif  // HWY_TARGET != HWY_SCALAR
+#endif  // VQSORT_ENABLED
 }  // namespace detail

 // Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
@ -683,17 +905,26 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
 // sampling only from the first 256 GiB.
 //
 // `d` is typically SortTag<T> (chooses between full and partial vectors).
-// `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
-//   bridges differences in sort order and single-lane vs 128-bit keys.
+// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
+//   differences in sort order and single-lane vs 128-bit keys.
 template <class D, class Traits, typename T>
 void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
          T* HWY_RESTRICT buf) {
-#if HWY_TARGET == HWY_SCALAR
-  (void)d;
-  (void)buf;
-  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
-  return detail::HeapSort(st, keys, num);
-#else
+#if VQSORT_PRINT
+  fprintf(stderr, "=============== Sort num %zu\n", num);
+#endif
+
+#if VQSORT_ENABLED || HWY_IDE
+#if !HWY_HAVE_SCALABLE
+  // On targets with fixed-size vectors, avoid _using_ the allocated memory.
+  // We avoid (potentially expensive for small input sizes) allocations on
+  // platforms where no targets are scalable. For 512-bit vectors, this fits on
+  // the stack (several KiB).
+  HWY_ALIGN T storage[SortConstants::BufNum<T>(HWY_LANES(T))] = {};
+  static_assert(sizeof(storage) <= 8192, "Unexpectedly large, check size");
+  buf = storage;
+#endif  // !HWY_HAVE_SCALABLE
+
  if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;

 #if HWY_MAX_BYTES > 64
@ -705,13 +936,22 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,

  // Pulled out of the recursion so we can special-case degenerate partitions.
  detail::Generator rng(keys, num);
-  const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
+  detail::PivotResult result;
+  const Vec<D> pivot =
+      detail::ChoosePivot(d, st, keys, 0, num, buf, rng, result);

-  // Introspection: switch to worst-case N*logN heapsort after this many.
-  const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
-
-  detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
-#endif  // HWY_TARGET == HWY_SCALAR
+  if (result != detail::PivotResult::kAllEqual) {
+    // Introspection: switch to worst-case N*logN heapsort after this many.
+    const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
+    detail::Recurse(d, st, keys, keys + num, 0, num, pivot, buf, rng,
+                    max_levels);
+  }
+#else
+  (void)d;
+  (void)buf;
+  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
+  return detail::HeapSort(st, keys, num);
+#endif  // VQSORT_ENABLED
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/contrib/sort/vqsort.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,21 +17,64 @@

 #include <string.h>  // memset

-#include "hwy/aligned_allocator.h"
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/shared-inl.h"

+// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
+// optimization that replaces dynamic allocation with stack storage.
+#ifndef VQSORT_STACK
+#if HWY_ARCH_X86 || HWY_ARCH_WASM
+#define VQSORT_STACK 1
+#else
+#define VQSORT_STACK 0
+#endif
+#endif  // VQSORT_STACK
+
+#if !VQSORT_STACK
+#include "hwy/aligned_allocator.h"
+#endif
+
+// Check if we have sys/random.h. First skip some systems on which the check
+// itself (features.h) might be problematic.
+#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
+#define VQSORT_GETRANDOM 0
+#endif
+
+#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
+#include <features.h>
+
+// ---- which libc
+#if defined(__UCLIBC__)
+#define VQSORT_GETRANDOM 1  // added Mar 2015, before uclibc-ng 1.0
+
+#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 25)
+#define VQSORT_GETRANDOM 1
+#else
+#define VQSORT_GETRANDOM 0
+#endif
+
+#else
+// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
+// https://www.openwall.com/lists/musl/2013/03/29/13.
+#define VQSORT_GETRANDOM 1
+
+#endif  // ---- which libc
+#endif  // linux
+
+#if !defined(VQSORT_GETRANDOM)
+#define VQSORT_GETRANDOM 0
+#endif
+
 // Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
 // (not all Android support the getrandom wrapper)
 #ifndef VQSORT_SECURE_SEED

-#if (defined(linux) || defined(__linux__)) && \
-    !(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV)
+#if VQSORT_GETRANDOM
 #define VQSORT_SECURE_SEED 1
 #elif defined(_WIN32) || defined(_WIN64)
 #define VQSORT_SECURE_SEED 2
@ -47,7 +91,7 @@
 #include <sys/random.h>
 #elif VQSORT_SECURE_SEED == 2
 #include <windows.h>
-#pragma comment(lib, "Advapi32.lib")
+#pragma comment(lib, "advapi32.lib")
 // Must come after windows.h.
 #include <wincrypt.h>
 #endif  // VQSORT_SECURE_SEED
@ -72,40 +116,32 @@ namespace {
 HWY_EXPORT(VectorSize);
 HWY_EXPORT(HaveFloat64);

-HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
-  // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
-  const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N);
-  return (3 + 1) * lpc + 2 * N;
-}
-
 }  // namespace

 Sorter::Sorter() {
+#if VQSORT_STACK
+  ptr_ = nullptr;  // Sort will use stack storage instead
+#else
  // Determine the largest buffer size required for any type by trying them all.
  // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
  // may require a larger buffer.)
  const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
-  size_t max_bytes = 0;
-  for (size_t sizeof_t :
-       {sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) {
-    const size_t N = vector_size / sizeof_t;
-    // One extra for padding plus another for full-vector loads.
-    const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N;
-    const size_t partition_num = SortConstants::PartitionBufNum(N);
-    const size_t buf_lanes =
-        HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N)));
-    max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t);
-  }
-
+  const size_t max_bytes =
+      HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
+                      SortConstants::BufBytes<uint32_t>(vector_size)),
+              SortConstants::BufBytes<uint64_t>(vector_size));
  ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);

  // Prevent msan errors by initializing.
  memset(ptr_, 0, max_bytes);
+#endif
 }

 void Sorter::Delete() {
+#if !VQSORT_STACK
  FreeAlignedBytes(ptr_, nullptr, nullptr);
  ptr_ = nullptr;
+#endif
 }

 #if !VQSORT_SECURE_RNG
--- a/third_party/highway/hwy/contrib/sort/vqsort.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort.h
@ -1,4 +1,5 @@
 // Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -13,6 +14,12 @@
 // limitations under the License.

 // Interface to vectorized quicksort with dynamic dispatch.
+// Blog post: https://tinyurl.com/vqsort-blog
+// Paper with measurements: https://arxiv.org/abs/2205.05982
+//
+// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
+// worthwhile, we recommend using this code for sorting arrays whose size is at
+// least 512 KiB.

 #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
 #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
@ -21,15 +28,6 @@

 namespace hwy {

-// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
-// https://reviews.llvm.org/D86310
-#pragma pack(push, 1)
-struct alignas(16) uint128_t {
-  uint64_t lo;  // little-endian layout
-  uint64_t hi;
-};
-#pragma pack(pop)
-
 // Tag arguments that determine the sort order.
 struct SortAscending {
  constexpr bool IsAscending() const { return true; }
@ -84,6 +82,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;

+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
+  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
+
  // For internal use only
  static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
  static bool HaveFloat64();
--- a/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@ -29,9 +29,16 @@ namespace HWY_NAMESPACE {

 void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
  SortTag<uint64_t> d;
  detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits128-inl.h"
@ -29,9 +29,16 @@ namespace HWY_NAMESPACE {

 void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
  SortTag<uint64_t> d;
  detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
--- a/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -29,7 +29,7 @@ namespace HWY_NAMESPACE {

 void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
  SortTag<float> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
                 float* HWY_RESTRICT buf) {
  SortTag<float> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -31,7 +31,7 @@ void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
                double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
  SortTag<double> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
  Sort(d, st, keys, num, buf);
 #else
  (void)keys;
--- a/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -31,7 +31,7 @@ void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
                 double* HWY_RESTRICT buf) {
 #if HWY_HAVE_FLOAT64
  SortTag<double> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
  Sort(d, st, keys, num, buf);
 #else
  (void)keys;
--- a/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"

-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
                int16_t* HWY_RESTRICT buf) {
  SortTag<int16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
  Sort(d, st, keys, num, buf);
 }

@ -55,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,

 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"

-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
                 int16_t* HWY_RESTRICT buf) {
  SortTag<int16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
  Sort(d, st, keys, num, buf);
 }

@ -55,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,

 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
                int32_t* HWY_RESTRICT buf) {
  SortTag<int32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
                 int32_t* HWY_RESTRICT buf) {
  SortTag<int32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
                int64_t* HWY_RESTRICT buf) {
  SortTag<int64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
                 int64_t* HWY_RESTRICT buf) {
  SortTag<int64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc
@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
+                  uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Asc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortAscending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Asc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc
@ -0,0 +1,65 @@
+// Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/contrib/sort/vqsort.h"
+
+#undef HWY_TARGET_INCLUDE
+// clang-format off
+// (avoid line break, which would prevent Copybara rules from matching)
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc"  //NOLINT
+// clang-format on
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
+// After foreach_target
+#include "hwy/contrib/sort/traits128-inl.h"
+#include "hwy/contrib/sort/vqsort-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
+                   uint64_t* HWY_RESTRICT buf) {
+#if VQSORT_ENABLED
+  SortTag<uint64_t> d;
+  detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
+  Sort(d, st, keys, num, buf);
+#else
+  (void) keys;
+  (void) num;
+  (void) buf;
+  HWY_ASSERT(0);
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(SortKV128Desc);
+}  // namespace
+
+void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
+                        SortDescending) const {
+  HWY_DYNAMIC_DISPATCH(SortKV128Desc)
+  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"

-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
                uint16_t* HWY_RESTRICT buf) {
  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
  Sort(d, st, keys, num, buf);
 }

@ -55,5 +52,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,

 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,20 +13,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
 #include "hwy/contrib/sort/vqsort-inl.h"

-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@ -33,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
                 uint16_t* HWY_RESTRICT buf) {
  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
+      st;
  Sort(d, st, keys, num, buf);
 }

@ -55,5 +53,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,

 }  // namespace hwy
 #endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
                uint32_t* HWY_RESTRICT buf) {
  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
                 uint32_t* HWY_RESTRICT buf) {
  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
+      st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
 void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
                uint64_t* HWY_RESTRICT buf) {
  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,12 +13,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "hwy/contrib/sort/disabled_targets.h"
 #include "hwy/contrib/sort/vqsort.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // After foreach_target
 #include "hwy/contrib/sort/traits-inl.h"
@ -30,7 +30,8 @@ namespace HWY_NAMESPACE {
 void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
                 uint64_t* HWY_RESTRICT buf) {
  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
+  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
+      st;
  Sort(d, st, keys, num, buf);
 }

--- a/third_party/highway/hwy/detect_compiler_arch.h
+++ b/third_party/highway/hwy/detect_compiler_arch.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -29,34 +30,45 @@
 //------------------------------------------------------------------------------
 // Compiler

-// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
-// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
-// purpose.
+// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
+// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
 #if defined(_MSC_VER) && !defined(__clang__)
 #define HWY_COMPILER_MSVC _MSC_VER
 #else
 #define HWY_COMPILER_MSVC 0
 #endif

+#if defined(_MSC_VER) && defined(__clang__)
+#define HWY_COMPILER_CLANGCL _MSC_VER
+#else
+#define HWY_COMPILER_CLANGCL 0
+#endif
+
 #ifdef __INTEL_COMPILER
 #define HWY_COMPILER_ICC __INTEL_COMPILER
 #else
 #define HWY_COMPILER_ICC 0
 #endif

+// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
+// compiler extensions (eg. Clang, Intel...)
 #ifdef __GNUC__
 #define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
 #else
 #define HWY_COMPILER_GCC 0
 #endif

-// Clang can masquerade as MSVC/GCC, in which case both are set.
+// Clang or clang-cl, not GCC.
 #ifdef __clang__
-#ifdef __APPLE__
-// Apple LLVM version is unrelated to the actual Clang version, which we need
-// for enabling workarounds. Use the presence of warning flags to deduce it.
+// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
+// an invalid version number, deduce it from the presence of warnings.
 // Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if __has_warning("-Wformat-insufficient-args")
+#if defined(__APPLE__) || __clang_major__ >= 999
+#if __has_warning("-Wbitwise-instead-of-logical")
+#define HWY_COMPILER_CLANG 1400
+#elif __has_warning("-Wreserved-identifier")
+#define HWY_COMPILER_CLANG 1300
+#elif __has_warning("-Wformat-insufficient-args")
 #define HWY_COMPILER_CLANG 1200
 #elif __has_warning("-Wimplicit-const-int-float-conversion")
 #define HWY_COMPILER_CLANG 1100
@ -72,19 +84,32 @@
 #else  // Anything older than 7.0 is not recommended for Highway.
 #define HWY_COMPILER_CLANG 600
 #endif  // __has_warning chain
-#else   // Non-Apple: normal version
+#else   // use normal version
 #define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
 #endif
 #else  // Not clang
 #define HWY_COMPILER_CLANG 0
 #endif

+#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
+#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
+#else
+#define HWY_COMPILER_GCC_ACTUAL 0
+#endif
+
 // More than one may be nonzero, but we want at least one.
-#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
-    !HWY_COMPILER_CLANG
+#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
+          HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
 #error "Unsupported compiler"
 #endif

+// We should only detect one of these (only clang/clangcl overlap)
+#if 1 <                                                                     \
+    (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
+     !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
+#error "Detected multiple compilers"
+#endif
+
 #ifdef __has_builtin
 #define HWY_HAS_BUILTIN(name) __has_builtin(name)
 #else
@ -140,7 +165,7 @@
 #define HWY_ARCH_ARM_A64 0
 #endif

-#if defined(__arm__) || defined(_M_ARM)
+#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
 #define HWY_ARCH_ARM_V7 1
 #else
 #define HWY_ARCH_ARM_V7 0
@ -150,12 +175,20 @@
 #error "Cannot have both A64 and V7"
 #endif

+// Any *supported* version of Arm, i.e. 7 or later
 #if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
 #define HWY_ARCH_ARM 1
 #else
 #define HWY_ARCH_ARM 0
 #endif

+// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
+#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
+#define HWY_ARCH_ARM_OLD 1
+#else
+#define HWY_ARCH_ARM_OLD 0
+#endif
+
 #if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
 #define HWY_ARCH_WASM 1
 #else
@ -170,9 +203,21 @@

 // It is an error to detect multiple architectures at the same time, but OK to
 // detect none of the above.
-#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
-     HWY_ARCH_RVV) > 1
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
+     HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
 #error "Must not detect more than one architecture"
 #endif

+#if defined(_WIN32) || defined(_WIN64)
+#define HWY_OS_WIN 1
+#else
+#define HWY_OS_WIN 0
+#endif
+
+#if defined(linux) || defined(__linux__)
+#define HWY_OS_LINUX 1
+#else
+#define HWY_OS_LINUX 0
+#endif
+
 #endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
--- a/third_party/highway/hwy/detect_targets.h
+++ b/third_party/highway/hwy/detect_targets.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -37,6 +38,10 @@
 // AVX2 target for VMs which support AVX2 but not the other instruction sets)
 // #define HWY_DISABLE_BMI2_FMA

+// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
+// #define HWY_WANT_SSSE3
+// #define HWY_WANT_SSE4
+
 //------------------------------------------------------------------------------
 // Targets

@ -46,59 +51,71 @@
 // All values are unconditionally defined so we can test HWY_TARGETS without
 // first checking the HWY_ARCH_*.
 //
-// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
-// can use 32-bit literals.
-
-// 1,2: reserved
-
-// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
-// be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
-// yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
-#define HWY_AVX3_DL 4  // see HWY_WANT_AVX3_DL below
-#define HWY_AVX3 8
-#define HWY_AVX2 16
-// 32: reserved for AVX
-#define HWY_SSE4 64
-#define HWY_SSSE3 128
-// 0x100, 0x200: reserved for SSE3, SSE2
+// The C99 preprocessor evaluates #if expressions using intmax_t types. This
+// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
+// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
+// avoid overflow when computing HWY_TARGETS (subtracting one instead of
+// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.

+// --------------------------- x86: 15 targets (+ one fallback)
+// Bits 0..6 reserved (7 targets)
+// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
+// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
+// Tiger Lake? We do not yet have uses for GFNI.
+#define HWY_AVX3_DL (1LL << 7)  // see HWY_WANT_AVX3_DL below
+#define HWY_AVX3 (1LL << 8)
+#define HWY_AVX2 (1LL << 9)
+// Bit 10: reserved for AVX
+#define HWY_SSE4 (1LL << 11)
+#define HWY_SSSE3 (1LL << 12)
+// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
 // dynamic dispatch. All x86 target bits must be lower or equal to
 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
 // HWY_MAX_DYNAMIC_TARGETS in total.
-#define HWY_HIGHEST_TARGET_BIT_X86 9
+#define HWY_HIGHEST_TARGET_BIT_X86 14

-#define HWY_SVE2 0x400
-#define HWY_SVE 0x800
-// 0x1000 reserved for Helium
-#define HWY_NEON 0x2000
+// --------------------------- Arm: 15 targets (+ one fallback)
+// Bits 15..23 reserved (9 targets)
+#define HWY_SVE2_128 (1LL << 24)  // specialized target (e.g. Arm N2)
+#define HWY_SVE_256 (1LL << 25)   // specialized target (e.g. Arm V1)
+#define HWY_SVE2 (1LL << 26)
+#define HWY_SVE (1LL << 27)
+#define HWY_NEON (1LL << 28)  // On A64, includes/requires AES
+// Bit 29 reserved (Helium?)
+#define HWY_HIGHEST_TARGET_BIT_ARM 29

-#define HWY_HIGHEST_TARGET_BIT_ARM 13
+// --------------------------- RISC-V: 9 targets (+ one fallback)
+// Bits 30..36 reserved (7 targets)
+#define HWY_RVV (1LL << 37)
+// Bit 38 reserved
+#define HWY_HIGHEST_TARGET_BIT_RVV 38

-// 0x4000, 0x8000 reserved
-#define HWY_PPC8 0x10000  // v2.07 or 3
-// 0x20000, 0x40000 reserved for prior VSX/AltiVec
+// --------------------------- Future expansion: 4 targets
+// Bits 39..42 reserved

-#define HWY_HIGHEST_TARGET_BIT_PPC 18

-#define HWY_WASM2 0x80000  // Experimental
-#define HWY_WASM 0x100000
+// --------------------------- IBM Power: 9 targets (+ one fallback)
+// Bits 43..48 reserved (6 targets)
+#define HWY_PPC8 (1LL << 49)  // v2.07 or 3
+// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
+#define HWY_HIGHEST_TARGET_BIT_PPC 51

-#define HWY_HIGHEST_TARGET_BIT_WASM 20
+// --------------------------- WebAssembly: 9 targets (+ one fallback)
+// Bits 52..57 reserved (6 targets)
+#define HWY_WASM_EMU256 (1LL << 58)  // Experimental
+#define HWY_WASM (1LL << 59)
+// Bits 60 reserved
+#define HWY_HIGHEST_TARGET_BIT_WASM 60

-// 0x200000, 0x400000, 0x800000 reserved
+// --------------------------- Emulation: 2 targets

-#define HWY_RVV 0x1000000
+#define HWY_EMU128 (1LL << 61)
+// We do not add/left-shift, so this will not overflow to a negative number.
+#define HWY_SCALAR (1LL << 62)
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 62

-#define HWY_HIGHEST_TARGET_BIT_RVV 24
-
-// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
-
-#define HWY_SCALAR 0x20000000
-
-#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
-
-// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+// Do not use bit 63 - would be confusing to have negative numbers.

 //------------------------------------------------------------------------------
 // Set default blocklists
@ -138,9 +155,9 @@
 #define HWY_BROKEN_TARGETS (HWY_NEON)

 // SVE[2] require recent clang or gcc versions.
-#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
-(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
-#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
+#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
+    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
+#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)

 #else
 #define HWY_BROKEN_TARGETS 0
@ -152,24 +169,41 @@
 #define HWY_ENABLED(targets) \
  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))

+// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
+// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
+// be enabled. If 1, we instead choose HWY_SCALAR even without
+// HWY_COMPILE_ONLY_SCALAR being set.
+#if !defined(HWY_BROKEN_EMU128)  // allow overriding
+#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
+#define HWY_BROKEN_EMU128 1
+#else
+#define HWY_BROKEN_EMU128 0
+#endif
+#endif  // HWY_BROKEN_EMU128
+
 //------------------------------------------------------------------------------
 // Detect baseline targets using predefined macros

 // Baseline means the targets for which the compiler is allowed to generate
-// instructions, implying the target CPU would have to support them. Do not use
-// this directly because it does not take the blocklist into account. Allow the
-// user to override this without any guarantee of success.
-#ifndef HWY_BASELINE_TARGETS
+// instructions, implying the target CPU would have to support them. This does
+// not take the blocklist into account.
+
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
+#define HWY_BASELINE_SCALAR HWY_SCALAR
+#else
+#define HWY_BASELINE_SCALAR HWY_EMU128
+#endif

 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
-// HWY_TARGET == HWY_SCALAR.
+// HWY_TARGET == HWY_BASELINE_SCALAR.

 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
 #if defined(HWY_WANT_WASM2)
-#define HWY_BASELINE_WASM HWY_WASM2
+#define HWY_BASELINE_WASM HWY_WASM_EMU256
 #else
 #define HWY_BASELINE_WASM HWY_WASM
-#endif // HWY_WANT_WASM2
+#endif  // HWY_WANT_WASM2
 #else
 #define HWY_BASELINE_WASM 0
 #endif
@ -181,7 +215,6 @@
 #define HWY_BASELINE_PPC8 0
 #endif

-// SVE2 compiles, but is not yet tested.
 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
 #define HWY_BASELINE_SVE2 HWY_SVE2
 #else
@ -189,6 +222,11 @@
 #endif

 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
+// Baseline targets can be used unconditionally, which does not apply to
+// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
+// in the baseline would also disable all 'worse' targets (including SVE and
+// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
+// HWY_ATTAINABLE_TARGETS below.
 #define HWY_BASELINE_SVE HWY_SVE
 #else
 #define HWY_BASELINE_SVE 0
@ -201,11 +239,11 @@
 #define HWY_BASELINE_NEON 0
 #endif

-// Special handling for MSVC because it has fewer predefined macros
-#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
+// Special handling for MSVC because it has fewer predefined macros:
+#if HWY_COMPILER_MSVC

-// We can only be sure SSSE3/SSE4 are enabled if AVX is
-// (https://stackoverflow.com/questions/18563978/)
+// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
+//    https://stackoverflow.com/questions/18563978/.
 #if defined(__AVX__)
 #define HWY_CHECK_SSSE3 1
 #define HWY_CHECK_SSE4 1
@ -214,8 +252,8 @@
 #define HWY_CHECK_SSE4 0
 #endif

-// Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
-// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
+// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
+//    PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
 #define HWY_CHECK_PCLMUL_AES 1
 #define HWY_CHECK_BMI2_FMA 1
 #define HWY_CHECK_F16C 1
@ -255,13 +293,13 @@

 #endif  // non-MSVC

-#if HWY_ARCH_X86 && HWY_CHECK_SSSE3
+#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
 #define HWY_BASELINE_SSSE3 HWY_SSSE3
 #else
 #define HWY_BASELINE_SSSE3 0
 #endif

-#if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
+#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
 #define HWY_BASELINE_SSE4 HWY_SSE4
 #else
 #define HWY_BASELINE_SSE4 0
@ -284,7 +322,9 @@

 // TODO(janwas): not yet known whether these will be set by MSVC
 #if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
-    defined(__VPCLMULQDQ__)
+    defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) &&                  \
+    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) &&            \
+    defined(__AVX512BITALG__)
 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
 #else
 #define HWY_BASELINE_AVX3_DL 0
@ -296,16 +336,13 @@
 #define HWY_BASELINE_RVV 0
 #endif

-#define HWY_BASELINE_TARGETS                                                \
-  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
-   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 |              \
-   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
-   HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
-
-#else
-// User already defined HWY_BASELINE_TARGETS, but we still need to define
-// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
-#define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
+// Allow the user to override this without any guarantee of success.
+#ifndef HWY_BASELINE_TARGETS
+#define HWY_BASELINE_TARGETS                                     \
+  (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
+   HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON |    \
+   HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 |  \
+   HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
 #endif  // HWY_BASELINE_TARGETS

 //------------------------------------------------------------------------------
@ -329,33 +366,68 @@
 //------------------------------------------------------------------------------
 // Choose targets for dynamic dispatch according to one of four policies

-#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
-     defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
-#error "Invalid config: can only define a single policy for targets"
+#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
+         defined(HWY_COMPILE_ONLY_STATIC))
+#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
+#endif
+// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
+
+// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
+// does, and we require Linux to detect CPU capabilities.
+#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
+#define HWY_HAVE_RUNTIME_DISPATCH 1
+#else
+#define HWY_HAVE_RUNTIME_DISPATCH 0
 #endif

-// Further to checking for disabled/broken targets, we only use AVX3_DL after
-// explicit opt-in (via this macro OR baseline compiler flags) to avoid
-// generating a codepath which is only helpful if the app uses AVX3_DL features.
-#if defined(HWY_WANT_AVX3_DL)
-#define HWY_CHECK_AVX3_DL HWY_AVX3_DL
+// AVX3_DL is not widely available yet. To reduce code size and compile time,
+// only include it in the set of attainable targets (for dynamic dispatch) if
+// the user opts in, OR it is in the baseline (we check whether enabled below).
+#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
+#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
 #else
-#define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
+#define HWY_ATTAINABLE_AVX3_DL 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && \
+    ((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
+#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
+#else
+#define HWY_ATTAINABLE_SVE 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && \
+    ((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
+#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
+#else
+#define HWY_ATTAINABLE_SVE2 0
 #endif

 // Attainable means enabled and the compiler allows intrinsics (even when not
 // allowed to autovectorize). Used in 3 and 4.
 #if HWY_ARCH_X86
-#define HWY_ATTAINABLE_TARGETS                                          \
-  HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
-              HWY_CHECK_AVX3_DL)
+#define HWY_ATTAINABLE_TARGETS                                        \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
+              HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+#define HWY_ATTAINABLE_TARGETS                                      \
+  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
+              HWY_ATTAINABLE_SVE2)
 #else
-#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
+#define HWY_ATTAINABLE_TARGETS \
+  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
 #endif

-// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
-// to ~HWY_SCALAR, but this is more explicit).
-#if defined(HWY_COMPILE_ONLY_SCALAR)
+// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
+#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_EMU128  // override baseline
+#define HWY_TARGETS HWY_EMU128
+
+// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
+// we currently still support it for backwards compatibility.
+#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
+    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
 #undef HWY_STATIC_TARGET
 #define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
 #define HWY_TARGETS HWY_SCALAR
@ -369,9 +441,12 @@
 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS

 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
-// excluding superseded targets, in particular scalar.
+// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
+// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
+// sets all lower bits (better targets), then we also include the static target.
 #else
-#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+#define HWY_TARGETS \
+  (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))

 #endif  // target policy

--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -12,10 +13,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
-#include "hwy/foreach_target.h"
-
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
@ -24,8 +21,12 @@
 #include <memory>
 #include <numeric>  // iota

-#include "hwy/aligned_allocator.h"
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+
 // Must come after foreach_target.h to avoid redefinition errors.
+#include "hwy/aligned_allocator.h"
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"

@ -81,7 +82,8 @@ void RunBenchmark(const char* caption) {
  benchmark.Verify(num_items);

  for (size_t i = 0; i < num_results; ++i) {
-    const double cycles_per_item = results[i].ticks / double(results[i].input);
+    const double cycles_per_item =
+        results[i].ticks / static_cast<double>(results[i].input);
    const double mad = results[i].variability * cycles_per_item;
    printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
           static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
@ -233,7 +235,7 @@ namespace hwy {
 HWY_EXPORT(RunBenchmarks);

 void Run() {
-  for (uint32_t target : SupportedAndGeneratedTargets()) {
+  for (int64_t target : SupportedAndGeneratedTargets()) {
    SetSupportedTargetsForTest(target);
    HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
  }
--- a/third_party/highway/hwy/examples/skeleton-inl.h
+++ b/third_party/highway/hwy/examples/skeleton-inl.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -22,7 +23,7 @@
 // __FILE__ is not reliable) so that foreach_target.h can re-include it.
 #define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
 // Generates code for each enabled target by re-including this source file.
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
@ -35,28 +36,21 @@ namespace skeleton {
 namespace HWY_NAMESPACE {

 // Highway ops reside here; ADL does not find templates nor builtins.
-using namespace hwy::HWY_NAMESPACE;
-
-// For reasons unknown, optimized msan builds encounter long build times here;
-// work around it until a cause is found.
-#if HWY_COMPILER_CLANG && defined(MEMORY_SANITIZER) && defined(__OPTIMIZE__)
-#define ATTR_MSAN __attribute__((optnone))
-#else
-#define ATTR_MSAN
-#endif
+namespace hn = hwy::HWY_NAMESPACE;

 // Computes log2 by converting to a vector of floats. Compiled once per target.
 template <class DF>
-ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
-                            uint8_t* HWY_RESTRICT log2) {
+HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
+                                   const uint8_t* HWY_RESTRICT values,
+                                   uint8_t* HWY_RESTRICT log2) {
  // Type tags for converting to other element types (Rebind = same count).
-  const RebindToSigned<DF> d32;
-  const Rebind<uint8_t, DF> d8;
+  const hn::RebindToSigned<DF> d32;
+  const hn::Rebind<uint8_t, DF> d8;

-  const auto u8 = Load(d8, values);
-  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
-  const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
-  Store(DemoteTo(d8, exponent), d8, log2);
+  const auto u8 = hn::Load(d8, values);
+  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
+  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
+  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
 }

 void CodepathDemo() {
@ -74,14 +68,14 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
               uint8_t* HWY_RESTRICT log2) {
  CodepathDemo();

-  const ScalableTag<float> df;
-  const size_t N = Lanes(df);
+  const hn::ScalableTag<float> df;
+  const size_t N = hn::Lanes(df);
  size_t i = 0;
  for (; i + N <= count; i += N) {
    OneFloorLog2(df, values + i, log2 + i);
  }
  for (; i < count; ++i) {
-    CappedTag<float, 1> d1;
+    hn::CappedTag<float, 1> d1;
    OneFloorLog2(d1, values + i, log2 + i);
  }
 }
@ -105,8 +99,9 @@ HWY_EXPORT(FloorLog2);
 // This function is optional and only needed in the case of exposing it in the
 // header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
 // is equivalent to inlining this function.
-void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
-                   uint8_t* HWY_RESTRICT out) {
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+                                 const size_t count,
+                                 uint8_t* HWY_RESTRICT out) {
  // This must reside outside of HWY_NAMESPACE because it references (calls the
  // appropriate one from) the per-target implementations there.
  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
--- a/third_party/highway/hwy/examples/skeleton.h
+++ b/third_party/highway/hwy/examples/skeleton.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -27,8 +28,8 @@
 namespace skeleton {

 // Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
-void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
-                   uint8_t* HWY_RESTRICT out);
+HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
+                                 const size_t count, uint8_t* HWY_RESTRICT out);

 }  // namespace skeleton

--- a/third_party/highway/hwy/examples/skeleton_test.cc
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -20,7 +21,7 @@

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep

 // Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
@ -34,13 +35,13 @@ HWY_BEFORE_NAMESPACE();
 namespace skeleton {
 namespace HWY_NAMESPACE {

-using namespace hwy::HWY_NAMESPACE;
+namespace hn = hwy::HWY_NAMESPACE;

 // Calls function defined in skeleton.cc.
 struct TestFloorLog2 {
  template <class T, class DF>
  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
-    const size_t count = 5 * Lanes(df);
+    const size_t count = 5 * hn::Lanes(df);
    auto in = hwy::AllocateAligned<uint8_t>(count);
    auto expected = hwy::AllocateAligned<uint8_t>(count);

@ -70,7 +71,7 @@ struct TestSumMulAdd {
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    hwy::RandomState rng;
    const size_t count = 4096;
-    EXPECT_TRUE(count % Lanes(d) == 0);
+    EXPECT_EQ(0, count % hn::Lanes(d));
    auto mul = hwy::AllocateAligned<T>(count);
    auto x = hwy::AllocateAligned<T>(count);
    auto add = hwy::AllocateAligned<T>(count);
@ -106,10 +107,4 @@ HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
 HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
 }  // namespace skeleton

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/foreach_target.h
+++ b/third_party/highway/hwy/foreach_target.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -52,6 +53,17 @@
 #error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
 #endif

+#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_EMU128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SCALAR
@ -74,6 +86,17 @@
 #endif
 #endif

+#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_RVV
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SVE
@ -96,6 +119,28 @@
 #endif
 #endif

+#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE_256
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SVE2_128
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SSSE3
@ -151,9 +196,9 @@
 #endif
 #endif

-#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
+#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
 #undef HWY_TARGET
-#define HWY_TARGET HWY_WASM2
+#define HWY_TARGET HWY_WASM_EMU256
 #include HWY_TARGET_INCLUDE
 #ifdef HWY_TARGET_TOGGLE
 #undef HWY_TARGET_TOGGLE
--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -26,8 +27,8 @@
 namespace hwy {

 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
-#define HWY_MAJOR 0
-#define HWY_MINOR 16
+#define HWY_MAJOR 1
+#define HWY_MINOR 0
 #define HWY_PATCH 0

 //------------------------------------------------------------------------------
@ -39,7 +40,7 @@ namespace hwy {
 // registers in the group, and is ignored on targets that do not support groups.
 #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
 #define HWY_FULL2(T, LMUL) \
-  hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
+  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
 // Workaround for MSVC grouping __VA_ARGS__ into a single argument
 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
@ -49,8 +50,7 @@ namespace hwy {
 #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)

 // Vector of up to MAX_N lanes. It's better to use full vectors where possible.
-#define HWY_CAPPED(T, MAX_N) \
-  hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))>
+#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>

 //------------------------------------------------------------------------------
 // Export user functions for static/dynamic dispatch
@ -68,10 +68,12 @@ namespace hwy {
 // defined), and can be used to deduce the return type of Choose*.
 #if HWY_STATIC_TARGET == HWY_SCALAR
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_EMU128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_RVV
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_WASM2
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_WASM
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_NEON
@ -80,6 +82,10 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SVE2
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE_256
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SVE2_128
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_PPC8
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SSSE3
@ -94,51 +100,22 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
 #endif

-// Dynamic dispatch declarations.
-
-template <typename RetType, typename... Args>
-struct FunctionCache {
- public:
-  typedef RetType(FunctionType)(Args...);
-
-  // A template function that when instantiated has the same signature as the
-  // function being called. This function initializes the global cache of the
-  // current supported targets mask used for dynamic dispatch and calls the
-  // appropriate function. Since this mask used for dynamic dispatch is a
-  // global cache, all the highway exported functions, even those exposed by
-  // different modules, will be initialized after this function runs for any one
-  // of those exported functions.
-  template <FunctionType* const table[]>
-  static RetType ChooseAndCall(Args... args) {
-    // If we are running here it means we need to update the chosen target.
-    ChosenTarget& chosen_target = GetChosenTarget();
-    chosen_target.Update();
-    return (table[chosen_target.GetIndex()])(args...);
-  }
-};
-
-// Factory function only used to infer the template parameters RetType and Args
-// from a function passed to the factory.
-template <typename RetType, typename... Args>
-FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
-  return FunctionCache<RetType, Args...>();
-}
-
 // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
 // nullptr is that target was not compiled.
-#if HWY_TARGETS & HWY_SCALAR
-#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
+#if HWY_TARGETS & HWY_EMU128
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
+#elif HWY_TARGETS & HWY_SCALAR
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
 #else
-// When scalar is not present and we try to use scalar because other targets
-// were disabled at runtime we fall back to the baseline with
-// HWY_STATIC_DISPATCH()
-#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
+// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
+// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
+#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
 #endif

-#if HWY_TARGETS & HWY_WASM2
-#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
+#if HWY_TARGETS & HWY_WASM_EMU256
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
 #else
-#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
+#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
 #endif

 #if HWY_TARGETS & HWY_WASM
@ -171,6 +148,18 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
 #endif

+#if HWY_TARGETS & HWY_SVE_256
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SVE2_128
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
+#else
+#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_PPC8
 #define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
 #else
@ -207,6 +196,53 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
 #endif

+// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
+// apparently cannot be an array. Use a function pointer instead, which has the
+// disadvantage that we call the static (not best) target on the first call to
+// any HWY_DYNAMIC_DISPATCH.
+#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
+#define HWY_DISPATCH_WORKAROUND 1
+#else
+#define HWY_DISPATCH_WORKAROUND 0
+#endif
+
+// Provides a static member function which is what is called during the first
+// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
+// this function are the first entry in the tables created by HWY_EXPORT.
+template <typename RetType, typename... Args>
+struct FunctionCache {
+ public:
+  typedef RetType(FunctionType)(Args...);
+
+#if HWY_DISPATCH_WORKAROUND
+  template <FunctionType* const func>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (*func)(args...);
+  }
+#else
+  // A template function that when instantiated has the same signature as the
+  // function being called. This function initializes the bit array of targets
+  // supported by the current CPU and then calls the appropriate entry within
+  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
+  // exported functions, even those defined by different translation units,
+  // will dispatch directly to the best available target.
+  template <FunctionType* const table[]>
+  static RetType ChooseAndCall(Args... args) {
+    ChosenTarget& chosen_target = GetChosenTarget();
+    chosen_target.Update(SupportedTargets());
+    return (table[chosen_target.GetIndex()])(args...);
+  }
+#endif  // HWY_DISPATCH_WORKAROUND
+};
+
+// Used to deduce the template parameters RetType and Args from a function.
+template <typename RetType, typename... Args>
+FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
+  return FunctionCache<RetType, Args...>();
+}
+
 #define HWY_DISPATCH_TABLE(FUNC_NAME) \
  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)

@ -215,7 +251,7 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 // static array must be defined at the same namespace level as the function
 // it is exporting.
 // After being exported, it can be called from other parts of the same source
-// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
+// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
 // like in the following example:
 //
 //   #include "hwy/highway.h"
@ -245,26 +281,44 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 // This case still uses a table, although of a single element, to provide the
 // same compile error conditions as with the dynamic dispatch case when multiple
 // targets are being compiled.
-#define HWY_EXPORT(FUNC_NAME)                                       \
-  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
-      const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {                    \
-          &HWY_STATIC_DISPATCH(FUNC_NAME)}
+#define HWY_EXPORT(FUNC_NAME)                                             \
+  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
+  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)

 #else

-// Dynamic dispatch case with one entry per dynamic target plus the scalar
-// mode and the initialization wrapper.
-#define HWY_EXPORT(FUNC_NAME)                                              \
-  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME))                         \
-      const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
-          /* The first entry in the table initializes the global cache and \
-           * calls the appropriate function. */                            \
-          &decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH(        \
-              FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,  \
-          HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                               \
-          HWY_CHOOSE_SCALAR(FUNC_NAME),                                    \
+// Simplified version for MSVC 2017: function pointer instead of table.
+#if HWY_DISPATCH_WORKAROUND
+
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the function from HWY_STATIC_TARGET. */                       \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
  }
+
+#else
+
+// Dynamic dispatch case with one entry per dynamic target plus the fallback
+// target and the initialization wrapper.
+#define HWY_EXPORT(FUNC_NAME)                                                \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
+      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
+      /* The first entry in the table initializes the global cache and       \
+       * calls the appropriate function. */                                  \
+      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
+          FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
+      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
+      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
+  }
+
+#endif  // HWY_DISPATCH_WORKAROUND
+
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))

@ -302,14 +356,17 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #error "PPC is not yet supported"
 #elif HWY_TARGET == HWY_NEON
 #include "hwy/ops/arm_neon-inl.h"
-#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
 #include "hwy/ops/arm_sve-inl.h"
-#elif HWY_TARGET == HWY_WASM2
+#elif HWY_TARGET == HWY_WASM_EMU256
 #include "hwy/ops/wasm_256-inl.h"
 #elif HWY_TARGET == HWY_WASM
 #include "hwy/ops/wasm_128-inl.h"
 #elif HWY_TARGET == HWY_RVV
 #include "hwy/ops/rvv-inl.h"
+#elif HWY_TARGET == HWY_EMU128
+#include "hwy/ops/emu128-inl.h"
 #elif HWY_TARGET == HWY_SCALAR
 #include "hwy/ops/scalar-inl.h"
 #else
--- a/third_party/highway/hwy/highway_export.h
+++ b/third_party/highway/hwy/highway_export.h
@ -9,19 +9,11 @@
 #ifndef HWY_DLLEXPORT_H
 #define HWY_DLLEXPORT_H

-// Bazel build are always static:
-#if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE)
-#define HWY_STATIC_DEFINE
-#endif
-
-#ifdef HWY_STATIC_DEFINE
+#if !defined(HWY_SHARED_DEFINE)
 #define HWY_DLLEXPORT
-#define HWY_NO_EXPORT
 #define HWY_CONTRIB_DLLEXPORT
-#define HWY_CONTRIB_NO_EXPORT
 #define HWY_TEST_DLLEXPORT
-#define HWY_TEST_NO_EXPORT
-#else
+#else  // !HWY_SHARED_DEFINE

 #ifndef HWY_DLLEXPORT
 #if defined(hwy_EXPORTS)
@ -31,23 +23,15 @@
 #else
 #define HWY_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#else
+#else  // defined(hwy_EXPORTS)
 /* We are using this library */
 #ifdef _WIN32
 #define HWY_DLLEXPORT __declspec(dllimport)
 #else
 #define HWY_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#endif
-#endif
-
-#ifndef HWY_NO_EXPORT
-#ifdef _WIN32
-#define HWY_NO_EXPORT
-#else
-#define HWY_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
+#endif  // defined(hwy_EXPORTS)
+#endif  // HWY_DLLEXPORT

 #ifndef HWY_CONTRIB_DLLEXPORT
 #if defined(hwy_contrib_EXPORTS)
@ -57,23 +41,15 @@
 #else
 #define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#else
+#else  // defined(hwy_contrib_EXPORTS)
 /* We are using this library */
 #ifdef _WIN32
 #define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
 #else
 #define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#endif
-#endif
-
-#ifndef HWY_CONTRIB_NO_EXPORT
-#ifdef _WIN32
-#define HWY_CONTRIB_NO_EXPORT
-#else
-#define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
+#endif  // defined(hwy_contrib_EXPORTS)
+#endif  // HWY_CONTRIB_DLLEXPORT

 #ifndef HWY_TEST_DLLEXPORT
 #if defined(hwy_test_EXPORTS)
@ -83,24 +59,16 @@
 #else
 #define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#else
+#else  // defined(hwy_test_EXPORTS)
 /* We are using this library */
 #ifdef _WIN32
 #define HWY_TEST_DLLEXPORT __declspec(dllimport)
 #else
 #define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
 #endif
-#endif
-#endif
+#endif  // defined(hwy_test_EXPORTS)
+#endif  // HWY_TEST_DLLEXPORT

-#ifndef HWY_TEST_NO_EXPORT
-#ifdef _WIN32
-#define HWY_TEST_NO_EXPORT
-#else
-#define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
-
-#endif
+#endif  // !HWY_SHARED_DEFINE

 #endif /* HWY_DLLEXPORT_H */
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -14,12 +15,15 @@

 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>

 #include <bitset>

+#include "hwy/base.h"
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "highway_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"    // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"  // Unpredictable1
 #include "hwy/tests/test_util-inl.h"
@ -28,6 +32,38 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+template <size_t kLimit, typename T>
+HWY_NOINLINE void TestCappedLimit(T /* tag */) {
+  CappedTag<T, kLimit> d;
+  // Ensure two ops compile
+  HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0}));
+
+  // Ensure we do not write more than kLimit lanes
+  const size_t N = Lanes(d);
+  if (kLimit < N) {
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T{0});
+    Store(Set(d, T{1}), d, lanes.get());
+    for (size_t i = kLimit; i < N; ++i) {
+      HWY_ASSERT_EQ(lanes[i], T{0});
+    }
+  }
+}
+
+// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not
+// use ForPartialVectors etc.
+struct TestCapped {
+  template <typename T>
+  void operator()(T t) const {
+    TestCappedLimit<1>(t);
+    TestCappedLimit<3>(t);
+    TestCappedLimit<5>(t);
+    TestCappedLimit<1ull << 15>(t);
+  }
+};
+
+HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); }
+
 // For testing that ForPartialVectors reaches every possible size:
 using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;

@ -48,7 +84,7 @@ struct TestMaxLanes {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t N = Lanes(d);
-    const size_t kMax = MaxLanes(d);
+    const size_t kMax = MaxLanes(d);  // for RVV, includes LMUL
    HWY_ASSERT(N <= kMax);
    HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));

@ -175,26 +211,19 @@ HWY_NOINLINE void TestAllSignBit() {
  ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
 }

-// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
-template <typename TF>
-bool IsNaN(TF f) {
-  MakeUnsigned<TF> bits;
-  memcpy(&bits, &f, sizeof(TF));
-  bits += bits;
-  bits >>= 1;  // clear sign bit
-  // NaN if all exponent bits are set and the mantissa is not zero.
-  return bits > ExponentMask<decltype(bits)>();
-}
-
+// inline to work around incorrect SVE codegen (only first 128 bits used).
 template <class D, class V>
-HWY_NOINLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
+HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
  using T = TFromD<D>;
-  const T lane = GetLane(v);
-  if (!IsNaN(lane)) {
-    const std::string type_name = TypeName(T(), Lanes(d));
+  const size_t N = Lanes(d);
+  if (!AllTrue(d, IsNaN(v))) {
+    Print(d, "not all NaN", v, 0, N);
+    Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N);
+    const std::string type_name = TypeName(T(), N);
    // RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
    // avoid truncating doubles.
    uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
+    const T lane = GetLane(v);
    memcpy(bytes, &lane, sizeof(T));
    Abort(file, line,
          "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
@ -264,14 +293,14 @@ struct TestNaN {

    // Reduction
    HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
-// TODO(janwas): re-enable after QEMU is fixed
+// TODO(janwas): re-enable after QEMU/Spike are fixed
 #if HWY_TARGET != HWY_RVV
    HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
    HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
 #endif

    // Min
-#if HWY_ARCH_X86 && HWY_TARGET != HWY_SCALAR
+#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128)
    // x86 SIMD returns the second operand if any input is NaN.
    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
@ -317,6 +346,74 @@ HWY_NOINLINE void TestAllNaN() {
  ForPartialVectors<TestF32NaN>()(float());
 }

+struct TestIsNaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_NAN(d, nan);
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsNaN() {
+  ForFloatTypes(ForPartialVectors<TestIsNaN>());
+}
+
+struct TestIsInf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsInf() {
+  ForFloatTypes(ForPartialVectors<TestIsInf>());
+}
+
+struct TestIsFinite {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    const auto neg = Set(d, T{-1});
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d)));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue<T>())));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d),
+                       IsFinite(Set(d, hwy::HighestValue<T>())));
+  }
+};
+
+HWY_NOINLINE void TestAllIsFinite() {
+  ForFloatTypes(ForPartialVectors<TestIsFinite>());
+}
+
 struct TestCopyAndAssign {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -371,21 +468,19 @@ HWY_AFTER_NAMESPACE();

 namespace hwy {
 HWY_BEFORE_TEST(HighwayTest);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -24,6 +25,7 @@
 #include <algorithm>  // sort
 #include <array>
 #include <atomic>
+#include <chrono>  //NOLINT
 #include <limits>
 #include <numeric>  // iota
 #include <random>
@ -122,6 +124,9 @@ inline Ticks Start() {
  Ticks t;
 #if HWY_ARCH_PPC && defined(__GLIBC__)
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
+  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  _ReadWriteBarrier();
  _mm_lfence();
@ -160,10 +165,14 @@ inline Ticks Start() {
  return t;
 }

+// WARNING: on x86, caller must check HasRDTSCP before using this!
 inline Ticks Stop() {
  uint64_t t;
 #if HWY_ARCH_PPC && defined(__GLIBC__)
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
+  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
+  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  _ReadWriteBarrier();
  unsigned aux;
@ -302,7 +311,8 @@ T MedianAbsoluteDeviation(const T* values, const size_t num_values,
  std::vector<T> abs_deviations;
  abs_deviations.reserve(num_values);
  for (size_t i = 0; i < num_values; ++i) {
-    const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
+    const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
+                                 static_cast<int64_t>(median));
    abs_deviations.push_back(static_cast<T>(abs));
  }
  return Median(abs_deviations.data(), num_values);
@ -331,6 +341,38 @@ inline void PreventElision(T&& output) {
 #endif
 }

+// Measures the actual current frequency of Ticks. We cannot rely on the nominal
+// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
+// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
+// used on RISC-V and ARM64.
+HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
+  double max_ticks_per_sec = 0.0;
+  // Arbitrary, enough to ignore 2 outliers without excessive init time.
+  for (int rep = 0; rep < 3; ++rep) {
+    auto time0 = std::chrono::steady_clock::now();
+    using Time = decltype(time0);
+    const timer::Ticks ticks0 = timer::Start();
+    const Time time_min = time0 + std::chrono::milliseconds(10);
+
+    Time time1;
+    timer::Ticks ticks1;
+    for (;;) {
+      time1 = std::chrono::steady_clock::now();
+      // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
+      // another codepath, just use Start instead. now() presumably has its own
+      // fence-like behavior.
+      ticks1 = timer::Start();  // Do not use Stop, see comment above
+      if (time1 >= time_min) break;
+    }
+
+    const double dticks = static_cast<double>(ticks1 - ticks0);
+    std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
+    const double ticks_per_sec = dticks / dtime.count();
+    max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec);
+  }
+  return max_ticks_per_sec;
+}
+
 #if HWY_ARCH_X86

 void Cpuid(const uint32_t level, const uint32_t count,
@ -378,50 +420,27 @@ std::string BrandString() {
  return brand_string;
 }

-// Returns the frequency quoted inside the brand string. This does not
-// account for throttling nor Turbo Boost.
-double NominalClockRate() {
-  const std::string& brand_string = BrandString();
-  // Brand strings include the maximum configured frequency. These prefixes are
-  // defined by Intel CPUID documentation.
-  const char* prefixes[3] = {"MHz", "GHz", "THz"};
-  const double multipliers[3] = {1E6, 1E9, 1E12};
-  for (size_t i = 0; i < 3; ++i) {
-    const size_t pos_prefix = brand_string.find(prefixes[i]);
-    if (pos_prefix != std::string::npos) {
-      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
-      if (pos_space != std::string::npos) {
-        const std::string digits =
-            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
-        return std::stod(digits) * multipliers[i];
-      }
-    }
-  }
-
-  return 0.0;
-}
-
 #endif  // HWY_ARCH_X86

 }  // namespace

 HWY_DLLEXPORT double InvariantTicksPerSecond() {
 #if HWY_ARCH_PPC && defined(__GLIBC__)
-  return double(__ppc_get_timebase_freq());
-#elif HWY_ARCH_X86
-  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
-  return NominalClockRate();
+  return static_cast<double>(__ppc_get_timebase_freq());
+#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
+  // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
+  static const double freq = MeasureNominalClockRate();
+  return freq;
 #elif defined(_WIN32) || defined(_WIN64)
  LARGE_INTEGER freq;
  (void)QueryPerformanceFrequency(&freq);
-  return double(freq.QuadPart);
+  return static_cast<double>(freq.QuadPart);
 #elif defined(__APPLE__)
  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
  mach_timebase_info_data_t timebase;
  (void)mach_timebase_info(&timebase);
-  return double(timebase.denom) / timebase.numer * 1E9;
+  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
 #else
-  // TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
  return 1E9;  // Haiku and clock_gettime return nanoseconds.
 #endif
 }
@ -432,14 +451,28 @@ HWY_DLLEXPORT double Now() {
 }

 HWY_DLLEXPORT uint64_t TimerResolution() {
+#if HWY_ARCH_X86
+  bool can_use_stop = platform::HasRDTSCP();
+#else
+  constexpr bool can_use_stop = true;
+#endif
+
  // Nested loop avoids exceeding stack/L1 capacity.
  timer::Ticks repetitions[Params::kTimerSamples];
  for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
    timer::Ticks samples[Params::kTimerSamples];
-    for (size_t i = 0; i < Params::kTimerSamples; ++i) {
-      const timer::Ticks t0 = timer::Start();
-      const timer::Ticks t1 = timer::Stop();
-      samples[i] = t1 - t0;
+    if (can_use_stop) {
+      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+        const timer::Ticks t0 = timer::Start();
+        const timer::Ticks t1 = timer::Stop();  // we checked HasRDTSCP above
+        samples[i] = t1 - t0;
+      }
+    } else {
+      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+        const timer::Ticks t0 = timer::Start();
+        const timer::Ticks t1 = timer::Start();  // do not use Stop, see above
+        samples[i] = t1 - t0;
+      }
    }
    repetitions[rep] = robust_statistics::Mode(samples);
  }
@ -459,7 +492,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
  // Choose initial samples_per_eval based on a single estimated duration.
  timer::Ticks t0 = timer::Start();
  lambda();
-  timer::Ticks t1 = timer::Stop();
+  timer::Ticks t1 = timer::Stop();  // Caller checks HasRDTSCP
  timer::Ticks est = t1 - t0;
  static const double ticks_per_second = platform::InvariantTicksPerSecond();
  const size_t ticks_per_eval =
@ -483,7 +516,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
    for (size_t i = 0; i < samples_per_eval; ++i) {
      t0 = timer::Start();
      lambda();
-      t1 = timer::Stop();
+      t1 = timer::Stop();  // Caller checks HasRDTSCP
      samples.push_back(t1 - t0);
    }

--- a/third_party/highway/hwy/nanobenchmark.h
+++ b/third_party/highway/hwy/nanobenchmark.h
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
--- a/third_party/highway/hwy/nanobenchmark_test.cc
+++ b/third_party/highway/hwy/nanobenchmark_test.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -57,7 +58,7 @@ std::mt19937 rng;
 // A function whose runtime depends on rng.
 FuncOutput Random(const void* /*arg*/, FuncInput in) {
  const size_t r = rng() & 0xF;
-  uint32_t ret = in;
+  FuncOutput ret = static_cast<FuncOutput>(in);
  for (size_t i = 0; i < r; ++i) {
    ret /= ((rng() & 1) + 2);
  }
@ -88,9 +89,3 @@ TEST(NanobenchmarkTest, RunAll) {

 }  // namespace
 }  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
--- a/third_party/highway/hwy/ops/arm_sve-inl.h
+++ b/third_party/highway/hwy/ops/arm_sve-inl.h
--- a/third_party/highway/hwy/ops/emu128-inl.h
+++ b/third_party/highway/hwy/ops/emu128-inl.h
--- a/third_party/highway/hwy/ops/generic_ops-inl.h
+++ b/third_party/highway/hwy/ops/generic_ops-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -58,9 +59,8 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
 // Returns lanes with the most significant bit set and all other bits zero.
 template <class D>
 HWY_API Vec<D> SignBit(D d) {
-  using Unsigned = MakeUnsigned<TFromD<D>>;
-  const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
-  return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
+  const RebindToUnsigned<decltype(d)> du;
+  return BitCast(d, Set(du, SignMask<TFromD<D>>()));
 }

 // Returns quiet NaN.
@ -72,6 +72,906 @@ HWY_API Vec<D> NaN(D d) {
  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
 }

+// Returns positive infinity.
+template <class D>
+HWY_API Vec<D> Inf(D d) {
+  const RebindToUnsigned<D> du;
+  using T = TFromD<D>;
+  using TU = TFromD<decltype(du)>;
+  const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
+  return BitCast(d, Set(du, max_x2 >> 1));
+}
+
+// ------------------------------ SafeFillN
+
+template <class D, typename T = TFromD<D>>
+HWY_API void SafeFillN(const size_t num, const T value, D d,
+                       T* HWY_RESTRICT to) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+  (void)d;
+  for (size_t i = 0; i < num; ++i) {
+    to[i] = value;
+  }
+#else
+  BlendedStore(Set(d, value), FirstN(d, num), d, to);
+#endif
+}
+
+// ------------------------------ SafeCopyN
+
+template <class D, typename T = TFromD<D>>
+HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
+                       T* HWY_RESTRICT to) {
+#if HWY_MEM_OPS_MIGHT_FAULT
+  (void)d;
+  for (size_t i = 0; i < num; ++i) {
+    to[i] = from[i];
+  }
+#else
+  const Mask<D> mask = FirstN(d, num);
+  BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
+#endif
+}
+
+// "Include guard": skip if native instructions are available. The generic
+// implementation is currently shared between x86_* and wasm_*, and is too large
+// to duplicate.
+
+#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#else
+#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#endif
+
+// ------------------------------ LoadInterleaved2
+
+template <typename T, size_t N, class V>
+HWY_API void LoadInterleaved2(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1) {
+  const V A = LoadU(d, unaligned + 0 * N);  // v1[1] v0[1] v1[0] v0[0]
+  const V B = LoadU(d, unaligned + 1 * N);
+  v0 = ConcatEven(d, B, A);
+  v1 = ConcatOdd(d, B, A);
+}
+
+template <typename T, class V>
+HWY_API void LoadInterleaved2(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+}
+
+// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
+
+namespace detail {
+
+// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
+template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
+HWY_API void LoadTransposedBlocks3(Simd<T, N, 0> d,
+                                   const T* HWY_RESTRICT unaligned, V& A, V& B,
+                                   V& C) {
+  A = LoadU(d, unaligned + 0 * N);
+  B = LoadU(d, unaligned + 1 * N);
+  C = LoadU(d, unaligned + 2 * N);
+}
+
+}  // namespace detail
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
+HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  const RebindToUnsigned<decltype(d)> du;
+  // Compact notation so these fit on one line: 12 := v1[2].
+  V A;  // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
+  V B;  // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
+  V C;  // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
+  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
+  // Compress all lanes belonging to v0 into consecutive lanes.
+  constexpr uint8_t Z = 0x80;
+  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
+                                                Z, Z, Z, Z, Z,  Z,  Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z,  Z,  Z, Z, Z, 2, 5,
+                                                8, 11, 14, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
+                                                Z, Z, Z, 1, 4, 7, 10, 13};
+  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
+                                                Z, Z, Z, Z,  Z,  Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z,  Z,  Z, Z, 0, 3, 6,
+                                                9, 12, 15, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
+                                                Z, Z, Z, 2, 5, 8, 11, 14};
+  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
+                                                Z, Z, Z, Z,  Z,  Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z,  Z,  Z, Z, Z, 1, 4, 7,
+                                                10, 13, Z, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
+                                                Z, Z, 0, 3, 6, 9, 12, 15};
+  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
+  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
+  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
+  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
+  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
+  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
+  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
+  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
+  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
+  v0 = Or3(v0L, v0M, v0U);
+  v1 = Or3(v1L, v1M, v1U);
+  v2 = Or3(v2L, v2M, v2U);
+}
+
+// 8-bit lanes x8
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
+          HWY_IF_LANES_PER_BLOCK(T, N, 8)>
+HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  const RebindToUnsigned<decltype(d)> du;
+  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
+  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
+  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
+  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
+  // Compress all lanes belonging to v0 into consecutive lanes.
+  constexpr uint8_t Z = 0x80;
+  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
+  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
+  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
+  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
+  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
+  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
+  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
+  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
+  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
+  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
+  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
+  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
+  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
+  v0 = Or3(v0L, v0M, v0U);
+  v1 = Or3(v1L, v1M, v1U);
+  v2 = Or3(v2L, v2M, v2U);
+}
+
+// 16-bit lanes x8
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
+          HWY_IF_LANES_PER_BLOCK(T, N, 8)>
+HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  const RebindToUnsigned<decltype(d)> du;
+  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
+  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
+  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
+  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
+  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
+  // but each element of the array contains two byte indices for a lane.
+  constexpr uint16_t Z = 0x8080;
+  alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
+                                                Z,      Z,      Z,      Z};
+  alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z,      Z,      Z, 0x0302,
+                                                0x0908, 0x0F0E, Z, Z};
+  alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z,      Z,
+                                                Z, Z, 0x0504, 0x0B0A};
+  alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
+                                                Z,      Z,      Z,      Z};
+  alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z,      Z, Z, 0x0504,
+                                                0x0B0A, Z, Z, Z};
+  alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z,      Z,      Z,
+                                                Z, 0x0100, 0x0706, 0x0D0C};
+  alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
+                                                Z,      Z,      Z, Z};
+  alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z,      Z, 0x0100, 0x0706,
+                                                0x0D0C, Z, Z,      Z};
+  alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z,      Z,      Z,
+                                                Z, 0x0302, 0x0908, 0x0F0E};
+  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
+  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
+  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
+  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
+  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
+  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
+  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
+  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
+  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
+  v0 = Or3(v0L, v0M, v0U);
+  v1 = Or3(v1L, v1M, v1U);
+  v2 = Or3(v2L, v2M, v2U);
+}
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
+HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  V A;  // v0[1] v2[0] v1[0] v0[0]
+  V B;  // v1[2] v0[2] v2[1] v1[1]
+  V C;  // v2[3] v1[3] v0[3] v2[2]
+  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
+
+  const V vxx_02_03_xx = OddEven(C, B);
+  v0 = detail::Shuffle1230(A, vxx_02_03_xx);
+
+  // Shuffle2301 takes the upper/lower halves of the output from one input, so
+  // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
+  // OddEven because it may have higher throughput than Shuffle.
+  const V vxx_xx_10_11 = OddEven(A, B);
+  const V v12_13_xx_xx = OddEven(B, C);
+  v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
+
+  const V vxx_20_21_xx = OddEven(B, A);
+  v2 = detail::Shuffle3012(vxx_20_21_xx, C);
+}
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
+HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  V A;  // v1[0] v0[0]
+  V B;  // v0[1] v2[0]
+  V C;  // v2[1] v1[1]
+  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
+  v0 = OddEven(B, A);
+  v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
+  v2 = OddEven(C, B);
+}
+
+template <typename T, class V>
+HWY_API void LoadInterleaved3(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+  v2 = LoadU(d, unaligned + 2);
+}
+
+// ------------------------------ LoadInterleaved4
+
+namespace detail {
+
+// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
+template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
+HWY_API void LoadTransposedBlocks4(Simd<T, N, 0> d,
+                                   const T* HWY_RESTRICT unaligned, V& A, V& B,
+                                   V& C, V& D) {
+  A = LoadU(d, unaligned + 0 * N);
+  B = LoadU(d, unaligned + 1 * N);
+  C = LoadU(d, unaligned + 2 * N);
+  D = LoadU(d, unaligned + 3 * N);
+}
+
+}  // namespace detail
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
+HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2, V& v3) {
+  const Repartition<uint64_t, decltype(d)> d64;
+  using V64 = VFromD<decltype(d64)>;
+  // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
+  // Here int[i] means the four interleaved values of the i-th 4-tuple and
+  // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
+  V A;  // int[13..10] int[3..0]
+  V B;  // int[17..14] int[7..4]
+  V C;  // int[1b..18] int[b..8]
+  V D;  // int[1f..1c] int[f..c]
+  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
+
+  // For brevity, the comments only list the lower block (upper = lower + 0x10)
+  const V v5140 = InterleaveLower(d, A, B);  // int[5,1,4,0]
+  const V vd9c8 = InterleaveLower(d, C, D);  // int[d,9,c,8]
+  const V v7362 = InterleaveUpper(d, A, B);  // int[7,3,6,2]
+  const V vfbea = InterleaveUpper(d, C, D);  // int[f,b,e,a]
+
+  const V v6420 = InterleaveLower(d, v5140, v7362);  // int[6,4,2,0]
+  const V veca8 = InterleaveLower(d, vd9c8, vfbea);  // int[e,c,a,8]
+  const V v7531 = InterleaveUpper(d, v5140, v7362);  // int[7,5,3,1]
+  const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea);  // int[f,d,b,9]
+
+  const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531));  // v10[7..0]
+  const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9));  // v10[f..8]
+  const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531));  // v32[7..0]
+  const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9));  // v32[f..8]
+
+  v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
+  v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
+  v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
+  v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
+}
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
+HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2, V& v3) {
+  // In the last step, we interleave by half of the block size, which is usually
+  // 8 bytes but half that for 8-bit x8 vectors.
+  using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
+  const Repartition<TW, decltype(d)> dw;
+  using VW = VFromD<decltype(dw)>;
+
+  // (Comments are for 256-bit vectors.)
+  // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
+  V A;  // v3210[9]v3210[8] v3210[1]v3210[0]
+  V B;  // v3210[b]v3210[a] v3210[3]v3210[2]
+  V C;  // v3210[d]v3210[c] v3210[5]v3210[4]
+  V D;  // v3210[f]v3210[e] v3210[7]v3210[6]
+  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
+
+  const V va820 = InterleaveLower(d, A, B);  // v3210[a,8] v3210[2,0]
+  const V vec64 = InterleaveLower(d, C, D);  // v3210[e,c] v3210[6,4]
+  const V vb931 = InterleaveUpper(d, A, B);  // v3210[b,9] v3210[3,1]
+  const V vfd75 = InterleaveUpper(d, C, D);  // v3210[f,d] v3210[7,5]
+
+  const VW v10_b830 =  // v10[b..8] v10[3..0]
+      BitCast(dw, InterleaveLower(d, va820, vb931));
+  const VW v10_fc74 =  // v10[f..c] v10[7..4]
+      BitCast(dw, InterleaveLower(d, vec64, vfd75));
+  const VW v32_b830 =  // v32[b..8] v32[3..0]
+      BitCast(dw, InterleaveUpper(d, va820, vb931));
+  const VW v32_fc74 =  // v32[f..c] v32[7..4]
+      BitCast(dw, InterleaveUpper(d, vec64, vfd75));
+
+  v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
+  v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
+  v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
+  v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
+}
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
+HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2, V& v3) {
+  V A;  // v3210[4] v3210[0]
+  V B;  // v3210[5] v3210[1]
+  V C;  // v3210[6] v3210[2]
+  V D;  // v3210[7] v3210[3]
+  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
+  const V v10_ev = InterleaveLower(d, A, C);  // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
+  const V v10_od = InterleaveLower(d, B, D);  // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
+  const V v32_ev = InterleaveUpper(d, A, C);  // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
+  const V v32_od = InterleaveUpper(d, B, D);  // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
+
+  v0 = InterleaveLower(d, v10_ev, v10_od);
+  v1 = InterleaveUpper(d, v10_ev, v10_od);
+  v2 = InterleaveLower(d, v32_ev, v32_od);
+  v3 = InterleaveUpper(d, v32_ev, v32_od);
+}
+
+template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
+HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2, V& v3) {
+  V A, B, C, D;
+  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
+  v0 = InterleaveLower(d, A, C);
+  v1 = InterleaveUpper(d, A, C);
+  v2 = InterleaveLower(d, B, D);
+  v3 = InterleaveUpper(d, B, D);
+}
+
+// Any T x1
+template <typename T, class V>
+HWY_API void LoadInterleaved4(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
+                              V& v0, V& v1, V& v2, V& v3) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+  v2 = LoadU(d, unaligned + 2);
+  v3 = LoadU(d, unaligned + 3);
+}
+
+// ------------------------------ StoreInterleaved2
+
+namespace detail {
+
+// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
+template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
+HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd<T, N, 0> d,
+                                    T* HWY_RESTRICT unaligned) {
+  StoreU(A, d, unaligned + 0 * N);
+  StoreU(B, d, unaligned + 1 * N);
+}
+
+}  // namespace detail
+
+// >= 128 bit vector
+template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
+                               T* HWY_RESTRICT unaligned) {
+  const auto v10L = InterleaveLower(d, v0, v1);  // .. v1[0] v0[0]
+  const auto v10U = InterleaveUpper(d, v0, v1);  // .. v1[N/2] v0[N/2]
+  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
+}
+
+// 64 bits
+template <typename T>
+HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
+                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<T> d_full;
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const auto v10 = InterleaveLower(d_full, v0, v1);
+  StoreU(v10, d_full, unaligned);
+}
+
+// <= 32 bits
+template <typename T, size_t N, HWY_IF_LE32(T, N)>
+HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
+                               const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
+                               T* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<T> d_full;
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const auto v10 = InterleaveLower(d_full, v0, v1);
+  alignas(16) T buf[16 / sizeof(T)];
+  StoreU(v10, d_full, buf);
+  CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
+}
+
+// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
+// TableLookupBytes)
+
+namespace detail {
+
+// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
+template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
+HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
+                                    Simd<T, N, 0> d,
+                                    T* HWY_RESTRICT unaligned) {
+  StoreU(A, d, unaligned + 0 * N);
+  StoreU(B, d, unaligned + 1 * N);
+  StoreU(C, d, unaligned + 2 * N);
+}
+
+}  // namespace detail
+
+// >= 128-bit vector, 8-bit lanes
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  const RebindToUnsigned<decltype(d)> du;
+  const auto k5 = Set(du, 5);
+  const auto k6 = Set(du, 6);
+
+  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
+  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
+  // to their place, with 0x80 so lanes to be filled from other vectors are 0
+  // to enable blending by ORing together.
+  alignas(16) static constexpr uint8_t tbl_v0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_v1[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  // The interleaved vectors will be named A, B, C; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A0 = LoadDup128(du, tbl_v0);
+  const auto shuf_A1 = LoadDup128(du, tbl_v1);  // cannot reuse shuf_A0 (has 5)
+  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
+  const V A = BitCast(d, A0 | A1 | A2);
+
+  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
+  const auto shuf_B0 = shuf_A2 + k6;  // .A..9..8..7..6..
+  const auto shuf_B1 = shuf_A0 + k5;  // A..9..8..7..6..5
+  const auto shuf_B2 = shuf_A1 + k5;  // ..9..8..7..6..5.
+  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
+  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
+  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
+  const V B = BitCast(d, B0 | B1 | B2);
+
+  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
+  const auto shuf_C0 = shuf_B2 + k6;  // ..F..E..D..C..B.
+  const auto shuf_C1 = shuf_B0 + k5;  // .F..E..D..C..B..
+  const auto shuf_C2 = shuf_B1 + k5;  // F..E..D..C..B..A
+  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
+  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
+  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
+  const V C = BitCast(d, C0 | C1 | C2);
+
+  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
+}
+
+// >= 128-bit vector, 16-bit lanes
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  const Repartition<uint8_t, decltype(d)> du8;
+  const auto k2 = Set(du8, 2 * sizeof(T));
+  const auto k3 = Set(du8, 3 * sizeof(T));
+
+  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
+  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
+  // filled from other vectors are 0 for blending. Note that these are byte
+  // indices for 16-bit lanes.
+  alignas(16) static constexpr uint8_t tbl_v1[16] = {
+      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
+      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
+  alignas(16) static constexpr uint8_t tbl_v2[16] = {
+      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
+      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
+
+  // The interleaved vectors will be named A, B, C; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A1 = LoadDup128(du8, tbl_v1);  // 2..1..0.
+                                                 // .2..1..0
+  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
+  const auto shuf_A2 = LoadDup128(du8, tbl_v2);  // ..1..0..
+
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
+  const V A = BitCast(d, A0 | A1 | A2);
+
+  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
+  const auto shuf_B0 = shuf_A1 + k3;  // 5..4..3.
+  const auto shuf_B1 = shuf_A2 + k3;  // ..4..3..
+  const auto shuf_B2 = shuf_A0 + k2;  // .4..3..2
+  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
+  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
+  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
+  const V B = BitCast(d, B0 | B1 | B2);
+
+  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
+  const auto shuf_C0 = shuf_B1 + k3;  // ..7..6..
+  const auto shuf_C1 = shuf_B2 + k3;  // .7..6..5
+  const auto shuf_C2 = shuf_B0 + k2;  // 7..6..5.
+  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
+  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
+  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
+  const V C = BitCast(d, C0 | C1 | C2);
+
+  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
+}
+
+// >= 128-bit vector, 32-bit lanes
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d)> dw;
+
+  const V v10_v00 = InterleaveLower(d, v0, v1);
+  const V v01_v20 = OddEven(v0, v2);
+  // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
+  const V A = BitCast(
+      d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
+
+  const V v1_321 = ShiftRightLanes<1>(d, v1);
+  const V v0_32 = ShiftRightLanes<2>(d, v0);
+  const V v21_v11 = OddEven(v2, v1_321);
+  const V v12_v02 = OddEven(v1_321, v0_32);
+  // B: v1[2],v0[2], v2[1],v1[1]
+  const V B = BitCast(
+      d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
+
+  // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
+  const V v23_v13 = OddEven(v2, v1_321);
+  const V v03_v22 = OddEven(v0, v2);
+  // C: v2[3],v1[3],v0[3], v2[2]
+  const V C = BitCast(
+      d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
+
+  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
+}
+
+// >= 128-bit vector, 64-bit lanes
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  const V A = InterleaveLower(d, v0, v1);
+  const V B = OddEven(v0, v2);
+  const V C = InterleaveUpper(d, v1, v2);
+  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
+}
+
+// 64-bit vector, 8-bit lanes
+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
+                               const Vec64<T> part2, Full64<T> d,
+                               T* HWY_RESTRICT unaligned) {
+  constexpr size_t N = 16 / sizeof(T);
+  // Use full vectors for the shuffles and first result.
+  const Full128<uint8_t> du;
+  const Full128<T> d_full;
+  const auto k5 = Set(du, 5);
+  const auto k6 = Set(du, 6);
+
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+
+  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
+  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
+  // filled from other vectors are 0 for blending.
+  alignas(16) static constexpr uint8_t tbl_v0[16] = {
+      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
+      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
+  alignas(16) static constexpr uint8_t tbl_v1[16] = {
+      0x80, 0, 0x80, 0x80, 1, 0x80,  //
+      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
+  // The interleaved vectors will be named A, B, C; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A0 = Load(du, tbl_v0);
+  const auto shuf_A1 = Load(du, tbl_v1);  // cannot reuse shuf_A0 (5 in MSB)
+  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
+  const auto A = BitCast(d_full, A0 | A1 | A2);
+  StoreU(A, d_full, unaligned + 0 * N);
+
+  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
+  const auto shuf_B0 = shuf_A2 + k6;  // ..7..6..
+  const auto shuf_B1 = shuf_A0 + k5;  // .7..6..5
+  const auto shuf_B2 = shuf_A1 + k5;  // 7..6..5.
+  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
+  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
+  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
+  const Vec64<T> B{(B0 | B1 | B2).raw};
+  StoreU(B, d, unaligned + 1 * N);
+}
+
+// 64-bit vector, 16-bit lanes
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
+                               const Vec64<T> part2, Full64<T> dh,
+                               T* HWY_RESTRICT unaligned) {
+  const Full128<T> d;
+  const Full128<uint8_t> du8;
+  constexpr size_t N = 16 / sizeof(T);
+  const auto k2 = Set(du8, 2 * sizeof(T));
+  const auto k3 = Set(du8, 3 * sizeof(T));
+
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+
+  // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
+  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
+  // to their place, with 0x80 so lanes to be filled from other vectors are 0
+  // to enable blending by ORing together.
+  alignas(16) static constexpr uint8_t tbl_v1[16] = {
+      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
+      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
+  alignas(16) static constexpr uint8_t tbl_v2[16] = {
+      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
+      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
+
+  // The interleaved vectors will be named A, B; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A1 = Load(du8, tbl_v1);  // 2..1..0.
+                                           // .2..1..0
+  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
+  const auto shuf_A2 = Load(du8, tbl_v2);  // ..1..0..
+
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
+  const Vec128<T> A = BitCast(d, A0 | A1 | A2);
+  StoreU(A, d, unaligned + 0 * N);
+
+  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
+  const auto shuf_B0 = shuf_A1 + k3;  // ..3.
+  const auto shuf_B1 = shuf_A2 + k3;  // .3..
+  const auto shuf_B2 = shuf_A0 + k2;  // 3..2
+  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
+  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
+  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
+  const Vec128<T> B = BitCast(d, B0 | B1 | B2);
+  StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
+}
+
+// 64-bit vector, 32-bit lanes
+template <typename T, HWY_IF_LANE_SIZE(T, 4)>
+HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
+                               const Vec64<T> v2, Full64<T> d,
+                               T* HWY_RESTRICT unaligned) {
+  // (same code as 128-bit vector, 64-bit lanes)
+  constexpr size_t N = 2;
+  const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
+  const Vec64<T> v01_v20 = OddEven(v0, v2);
+  const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
+  StoreU(v10_v00, d, unaligned + 0 * N);
+  StoreU(v01_v20, d, unaligned + 1 * N);
+  StoreU(v21_v11, d, unaligned + 2 * N);
+}
+
+// 64-bit lanes are handled by the N=1 case below.
+
+// <= 32-bit vector, 8-bit lanes
+template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
+HWY_API void StoreInterleaved3(const Vec128<T, N> part0,
+                               const Vec128<T, N> part1,
+                               const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
+                               T* HWY_RESTRICT unaligned) {
+  // Use full vectors for the shuffles and result.
+  const Full128<uint8_t> du;
+  const Full128<T> d_full;
+
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+
+  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
+  // so lanes to be filled from other vectors are 0 to enable blending by ORing
+  // together.
+  alignas(16) static constexpr uint8_t tbl_v0[16] = {
+      0,    0x80, 0x80, 1,    0x80, 0x80, 2,    0x80,
+      0x80, 3,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
+  // The interleaved vector will be named A; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A0 = Load(du, tbl_v0);
+  const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
+  const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ......3..2..1..0
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .....3..2..1..0.
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // ....3..2..1..0..
+  const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
+  alignas(16) T buf[16 / sizeof(T)];
+  StoreU(A, d_full, buf);
+  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
+}
+
+// 32-bit vector, 16-bit lanes
+template <typename T, HWY_IF_LANE_SIZE(T, 2)>
+HWY_API void StoreInterleaved3(const Vec128<T, 2> part0,
+                               const Vec128<T, 2> part1,
+                               const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
+                               T* HWY_RESTRICT unaligned) {
+  constexpr size_t N = 4 / sizeof(T);
+  // Use full vectors for the shuffles and result.
+  const Full128<uint8_t> du8;
+  const Full128<T> d_full;
+
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+
+  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
+  // so lanes to be filled from other vectors are 0 to enable blending by ORing
+  // together.
+  alignas(16) static constexpr uint8_t tbl_v2[16] = {
+      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
+      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
+  // The interleaved vector will be named A; temporaries with suffix
+  // 0..2 indicate which input vector's lanes they hold.
+  const auto shuf_A2 =  // ..1..0..
+      Load(du8, tbl_v2);
+  const auto shuf_A1 =  // ...1..0.
+      CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
+  const auto shuf_A0 =  // ....1..0
+      CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
+  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ..1..0
+  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .1..0.
+  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // 1..0..
+  const auto A = BitCast(d_full, A0 | A1 | A2);
+  alignas(16) T buf[16 / sizeof(T)];
+  StoreU(A, d_full, buf);
+  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
+}
+
+// Single-element vector, any lane size: just store directly
+template <typename T>
+HWY_API void StoreInterleaved3(const Vec128<T, 1> v0, const Vec128<T, 1> v1,
+                               const Vec128<T, 1> v2, Simd<T, 1, 0> d,
+                               T* HWY_RESTRICT unaligned) {
+  StoreU(v0, d, unaligned + 0);
+  StoreU(v1, d, unaligned + 1);
+  StoreU(v2, d, unaligned + 2);
+}
+
+// ------------------------------ StoreInterleaved4
+
+namespace detail {
+
+// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
+template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
+HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
+                                    Simd<T, N, 0> d,
+                                    T* HWY_RESTRICT unaligned) {
+  StoreU(A, d, unaligned + 0 * N);
+  StoreU(B, d, unaligned + 1 * N);
+  StoreU(C, d, unaligned + 2 * N);
+  StoreU(D, d, unaligned + 3 * N);
+}
+
+}  // namespace detail
+
+// >= 128-bit vector, 8..32-bit lanes
+template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  const RepartitionToWide<decltype(d)> dw;
+  const auto v10L = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
+  const auto v32L = ZipLower(dw, v2, v3);
+  const auto v10U = ZipUpper(dw, v0, v1);
+  const auto v32U = ZipUpper(dw, v2, v3);
+  // The interleaved vectors are A, B, C, D.
+  const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L));  // 3210
+  const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
+  const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
+  const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
+  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
+}
+
+// >= 128-bit vector, 64-bit lanes
+template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
+          HWY_IF_GE128(T, N)>
+HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
+                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
+  // The interleaved vectors are A, B, C, D.
+  const auto A = InterleaveLower(d, v0, v1);  // v1[0] v0[0]
+  const auto B = InterleaveLower(d, v2, v3);
+  const auto C = InterleaveUpper(d, v0, v1);
+  const auto D = InterleaveUpper(d, v2, v3);
+  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
+}
+
+// 64-bit vector, 8..32-bit lanes
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
+HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
+                               const Vec64<T> part2, const Vec64<T> part3,
+                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
+  constexpr size_t N = 16 / sizeof(T);
+  // Use full vectors to reduce the number of stores.
+  const Full128<T> d_full;
+  const RepartitionToWide<decltype(d_full)> dw;
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+  const Vec128<T> v3{part3.raw};
+  const auto v10 = ZipLower(dw, v0, v1);  // v1[0] v0[0]
+  const auto v32 = ZipLower(dw, v2, v3);
+  const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
+  const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
+  StoreU(A, d_full, unaligned + 0 * N);
+  StoreU(B, d_full, unaligned + 1 * N);
+}
+
+// 64-bit vector, 64-bit lane
+template <typename T, HWY_IF_LANE_SIZE(T, 8)>
+HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
+                               const Vec64<T> part2, const Vec64<T> part3,
+                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
+  constexpr size_t N = 16 / sizeof(T);
+  // Use full vectors to reduce the number of stores.
+  const Full128<T> d_full;
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+  const Vec128<T> v3{part3.raw};
+  const auto A = InterleaveLower(d_full, v0, v1);  // v1[0] v0[0]
+  const auto B = InterleaveLower(d_full, v2, v3);
+  StoreU(A, d_full, unaligned + 0 * N);
+  StoreU(B, d_full, unaligned + 1 * N);
+}
+
+// <= 32-bit vectors
+template <typename T, size_t N, HWY_IF_LE32(T, N)>
+HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
+                               const Vec128<T, N> part1,
+                               const Vec128<T, N> part2,
+                               const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
+                               T* HWY_RESTRICT unaligned) {
+  // Use full vectors to reduce the number of stores.
+  const Full128<T> d_full;
+  const RepartitionToWide<decltype(d_full)> dw;
+  const Vec128<T> v0{part0.raw};
+  const Vec128<T> v1{part1.raw};
+  const Vec128<T> v2{part2.raw};
+  const Vec128<T> v3{part3.raw};
+  const auto v10 = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
+  const auto v32 = ZipLower(dw, v2, v3);
+  const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
+  alignas(16) T buf[16 / sizeof(T)];
+  StoreU(v3210, d_full, buf);
+  CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
+}
+
+#endif  // HWY_NATIVE_LOAD_STORE_INTERLEAVED
+
 // ------------------------------ AESRound

 // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
@ -281,6 +1181,7 @@ HWY_API V CLMulUpper(V a, V b) {
 #define HWY_NATIVE_POPCNT
 #endif

+#undef HWY_MIN_POW2_FOR_128
 #if HWY_TARGET == HWY_RVV
 #define HWY_MIN_POW2_FOR_128 1
 #else
@ -291,10 +1192,11 @@ HWY_API V CLMulUpper(V a, V b) {

 // This algorithm requires vectors to be at least 16 bytes, which is the case
 // for LMUL >= 2. If not, use the fallback below.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
-          HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
+          HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
  HWY_ALIGN constexpr uint8_t kLookup[16] = {
      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  };
@ -307,9 +1209,10 @@ HWY_API V PopulationCount(V v) {
 // RVV has a specialization that avoids the Set().
 #if HWY_TARGET != HWY_RVV
 // Slower fallback for capped vectors.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
+  const D d;
  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
@ -317,26 +1220,29 @@ HWY_API V PopulationCount(V v) {
 }
 #endif  // HWY_TARGET != HWY_RVV

-template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
+  const D d;
  const Repartition<uint8_t, decltype(d)> d8;
  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
 }

-template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
+  const D d;
  Repartition<uint16_t, decltype(d)> d16;
  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
 }

 #if HWY_HAVE_INTEGER64
-template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
+template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
+  const D d;
  Repartition<uint32_t, decltype(d)> d32;
  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -70,7 +71,7 @@ class Mask1 {
 public:
  static HWY_INLINE Mask1<T> FromBool(bool b) {
    Mask1<T> mask;
-    mask.bits = b ? ~Raw(0) : 0;
+    mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
    return mask;
  }

@ -127,6 +128,9 @@ HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
  return Vec1<T>(static_cast<T>(first));
 }

+template <class D>
+using VFromD = decltype(Zero(D()));
+
 // ================================================== LOGICAL

 // ------------------------------ Not
@ -187,6 +191,13 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
  return Xor(a, b);
 }

+// ------------------------------ Or3
+
+template <typename T>
+HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) {
+  return Or(o1, Or(o2, o3));
+}
+
 // ------------------------------ OrAnd

 template <typename T>
@ -337,7 +348,8 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
 template <int kBits, typename T>
 HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
+  return Vec1<T>(
+      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
 }

 template <int kBits, typename T>
@ -346,7 +358,7 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
 #if __cplusplus >= 202002L
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
  // negative infinity, i.e. shifting in the sign bit).
-  return Vec1<T>(v.raw >> kBits);
+  return Vec1<T>(static_cast<T>(v.raw >> kBits));
 #else
  if (IsSigned<T>()) {
    // Emulate arithmetic shift using only logical (unsigned) shifts, because
@ -355,28 +367,51 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
    const Sisd<TU> du;
    const TU shifted = BitCast(du, v).raw >> kBits;
    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
-    const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
+    const size_t sign_shift =
+        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
+    const TU upper = static_cast<TU>(sign << sign_shift);
    return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
-  } else {
-    return Vec1<T>(v.raw >> kBits);  // unsigned, logical shift
+  } else {  // T is unsigned
+    return Vec1<T>(static_cast<T>(v.raw >> kBits));
  }
 #endif
 }

 // ------------------------------ RotateRight (ShiftRight)

+namespace detail {
+
+// For partial specialization: kBits == 0 results in an invalid shift count
+template <int kBits>
+struct RotateRight {
+  template <typename T>
+  HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
+    return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
+  }
+};
+
+template <>
+struct RotateRight<0> {
+  template <typename T>
+  HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
+    return v;
+  }
+};
+
+}  // namespace detail
+
 template <int kBits, typename T>
 HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
+  return detail::RotateRight<kBits>()(v);
 }

 // ------------------------------ ShiftLeftSame (BroadcastSignBit)

 template <typename T>
 HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
-  return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
+  return Vec1<T>(
+      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
 }

 template <typename T>
@ -384,7 +419,7 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
 #if __cplusplus >= 202002L
  // Signed right shift is now guaranteed to be arithmetic (rounding toward
  // negative infinity, i.e. shifting in the sign bit).
-  return Vec1<T>(v.raw >> bits);
+  return Vec1<T>(static_cast<T>(v.raw >> bits));
 #else
  if (IsSigned<T>()) {
    // Emulate arithmetic shift using only logical (unsigned) shifts, because
@ -393,10 +428,12 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
    const Sisd<TU> du;
    const TU shifted = BitCast(du, v).raw >> bits;
    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
-    const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
+    const size_t sign_shift =
+        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
+    const TU upper = static_cast<TU>(sign << sign_shift);
    return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
-  } else {
-    return Vec1<T>(v.raw >> bits);  // unsigned, logical shift
+  } else {  // T is unsigned
+    return Vec1<T>(static_cast<T>(v.raw >> bits));
  }
 #endif
 }
@ -601,6 +638,10 @@ HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
 }

+HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
+  return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
+}
+
 // Multiplies even lanes (0, 2 ..) and returns the double-wide result.
 HWY_API Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
  const int64_t a64 = a.raw;
@ -684,7 +725,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
  const TI rounded = static_cast<TI>(v.raw + bias);
  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
  // Round to even
-  if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
+  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
    return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
  }
  return Vec1<T>(static_cast<T>(rounded));
@ -842,6 +883,45 @@ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
  return Mask1<T>::FromBool(a.raw >= b.raw);
 }

+// ------------------------------ Floating-point classification (==)
+
+template <typename T>
+HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
+  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
+  MakeUnsigned<T> bits;
+  memcpy(&bits, &v, sizeof(v));
+  bits += bits;
+  bits >>= 1;  // clear sign bit
+  // NaN if all exponent bits are set and the mantissa is not zero.
+  return Mask1<T>::FromBool(bits > ExponentMask<T>());
+}
+
+HWY_API Mask1<float> IsInf(const Vec1<float> v) {
+  const Sisd<float> d;
+  const RebindToUnsigned<decltype(d)> du;
+  const Vec1<uint32_t> vu = BitCast(du, v);
+  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
+  return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
+}
+HWY_API Mask1<double> IsInf(const Vec1<double> v) {
+  const Sisd<double> d;
+  const RebindToUnsigned<decltype(d)> du;
+  const Vec1<uint64_t> vu = BitCast(du, v);
+  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
+  return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
+}
+
+HWY_API Mask1<float> IsFinite(const Vec1<float> v) {
+  const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
+  // Shift left to clear the sign bit, check whether exponent != max value.
+  return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
+}
+HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
+  const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
+  // Shift left to clear the sign bit, check whether exponent != max value.
+  return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
+}
+
 // ================================================== MEMORY

 // ------------------------------ Load
@ -883,20 +963,69 @@ HWY_API void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
  return Store(v, d, p);
 }

-// ------------------------------ StoreInterleaved3
+template <typename T>
+HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, Sisd<T> d,
+                          T* HWY_RESTRICT p) {
+  if (!m.bits) return;
+  StoreU(v, d, p);
+}

-HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
-                               const Vec1<uint8_t> v2, Sisd<uint8_t> d,
-                               uint8_t* HWY_RESTRICT unaligned) {
+// ------------------------------ LoadInterleaved2/3/4
+
+// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
+#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#else
+#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
+#endif
+
+template <typename T>
+HWY_API void LoadInterleaved2(Sisd<T> d, const T* HWY_RESTRICT unaligned,
+                              Vec1<T>& v0, Vec1<T>& v1) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+}
+
+template <typename T>
+HWY_API void LoadInterleaved3(Sisd<T> d, const T* HWY_RESTRICT unaligned,
+                              Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+  v2 = LoadU(d, unaligned + 2);
+}
+
+template <typename T>
+HWY_API void LoadInterleaved4(Sisd<T> d, const T* HWY_RESTRICT unaligned,
+                              Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2,
+                              Vec1<T>& v3) {
+  v0 = LoadU(d, unaligned + 0);
+  v1 = LoadU(d, unaligned + 1);
+  v2 = LoadU(d, unaligned + 2);
+  v3 = LoadU(d, unaligned + 3);
+}
+
+// ------------------------------ StoreInterleaved2/3/4
+
+template <typename T>
+HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, Sisd<T> d,
+                               T* HWY_RESTRICT unaligned) {
+  StoreU(v0, d, unaligned + 0);
+  StoreU(v1, d, unaligned + 1);
+}
+
+template <typename T>
+HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
+                               const Vec1<T> v2, Sisd<T> d,
+                               T* HWY_RESTRICT unaligned) {
  StoreU(v0, d, unaligned + 0);
  StoreU(v1, d, unaligned + 1);
  StoreU(v2, d, unaligned + 2);
 }

-HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
-                               const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
-                               Sisd<uint8_t> d,
-                               uint8_t* HWY_RESTRICT unaligned) {
+template <typename T>
+HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
+                               const Vec1<T> v2, const Vec1<T> v3, Sisd<T> d,
+                               T* HWY_RESTRICT unaligned) {
  StoreU(v0, d, unaligned + 0);
  StoreU(v1, d, unaligned + 1);
  StoreU(v2, d, unaligned + 2);
@ -933,7 +1062,8 @@ template <typename T, typename Offset>
 HWY_API Vec1<T> GatherOffset(Sisd<T> d, const T* base,
                             const Vec1<Offset> offset) {
  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
+  const intptr_t addr =
+      reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
  return Load(d, reinterpret_cast<const T*>(addr));
 }

@ -988,12 +1118,8 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
 }

 HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
-#if HWY_NATIVE_FLOAT16
  uint16_t bits16;
  CopyBytes<2>(&v.raw, &bits16);
-#else
-  const uint16_t bits16 = v.raw.bits;
-#endif
  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
  const uint32_t mantissa = bits16 & 0x3FF;
@ -1031,12 +1157,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
  // Tiny or zero => zero.
  Vec1<float16_t> out;
  if (exp < -24) {
-#if HWY_NATIVE_FLOAT16
    const uint16_t zero = 0;
    CopyBytes<2>(&zero, &out.raw);
-#else
-    out.raw.bits = 0;
-#endif
    return out;
  }

@ -1059,12 +1181,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
  HWY_DASSERT(mantissa16 < 1024);
  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
  HWY_DASSERT(bits16 < 0x10000);
-#if HWY_NATIVE_FLOAT16
  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
  CopyBytes<2>(&narrowed, &out.raw);
-#else
-  out.raw.bits = static_cast<uint16_t>(bits16);
-#endif
  return out;
 }

@ -1097,6 +1215,38 @@ HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
  return DemoteTo(Sisd<uint8_t>(), v);
 }

+// ------------------------------ Truncations
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint64_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint32_t> TruncateTo(Sisd<uint32_t> /* tag */,
+                                  const Vec1<uint64_t> v) {
+  return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint32_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
+HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
+                                  const Vec1<uint32_t> v) {
+  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
+}
+
+HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
+                                 const Vec1<uint16_t> v) {
+  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
+}
+
 // ================================================== COMBINE
 // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.

@ -1117,6 +1267,21 @@ HWY_API T GetLane(const Vec1<T> v) {
  return v.raw;
 }

+template <typename T>
+HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
+  HWY_DASSERT(i == 0);
+  (void)i;
+  return v.raw;
+}
+
+template <typename T>
+HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) {
+  HWY_DASSERT(i == 0);
+  (void)i;
+  v.raw = t;
+  return v;
+}
+
 template <typename T>
 HWY_API Vec1<T> DupEven(Vec1<T> v) {
  return v;
@ -1157,7 +1322,7 @@ HWY_API Indices1<T> IndicesFromVec(Sisd<T>, Vec1<TI> vec) {

 template <typename T, typename TI>
 HWY_API Indices1<T> SetTableIndices(Sisd<T> d, const TI* idx) {
-  return IndicesFromVec(d, LoadU(idx));
+  return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
 }

 template <typename T>
@ -1180,6 +1345,7 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
  return v;
 }

+// Must not be called:
 template <typename T>
 HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
  return v;
@ -1304,19 +1470,24 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {

 // ------------------------------ Compress, CompressBits

+template <typename T>
+struct CompressIsPartition {
+  enum { value = 1 };
+};
+
 template <typename T>
 HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
-  // Upper lanes are undefined, so result is the same independent of mask.
+  // A single lane is already partitioned by definition.
  return v;
 }

 template <typename T>
-HWY_API Vec1<T> Compress(Vec1<T> v, const uint8_t* HWY_RESTRICT /* bits */) {
+HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
+  // A single lane is already partitioned by definition.
  return v;
 }

 // ------------------------------ CompressStore
-
 template <typename T>
 HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
                             T* HWY_RESTRICT unaligned) {
@ -1325,7 +1496,6 @@ HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
 }

 // ------------------------------ CompressBlendedStore
-
 template <typename T>
 HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
                                    T* HWY_RESTRICT unaligned) {
@ -1334,8 +1504,13 @@ HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
  return 1;
 }

-// ------------------------------ CompressBitsStore
+// ------------------------------ CompressBits
+template <typename T>
+HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
+  return v;
+}

+// ------------------------------ CompressBitsStore
 template <typename T>
 HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
                                 Sisd<T> d, T* HWY_RESTRICT unaligned) {
--- a/third_party/highway/hwy/ops/set_macros-inl.h
+++ b/third_party/highway/hwy/ops/set_macros-inl.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -36,6 +37,8 @@
 #undef HWY_HAVE_INTEGER64
 #undef HWY_HAVE_FLOAT16
 #undef HWY_HAVE_FLOAT64
+#undef HWY_MEM_OPS_MIGHT_FAULT
+#undef HWY_NATIVE_FMA
 #undef HWY_CAP_GE256
 #undef HWY_CAP_GE512

@ -71,6 +74,7 @@

 // Before include guard so we redefine HWY_TARGET_STR on each include,
 // governed by the current HWY_TARGET.
+
 //-----------------------------------------------------------------------------
 // SSSE3
 #if HWY_TARGET == HWY_SSSE3
@ -84,11 +88,13 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
-#define HWY_CAP_AES 0
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

 #define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
+
 //-----------------------------------------------------------------------------
 // SSE4
 #elif HWY_TARGET == HWY_SSE4
@ -102,6 +108,8 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -120,6 +128,14 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+
+#ifdef HWY_DISABLE_BMI2_FMA
+#define HWY_NATIVE_FMA 0
+#else
+#define HWY_NATIVE_FMA 1
+#endif
+
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0

@ -137,6 +153,8 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 1

@ -148,9 +166,10 @@
 #elif HWY_TARGET == HWY_AVX3_DL

 #define HWY_NAMESPACE N_AVX3_DL
-#define HWY_TARGET_STR \
-  HWY_TARGET_STR_AVX3  \
-      ",vpclmulqdq,avx512vbmi2,vaes,avxvnni,avx512bitalg,avx512vpopcntdq"
+#define HWY_TARGET_STR                                            \
+  HWY_TARGET_STR_AVX3                                             \
+  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \
+  "avx512vpopcntdq"

 #else
 #error "Logic error"
@ -168,6 +187,8 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 0
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -186,8 +207,6 @@
 #define HWY_HAVE_SCALABLE 0
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0

 #if HWY_ARCH_ARM_A64
 #define HWY_HAVE_FLOAT64 1
@ -195,19 +214,38 @@
 #define HWY_HAVE_FLOAT64 0
 #endif

+#define HWY_MEM_OPS_MIGHT_FAULT 1
+
+#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
+#define HWY_NATIVE_FMA 1
+#else
+#define HWY_NATIVE_FMA 0
+#endif
+
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
 #define HWY_NAMESPACE N_NEON

-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_ARCH_ARM_V7
+#define HWY_TARGET_STR "+neon-vfpv4"
+#else
+#define HWY_TARGET_STR "+crypto"
+#endif  // HWY_ARCH_ARM_V7
+#else
+// HWY_TARGET_STR remains undefined
+#endif

 //-----------------------------------------------------------------------------
 // SVE[2]
-#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
+#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
+    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128

 // SVE only requires lane alignment, not natural alignment of the entire vector.
 #define HWY_ALIGN alignas(8)

-#define HWY_MAX_BYTES 256
-
 // Value ensures MaxLanes() is the tightest possible upper bound to reduce
 // overallocation.
 #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
@ -216,16 +254,35 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

 #if HWY_TARGET == HWY_SVE2
 #define HWY_NAMESPACE N_SVE2
+#define HWY_MAX_BYTES 256
+#elif HWY_TARGET == HWY_SVE_256
+#define HWY_NAMESPACE N_SVE_256
+#define HWY_MAX_BYTES 32
+#elif HWY_TARGET == HWY_SVE2_128
+#define HWY_NAMESPACE N_SVE2_128
+#define HWY_MAX_BYTES 16
 #else
 #define HWY_NAMESPACE N_SVE
+#define HWY_MAX_BYTES 256
 #endif

+// Can use pragmas instead of -march compiler flag
+#if HWY_HAVE_RUNTIME_DISPATCH
+#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
+#define HWY_TARGET_STR "+sve2-aes"
+#else
+#define HWY_TARGET_STR "+sve"
+#endif
+#else
 // HWY_TARGET_STR remains undefined
+#endif

 //-----------------------------------------------------------------------------
 // WASM
@ -239,6 +296,8 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 0
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -247,8 +306,8 @@
 #define HWY_TARGET_STR "simd128"

 //-----------------------------------------------------------------------------
-// WASM2
-#elif HWY_TARGET == HWY_WASM2
+// WASM_EMU256
+#elif HWY_TARGET == HWY_WASM_EMU256

 #define HWY_ALIGN alignas(32)
 #define HWY_MAX_BYTES 32
@ -258,10 +317,12 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 0
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

-#define HWY_NAMESPACE N_WASM2
+#define HWY_NAMESPACE N_WASM_EMU256

 #define HWY_TARGET_STR "simd128"

@ -283,10 +344,12 @@
 #define HWY_HAVE_SCALABLE 1
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

-#if defined(__riscv_zfh)
+#if defined(__riscv_zvfh)
 #define HWY_HAVE_FLOAT16 1
 #else
 #define HWY_HAVE_FLOAT16 0
@ -297,6 +360,27 @@
 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
 // (rv64gcv is not a valid target)

+//-----------------------------------------------------------------------------
+// EMU128
+#elif HWY_TARGET == HWY_EMU128
+
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_HAVE_SCALABLE 0
+#define HWY_HAVE_INTEGER64 1
+#define HWY_HAVE_FLOAT16 1
+#define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#define HWY_NATIVE_FMA 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_EMU128
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+
 //-----------------------------------------------------------------------------
 // SCALAR
 #elif HWY_TARGET == HWY_SCALAR
@ -309,6 +393,8 @@
 #define HWY_HAVE_INTEGER64 1
 #define HWY_HAVE_FLOAT16 1
 #define HWY_HAVE_FLOAT64 1
+#define HWY_MEM_OPS_MIGHT_FAULT 0
+#define HWY_NATIVE_FMA 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -320,6 +406,12 @@
 #pragma message("HWY_TARGET does not match any known target")
 #endif  // HWY_TARGET

+// Override this to 1 in asan/msan builds, which will still fault.
+#if HWY_IS_ASAN || HWY_IS_MSAN
+#undef HWY_MEM_OPS_MIGHT_FAULT
+#define HWY_MEM_OPS_MIGHT_FAULT 1
+#endif
+
 // Clang <9 requires this be invoked at file scope, before any namespace.
 #undef HWY_BEFORE_NAMESPACE
 #if defined(HWY_TARGET_STR)
--- a/third_party/highway/hwy/ops/shared-inl.h
+++ b/third_party/highway/hwy/ops/shared-inl.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -98,21 +99,21 @@ struct Simd {

 namespace detail {

-#if HWY_HAVE_SCALABLE
-
 template <typename T, size_t N, int kPow2>
 constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
  return N == HWY_LANES(T) && kPow2 == 0;
 }

-#endif
-
 // Returns the number of lanes (possibly zero) after applying a shift:
 // - 0: no change;
 // - [1,3]: a group of 2,4,8 [fractional] vectors;
 // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
 constexpr size_t ScaleByPower(size_t N, int pow2) {
+#if HWY_TARGET == HWY_RVV
  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
+#else
+  return pow2 >= 0 ? N : (N >> (-pow2));
+#endif
 }

 // Struct wrappers enable validation of arguments via static_assert.
@ -136,17 +137,16 @@ struct ScalableTagChecker {
 template <typename T, size_t kLimit>
 struct CappedTagChecker {
  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
-  using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
+  // Safely handle non-power-of-two inputs by rounding down, which is allowed by
+  // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
+  static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
+  using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
 };

 template <typename T, size_t kNumLanes>
 struct FixedTagChecker {
  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
-  static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
-#if HWY_TARGET == HWY_SCALAR
-  // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
-  static_assert(kNumLanes == 1, "Scalar only supports one lane");
-#endif
+  static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
  using type = Simd<T, kNumLanes, 0>;
 };

@ -172,8 +172,8 @@ template <typename T, size_t kLimit>
 using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;

 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
-// even on targets with scalable vectors. HWY_SCALAR only supports one lane.
-// All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
+// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
+// two not exceeding `HWY_LANES(T)`.
 //
 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
@ -218,6 +218,15 @@ using Half = typename D::Half;
 template <class D>
 using Twice = typename D::Twice;

+template <typename T>
+using Full32 = Simd<T, 4 / sizeof(T), 0>;
+
+template <typename T>
+using Full64 = Simd<T, 8 / sizeof(T), 0>;
+
+template <typename T>
+using Full128 = Simd<T, 16 / sizeof(T), 0>;
+
 // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
@ -232,15 +241,12 @@ using Twice = typename D::Twice;
 #define HWY_IF_GE128_D(D) \
  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr

-// Same, but with a vector argument.
+// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
 #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
 #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
-
-// For implementing functions for a specific type.
-// IsSame<...>() in template arguments is broken on MSVC2015.
-#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
+#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)

 template <class D>
 HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
@ -291,8 +297,7 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
 // and possibly also other functions that are not inlined.
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
-    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
+#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
 template <class V>
 using VecArg = const V&;
 #else
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
--- a/third_party/highway/hwy/ops/wasm_256-inl.h
+++ b/third_party/highway/hwy/ops/wasm_256-inl.h
@ -1,4 +1,5 @@
 // Copyright 2021 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -590,6 +591,10 @@ HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
  return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
 }

+HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
+  HWY_ASSERT(0);
+}
+
 // Multiplies even lanes (0, 2 ..) and returns the double-width result.
 HWY_API Vec256<int64_t> MulEven(const Vec256<int32_t> a,
                                const Vec256<int32_t> b) {
@ -717,6 +722,37 @@ HWY_API Vec256<float> Floor(const Vec256<float> v) {
  return Vec256<float>{wasm_f32x4_floor(v.raw)};
 }

+// ------------------------------ Floating-point classification
+
+template <typename T>
+HWY_API Mask256<T> IsNaN(const Vec256<T> v) {
+  return v != v;
+}
+
+template <typename T, HWY_IF_FLOAT(T)>
+HWY_API Mask256<T> IsInf(const Vec256<T> v) {
+  const Full256<T> d;
+  const RebindToSigned<decltype(d)> di;
+  const VFromD<decltype(di)> vi = BitCast(di, v);
+  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
+  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
+}
+
+// Returns whether normal/subnormal/zero.
+template <typename T, HWY_IF_FLOAT(T)>
+HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
+  const Full256<T> d;
+  const RebindToUnsigned<decltype(d)> du;
+  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
+  const VFromD<decltype(du)> vu = BitCast(du, v);
+  // 'Shift left' to clear the sign bit, then right so we can compare with the
+  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
+  // negative and non-negative floats would be greater).
+  const VFromD<decltype(di)> exp =
+      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
+  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
+}
+
 // ================================================== COMPARE

 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
@ -910,6 +946,13 @@ HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
  return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
 }

+// ------------------------------ Or3
+
+template <typename T>
+HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
+  return Or(o1, Or(o2, o3));
+}
+
 // ------------------------------ OrAnd

 template <typename T>
@ -1201,6 +1244,12 @@ HWY_API void StoreU(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT p) {
  Store(v, d, p);
 }

+template <typename T>
+HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
+                          T* HWY_RESTRICT p) {
+  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
+}
+
 // ------------------------------ Non-temporal stores

 // Same as aligned stores on non-x86.
@ -1281,8 +1330,19 @@ HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,

 // ================================================== SWIZZLE

-// ------------------------------ Extract lane
+// ------------------------------ ExtractLane
+template <typename T, size_t N>
+HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
+  HWY_ASSERT(0);
+}

+// ------------------------------ InsertLane
+template <typename T, size_t N>
+HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
+  HWY_ASSERT(0);
+}
+
+// ------------------------------ GetLane
 // Gets the single value stored in a vector/part.
 HWY_API uint8_t GetLane(const Vec256<uint8_t> v) {
  return wasm_i8x16_extract_lane(v.raw, 0);
@ -2244,6 +2304,50 @@ HWY_API Vec256<uint8_t> U8FromU32(const Vec256<uint32_t> v) {
  return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
 }

+// ------------------------------ Truncations
+
+HWY_API Vec256<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
+                                      const Vec256<uint64_t> v) {
+  return Vec256<uint8_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24,
+                                               0, 8, 16, 24, 0, 8, 16, 24, 0, 8,
+                                               16, 24)};
+}
+
+HWY_API Vec256<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint16_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9,
+                                                16, 17, 24, 25, 0, 1, 8, 9, 16,
+                                                17, 24, 25)};
+}
+
+HWY_API Vec256<uint32_t, 4> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
+                                       const Vec256<uint64_t> v) {
+  return Vec256<uint32_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3,
+                                                8, 9, 10, 11, 16, 17, 18, 19,
+                                                24, 25, 26, 27)};
+}
+
+HWY_API Vec256<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
+                                      const Vec256<uint32_t> v) {
+  return Vec256<uint8_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12,
+                                               16, 20, 24, 28, 0, 4, 8, 12, 16,
+                                               20, 24, 28)};
+}
+
+HWY_API Vec256<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
+                                       const Vec256<uint32_t> v) {
+  return Vec256<uint16_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5,
+                                                8, 9, 12, 13, 16, 17, 20, 21,
+                                                24, 25, 28, 29)};
+}
+
+HWY_API Vec256<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
+                                       const Vec256<uint16_t> v) {
+  return Vec256<uint8_t, 16>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6,
+                                                8, 10, 12, 14, 16, 18, 20, 22,
+                                                24, 26, 28, 30)};
+}
+
 // ------------------------------ Convert i32 <=> f32 (Round)

 HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
@ -2687,12 +2791,29 @@ HWY_INLINE Vec256<uint64_t> Compress(hwy::SizeTag<8> /*tag*/,

 }  // namespace detail

+template <typename T>
+struct CompressIsPartition {
+  enum { value = 1 };
+};
+
 template <typename T>
 HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
  const uint64_t mask_bits = detail::BitsFromMask(mask);
  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
 }

+// ------------------------------ CompressNot
+template <typename T>
+HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
+  return Compress(v, Not(mask));
+}
+
+// ------------------------------ CompressBlocksNot
+HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
+                                           Mask256<uint64_t> mask) {
+  HWY_ASSERT(0);
+}
+
 // ------------------------------ CompressBits

 template <typename T>
@ -2750,76 +2871,10 @@ HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
  return PopCount(mask_bits);
 }

-// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
-// TableLookupBytes)
+// ------------------------------ StoreInterleaved2/3/4

-HWY_API void StoreInterleaved3(const Vec256<uint8_t> a, const Vec256<uint8_t> b,
-                               const Vec256<uint8_t> c, Full256<uint8_t> d,
-                               uint8_t* HWY_RESTRICT unaligned) {
-  const auto k5 = Set(d, 5);
-  const auto k6 = Set(d, 6);
-
-  // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
-  // 0x80 so lanes to be filled from other vectors are 0 for blending.
-  alignas(32) static constexpr uint8_t tbl_r0[16] = {
-      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
-      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
-  alignas(32) static constexpr uint8_t tbl_g0[16] = {
-      0x80, 0, 0x80, 0x80, 1, 0x80,  //
-      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
-  const auto shuf_r0 = Load(d, tbl_r0);
-  const auto shuf_g0 = Load(d, tbl_g0);  // cannot reuse r0 due to 5 in MSB
-  const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
-  const auto r0 = TableLookupBytes(a, shuf_r0);  // 5..4..3..2..1..0
-  const auto g0 = TableLookupBytes(b, shuf_g0);  // ..4..3..2..1..0.
-  const auto b0 = TableLookupBytes(c, shuf_b0);  // .4..3..2..1..0..
-  const auto int0 = r0 | g0 | b0;
-  StoreU(int0, d, unaligned + 0 * 16);
-
-  // Second vector: g10,r10, bgr[9:6], b5,g5
-  const auto shuf_r1 = shuf_b0 + k6;  // .A..9..8..7..6..
-  const auto shuf_g1 = shuf_r0 + k5;  // A..9..8..7..6..5
-  const auto shuf_b1 = shuf_g0 + k5;  // ..9..8..7..6..5.
-  const auto r1 = TableLookupBytes(a, shuf_r1);
-  const auto g1 = TableLookupBytes(b, shuf_g1);
-  const auto b1 = TableLookupBytes(c, shuf_b1);
-  const auto int1 = r1 | g1 | b1;
-  StoreU(int1, d, unaligned + 1 * 16);
-
-  // Third vector: bgr[15:11], b10
-  const auto shuf_r2 = shuf_b1 + k6;  // ..F..E..D..C..B.
-  const auto shuf_g2 = shuf_r1 + k5;  // .F..E..D..C..B..
-  const auto shuf_b2 = shuf_g1 + k5;  // F..E..D..C..B..A
-  const auto r2 = TableLookupBytes(a, shuf_r2);
-  const auto g2 = TableLookupBytes(b, shuf_g2);
-  const auto b2 = TableLookupBytes(c, shuf_b2);
-  const auto int2 = r2 | g2 | b2;
-  StoreU(int2, d, unaligned + 2 * 16);
-}
-
-// ------------------------------ StoreInterleaved4
-
-HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
-                               const Vec256<uint8_t> v1,
-                               const Vec256<uint8_t> v2,
-                               const Vec256<uint8_t> v3, Full256<uint8_t> d8,
-                               uint8_t* HWY_RESTRICT unaligned) {
-  const RepartitionToWide<decltype(d8)> d16;
-  const RepartitionToWide<decltype(d16)> d32;
-  // let a,b,c,d denote v0..3.
-  const auto ba0 = ZipLower(d16, v0, v1);  // b7 a7 .. b0 a0
-  const auto dc0 = ZipLower(d16, v2, v3);  // d7 c7 .. d0 c0
-  const auto ba8 = ZipUpper(d16, v0, v1);
-  const auto dc8 = ZipUpper(d16, v2, v3);
-  const auto dcba_0 = ZipLower(d32, ba0, dc0);  // d..a3 d..a0
-  const auto dcba_4 = ZipUpper(d32, ba0, dc0);  // d..a7 d..a4
-  const auto dcba_8 = ZipLower(d32, ba8, dc8);  // d..aB d..a8
-  const auto dcba_C = ZipUpper(d32, ba8, dc8);  // d..aF d..aC
-  StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
-  StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
-  StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
-  StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
-}
+// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
+// generic_ops-inl.h.

 // ------------------------------ MulEven/Odd (Load)

@ -2952,12 +3007,27 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> /* tag */, const Vec256<T> v) {
 template <typename T>
 HWY_INLINE Mask256<T> Lt128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}

+template <typename T>
+HWY_INLINE Mask256<T> Lt128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Eq128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Mask256<T> Eq128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
 template <typename T>
 HWY_INLINE Vec256<T> Min128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}

 template <typename T>
 HWY_INLINE Vec256<T> Max128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}

+template <typename T>
+HWY_INLINE Vec256<T> Min128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
+template <typename T>
+HWY_INLINE Vec256<T> Max128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
--- a/third_party/highway/hwy/per_target.cc
+++ b/third_party/highway/hwy/per_target.cc
@ -0,0 +1,50 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/per_target.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/per_target.cc"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+// On SVE, Lanes rounds down to a power of two, but we want to know the actual
+// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32
+// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on
+// HWY_SVE_256 requires the actual vector to be a power of two.
+#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256
+size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); }
+#else
+size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
+#endif
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(GetVectorBytes);  // Local function.
+}  // namespace
+
+size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); }
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/per_target.h
+++ b/third_party/highway/hwy/per_target.h
@ -0,0 +1,37 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_PER_TARGET_H_
+#define HIGHWAY_HWY_PER_TARGET_H_
+
+#include <stddef.h>
+
+// Per-target functions.
+
+namespace hwy {
+
+// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
+//
+// Do not cache the result, which may change after calling DisableTargets, or
+// if software requests a different vector size (e.g. when entering/exiting SME
+// streaming mode). Instead call this right before the code that depends on the
+// result, without any DisableTargets or SME transition in-between. Note that
+// this involves an indirect call, so prefer not to call this frequently nor
+// unnecessarily.
+size_t VectorBytes();
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_PER_TARGET_H_
--- a/third_party/highway/hwy/print-inl.h
+++ b/third_party/highway/hwy/print-inl.h
@ -0,0 +1,56 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Print() function
+
+#include <inttypes.h>
+#include <stdint.h>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+#include "hwy/print.h"
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_PRINT_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_PRINT_INL_H_
+#undef HIGHWAY_HWY_PRINT_INL_H_
+#else
+#define HIGHWAY_HWY_PRINT_INL_H_
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Prints lanes around `lane`, in memory order.
+template <class D, class V = Vec<D>>
+void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
+           size_t max_lanes = 7) {
+  const size_t N = Lanes(d);
+  using T = TFromD<D>;
+  auto lanes = AllocateAligned<T>(N);
+  Store(v, d, lanes.get());
+
+  const auto info = hwy::detail::MakeTypeInfo<T>();
+  hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // per-target include guard
--- a/third_party/highway/hwy/print.cc
+++ b/third_party/highway/hwy/print.cc
@ -0,0 +1,107 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/print.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include "hwy/base.h"
+
+namespace hwy {
+namespace detail {
+
+HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) {
+  const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
+  // Omit the xN suffix for scalars.
+  if (N == 1) {
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%d", prefix,
+             static_cast<int>(info.sizeof_t * 8));
+  } else {
+    // NOLINTNEXTLINE
+    snprintf(string100, 64, "%c%dx%d", prefix,
+             static_cast<int>(info.sizeof_t * 8), static_cast<int>(N));
+  }
+}
+
+HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
+                            char* string100) {
+  if (info.sizeof_t == 1) {
+    uint8_t byte;
+    CopyBytes<1>(ptr, &byte);  // endian-safe: we ensured sizeof(T)=1.
+    snprintf(string100, 100, "0x%02X", byte);  // NOLINT
+  } else if (info.sizeof_t == 2) {
+    uint16_t bits;
+    CopyBytes<2>(ptr, &bits);
+    snprintf(string100, 100, "0x%04X", bits);  // NOLINT
+  } else if (info.sizeof_t == 4) {
+    if (info.is_float) {
+      float value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%g", static_cast<double>(value));  // NOLINT
+    } else if (info.is_signed) {
+      int32_t value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%d", value);  // NOLINT
+    } else {
+      uint32_t value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%u", value);  // NOLINT
+    }
+  } else {
+    HWY_ASSERT(info.sizeof_t == 8);
+    if (info.is_float) {
+      double value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%g", value);  // NOLINT
+    } else if (info.is_signed) {
+      int64_t value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%" PRIi64 "", value);  // NOLINT
+    } else {
+      uint64_t value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%" PRIu64 "", value);  // NOLINT
+    }
+  }
+}
+
+HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
+                              const void* array_void, size_t N, size_t lane_u,
+                              size_t max_lanes) {
+  const uint8_t* array_bytes = reinterpret_cast<const uint8_t*>(array_void);
+
+  char type_name[100];
+  TypeName(info, N, type_name);
+
+  const intptr_t lane = intptr_t(lane_u);
+  const size_t begin = static_cast<size_t>(HWY_MAX(0, lane - 2));
+  const size_t end = HWY_MIN(begin + max_lanes, N);
+  fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n  ", type_name, caption,
+          static_cast<uint64_t>(begin));
+  for (size_t i = begin; i < end; ++i) {
+    const void* ptr = array_bytes + i * info.sizeof_t;
+    char str[100];
+    ToString(info, ptr, str);
+    fprintf(stderr, "%s,", str);
+  }
+  if (begin >= end) fprintf(stderr, "(out of bounds)");
+  fprintf(stderr, "\n");
+}
+
+}  // namespace detail
+}  // namespace hwy
--- a/third_party/highway/hwy/print.h
+++ b/third_party/highway/hwy/print.h
@ -0,0 +1,73 @@
+// Copyright 2022 Google LLC
+// SPDX-License-Identifier: Apache-2.0
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_PRINT_H_
+#define HWY_PRINT_H_
+
+// Helpers for printing vector lanes.
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "hwy/base.h"
+#include "hwy/highway_export.h"
+
+namespace hwy {
+
+namespace detail {
+
+// For implementing value comparisons etc. as type-erased functions to reduce
+// template bloat.
+struct TypeInfo {
+  size_t sizeof_t;
+  bool is_float;
+  bool is_signed;
+};
+
+template <typename T>
+HWY_INLINE TypeInfo MakeTypeInfo() {
+  TypeInfo info;
+  info.sizeof_t = sizeof(T);
+  info.is_float = IsFloat<T>();
+  info.is_signed = IsSigned<T>();
+  return info;
+}
+
+HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
+HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
+                            char* string100);
+
+HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
+                              const void* array_void, size_t N,
+                              size_t lane_u = 0, size_t max_lanes = 7);
+
+}  // namespace detail
+
+template <typename T>
+HWY_NOINLINE void PrintValue(T value) {
+  char str[100];
+  detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
+  fprintf(stderr, "%s,", str);
+}
+
+template <typename T>
+HWY_NOINLINE void PrintArray(const T* value, size_t count) {
+  detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
+                     count);
+}
+
+}  // namespace hwy
+
+#endif  // HWY_PRINT_H_
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -14,6 +15,7 @@

 #include "hwy/targets.h"

+#include <inttypes.h>  // PRIx64
 #include <stdarg.h>
 #include <stddef.h>
 #include <stdint.h>
@ -21,7 +23,7 @@

 #include <atomic>

-#include "hwy/base.h"
+#include "hwy/per_target.h"

 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
@ -36,7 +38,11 @@
 #else  // !HWY_COMPILER_MSVC
 #include <cpuid.h>
 #endif  // HWY_COMPILER_MSVC
-#endif  // HWY_ARCH_X86
+
+#elif HWY_ARCH_ARM && HWY_OS_LINUX
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif  // HWY_ARCH_*

 namespace hwy {
 namespace {
@ -57,7 +63,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
  for (int i = 0; i < 4; ++i) {
    abcd[i] = regs[i];
  }
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
  uint32_t a;
  uint32_t b;
  uint32_t c;
@ -75,7 +81,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
 uint32_t ReadXCR0() {
 #if HWY_COMPILER_MSVC
  return static_cast<uint32_t>(_xgetbv(0));
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
  uint32_t xcr0, xcr0_high;
  const uint32_t index = 0;
  asm volatile(".byte 0x0F, 0x01, 0xD0"
@ -87,15 +93,12 @@ uint32_t ReadXCR0() {

 #endif  // HWY_ARCH_X86

-// Not function-local => no compiler-generated locking.
-std::atomic<uint32_t> supported_{0};  // Not yet initialized
-
 // When running tests, this value can be set to the mocked supported targets
 // mask. Only written to from a single thread before the test starts.
-uint32_t supported_targets_for_test_ = 0;
+int64_t supported_targets_for_test_ = 0;

 // Mask of targets disabled at runtime with DisableTargets.
-uint32_t supported_mask_{LimitsMax<uint32_t>()};
+int64_t supported_mask_ = LimitsMax<int64_t>();

 #if HWY_ARCH_X86
 // Arbritrary bit indices indicating which instruction set extensions are
@ -126,6 +129,7 @@ enum class FeatureIndex : uint32_t {

  kVNNI,
  kVPCLMULQDQ,
+  kVBMI,
  kVBMI2,
  kVAES,
  kPOPCNTDQ,
@ -176,77 +180,19 @@ constexpr uint64_t kGroupAVX3 =

 constexpr uint64_t kGroupAVX3_DL =
    Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
-    Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
-    Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
+    Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
+    Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
+    Bit(FeatureIndex::kBITALG) | kGroupAVX3;

 #endif  // HWY_ARCH_X86

-}  // namespace
-
-HWY_NORETURN void HWY_FORMAT(3, 4)
-    Abort(const char* file, int line, const char* format, ...) {
-  char buf[2000];
-  va_list args;
-  va_start(args, format);
-  vsnprintf(buf, sizeof(buf), format, args);
-  va_end(args);
-
-  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
-
-// If compiled with any sanitizer, they can also print a stack trace.
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-  __sanitizer_print_stack_trace();
-#endif  // HWY_IS_*
-  fflush(stderr);
-
-// Now terminate the program:
-#if HWY_ARCH_RVV
-  exit(1);  // trap/abort just freeze Spike.
-#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
-  // Facilitates breaking into a debugger, but don't use this in non-debug
-  // builds because it looks like "illegal instruction", which is misleading.
-  __builtin_trap();
-#else
-  abort();  // Compile error without this due to HWY_NORETURN.
-#endif
-}
-
-void DisableTargets(uint32_t disabled_targets) {
-  supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
-  // We can call Update() here to initialize the mask but that will trigger a
-  // call to SupportedTargets() which we use in tests to tell whether any of the
-  // highway dynamic dispatch functions were used.
-  GetChosenTarget().DeInit();
-}
-
-void SetSupportedTargetsForTest(uint32_t targets) {
-  // Reset the cached supported_ value to 0 to force a re-evaluation in the
-  // next call to SupportedTargets() which will use the mocked value set here
-  // if not zero.
-  supported_.store(0, std::memory_order_release);
-  supported_targets_for_test_ = targets;
-  GetChosenTarget().DeInit();
-}
-
-bool SupportedTargetsCalledForTest() {
-  return supported_.load(std::memory_order_acquire) != 0;
-}
-
-uint32_t SupportedTargets() {
-  uint32_t bits = supported_.load(std::memory_order_acquire);
-  // Already initialized?
-  if (HWY_LIKELY(bits != 0)) {
-    return bits & supported_mask_;
-  }
-
-  // When running tests, this allows to mock the current supported targets.
-  if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
-    // Store the value to signal that this was used.
-    supported_.store(supported_targets_for_test_, std::memory_order_release);
-    return supported_targets_for_test_ & supported_mask_;
-  }
-
-  bits = HWY_SCALAR;
+// Returns targets supported by the CPU, independently of DisableTargets.
+// Factored out of SupportedTargets to make its structure more obvious. Note
+// that x86 CPUID may take several hundred cycles.
+int64_t DetectTargets() {
+  // Apps will use only one of these (the default is EMU128), but compile flags
+  // for this TU may differ from that of the app, so allow both.
+  int64_t bits = HWY_SCALAR | HWY_EMU128;

 #if HWY_ARCH_X86
  bool has_osxsave = false;
@ -288,6 +234,7 @@ uint32_t SupportedTargets() {
      flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
      flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;

+      flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
      flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
      flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
      flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
@ -318,33 +265,162 @@ uint32_t SupportedTargets() {
  // are not preserved across context switches.
  if (has_osxsave) {
    const uint32_t xcr0 = ReadXCR0();
+    const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
+    const int64_t min_avx2 = HWY_AVX2 | min_avx3;
    // XMM
    if (!IsBitSet(xcr0, 1)) {
-      bits &=
-          ~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
+      bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2);
    }
    // YMM
    if (!IsBitSet(xcr0, 2)) {
-      bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
+      bits &= ~min_avx2;
    }
-    // ZMM + opmask
-    if ((xcr0 & 0x70) != 0x70) {
-      bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
+    // opmask, ZMM lo/hi
+    if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) {
+      bits &= ~min_avx3;
    }
  }

-#else
-  // TODO(janwas): detect for other platforms
-  bits = HWY_ENABLED_BASELINE;
-#endif  // HWY_ARCH_X86
-
  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
-    fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
-            size_t(bits), HWY_ENABLED_BASELINE);
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
  }

-  supported_.store(bits, std::memory_order_release);
-  return bits & supported_mask_;
+#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+  using CapBits = unsigned long;  // NOLINT
+  const CapBits hw = getauxval(AT_HWCAP);
+  (void)hw;
+
+#if HWY_ARCH_ARM_A64
+
+#if defined(HWCAP_AES)
+  // aarch64 always has NEON and VFPv4, but not necessarily AES, which we
+  // require and thus must still check for.
+  if (hw & HWCAP_AES) {
+    bits |= HWY_NEON;
+  }
+#endif  // HWCAP_AES
+
+#if defined(HWCAP_SVE)
+  if (hw & HWCAP_SVE) {
+    bits |= HWY_SVE;
+  }
+#endif
+
+#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
+  const CapBits hw2 = getauxval(AT_HWCAP2);
+  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
+    bits |= HWY_SVE2;
+  }
+#endif
+
+#else  // HWY_ARCH_ARM_A64
+
+// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
+// Note that AES has a different HWCAP bit compared to aarch64.
+#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
+  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
+    bits |= HWY_NEON;
+  }
+#endif
+
+#endif  // HWY_ARCH_ARM_A64
+  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+    fprintf(stderr,
+            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
+            "\n",
+            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
+  }
+#else   // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
+  // TODO(janwas): detect for other platforms and check for baseline
+  // This file is typically compiled without HWY_IS_TEST, but targets_test has
+  // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
+  // supported.
+  bits |= HWY_ENABLED_BASELINE;
+#endif  // HWY_ARCH_X86
+
+  return bits;
+}
+
+}  // namespace
+
+HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...) {
+  char buf[2000];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buf, sizeof(buf), format, args);
+  va_end(args);
+
+  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
+
+// If compiled with any sanitizer, they can also print a stack trace.
+#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+  __sanitizer_print_stack_trace();
+#endif  // HWY_IS_*
+  fflush(stderr);
+
+// Now terminate the program:
+#if HWY_ARCH_RVV
+  exit(1);  // trap/abort just freeze Spike.
+#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
+  // Facilitates breaking into a debugger, but don't use this in non-debug
+  // builds because it looks like "illegal instruction", which is misleading.
+  __builtin_trap();
+#else
+  abort();  // Compile error without this due to HWY_NORETURN.
+#endif
+}
+
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
+  supported_mask_ = static_cast<int64_t>(~disabled_targets);
+  // This will take effect on the next call to SupportedTargets, which is
+  // called right before GetChosenTarget::Update. However, calling Update here
+  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
+  // to check in tests. We instead de-initialize such that the next
+  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
+  GetChosenTarget().DeInit();
+}
+
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
+  supported_targets_for_test_ = targets;
+  GetChosenTarget().DeInit();  // see comment above
+}
+
+HWY_DLLEXPORT int64_t SupportedTargets() {
+  int64_t targets = supported_targets_for_test_;
+  if (HWY_LIKELY(targets == 0)) {
+    // Mock not active. Re-detect instead of caching just in case we're on a
+    // heterogeneous ISA (also requires some app support to pin threads). This
+    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
+    // DisableTargets or SetSupportedTargetsForTest.
+    targets = DetectTargets();
+
+    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
+    // first set up ChosenTarget. No need to Update() again afterwards with the
+    // final targets - that will be done by a caller of this function.
+    GetChosenTarget().Update(targets);
+
+    // Now that we can call VectorBytes, check for targets with specific sizes.
+    if (HWY_ARCH_ARM_A64) {
+      const size_t vec_bytes = VectorBytes();  // uncached, see declaration
+      if ((targets & HWY_SVE) && vec_bytes == 32) {
+        targets = static_cast<int64_t>(targets | HWY_SVE_256);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
+      }
+      if ((targets & HWY_SVE2) && vec_bytes == 16) {
+        targets = static_cast<int64_t>(targets | HWY_SVE2_128);
+      } else {
+        targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
+      }
+    }  // HWY_ARCH_ARM_A64
+  }
+
+  targets &= supported_mask_;
+  return targets == 0 ? HWY_STATIC_TARGET : targets;
 }

 HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
@ -352,14 +428,4 @@ HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
  return chosen_target;
 }

-void ChosenTarget::Update() {
-  // The supported variable contains the current CPU supported targets shifted
-  // to the location expected by the ChosenTarget mask. We enabled SCALAR
-  // regardless of whether it was compiled since it is also used as the
-  // fallback mechanism to the baseline target.
-  uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
-                       HWY_CHOSEN_TARGET_MASK_SCALAR;
-  mask_.store(supported);
-}
-
 }  // namespace hwy
--- a/third_party/highway/hwy/targets.h
+++ b/third_party/highway/hwy/targets.h
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -24,13 +25,18 @@
 #include "hwy/detect_targets.h"
 #include "hwy/highway_export.h"

+#if !HWY_ARCH_RVV
+#include <atomic>
+#endif
+
 namespace hwy {

-// Returns (cached) bitfield of enabled targets that are supported on this CPU.
-// Implemented in targets.cc; unconditionally compiled to support the use case
-// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
-// eliding calls to this function.
-HWY_DLLEXPORT uint32_t SupportedTargets();
+// Returns bitfield of enabled targets that are supported on this CPU; there is
+// always at least one such target, hence the return value is never 0. The
+// targets returned may change after calling DisableTargets. This function is
+// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
+// calls to it if there is only a single target enabled.
+HWY_DLLEXPORT int64_t SupportedTargets();

 // Evaluates to a function call, or literal if there is a single target.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
@ -39,40 +45,36 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
 #define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
 #endif

-// Disable from runtime dispatch the mask of compiled in targets. Targets that
-// were not enabled at compile time are ignored. This function is useful to
-// disable a target supported by the CPU that is known to have bugs or when a
-// lower target is desired. For this reason, attempts to disable targets which
-// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
-// returns at least the baseline target.
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
+// Subsequent SupportedTargets will not return targets whose bit(s) are set in
+// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
+// instead return HWY_STATIC_TARGET (there must always be one target to call).
+//
+// This function is useful for disabling targets known to be buggy, or if the
+// best available target is undesirable (perhaps due to throttling or memory
+// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
+// function for iteratively enabling specific targets for testing.
+HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);

-// Set the mock mask of CPU supported targets instead of the actual CPU
-// supported targets computed in SupportedTargets(). The return value of
-// SupportedTargets() will still be affected by the DisableTargets() mask
-// regardless of this mock, to prevent accidentally adding targets that are
-// known to be buggy in the current CPU. Call with a mask of 0 to disable the
-// mock and use the actual CPU supported targets instead.
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
-
-// Returns whether the SupportedTargets() function was called since the last
-// SetSupportedTargetsForTest() call.
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
+// Subsequent SupportedTargets will return the given set of targets, except
+// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
+// and return to the normal SupportedTargets behavior. Used to run tests for
+// all targets.
+HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);

 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
 // is affected by the current SetSupportedTargetsForTest() mock if any.
-HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
-  std::vector<uint32_t> ret;
-  for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
+HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
+  std::vector<int64_t> ret;
+  for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
       targets = targets & (targets - 1)) {
-    uint32_t current_target = targets & ~(targets - 1);
+    int64_t current_target = targets & ~(targets - 1);
    ret.push_back(current_target);
  }
  return ret;
 }

-static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
+static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
  switch (target) {
 #if HWY_ARCH_X86
    case HWY_SSSE3:
@ -88,22 +90,28 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 #endif

 #if HWY_ARCH_ARM
+    case HWY_SVE2_128:
+      return "SVE2_128";
+    case HWY_SVE_256:
+      return "SVE_256";
    case HWY_SVE2:
      return "SVE2";
    case HWY_SVE:
      return "SVE";
    case HWY_NEON:
-      return "Neon";
+      return "NEON";
 #endif

 #if HWY_ARCH_PPC
    case HWY_PPC8:
-      return "Power8";
+      return "PPC8";
 #endif

 #if HWY_ARCH_WASM
    case HWY_WASM:
-      return "Wasm";
+      return "WASM";
+    case HWY_WASM_EMU256:
+      return "WASM_EMU256";
 #endif

 #if HWY_ARCH_RVV
@ -111,8 +119,10 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
      return "RVV";
 #endif

+    case HWY_EMU128:
+      return "EMU128";
    case HWY_SCALAR:
-      return "Scalar";
+      return "SCALAR";

    default:
      return "Unknown";  // must satisfy gtest IsValidParamName()
@ -125,95 +135,125 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 // For the ChosenTarget mask and index we use a different bit arrangement than
 // in the HWY_TARGETS mask. Only the targets involved in the current
 // architecture are used in this mask, and therefore only the least significant
-// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
+// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
 // significant bit is set when the mask is not initialized, the next
 // HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
 // HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
-// that position and the next more significant bit is used for the scalar
-// target. Because of this we need to define equivalent values for HWY_TARGETS
-// in this representation.
+// that position and the next more significant bit is used for HWY_SCALAR (if
+// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
+// define equivalent values for HWY_TARGETS in this representation.
 // This mask representation allows to use ctz() on this mask and obtain a small
 // number that's used as an index of the table for dynamic dispatch. In this
 // way the first entry is used when the mask is uninitialized, the following
 // HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
 // scalar.

-// The HWY_SCALAR bit in the ChosenTarget mask format.
-#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
+// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
+#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))

 // Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
 // current architecture.
 #define HWY_CHOSEN_TARGET_SHIFT(X)                                    \
  ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
-    ((1u << HWY_MAX_DYNAMIC_TARGETS) - 1))                            \
+    ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1))                           \
   << 1)

 // The HWY_TARGETS mask in the ChosenTarget mask format.
 #define HWY_CHOSEN_TARGET_MASK_TARGETS \
-  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
+  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)

 #if HWY_ARCH_X86
 // Maximum number of dynamic targets, changing this value is an ABI incompatible
 // change
-#define HWY_MAX_DYNAMIC_TARGETS 10
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
 // These must match the order in which the HWY_TARGETS are defined
 // starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
 // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
 // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
 // corresponds to the best target. Don't include a "," at the end of the list.
-#define HWY_CHOOSE_TARGET_LIST(func_name)           \
-  nullptr,                           /* reserved */ \
-      nullptr,                       /* reserved */ \
-      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */  \
-      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */     \
-      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */     \
-      nullptr,                       /* AVX */      \
-      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */     \
-      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */    \
-      nullptr,                       /* SSE3 */     \
-      nullptr                        /* SSE2 */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                   \
+  nullptr,                           /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      nullptr,                       /* reserved */         \
+      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */          \
+      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */             \
+      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */             \
+      nullptr,                       /* AVX */              \
+      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */             \
+      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */            \
+      nullptr ,                       /* reserved - SSE3? */ \
+      nullptr                        /* reserved - SSE2? */

 #elif HWY_ARCH_ARM
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 15
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
-#define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  HWY_CHOOSE_SVE2(func_name),    /* SVE2 */     \
-      HWY_CHOOSE_SVE(func_name), /* SVE */      \
-      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_NEON(func_name) /* NEON */
-
-#elif HWY_ARCH_PPC
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 5
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
-#define HWY_CHOOSE_TARGET_LIST(func_name)        \
-  nullptr,                        /* reserved */ \
-      nullptr,                    /* reserved */ \
-      HWY_CHOOSE_PPC8(func_name), /* PPC8 */     \
-      nullptr,                    /* VSX */      \
-      nullptr                     /* AltiVec */
-
-#elif HWY_ARCH_WASM
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
-#define HWY_CHOOSE_TARGET_LIST(func_name)         \
-  nullptr,                         /* reserved */ \
-      nullptr,                     /* reserved */ \
-      HWY_CHOOSE_WASM2(func_name), /* WASM2 */    \
-      HWY_CHOOSE_WASM(func_name)   /* WASM */
+#define HWY_CHOOSE_TARGET_LIST(func_name)                \
+  nullptr,                            /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      nullptr,                        /* reserved */     \
+      HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
+      HWY_CHOOSE_SVE_256(func_name),  /* SVE 256-bit */  \
+      HWY_CHOOSE_SVE2(func_name),     /* SVE2 */         \
+      HWY_CHOOSE_SVE(func_name),      /* SVE */          \
+      HWY_CHOOSE_NEON(func_name),     /* NEON */         \
+      nullptr                         /* reserved - Helium? */

 #elif HWY_ARCH_RVV
 // See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_MAX_DYNAMIC_TARGETS 9
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
 #define HWY_CHOOSE_TARGET_LIST(func_name)       \
  nullptr,                       /* reserved */ \
      nullptr,                   /* reserved */ \
      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_RVV(func_name) /* RVV */
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      HWY_CHOOSE_RVV(func_name), /* RVV */      \
+      nullptr                    /* reserved */
+
+#elif HWY_ARCH_PPC
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 9
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
+#define HWY_CHOOSE_TARGET_LIST(func_name)                         \
+  nullptr,                        /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      nullptr,                    /* reserved */                  \
+      HWY_CHOOSE_PPC8(func_name), /* PPC8 */                      \
+      nullptr,                    /* reserved (VSX or AltiVec) */ \
+      nullptr                     /* reserved (VSX or AltiVec) */
+
+#elif HWY_ARCH_WASM
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 9
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
+#define HWY_CHOOSE_TARGET_LIST(func_name)                  \
+  nullptr,                               /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      nullptr,                           /* reserved */    \
+      HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
+      HWY_CHOOSE_WASM(func_name),        /* WASM */        \
+      nullptr                            /* reserved */

 #else
 // Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
@ -222,32 +262,52 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
 #endif

+// Bitfield of supported and enabled targets. The format differs from that of
+// HWY_TARGETS; the lowest bit governs the first function pointer (which is
+// special in that it calls FunctionCache, then Update, then dispatches to the
+// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
+// GetChosenTarget), thread-safe except on RVV.
 struct ChosenTarget {
 public:
-  // Update the ChosenTarget mask based on the current CPU supported
-  // targets.
-  HWY_DLLEXPORT void Update();
+  // Reset bits according to `targets` (typically the return value of
+  // SupportedTargets()). Postcondition: IsInitialized() == true.
+  void Update(int64_t targets) {
+    // These are `targets` shifted downwards, see above. Also include SCALAR
+    // (corresponds to the last entry in the function table) as fallback.
+    StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
+  }

-  // Reset the ChosenTarget to the uninitialized state.
-  void DeInit() { mask_.store(1); }
+  // Reset to the uninitialized state, so that FunctionCache will call Update
+  // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
+  void DeInit() { StoreMask(1); }

-  // Whether the ChosenTarget was initialized. This is useful to know whether
-  // any HWY_DYNAMIC_DISPATCH function was called.
-  bool IsInitialized() const { return mask_.load() != 1; }
+  // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
+  // function was called, which we check in tests.
+  bool IsInitialized() const { return LoadMask() != 1; }

  // Return the index in the dynamic dispatch table to be used by the current
  // CPU. Note that this method must be in the header file so it uses the value
  // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
-  // calls it, which may be different from others. This allows to only consider
+  // calls it, which may be different from others. This means we only enable
  // those targets that were actually compiled in this module.
  size_t HWY_INLINE GetIndex() const {
-    return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
-                                              HWY_CHOSEN_TARGET_MASK_TARGETS);
+    return hwy::Num0BitsBelowLS1Bit_Nonzero64(
+        static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
  }

 private:
-  // Initialized to 1 so GetIndex() returns 0.
-  std::atomic<uint32_t> mask_{1};
+  // TODO(janwas): remove #if once <atomic> is available
+#if HWY_ARCH_RVV
+  int64_t LoadMask() const { return mask_; }
+  void StoreMask(int64_t mask) { mask_ = mask; }
+
+  int64_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+#else
+  int64_t LoadMask() const { return mask_.load(); }
+  void StoreMask(int64_t mask) { mask_.store(mask); }
+
+  std::atomic<int64_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
+#endif  // HWY_ARCH_RVV
 };

 // For internal use (e.g. by FunctionCache and DisableTargets).
--- a/third_party/highway/hwy/targets_test.cc
+++ b/third_party/highway/hwy/targets_test.cc
@ -1,4 +1,5 @@
 // Copyright 2020 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -18,9 +19,10 @@

 namespace fake {

-#define DECLARE_FUNCTION(TGT)                        \
-  namespace N_##TGT {                                \
-    uint32_t FakeFunction(int) { return HWY_##TGT; } \
+#define DECLARE_FUNCTION(TGT)                                                \
+  namespace N_##TGT {                                                        \
+    /* Function argument is just to ensure/demonstrate they are possible. */ \
+    int64_t FakeFunction(int) { return HWY_##TGT; }                          \
  }

 DECLARE_FUNCTION(AVX3_DL)
@ -31,41 +33,62 @@ DECLARE_FUNCTION(SSSE3)
 DECLARE_FUNCTION(NEON)
 DECLARE_FUNCTION(SVE)
 DECLARE_FUNCTION(SVE2)
+DECLARE_FUNCTION(SVE_256)
+DECLARE_FUNCTION(SVE2_128)
 DECLARE_FUNCTION(PPC8)
 DECLARE_FUNCTION(WASM)
 DECLARE_FUNCTION(RVV)
 DECLARE_FUNCTION(SCALAR)
+DECLARE_FUNCTION(EMU128)

 HWY_EXPORT(FakeFunction);

+void CallFunctionForTarget(int64_t target, int line) {
+  if ((HWY_TARGETS & target) == 0) return;
+  hwy::SetSupportedTargetsForTest(target);
+
+  // Call Update() first to make &HWY_DYNAMIC_DISPATCH() return
+  // the pointer to the already cached function.
+  hwy::GetChosenTarget().Update(hwy::SupportedTargets());
+
+  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+
+  // Calling DeInit() will test that the initializer function
+  // also calls the right function.
+  hwy::GetChosenTarget().DeInit();
+
+#if HWY_DISPATCH_WORKAROUND
+  EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#else
+  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+#endif
+
+  // Second call uses the cached value from the previous call.
+  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
+}
+
 void CheckFakeFunction() {
-#define CHECK_ARRAY_ENTRY(TGT)                                              \
-  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                     \
-    hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \
-    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \
-    /* the pointer to the already cached function. */                       \
-    hwy::GetChosenTarget().Update();                                        \
-    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
-    /* Calling DeInit() will test that the initializer function */          \
-    /* also calls the right function. */                                    \
-    hwy::GetChosenTarget().DeInit();                                        \
-    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
-    /* Second call uses the cached value from the previous call. */         \
-    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
-  }
-  CHECK_ARRAY_ENTRY(AVX3_DL)
-  CHECK_ARRAY_ENTRY(AVX3)
-  CHECK_ARRAY_ENTRY(AVX2)
-  CHECK_ARRAY_ENTRY(SSE4)
-  CHECK_ARRAY_ENTRY(SSSE3)
-  CHECK_ARRAY_ENTRY(NEON)
-  CHECK_ARRAY_ENTRY(SVE)
-  CHECK_ARRAY_ENTRY(SVE2)
-  CHECK_ARRAY_ENTRY(PPC8)
-  CHECK_ARRAY_ENTRY(WASM)
-  CHECK_ARRAY_ENTRY(RVV)
-  CHECK_ARRAY_ENTRY(SCALAR)
-#undef CHECK_ARRAY_ENTRY
+  // When adding a target, also add to DECLARE_FUNCTION above.
+  CallFunctionForTarget(HWY_AVX3_DL, __LINE__);
+  CallFunctionForTarget(HWY_AVX3, __LINE__);
+  CallFunctionForTarget(HWY_AVX2, __LINE__);
+  CallFunctionForTarget(HWY_SSE4, __LINE__);
+  CallFunctionForTarget(HWY_SSSE3, __LINE__);
+  CallFunctionForTarget(HWY_NEON, __LINE__);
+  CallFunctionForTarget(HWY_SVE, __LINE__);
+  CallFunctionForTarget(HWY_SVE2, __LINE__);
+  CallFunctionForTarget(HWY_SVE_256, __LINE__);
+  CallFunctionForTarget(HWY_SVE2_128, __LINE__);
+  CallFunctionForTarget(HWY_PPC8, __LINE__);
+  CallFunctionForTarget(HWY_WASM, __LINE__);
+  CallFunctionForTarget(HWY_RVV, __LINE__);
+  // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
+  // is opt-in only.
+#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
+  CallFunctionForTarget(HWY_SCALAR, __LINE__);
+#else
+  CallFunctionForTarget(HWY_EMU128, __LINE__);
+#endif
 }

 }  // namespace fake
@ -86,31 +109,27 @@ class HwyTargetsTest : public testing::Test {
 TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }

 TEST_F(HwyTargetsTest, DisabledTargetsTest) {
-  DisableTargets(~0u);
-  // Check that the baseline can't be disabled.
-  HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
+  DisableTargets(~0LL);
+  // Check that disabling everything at least leaves the static target.
+  HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets());

  DisableTargets(0);  // Reset the mask.
-  uint32_t current_targets = SupportedTargets();
-  if ((current_targets & ~uint32_t(HWY_ENABLED_BASELINE)) == 0) {
+  const int64_t current_targets = SupportedTargets();
+  const int64_t enabled_baseline = static_cast<int64_t>(HWY_ENABLED_BASELINE);
+  // Exclude these two because they are always returned by SupportedTargets.
+  const int64_t fallback = HWY_SCALAR | HWY_EMU128;
+  if ((current_targets & ~enabled_baseline & ~fallback) == 0) {
    // We can't test anything else if the only compiled target is the baseline.
    return;
  }
+
  // Get the lowest bit in the mask (the best target) and disable that one.
-  uint32_t lowest_target = current_targets & (~current_targets + 1);
-  // The lowest target shouldn't be one in the baseline.
-  HWY_ASSERT((lowest_target & ~uint32_t(HWY_ENABLED_BASELINE)) != 0);
-  DisableTargets(lowest_target);
+  const int64_t best_target = current_targets & (~current_targets + 1);
+  DisableTargets(best_target);

  // Check that the other targets are still enabled.
-  HWY_ASSERT((lowest_target ^ current_targets) == SupportedTargets());
+  HWY_ASSERT((best_target ^ current_targets) == SupportedTargets());
  DisableTargets(0);  // Reset the mask.
 }

 }  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/third_party/highway/hwy/tests/arithmetic_test.cc
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@ -1,4 +1,5 @@
 // Copyright 2019 Google LLC
+// SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@ -16,12 +17,9 @@
 #include <stddef.h>
 #include <stdint.h>

-#include <algorithm>
-#include <limits>
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
-#include "hwy/foreach_target.h"
+#include "hwy/foreach_target.h"  // IWYU pragma: keep
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

@ -177,6 +175,23 @@ HWY_NOINLINE void TestAllAbs() {
  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
 }

+struct TestNeg {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vn = Set(d, T(-3));
+    const auto vp = Set(d, T(3));
+    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
+    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
+  }
+};
+
+HWY_NOINLINE void TestAllNeg() {
+  ForSignedTypes(ForPartialVectors<TestNeg>());
+  ForFloatTypes(ForPartialVectors<TestNeg>());
+}
+
 struct TestUnsignedMinMax {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -263,16 +278,15 @@ HWY_NOINLINE void TestAllMinMax() {
  ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
 }

-class TestMinMax128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
+template <class D>
+static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
+  alignas(16) uint64_t in[2];
+  in[0] = lo;
+  in[1] = hi;
+  return LoadDup128(d, in);
+}

- public:
+struct TestMinMax128 {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    using V = Vec<D>;
@ -341,779 +355,73 @@ HWY_NOINLINE void TestAllMinMax128() {
  ForGEVectors<128, TestMinMax128>()(uint64_t());
 }

-struct TestUnsignedMul {
+struct TestMinMax128Upper {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, T(1));
-    const auto vi = Iota(d, 1);
-    const auto vj = Iota(d, 3);
+    using V = Vec<D>;
    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (3 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj));
-
-    const T max = LimitsMax<T>();
-    const auto vmax = Set(d, max);
-    HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
-    HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
-
-    const size_t bits = sizeof(T) * 8;
-    const uint64_t mask = (1ull << bits) - 1;
-    const T max2 = (uint64_t(max) * max) & mask;
-    HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
-  }
-};
-
-struct TestSignedMul {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, T(1));
-    const auto vi = Iota(d, 1);
-    const auto vn = Iota(d, -T(N));  // no i8 supported, so no wraparound
-    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllMul() {
-  const ForPartialVectors<TestUnsignedMul> test_unsigned;
-  // No u8.
-  test_unsigned(uint16_t());
-  test_unsigned(uint32_t());
-  // No u64.
-
-  const ForPartialVectors<TestSignedMul> test_signed;
-  // No i8.
-  test_signed(int16_t());
-  test_signed(int32_t());
-  // No i64.
-}
-
-struct TestMulHigh {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Wide = MakeWide<T>;
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-    auto expected_lanes = AllocateAligned<T>(N);
-
-    const auto vi = Iota(d, 1);
-    const auto vni = Iota(d, -T(N));  // no i8 supported, so no wraparound
-
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0));
-
-    // Large positive squared
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = T(LimitsMax<T>() >> i);
-      expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
-    }
-    auto v = Load(d, in_lanes.get());
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
-
-    // Large positive * small positive
-    for (size_t i = 0; i < N; ++i) {
-      expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
-
-    // Large positive * small negative
-    for (size_t i = 0; i < N; ++i) {
-      expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
-  }
-};
-
-HWY_NOINLINE void TestAllMulHigh() {
-  ForPartialVectors<TestMulHigh> test;
-  test(int16_t());
-  test(uint16_t());
-}
-
-struct TestMulEven {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Wide = MakeWide<T>;
-    const Repartition<Wide, D> d2;
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0));
-
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<Wide>(Lanes(d2));
-    for (size_t i = 0; i < N; i += 2) {
-      in_lanes[i + 0] = LimitsMax<T>() >> i;
-      if (N != 1) {
-        in_lanes[i + 1] = 1;  // unused
-      }
-      expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0];
-    }
-
-    const auto v = Load(d, in_lanes.get());
-    HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v));
-  }
-};
-
-struct TestMulEvenOdd64 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
-
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto expected_even = AllocateAligned<T>(N);
-    auto expected_odd = AllocateAligned<T>(N);
-
-    // Random inputs in each lane
+    auto a_lanes = AllocateAligned<T>(N);
+    auto b_lanes = AllocateAligned<T>(N);
+    auto min_lanes = AllocateAligned<T>(N);
+    auto max_lanes = AllocateAligned<T>(N);
    RandomState rng;
+
+    const V v00 = Zero(d);
+    const V v01 = Make128(d, 0, 1);
+    const V v10 = Make128(d, 1, 0);
+    const V v11 = Add(v01, v10);
+
+    // Same arg
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
+
+    // Equivalent but not equal (chooses second arg)
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
+    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
+    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
+    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
+
+    // First arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
+
+    // Second arg less
+    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
+    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
+
+    // Also check 128-bit blocks are independent
    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
-        in1[i] = Random64(&rng);
-        in2[i] = Random64(&rng);
+        a_lanes[i] = Random64(&rng);
+        b_lanes[i] = Random64(&rng);
      }
-
+      const V a = Load(d, a_lanes.get());
+      const V b = Load(d, b_lanes.get());
      for (size_t i = 0; i < N; i += 2) {
-        expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
-        expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
-      }
-
-      const auto a = Load(d, in1.get());
-      const auto b = Load(d, in2.get());
-      HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
-      HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
-    }
-#else
-    (void)d;
-#endif  // HWY_TARGET != HWY_SCALAR
-  }
-};
-
-HWY_NOINLINE void TestAllMulEven() {
-  ForGEVectors<64, TestMulEven> test;
-  test(int32_t());
-  test(uint32_t());
-
-  ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
-}
-
-struct TestMulAdd {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto k0 = Zero(d);
-    const auto kNeg0 = Set(d, T(-0.0));
-    const auto v1 = Iota(d, 1);
-    const auto v2 = Iota(d, 2);
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2));
-    HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 1) * (i + 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] =
-          T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
-
-    HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = -T(i + 2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 1) * (i + 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1));
-  }
-};
-
-HWY_NOINLINE void TestAllMulAdd() {
-  ForFloatTypes(ForPartialVectors<TestMulAdd>());
-}
-
-struct TestReorderWidenMulAccumulate {
-  template <typename TN, class DN>
-  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
-    using TW = MakeWide<TN>;
-    const RepartitionToWide<DN> dw;
-    const auto f0 = Zero(dw);
-    const auto f1 = Set(dw, 1.0f);
-    const auto fi = Iota(dw, 1);
-    const auto bf0 = ReorderDemote2To(dn, f0, f0);
-    const auto bf1 = ReorderDemote2To(dn, f1, f1);
-    const auto bfi = ReorderDemote2To(dn, fi, fi);
-    const size_t NW = Lanes(dw);
-    auto delta = AllocateAligned<TW>(2 * NW);
-    for (size_t i = 0; i < 2 * NW; ++i) {
-      delta[i] = 0.0f;
-    }
-
-    // Any input zero => both outputs zero
-    auto sum1 = f0;
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-
-    // delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
-    for (size_t p = 0; p < 2 * NW; ++p) {
-      delta[p] = 1.0f;
-      const auto delta0 = Load(dw, delta.get() + 0);
-      const auto delta1 = Load(dw, delta.get() + NW);
-      delta[p] = 0.0f;
-      const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
-
-      {
-        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Swapped arg order
-      {
-        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Start with nonzero sum0 or sum1
-      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Start with nonzero sum0 or sum1, and swap arg order
-      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+        const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
+        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
+        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
+        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
+        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
      }
+      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
+      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
    }
  }
 };

-HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
-  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
-}
-
-struct TestDiv {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(-2));
-    const auto v1 = Set(d, T(1));
-
-    // Unchanged after division by 1.
-    HWY_ASSERT_VEC_EQ(d, v, Div(v, v1));
-
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (T(i) - 2) / T(2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2))));
-  }
-};
-
-HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); }
-
-struct TestApproximateReciprocal {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(-2));
-    const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v);
-    const size_t N = Lanes(d);
-    auto input = AllocateAligned<T>(N);
-    Store(nonzero, d, input.get());
-
-    auto actual = AllocateAligned<T>(N);
-    Store(ApproximateReciprocal(nonzero), d, actual.get());
-
-    double max_l1 = 0.0;
-    double worst_expected = 0.0;
-    double worst_actual = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      const double expected = 1.0 / input[i];
-      const double l1 = std::abs(expected - actual[i]);
-      if (l1 > max_l1) {
-        max_l1 = l1;
-        worst_expected = expected;
-        worst_actual = actual[i];
-      }
-    }
-    const double abs_worst_expected = std::abs(worst_expected);
-    if (abs_worst_expected > 1E-5) {
-      const double max_rel = max_l1 / abs_worst_expected;
-      fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
-              worst_expected, worst_actual);
-      HWY_ASSERT(max_rel < 0.004);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllApproximateReciprocal() {
-  ForPartialVectors<TestApproximateReciprocal>()(float());
-}
-
-struct TestSquareRoot {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto vi = Iota(d, 0);
-    HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
-  }
-};
-
-HWY_NOINLINE void TestAllSquareRoot() {
-  ForFloatTypes(ForPartialVectors<TestSquareRoot>());
-}
-
-struct TestReciprocalSquareRoot {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Set(d, 123.0f);
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    Store(ApproximateReciprocalSqrt(v), d, lanes.get());
-    for (size_t i = 0; i < N; ++i) {
-      float err = lanes[i] - 0.090166f;
-      if (err < 0.0f) err = -err;
-      if (err >= 4E-4f) {
-        HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
-                  static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
-                  err);
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllReciprocalSquareRoot() {
-  ForPartialVectors<TestReciprocalSquareRoot>()(float());
-}
-
-template <typename T, class D>
-AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
-  const T eps = std::numeric_limits<T>::epsilon();
-  const T test_cases[] = {
-    // +/- 1
-    T(1),
-    T(-1),
-    // +/- 0
-    T(0),
-    T(-0),
-    // near 0
-    T(0.4),
-    T(-0.4),
-    // +/- integer
-    T(4),
-    T(-32),
-    // positive near limit
-    MantissaEnd<T>() - T(1.5),
-    MantissaEnd<T>() + T(1.5),
-    // negative near limit
-    -MantissaEnd<T>() - T(1.5),
-    -MantissaEnd<T>() + T(1.5),
-    // positive tiebreak
-    T(1.5),
-    T(2.5),
-    // negative tiebreak
-    T(-1.5),
-    T(-2.5),
-    // positive +/- delta
-    T(2.0001),
-    T(3.9999),
-    // negative +/- delta
-    T(-999.9999),
-    T(-998.0001),
-    // positive +/- epsilon
-    T(1) + eps,
-    T(1) - eps,
-    // negative +/- epsilon
-    T(-1) + eps,
-    T(-1) - eps,
-    // +/- huge (but still fits in float)
-    T(1E34),
-    T(-1E35),
-    // +/- infinity
-    std::numeric_limits<T>::infinity(),
-    -std::numeric_limits<T>::infinity(),
-    // qNaN
-    GetLane(NaN(d))
-  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  auto in = AllocateAligned<T>(padded);
-  auto expected = AllocateAligned<T>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, T(0));
-  return in;
-}
-
-struct TestRound {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      // Avoid [std::]round, which does not round to nearest *even*.
-      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = static_cast<T>(nearbyint(in[i]));
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllRound() {
-  ForFloatTypes(ForPartialVectors<TestRound>());
-}
-
-struct TestNearestInt {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF tf, const DF df) {
-    using TI = MakeSigned<TF>;
-    const RebindToSigned<DF> di;
-
-    size_t padded;
-    auto in = RoundTestCases(tf, df, padded);
-    auto expected = AllocateAligned<TI>(padded);
-
-    constexpr double max = static_cast<double>(LimitsMax<TI>());
-    for (size_t i = 0; i < padded; ++i) {
-      if (std::isnan(in[i])) {
-        // We replace NaN with 0 below (no_nan)
-        expected[i] = 0;
-      } else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
-        // Avoid undefined result for lrintf
-        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
-      } else {
-        expected[i] = static_cast<TI>(lrintf(in[i]));
-      }
-    }
-    for (size_t i = 0; i < padded; i += Lanes(df)) {
-      const auto v = Load(df, &in[i]);
-      const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
-      HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllNearestInt() {
-  ForPartialVectors<TestNearestInt>()(float());
-}
-
-struct TestTrunc {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = static_cast<T>(trunc(in[i]));
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllTrunc() {
-  ForFloatTypes(ForPartialVectors<TestTrunc>());
-}
-
-struct TestCeil {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      expected[i] = std::ceil(in[i]);
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllCeil() {
-  ForFloatTypes(ForPartialVectors<TestCeil>());
-}
-
-struct TestFloor {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      expected[i] = std::floor(in[i]);
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllFloor() {
-  ForFloatTypes(ForPartialVectors<TestFloor>());
-}
-
-struct TestSumOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes 0
-    double sum = 0.0;
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      sum += static_cast<double>(in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
-                      SumOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i (iota) to include upper lanes
-    sum = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      sum += static_cast<double>(i);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
-  }
-};
-
-HWY_NOINLINE void TestAllSumOfLanes() {
-  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
-}
-
-struct TestMinOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes = 2 (not the minimum)
-    T min = HighestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = N - i to include upper lanes
-    min = HighestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-struct TestMaxOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    T max = LowestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i to include upper lanes
-    max = LowestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  const ForPartialVectors<TestMinOfLanes> test_min;
-  const ForPartialVectors<TestMaxOfLanes> test_max;
-  ForUIF3264(test_min);
-  ForUIF3264(test_max);
-  test_min(uint16_t());
-  test_max(uint16_t());
-  test_min(int16_t());
-  test_max(int16_t());
-}
-
-struct TestAbsDiff {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes_a = AllocateAligned<T>(N);
-    auto in_lanes_b = AllocateAligned<T>(N);
-    auto out_lanes = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes_a[i] = static_cast<T>((i ^ 1u) << i);
-      in_lanes_b[i] = static_cast<T>(i << i);
-      out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]);
-    }
-    const auto a = Load(d, in_lanes_a.get());
-    const auto b = Load(d, in_lanes_b.get());
-    const auto expected = Load(d, out_lanes.get());
-    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b));
-    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a));
-  }
-};
-
-HWY_NOINLINE void TestAllAbsDiff() {
-  ForPartialVectors<TestAbsDiff>()(float());
-}
-
-struct TestSumsOf8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    const size_t N = Lanes(d);
-    if (N < 8) return;
-    const Repartition<uint64_t, D> du64;
-
-    auto in_lanes = AllocateAligned<T>(N);
-    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = Random64(&rng) & 0xFF;
-      }
-
-      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
-        uint64_t sum = 0;
-        for (size_t i = 0; i < 8; ++i) {
-          sum += in_lanes[idx_sum * 8 + i];
-        }
-        sum_lanes[idx_sum] = sum;
-      }
-
-      const Vec<D> in = Load(d, in_lanes.get());
-      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllSumsOf8() {
-  ForGEVectors<64, TestSumsOf8>()(uint8_t());
-}
-
-struct TestNeg {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vn = Set(d, T(-3));
-    const auto vp = Set(d, T(3));
-    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
-    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
-  }
-};
-
-HWY_NOINLINE void TestAllNeg() {
-  ForSignedTypes(ForPartialVectors<TestNeg>());
-  ForFloatTypes(ForPartialVectors<TestNeg>());
+HWY_NOINLINE void TestAllMinMax128Upper() {
+  ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@ -1127,35 +435,12 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulHigh);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulEven);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulAdd);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReorderWidenMulAccumulate);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllDiv);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllApproximateReciprocal);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSquareRoot);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReciprocalSquareRoot);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
 }  // namespace hwy

-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
--- a/Показать больше
+++ b/Показать больше