Backed out changeset f87312b156f6 (bug 1757483) for causing build bustages on highway_export.h CLOSED TREE

2022-03-01 13:31:39 +02:00 · 2022-03-01 13:31:39 +02:00 · b4d61b9a3f
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z).
+  release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: f13e3b956eb226561ac79427893ec0afd66f91a8
+  revision: e69083a12a05caf037cabecdf1b248b7579705a5

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: commit 89875cba4d18485ec9692c80b747b59b73ce712e (2022-02-28T16:03:42Z).
+  release: commit 4322679b1c418addc2284c5ea84fc2c3935b4a75 (2022-02-07T20:56:39Z).

-  revision: 89875cba4d18485ec9692c80b747b59b73ce712e
+  revision: 4322679b1c418addc2284c5ea84fc2c3935b4a75

  license: Apache-2.0

--- a/third_party/highway/.github/workflows/build_test.yml
+++ b/third_party/highway/.github/workflows/build_test.yml
@ -1,57 +0,0 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Build / test
-on: [push, pull_request]
-jobs:
-  cmake:
-    name: Build and test ${{ matrix.name }}
-    runs-on: ubuntu-18.04
-    strategy:
-      matrix:
-        include:
-          - name: Clang-5.0
-            extra_deps: clang-5.0
-            c_compiler: clang-5.0
-            cxx_compiler: clang++-5.0
-
-          - name: Clang-6.0
-            extra_deps: clang-6.0
-            c_compiler: clang-6.0
-            cxx_compiler: clang++-6.0
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Install deps
-        run: sudo apt-get install ${{ matrix.extra_deps }}
-
-      - name: Build and test
-        run: |
-          export CMAKE_BUILD_PARALLEL_LEVEL=2
-          export CTEST_PARALLEL_LEVEL=2
-          CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out .
-          cmake --build out
-          ctest --test-dir out
-
-  bazel:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: bazelbuild/setup-bazelisk@v1
-      - uses: actions/cache@v2
-        with:
-          path: ~/.cache/bazel
-          key: bazel-${{ runner.os }}
-      - run: bazel build //...
--- a/third_party/highway/BUILD
+++ b/third_party/highway/BUILD
@ -1,6 +1,6 @@
 load("@bazel_skylib//lib:selects.bzl", "selects")
-
 load("@rules_cc//cc:defs.bzl", "cc_test")
+
 package(default_visibility = ["//visibility:public"])

 licenses(["notice"])
@ -18,11 +18,6 @@ config_setting(
    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
 )

-config_setting(
-    name = "compiler_emscripten",
-    values = {"cpu": "wasm32"},
-)
-
 # See https://github.com/bazelbuild/bazel/issues/12707
 config_setting(
    name = "compiler_gcc_bug",
@ -46,6 +41,13 @@ selects.config_setting_group(
    ],
 )

+config_setting(
+    name = "emulate_sve",
+    values = {
+        "copt": "-DHWY_EMULATE_SVE",
+    },
+)
+
 # Additional warnings for Clang OR GCC (skip for MSVC)
 CLANG_GCC_COPTS = [
    "-Wunused-parameter",
@ -80,7 +82,7 @@ COPTS = select({
    "//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
 }) + select({
    "@platforms//cpu:riscv64": [
-        "-march=rv64gcv1p0",
+        "-march=rv64gcv0p10",
        "-menable-experimental-extensions",
    ],
    "//conditions:default": [
@ -110,32 +112,28 @@ cc_library(
        "hwy/base.h",
        "hwy/cache_control.h",
        "hwy/detect_compiler_arch.h",  # private
-        "hwy/highway_export.h",
+        "hwy/detect_targets.h",  # private
+        "hwy/targets.h",
    ],
    compatible_with = [],
    copts = COPTS,
    textual_hdrs = [
-        # These are textual because config macros influence them:
-        "hwy/detect_targets.h",  # private
-        "hwy/targets.h",
-        # End of list
        "hwy/highway.h",  # public
        "hwy/foreach_target.h",  # public
        "hwy/ops/arm_neon-inl.h",
        "hwy/ops/arm_sve-inl.h",
        "hwy/ops/generic_ops-inl.h",
+        "hwy/ops/rvv-inl.h",
        "hwy/ops/scalar-inl.h",
        "hwy/ops/set_macros-inl.h",
        "hwy/ops/shared-inl.h",
+        "hwy/ops/wasm_128-inl.h",
        "hwy/ops/x86_128-inl.h",
        "hwy/ops/x86_256-inl.h",
        "hwy/ops/x86_512-inl.h",
-        # Select avoids recompiling native arch if only non-native changed
-    ] + select({
-        ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
-        "//conditions:default": [],
-    }) + select({
-        "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
+    ],
+    deps = select({
+        ":emulate_sve": ["//third_party/farm_sve"],
        "//conditions:default": [],
    }),
 )
@ -146,9 +144,7 @@ cc_library(
    textual_hdrs = [
        "hwy/contrib/dot/dot-inl.h",
    ],
-    deps = [
-        ":hwy",
-    ],
+    deps = [":hwy"],
 )

 cc_library(
@ -160,9 +156,7 @@ cc_library(
        "hwy/contrib/image/image.h",
    ],
    compatible_with = [],
-    deps = [
-        ":hwy",
-    ],
+    deps = [":hwy"],
 )

 cc_library(
@ -171,9 +165,16 @@ cc_library(
    textual_hdrs = [
        "hwy/contrib/math/math-inl.h",
    ],
-    deps = [
-        ":hwy",
+    deps = [":hwy"],
+)
+
+cc_library(
+    name = "sort",
+    compatible_with = [],
+    textual_hdrs = [
+        "hwy/contrib/sort/sort-inl.h",
    ],
+    deps = [":hwy"],
 )

 # Everything required for tests that use Highway.
@ -187,9 +188,7 @@ cc_library(
    ],
    # Must not depend on a gtest variant, which can conflict with the
    # GUNIT_INTERNAL_BUILD_MODE defined by the test.
-    deps = [
-        ":hwy",
-    ],
+    deps = [":hwy"],
 )

 cc_library(
@ -213,9 +212,7 @@ cc_library(
    srcs = ["hwy/examples/skeleton.cc"],
    hdrs = ["hwy/examples/skeleton.h"],
    textual_hdrs = ["hwy/examples/skeleton-inl.h"],
-    deps = [
-        ":hwy",
-    ],
+    deps = [":hwy"],
 )

 cc_binary(
@ -229,7 +226,7 @@ HWY_TESTS = [
    ("hwy/contrib/dot/", "dot_test"),
    ("hwy/contrib/image/", "image_test"),
    ("hwy/contrib/math/", "math_test"),
-    # contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
+    ("hwy/contrib/sort/", "sort_test"),
    ("hwy/examples/", "skeleton_test"),
    ("hwy/", "nanobenchmark_test"),
    ("hwy/", "aligned_allocator_test"),
@ -242,27 +239,13 @@ HWY_TESTS = [
    ("hwy/tests/", "compare_test"),
    ("hwy/tests/", "convert_test"),
    ("hwy/tests/", "crypto_test"),
-    ("hwy/tests/", "demote_test"),
    ("hwy/tests/", "logical_test"),
    ("hwy/tests/", "mask_test"),
    ("hwy/tests/", "memory_test"),
-    ("hwy/tests/", "shift_test"),
    ("hwy/tests/", "swizzle_test"),
    ("hwy/tests/", "test_util_test"),
 ]

-HWY_TEST_DEPS = [
-    ":dot",
-    ":hwy",
-    ":hwy_test_util",
-    ":image",
-    ":math",
-    ":nanobenchmark",
-    ":skeleton",
-    "//hwy/contrib/sort:vqsort",
-    "@com_google_googletest//:gtest_main",
-]
-
 [
    [
        cc_test(
@ -282,18 +265,6 @@ HWY_TEST_DEPS = [
                "@platforms//cpu:riscv64": ["fully_static_link"],
                "//conditions:default": [],
            }),
-            linkopts = select({
-                ":compiler_emscripten": [
-                    "-s ASSERTIONS=2",
-                    "-s ENVIRONMENT=node,shell,web",
-                    "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
-                    "-s DEMANGLE_SUPPORT=1",
-                    "-s EXIT_RUNTIME=1",
-                    "-s ALLOW_MEMORY_GROWTH=1",
-                    "--pre-js $(location :preamble.js.lds)",
-                ],
-                "//conditions:default": [],
-            }),
            linkstatic = select({
                "@platforms//cpu:riscv64": True,
                "//conditions:default": False,
@ -301,10 +272,17 @@ HWY_TEST_DEPS = [
            local_defines = ["HWY_IS_TEST"],
            # for test_suite.
            tags = ["hwy_ops_test"],
-            deps = HWY_TEST_DEPS + select({
-                ":compiler_emscripten": [":preamble.js.lds"],
-                "//conditions:default": [],
-            }),
+            deps = [
+                ":dot",
+                ":hwy",
+                ":hwy_test_util",
+                ":image",
+                ":math",
+                ":nanobenchmark",
+                ":skeleton",
+                ":sort",
+                "@com_google_googletest//:gtest_main",
+            ],
        ),
    ]
    for subdir, test in HWY_TESTS
@ -315,5 +293,3 @@ test_suite(
    name = "hwy_ops_tests",
    tags = ["hwy_ops_test"],
 )
-
-# Placeholder for integration test, do not remove
--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@ -19,13 +19,11 @@ if(POLICY CMP0083)
  cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 0.16.0)  # Keep in sync with highway.h version
-
-# Directly define the ABI version from the cmake project() version values:
-set(LIBRARY_VERSION "${hwy_VERSION}")
-set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
+project(hwy VERSION 0.15.0)  # Keep in sync with highway.h version

+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)

 # Enabled PIE binaries by default if supported.
 include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
@ -42,14 +40,13 @@ if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE RelWithDebInfo)
 endif()

-set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
+set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")

 # Unconditionally adding -Werror risks breaking the build when new warnings
 # arise due to compiler/platform changes. Enable this in CI/tests.
 set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")

-set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
-set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
+set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?")

 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
@ -67,32 +64,7 @@ set(HWY_CONTRIB_SOURCES
    hwy/contrib/image/image.cc
    hwy/contrib/image/image.h
    hwy/contrib/math/math-inl.h
-    hwy/contrib/sort/disabled_targets.h
-    hwy/contrib/sort/shared-inl.h
-    hwy/contrib/sort/sorting_networks-inl.h
-    hwy/contrib/sort/traits-inl.h
-    hwy/contrib/sort/traits128-inl.h
-    hwy/contrib/sort/vqsort-inl.h
-    hwy/contrib/sort/vqsort.cc
-    hwy/contrib/sort/vqsort.h
-    hwy/contrib/sort/vqsort_128a.cc
-    hwy/contrib/sort/vqsort_128d.cc
-    hwy/contrib/sort/vqsort_f32a.cc
-    hwy/contrib/sort/vqsort_f32d.cc
-    hwy/contrib/sort/vqsort_f64a.cc
-    hwy/contrib/sort/vqsort_f64d.cc
-    hwy/contrib/sort/vqsort_i16a.cc
-    hwy/contrib/sort/vqsort_i16d.cc
-    hwy/contrib/sort/vqsort_i32a.cc
-    hwy/contrib/sort/vqsort_i32d.cc
-    hwy/contrib/sort/vqsort_i64a.cc
-    hwy/contrib/sort/vqsort_i64d.cc
-    hwy/contrib/sort/vqsort_u16a.cc
-    hwy/contrib/sort/vqsort_u16d.cc
-    hwy/contrib/sort/vqsort_u32a.cc
-    hwy/contrib/sort/vqsort_u32d.cc
-    hwy/contrib/sort/vqsort_u64a.cc
-    hwy/contrib/sort/vqsort_u64d.cc
+    hwy/contrib/sort/sort-inl.h
 )

 set(HWY_SOURCES
@ -104,7 +76,6 @@ set(HWY_SOURCES
    hwy/detect_targets.h  # private
    hwy/foreach_target.h
    hwy/highway.h
-    hwy/highway_export.h
    hwy/nanobenchmark.cc
    hwy/nanobenchmark.h
    hwy/ops/arm_neon-inl.h
@ -221,59 +192,20 @@ else()

 endif()  # !MSVC

-# By default prefer STATIC build (legacy behavior)
-option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
-option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
-# only expose shared/static options to advanced users:
-mark_as_advanced(BUILD_SHARED_LIBS)
-mark_as_advanced(HWY_FORCE_STATIC_LIBS)
-# Define visibility settings globally:
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
-set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
-
-# Copy-cat "add_library" logic + add override.
-set(HWY_LIBRARY_TYPE "SHARED")
-if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
-  set(HWY_LIBRARY_TYPE "STATIC")
-endif()
-
-# This preprocessor define will drive the build, also used in the *.pc files:
-if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
-  set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
-else()
-  set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
-endif()
-
-add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
-target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
+add_library(hwy STATIC ${HWY_SOURCES})
 target_compile_options(hwy PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
 target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
-target_compile_features(hwy PUBLIC cxx_std_11)
-set_target_properties(hwy PROPERTIES
-  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-if(UNIX AND NOT APPLE)
-  set_property(TARGET hwy APPEND_STRING PROPERTY
-    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-endif()

-add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
-target_link_libraries(hwy_contrib hwy)
+add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
 target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
 target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
-target_compile_features(hwy_contrib PUBLIC cxx_std_11)

-add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
-target_link_libraries(hwy_test hwy)
+add_library(hwy_test STATIC ${HWY_TEST_SOURCES})
 target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
 target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
-target_compile_features(hwy_test PUBLIC cxx_std_11)

 # -------------------------------------------------------- hwy_list_targets
 # Generate a tool to print the compiled-in targets as defined by the current
@ -287,22 +219,17 @@ target_include_directories(hwy_list_targets PRIVATE
 # Naked target also not always could be run (due to the lack of '.\' prefix)
 # Thus effective command to run should contain the full path
 # and emulator prefix (if any).
-if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
 add_custom_command(TARGET hwy_list_targets POST_BUILD
    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
-endif()

 # --------------------------------------------------------
 # Allow skipping the following sections for projects that do not need them:
 # tests, examples, benchmarks and installation.
+if (HWY_EXAMPLES_TESTS_INSTALL)

 # -------------------------------------------------------- install library
-if (HWY_ENABLE_INSTALL)
-
 install(TARGETS hwy
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 # Install all the headers keeping the relative path to the current directory
 # when installing them.
 foreach (source ${HWY_SOURCES})
@ -314,9 +241,7 @@ foreach (source ${HWY_SOURCES})
 endforeach()

 install(TARGETS hwy_contrib
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 # Install all the headers keeping the relative path to the current directory
 # when installing them.
 foreach (source ${HWY_CONTRIB_SOURCES})
@ -328,9 +253,7 @@ foreach (source ${HWY_CONTRIB_SOURCES})
 endforeach()

 install(TARGETS hwy_test
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
 # Install all the headers keeping the relative path to the current directory
 # when installing them.
 foreach (source ${HWY_TEST_SOURCES})
@ -349,9 +272,7 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 endforeach()

-endif() # HWY_ENABLE_INSTALL
 # -------------------------------------------------------- Examples
-if (HWY_ENABLE_EXAMPLES)

 # Avoids mismatch between GTest's static CRT and our dynamic.
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@ -359,6 +280,7 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 # Programming exercise with integrated benchmark
 add_executable(hwy_benchmark hwy/examples/benchmark.cc)
 target_sources(hwy_benchmark PRIVATE
+    hwy/nanobenchmark.cc
    hwy/nanobenchmark.h)
 # Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
 # observe the difference in targets printed.
@ -367,7 +289,6 @@ target_link_libraries(hwy_benchmark hwy)
 set_target_properties(hwy_benchmark
    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")

-endif() # HWY_ENABLE_EXAMPLES
 # -------------------------------------------------------- Tests

 include(CTest)
@ -431,11 +352,9 @@ set(HWY_TEST_FILES
  hwy/tests/compare_test.cc
  hwy/tests/convert_test.cc
  hwy/tests/crypto_test.cc
-  hwy/tests/demote_test.cc
  hwy/tests/logical_test.cc
  hwy/tests/mask_test.cc
  hwy/tests/memory_test.cc
-  hwy/tests/shift_test.cc
  hwy/tests/swizzle_test.cc
  hwy/tests/test_util_test.cc
 )
@ -458,7 +377,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
    target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
  endif()
  # Output test targets in the test directory.
-  set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
+  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")

  if (HWY_EMSCRIPTEN)
    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
@ -475,3 +394,5 @@ endforeach ()
 target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)

 endif() # BUILD_TESTING
+
+endif() # HWY_EXAMPLES_TESTS_INSTALL
--- a/third_party/highway/CMakeLists.txt.in
+++ b/third_party/highway/CMakeLists.txt.in
@ -5,11 +5,11 @@ project(googletest-download NONE)
 include(ExternalProject)
 ExternalProject_Add(googletest
  GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           43efa0a4efd40c78b9210d15373112081899a97c
+  GIT_TAG           master
  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
  CONFIGURE_COMMAND ""
  BUILD_COMMAND     ""
  INSTALL_COMMAND   ""
  TEST_COMMAND      ""
-)
+)
--- a/third_party/highway/README.md
+++ b/third_party/highway/README.md
@ -1,95 +1,30 @@
-# Efficient and performance-portable vector software
+# Efficient and performance-portable SIMD

-[//]: # (placeholder, do not remove)
+Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
+applying the same operation to multiple 'lanes' using a single CPU instruction.

-Highway is a C++ library that provides portable SIMD/vector intrinsics.
+## Why Highway?

-## Why
-
-We are passionate about high-performance software. We see major untapped
-potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
-to reliably and economically push the boundaries of what is possible in
-software.
-
-## How
-
-CPUs provide SIMD/vector instructions that apply the same operation to multiple
-data items. This can reduce energy usage e.g. *fivefold* because fewer
-instructions are executed. We also often see *5-10x* speedups.
-
-Highway makes SIMD/vector programming practical and workable according to these
-guiding principles:
-
-**Does what you expect**: Highway is a C++ library with carefully-chosen
-functions that map well to CPU instructions without extensive compiler
-transformations. The resulting code is more predictable and robust to code
-changes/compiler updates than autovectorization.
-
-**Works on widely-used platforms**: Highway supports four architectures; the
-same application code can target eight instruction sets, including those with
-'scalable' vectors (size unknown at compile time). Highway only requires C++11
-and supports four families of compilers. If you would like to use Highway on
-other platforms, please raise an issue.
-
-**Flexible to deploy**: Applications using Highway can run on heterogeneous
-clouds or client devices, choosing the best available instruction set at
-runtime. Alternatively, developers may choose to target a single instruction set
-without any runtime overhead. In both cases, the application code is the same
-except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
-line of code.
-
-**Suitable for a variety of domains**: Highway provides an extensive set of
-operations, used for image processing (floating-point), compression, video
-analysis, linear algebra, cryptography, sorting and random generation. We
-recognise that new use-cases may require additional ops and are happy to add
-them where it makes sense (e.g. no performance cliffs on some architectures). If
-you would like to discuss, please file an issue.
-
-**Rewards data-parallel design**: Highway provides tools such as Gather,
-MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
-the biggest gains are unlocked by designing algorithms and data structures for
-scalable vectors. Helpful techniques include batching, structure-of-array
-layouts, and aligned/padded allocations.
-
-## Examples
-
-Online demos using Compiler Explorer:
-
-   [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
-   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
-
-Projects using Highway: (to add yours, feel free to raise an issue or contact us
-via the below email)
-
-*   [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
-*   [JPEG XL image codec](https://github.com/libjxl/libjxl)
-*   [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
-*   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort)
+- more portable (same source code) than platform-specific intrinsics,
+- works on a wider range of compilers than compiler-specific vector extensions,
+- more dependable than autovectorization,
+- easier to write/maintain than assembly language,
+- supports **runtime dispatch**,
+- supports **variable-length vector** architectures.

 ## Current status

-### Targets
-
 Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
-requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
+requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE,
 WASM SIMD.

-SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV
-is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops
-which were not yet supported by GCC.
+SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not
+yet validated. A subset of RVV is implemented and tested with GCC and QEMU.
+Work is underway to compile using LLVM, which has different intrinsics with AVL.

-### Versioning
-
-Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
-incrementing MINOR after backward-compatible additions and PATCH after
-backward-compatible fixes. We recommend using releases (rather than the Git tip)
-because they are tested more extensively, see below.
-
-Version 0.11 is considered stable enough to use in other projects.
-Version 1.0 will signal an increased focus on backwards compatibility and will
-be reached after the RVV target is finished (planned for 2022H1).
-
-### Testing
+Version 0.11 is considered stable enough to use in other projects, and is
+expected to remain backwards compatible unless serious issues are discovered
+while finishing the RVV target. After that, Highway will reach version 1.0.

 Continuous integration tests build with a recent version of Clang (running on
 x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
@ -98,15 +33,13 @@ Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
 GCC cross-compile and QEMU. See the
 [testing process](g3doc/release_testing_process.md) for details.

-### Related modules
-
 The `contrib` directory contains SIMD-related utilities: an image class with
-aligned rows, a math library (16 functions already implemented, mostly
-trigonometry), and functions for computing dot products and sorting.
+aligned rows, and a math library (16 functions already implemented, mostly
+trigonometry).

 ## Installation

-This project uses CMake to generate and build. In a Debian-based system you can
+This project uses cmake to generate and build. In a Debian-based system you can
 install it via:

 ```bash
@ -122,8 +55,7 @@ installing gtest separately:
 sudo apt install libgtest-dev
 ```

-To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
-the standard CMake workflow can be used:
+To build and test the library the standard cmake workflow can be used:

 ```bash
 mkdir -p build && cd build
@ -144,40 +76,31 @@ and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
 indicates the number of instructions per operation.

 We recommend using full SIMD vectors whenever possible for maximum performance
-portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
-`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
-alternatives for use-cases requiring an upper bound on the lanes:
+portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
+`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
+two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET ==
+HWY_SCALAR`, the vector always has one lane. For all other targets, up to
+128-bit vectors are guaranteed to be available.

-   For up to a power of two `N`, specify `CappedTag<T, N>` (or
-    equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as
-    a narrow matrix. A loop is still required because vectors may actually have
-    fewer than `N` lanes.
-
-   For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
-    supported `N` depends on the target, but is guaranteed to be at least
-    `16/sizeof(T)`.
-
-Functions using Highway must either be inside `namespace HWY_NAMESPACE {`
-(possibly nested in one or more other namespaces defined by the project), OR
-each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE;
-hn::LoadDup128()`. Additionally, each function using Highway must either be
-prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
-`HWY_AFTER_NAMESPACE()`.
+Functions using Highway must be inside `namespace HWY_NAMESPACE {`
+(possibly nested in one or more other namespaces defined by the project), and
+additionally either prefixed with `HWY_ATTR`, or residing between
+`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.

 *   For static dispatch, `HWY_TARGET` will be the best available target among
    `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
-    [quick-reference](g3doc/quick_reference.md)). Functions inside
-    `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
-    the same module they are defined in. You can call the function from other
-    modules by wrapping it in a regular function and declaring the regular
-    function in a header.
+    [quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
+    can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
+    they are defined in. You can call the function from other modules by
+    wrapping it in a regular function and declaring the regular function in a
+    header.

 *   For dynamic dispatch, a table of function pointers is generated via the
    `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
    call the best function pointer for the current CPU's supported targets. A
    module is automatically compiled for each target in `HWY_TARGETS` (see
    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
-    defined and `foreach_target.h` is included.
+    defined and foreach_target.h is included.

 ## Compiler flags

@ -200,17 +123,17 @@ ensure proper VEX code generation for AVX2 targets.
 To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
 loop with number of iterations matching the preferred vector width.

-In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
-the number of elements to process, and `N = Lanes(d)` the number of lanes in a
-full vector. Assume the loop body is given as a function `template<bool partial,
-class D> void LoopBody(D d, size_t index, size_t max_n)`.
+In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
+number of elements to process, and `N = Lanes(d)` the number of lanes in a full
+vector. Assume the loop body is given as a function `template<bool partial,
+class D> void LoopBody(D d, size_t max_n)`.

 Highway offers several ways to express loops where `N` need not divide `count`:

 *   Ensure all inputs/outputs are padded. Then the loop is simply

    ```
-    for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
+    for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
    ```
    Here, the template parameter and second function argument are not needed.

@ -226,8 +149,8 @@ Highway offers several ways to express loops where `N` need not divide `count`:

    ```
    size_t i = 0;
-    for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
-    for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0);
+    for (; i + N <= count; i += N) LoopBody<false>(d, 0);
+    for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
    ```
    The template parameter and second function arguments are again not needed.

@ -240,20 +163,18 @@ Highway offers several ways to express loops where `N` need not divide `count`:
    ```
    size_t i = 0;
    for (; i + N <= count; i += N) {
-      LoopBody<false>(d, i, 0);
+      LoopBody<false>(d, 0);
    }
    if (i < count) {
-      LoopBody<true>(d, i, count - i);
+      LoopBody<true>(d, count - i);
    }
    ```
-    Now the template parameter and third function argument can be used inside
+    Now the template parameter and second function argument can be used inside
    `LoopBody` to 'blend' the new partial vector with previous memory contents:
    `Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.

    This is a good default when it is infeasible to ensure vectors are padded.
    In contrast to the scalar loop, only a single final iteration is needed.
-    The increased code size from two loop bodies is expected to be worthwhile
-    because it avoids the cost of masking in all but the final iteration.

 ## Additional resources

--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@ -1,15 +1,3 @@
-highway (0.16.0-1) UNRELEASED; urgency=medium
-
-  * Add contrib/sort (vectorized quicksort)
-  * Add IfNegativeThenElse, IfVecThenElse
-  * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
-  * Add OrAnd, Min128, Max128, Lt128, SumsOf8
-  * Support capped/partial vectors on RVV/SVE, int64 in WASM
-  * Support SVE2, shared library build
-  * Remove deprecated overloads without the required d arg (UpperHalf etc.)
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 03 Feb 2022 11:00:00 +0100
-
 highway (0.15.0-1) UNRELEASED; urgency=medium

  * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
--- a/third_party/highway/hwy/aligned_allocator.h
+++ b/third_party/highway/hwy/aligned_allocator.h
@ -18,11 +18,8 @@
 // Memory allocator with support for alignment and offsets.

 #include <stddef.h>
-
 #include <memory>

-#include "hwy/highway_export.h"
-
 namespace hwy {

 // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
@ -39,15 +36,15 @@ using FreePtr = void (*)(void* opaque, void* memory);
 // bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
 // the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
 // memory or malloc() if it is null.
-HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
-                                         AllocPtr alloc_ptr, void* opaque_ptr);
+void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
+                           void* opaque_ptr);

 // Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
 // must have been returned from a previous call to `AllocateAlignedBytes`.
 // Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
 // `free_ptr` function is null, uses the default free().
-HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
-                                    FreePtr free_ptr, void* opaque_ptr);
+void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
+                      void* opaque_ptr);

 // Class that deletes the aligned pointer passed to operator() calling the
 // destructor before freeing the pointer. This is equivalent to the
@ -79,10 +76,8 @@ class AlignedDeleter {
  // array. TypeArrayDeleter<T> would match this prototype.
  using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);

-  HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
-                                               FreePtr free_ptr,
-                                               void* opaque_ptr,
-                                               ArrayDeleter deleter);
+  static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
+                                 void* opaque_ptr, ArrayDeleter deleter);

  FreePtr free_;
  void* opaque_ptr_;
@ -112,8 +107,8 @@ template <typename T, typename... Args>
 AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
  T* ptr = static_cast<T*>(AllocateAlignedBytes(
      sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
-  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
-                             AlignedDeleter());
+  return AlignedUniquePtr<T>(
+      new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
 }

 // Helpers for array allocators (avoids overflow)
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@ -24,7 +24,6 @@
 #include <cfloat>

 #include "hwy/detect_compiler_arch.h"
-#include "hwy/highway_export.h"

 //------------------------------------------------------------------------------
 // Compiler-specific definitions
@ -185,6 +184,10 @@
  } while (0)
 #endif

+#if defined(HWY_EMULATE_SVE)
+class FarmFloat16;
+#endif
+
 namespace hwy {

 //------------------------------------------------------------------------------
@ -202,9 +205,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 //------------------------------------------------------------------------------
 // Alignment

-// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
-// should be allocated dynamically via aligned_allocator.h because Lanes() may
-// exceed the stack size.
+// For stack-allocated partial arrays or LoadDup128.
 #if HWY_ARCH_X86
 #define HWY_ALIGN_MAX alignas(64)
 #elif HWY_ARCH_RVV && defined(__riscv_vector)
@ -227,7 +228,9 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;

 #pragma pack(push, 1)

-#if HWY_NATIVE_FLOAT16
+#if defined(HWY_EMULATE_SVE)
+using float16_t = FarmFloat16;
+#elif HWY_NATIVE_FLOAT16
 using float16_t = __fp16;
 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
 // arguments, so use a wrapper.
@ -250,15 +253,15 @@ using float64_t = double;
 //------------------------------------------------------------------------------
 // Controlling overload resolution (SFINAE)

-template <bool Condition>
+template <bool Condition, class T>
 struct EnableIfT {};
-template <>
-struct EnableIfT<true> {
-  using type = void;
+template <class T>
+struct EnableIfT<true, T> {
+  using type = T;
 };

-template <bool Condition>
-using EnableIf = typename EnableIfT<Condition>::type;
+template <bool Condition, class T = void>
+using EnableIf = typename EnableIfT<Condition, T>::type;

 template <typename T, typename U>
 struct IsSameT {
@ -280,7 +283,7 @@ HWY_API constexpr bool IsSame() {
 //
 // Note that enabling for exactly 128 bits is unnecessary because a function can
 // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
-// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
+// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
@ -316,6 +319,102 @@ struct RemoveConstT<const T> {
 template <class T>
 using RemoveConst = typename RemoveConstT<T>::type;

+//------------------------------------------------------------------------------
+// Type traits
+
+template <typename T>
+HWY_API constexpr bool IsFloat() {
+  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
+  // from a float, not compared.
+  return IsSame<T, float>() || IsSame<T, double>();
+}
+
+template <typename T>
+HWY_API constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+template <>
+constexpr bool IsSigned<float16_t>() {
+  return true;
+}
+template <>
+constexpr bool IsSigned<bfloat16_t>() {
+  return true;
+}
+
+// Largest/smallest representable integer values.
+template <typename T>
+HWY_API constexpr T LimitsMax() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
+                       : static_cast<T>(~0ull);
+}
+template <typename T>
+HWY_API constexpr T LimitsMin() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
+}
+
+// Largest/smallest representable value (integer or float). This naming avoids
+// confusion with numeric_limits<float>::min() (the smallest positive value).
+template <typename T>
+HWY_API constexpr T LowestValue() {
+  return LimitsMin<T>();
+}
+template <>
+constexpr float LowestValue<float>() {
+  return -FLT_MAX;
+}
+template <>
+constexpr double LowestValue<double>() {
+  return -DBL_MAX;
+}
+
+template <typename T>
+HWY_API constexpr T HighestValue() {
+  return LimitsMax<T>();
+}
+template <>
+constexpr float HighestValue<float>() {
+  return FLT_MAX;
+}
+template <>
+constexpr double HighestValue<double>() {
+  return DBL_MAX;
+}
+
+// Returns bitmask of the exponent field in IEEE binary32/64.
+template <typename T>
+constexpr T ExponentMask() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr uint32_t ExponentMask<uint32_t>() {
+  return 0x7F800000;
+}
+template <>
+constexpr uint64_t ExponentMask<uint64_t>() {
+  return 0x7FF0000000000000ULL;
+}
+
+// Returns 1 << mantissa_bits as a floating-point number. All integers whose
+// absolute value are less than this can be represented exactly.
+template <typename T>
+constexpr T MantissaEnd() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr float MantissaEnd<float>() {
+  return 8388608.0f;  // 1 << 23
+}
+template <>
+constexpr double MantissaEnd<double>() {
+  // floating point literal with p52 requires C++17.
+  return 4503599627370496.0;  // 1 << 52
+}
+
 //------------------------------------------------------------------------------
 // Type relations

@ -457,118 +556,6 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
 template <size_t N>
 using FloatFromSize = typename detail::TypeFromSize<N>::Float;

-//------------------------------------------------------------------------------
-// Type traits
-
-template <typename T>
-HWY_API constexpr bool IsFloat() {
-  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
-  // from a float, not compared.
-  return IsSame<T, float>() || IsSame<T, double>();
-}
-
-template <typename T>
-HWY_API constexpr bool IsSigned() {
-  return T(0) > T(-1);
-}
-template <>
-constexpr bool IsSigned<float16_t>() {
-  return true;
-}
-template <>
-constexpr bool IsSigned<bfloat16_t>() {
-  return true;
-}
-
-// Largest/smallest representable integer values.
-template <typename T>
-HWY_API constexpr T LimitsMax() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  using TU = MakeUnsigned<T>;
-  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
-                                      : static_cast<TU>(~0ull));
-}
-template <typename T>
-HWY_API constexpr T LimitsMin() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
-}
-
-// Largest/smallest representable value (integer or float). This naming avoids
-// confusion with numeric_limits<float>::min() (the smallest positive value).
-template <typename T>
-HWY_API constexpr T LowestValue() {
-  return LimitsMin<T>();
-}
-template <>
-constexpr float LowestValue<float>() {
-  return -FLT_MAX;
-}
-template <>
-constexpr double LowestValue<double>() {
-  return -DBL_MAX;
-}
-
-template <typename T>
-HWY_API constexpr T HighestValue() {
-  return LimitsMax<T>();
-}
-template <>
-constexpr float HighestValue<float>() {
-  return FLT_MAX;
-}
-template <>
-constexpr double HighestValue<double>() {
-  return DBL_MAX;
-}
-
-// Returns bitmask of the exponent field in IEEE binary32/64.
-template <typename T>
-constexpr T ExponentMask() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr uint32_t ExponentMask<uint32_t>() {
-  return 0x7F800000;
-}
-template <>
-constexpr uint64_t ExponentMask<uint64_t>() {
-  return 0x7FF0000000000000ULL;
-}
-
-// Returns bitmask of the mantissa field in IEEE binary32/64.
-template <typename T>
-constexpr T MantissaMask() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr uint32_t MantissaMask<uint32_t>() {
-  return 0x007FFFFF;
-}
-template <>
-constexpr uint64_t MantissaMask<uint64_t>() {
-  return 0x000FFFFFFFFFFFFFULL;
-}
-
-// Returns 1 << mantissa_bits as a floating-point number. All integers whose
-// absolute value are less than this can be represented exactly.
-template <typename T>
-constexpr T MantissaEnd() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr float MantissaEnd<float>() {
-  return 8388608.0f;  // 1 << 23
-}
-template <>
-constexpr double MantissaEnd<double>() {
-  // floating point literal with p52 requires C++17.
-  return 4503599627370496.0;  // 1 << 52
-}
-
 //------------------------------------------------------------------------------
 // Helper functions

@ -674,21 +661,14 @@ HWY_API size_t PopCount(uint64_t x) {
 #endif
 }

-// Skip HWY_API due to GCC "function not considered for inlining". Previously
-// such errors were caused by underlying type mismatches, but it's not clear
-// what is still mismatched despite all the casts.
 template <typename TI>
-/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
-  return x == TI{1}
-             ? 0
-             : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
+HWY_API constexpr size_t FloorLog2(TI x) {
+  return x == 1 ? 0 : FloorLog2(x >> 1) + 1;
 }

 template <typename TI>
-/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
-  return x == TI{1}
-             ? 0
-             : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
+HWY_API constexpr size_t CeilLog2(TI x) {
+  return x == 1 ? 0 : FloorLog2(x - 1) + 1;
 }

 #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
@ -747,7 +727,7 @@ HWY_API bfloat16_t BF16FromF32(float f) {
  return bf;
 }

-HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
+HWY_NORETURN void HWY_FORMAT(3, 4)
    Abort(const char* file, int line, const char* format, ...);

 }  // namespace hwy
--- a/third_party/highway/hwy/cache_control.h
+++ b/third_party/highway/hwy/cache_control.h
@ -36,7 +36,9 @@
 // undefine them in this header; these functions are anyway deprecated.
 // TODO(janwas): remove when these functions are removed.
 #pragma push_macro("LoadFence")
+#pragma push_macro("StoreFence")
 #undef LoadFence
+#undef StoreFence

 namespace hwy {

@ -70,6 +72,9 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
 #endif
 }

+// DEPRECATED, replace with `FlushStream`.
+HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); }
+
 // Optionally begins loading the cache line containing "p" to reduce latency of
 // subsequent actual loads.
 template <typename T>
@ -104,6 +109,7 @@ HWY_INLINE HWY_ATTR_CACHE void Pause() {
 }  // namespace hwy

 // TODO(janwas): remove when these functions are removed. (See above.)
+#pragma pop_macro("StoreFence")
 #pragma pop_macro("LoadFence")

 #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
--- a/third_party/highway/hwy/contrib/image/image.cc
+++ b/third_party/highway/hwy/contrib/image/image.cc
@ -14,14 +14,15 @@

 #include "hwy/contrib/image/image.h"

-#include <algorithm>  // swap
 #include <cstddef>

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
+
+#include <algorithm>  // swap
+
 #include "hwy/foreach_target.h"
 #include "hwy/highway.h"
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
--- a/third_party/highway/hwy/contrib/image/image.h
+++ b/third_party/highway/hwy/contrib/image/image.h
@ -27,13 +27,12 @@

 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
-#include "hwy/highway_export.h"

 namespace hwy {

 // Type-independent parts of Image<> - reduces code duplication and facilitates
 // moving member function implementations to cc file.
-struct HWY_CONTRIB_DLLEXPORT ImageBase {
+struct ImageBase {
  // Returns required alignment in bytes for externally allocated memory.
  static size_t VectorSize();

@ -101,7 +100,8 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
 protected:
  // Returns pointer to the start of a row.
  HWY_INLINE void* VoidRow(const size_t y) const {
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
    if (y >= ysize_) {
      HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
    }
@ -291,7 +291,8 @@ class Image3 {
 private:
  // Returns pointer to the start of a row.
  HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
    if (c >= kNumPlanes || y >= ysize()) {
      HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
--- a/third_party/highway/hwy/contrib/image/image_test.cc
+++ b/third_party/highway/hwy/contrib/image/image_test.cc
@ -51,7 +51,7 @@ struct TestAlignedT {
        for (size_t y = 0; y < ysize; ++y) {
          T* HWY_RESTRICT row = img.MutableRow(y);
          for (size_t x = 0; x < xsize; x += Lanes(d)) {
-            const auto values = Iota(d, static_cast<T>(dist(rng)));
+            const auto values = Iota(d, dist(rng));
            Store(values, d, row + x);
          }
        }
--- a/third_party/highway/hwy/contrib/math/math-inl.h
+++ b/third_party/highway/hwy/contrib/math/math-inl.h
@ -486,7 +486,7 @@ struct AsinImpl<float> {
  }
 };

-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64

 template <>
 struct AsinImpl<double> {
@ -531,7 +531,7 @@ struct AtanImpl<float> {
  }
 };

-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64

 template <>
 struct AtanImpl<double> {
@ -635,7 +635,7 @@ struct CosSinImpl<float> {
  }
 };

-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64

 template <>
 struct CosSinImpl<double> {
@ -787,7 +787,7 @@ struct LogImpl<float> {
  }
 };

-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
+#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
 template <>
 struct ExpImpl<double> {
  // Rounds double toward zero and returns as int32_t.
--- a/third_party/highway/hwy/contrib/math/math_test.cc
+++ b/third_party/highway/hwy/contrib/math/math_test.cc
@ -61,7 +61,7 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),

  uint64_t max_ulp = 0;
  // Emulation is slower, so cannot afford as many.
-  constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
+  constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000));
  for (int range_index = 0; range_index < range_count; ++range_index) {
    const UintT start = ranges[range_index][0];
    const UintT stop = ranges[range_index][1];
@ -96,11 +96,24 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
  HWY_ASSERT(max_ulp <= max_error_ulp);
 }

+// TODO(janwas): remove once RVV supports fractional LMUL
+#undef DEFINE_MATH_TEST_FUNC
+#if HWY_TARGET == HWY_RVV
+
+#define DEFINE_MATH_TEST_FUNC(NAME)                    \
+  HWY_NOINLINE void TestAll##NAME() {                  \
+    ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \
+  }
+
+#else
+
 #define DEFINE_MATH_TEST_FUNC(NAME)                 \
  HWY_NOINLINE void TestAll##NAME() {               \
    ForFloatTypes(ForPartialVectors<Test##NAME>()); \
  }

+#endif
+
 #undef DEFINE_MATH_TEST
 #define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
--- a/third_party/highway/hwy/contrib/sort/BUILD
+++ b/third_party/highway/hwy/contrib/sort/BUILD
@ -1,133 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])
-
-# Unused on Bazel builds, where this is not defined/known; Copybara replaces
-# usages with an empty list.
-COMPAT = [
-    "//buildenv/target:non_prod",  # includes mobile/vendor.
-]
-
-cc_library(
-    name = "vqsort",
-    srcs = [
-        # Split into separate files to reduce MSVC build time.
-        "vqsort.cc",
-        "vqsort_i16a.cc",
-        "vqsort_i16d.cc",
-        "vqsort_u16a.cc",
-        "vqsort_u16d.cc",
-        "vqsort_f32a.cc",
-        "vqsort_f32d.cc",
-        "vqsort_i32a.cc",
-        "vqsort_i32d.cc",
-        "vqsort_u32a.cc",
-        "vqsort_u32d.cc",
-        "vqsort_f64a.cc",
-        "vqsort_f64d.cc",
-        "vqsort_i64a.cc",
-        "vqsort_i64d.cc",
-        "vqsort_u64a.cc",
-        "vqsort_u64d.cc",
-        "vqsort_128a.cc",
-        "vqsort_128d.cc",
-    ],
-    hdrs = [
-        "disabled_targets.h",
-        "vqsort.h",  # public interface
-    ],
-    compatible_with = [],
-    textual_hdrs = [
-        "shared-inl.h",
-        "sorting_networks-inl.h",
-        "traits-inl.h",
-        "traits128-inl.h",
-        "vqsort-inl.h",
-    ],
-    deps = [
-        # Only if VQSORT_SECURE_RNG is set.
-        # "//third_party/absl/random",
-        "//:hwy",
-    ],
-)
-
-# -----------------------------------------------------------------------------
-# Internal-only targets
-
-cc_library(
-    name = "helpers",
-    testonly = 1,
-    textual_hdrs = [
-        "algo-inl.h",
-        "result-inl.h",
-    ],
-    deps = [
-        ":vqsort",
-        "//:nanobenchmark",
-        # Required for HAVE_PDQSORT, but that is unused and this is
-        # unavailable to Bazel builds, hence commented out.
-        # "//third_party/boost/allowed",
-        # Avoid ips4o and thus TBB to work around hwloc build failure.
-    ],
-)
-
-cc_binary(
-    name = "print_network",
-    testonly = 1,
-    srcs = ["print_network.cc"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "//:hwy",
-    ],
-)
-
-cc_test(
-    name = "sort_test",
-    size = "medium",
-    srcs = ["sort_test.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
-    local_defines = ["HWY_IS_TEST"],
-    # for test_suite.
-    tags = ["hwy_ops_test"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
-
-cc_binary(
-    name = "bench_sort",
-    testonly = 1,
-    srcs = ["bench_sort.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
-    local_defines = ["HWY_IS_TEST"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
-
-cc_binary(
-    name = "bench_parallel",
-    testonly = 1,
-    srcs = ["bench_parallel.cc"],
-    features = ["fully_static_link"],
-    linkstatic = True,
-    local_defines = ["HWY_IS_TEST"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
--- a/third_party/highway/hwy/contrib/sort/algo-inl.h
+++ b/third_party/highway/hwy/contrib/sort/algo-inl.h
@ -1,395 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>
-#include <cmath>  // std::abs
-#include <vector>
-
-#include "hwy/base.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-// Third-party algorithms
-#define HAVE_AVX2SORT 0
-#define HAVE_IPS4O 0
-#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
-#define HAVE_PDQSORT 0
-#define HAVE_SORT512 0
-
-#if HAVE_AVX2SORT
-HWY_PUSH_ATTRIBUTES("avx2,avx")
-#include "avx2sort.h"
-HWY_POP_ATTRIBUTES
-#endif
-#if HAVE_IPS4O
-#include "third_party/ips4o/include/ips4o.hpp"
-#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
-#endif
-#if HAVE_PDQSORT
-#include "third_party/boost/allowed/sort/sort.hpp"
-#endif
-#if HAVE_SORT512
-#include "sort512.h"
-#endif
-
-namespace hwy {
-
-enum class Dist { kUniform8, kUniform16, kUniform32 };
-
-std::vector<Dist> AllDist() {
-  return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32};
-}
-
-const char* DistName(Dist dist) {
-  switch (dist) {
-    case Dist::kUniform8:
-      return "uniform8";
-    case Dist::kUniform16:
-      return "uniform16";
-    case Dist::kUniform32:
-      return "uniform32";
-  }
-  return "unreachable";
-}
-
-template <typename T>
-class InputStats {
- public:
-  void Notify(T value) {
-    min_ = std::min(min_, value);
-    max_ = std::max(max_, value);
-    sumf_ += static_cast<double>(value);
-    count_ += 1;
-  }
-
-  bool operator==(const InputStats& other) const {
-    if (count_ != other.count_) {
-      HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
-                static_cast<int>(other.count_));
-    }
-
-    if (min_ != other.min_ || max_ != other.max_) {
-      HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
-                double(other.min_), double(other.max_));
-    }
-
-    // Sum helps detect duplicated/lost values
-    if (sumf_ != other.sumf_) {
-      // Allow some tolerance because kUniform32 * num can exceed double
-      // precision.
-      const double mul = 1E-9;  // prevent destructive cancellation
-      const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
-      if (err > 1E-3) {
-        HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
-                  other.sumf_, err, double(min_), double(max_));
-      }
-    }
-
-    return true;
-  }
-
- private:
-  T min_ = hwy::HighestValue<T>();
-  T max_ = hwy::LowestValue<T>();
-  double sumf_ = 0.0;
-  size_t count_ = 0;
-};
-
-enum class Algo {
-#if HAVE_AVX2SORT
-  kSEA,
-#endif
-#if HAVE_IPS4O
-  kIPS4O,
-#endif
-#if HAVE_PARALLEL_IPS4O
-  kParallelIPS4O,
-#endif
-#if HAVE_PDQSORT
-  kPDQ,
-#endif
-#if HAVE_SORT512
-  kSort512,
-#endif
-  kStd,
-  kVQSort,
-  kHeap,
-};
-
-const char* AlgoName(Algo algo) {
-  switch (algo) {
-#if HAVE_AVX2SORT
-    case Algo::kSEA:
-      return "sea";
-#endif
-#if HAVE_IPS4O
-    case Algo::kIPS4O:
-      return "ips4o";
-#endif
-#if HAVE_PARALLEL_IPS4O
-    case Algo::kParallelIPS4O:
-      return "par_ips4o";
-#endif
-#if HAVE_PDQSORT
-    case Algo::kPDQ:
-      return "pdq";
-#endif
-#if HAVE_SORT512
-    case Algo::kSort512:
-      return "sort512";
-#endif
-    case Algo::kStd:
-      return "std";
-    case Algo::kVQSort:
-      return "vq";
-    case Algo::kHeap:
-      return "heap";
-  }
-  return "unreachable";
-}
-
-}  // namespace hwy
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"  // HeapSort
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-class Xorshift128Plus {
-  static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
-    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
-    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
-    return z ^ (z >> 31);
-  }
-
- public:
-  // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
-  // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
-  template <class DU64>
-  static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
-    seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
-    for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
-      seeds[i] = SplitMix64(seeds[i - 1]);
-    }
-  }
-
-  // Need to pass in the state because vector cannot be class members.
-  template <class DU64>
-  static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
-                              Vec<DU64>& state1) {
-    Vec<DU64> s1 = state0;
-    Vec<DU64> s0 = state1;
-    const Vec<DU64> bits = Add(s1, s0);
-    state0 = s0;
-    s1 = Xor(s1, ShiftLeft<23>(s1));
-    state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
-    return bits;
-  }
-};
-
-template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  return And(bits, mask);
-}
-
-// Important to avoid denormals, which are flushed to zero by SIMD but not
-// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
-template <typename T, class DU64, HWY_IF_FLOAT(T)>
-Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
-                       const Vec<DU64> mask) {
-  const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
-  const Vec<DU64> values = And(bits, mask);
-#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to i32
-  const RebindToSigned<DU64> di;
-#else
-  const Repartition<MakeSigned<T>, DU64> di;
-#endif
-  const RebindToFloat<decltype(di)> df;
-  // Avoid NaN/denormal by converting from (range-limited) integer.
-  const Vec<DU64> no_nan =
-      And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
-  return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
-}
-
-template <class DU64>
-Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
-  switch (sizeof_t) {
-    case 2:
-      return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
-                                                 : 0xFFFFFFFFFFFFFFFFull);
-    case 4:
-      return Set(du64, (dist == Dist::kUniform8)    ? 0x000000FF000000FFull
-                       : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
-                                                    : 0xFFFFFFFFFFFFFFFFull);
-    case 8:
-      return Set(du64, (dist == Dist::kUniform8)    ? 0x00000000000000FFull
-                       : (dist == Dist::kUniform16) ? 0x000000000000FFFFull
-                                                    : 0x00000000FFFFFFFFull);
-    default:
-      HWY_ABORT("Logic error");
-      return Zero(du64);
-  }
-}
-
-template <typename T>
-InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
-  SortTag<uint64_t> du64;
-  using VU64 = Vec<decltype(du64)>;
-  const size_t N64 = Lanes(du64);
-  auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
-  Xorshift128Plus::GenerateSeeds(du64, buf.get());
-  auto s0 = Load(du64, buf.get());
-  auto s1 = Load(du64, buf.get() + N64);
-
-  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
-
-  const Repartition<T, decltype(du64)> d;
-  const size_t N = Lanes(d);
-  size_t i = 0;
-  for (; i + N <= num; i += N) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-#if HWY_ARCH_RVV
-    // v may not be 64-bit aligned
-    StoreU(bits, du64, buf.get());
-    memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
-#else
-    StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
-#endif
-  }
-  if (i < num) {
-    const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
-    StoreU(bits, du64, buf.get());
-    memcpy(v + i, buf.get(), (num - i) * sizeof(T));
-  }
-
-  InputStats<T> input_stats;
-  for (size_t i = 0; i < num; ++i) {
-    input_stats.Notify(v[i]);
-  }
-  return input_stats;
-}
-
-struct ThreadLocal {
-  Sorter sorter;
-};
-
-struct SharedState {
-#if HAVE_PARALLEL_IPS4O
-  ips4o::StdThreadPool pool{
-      HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))};
-#endif
-  std::vector<ThreadLocal> tls{1};
-};
-
-template <class Order, typename T>
-void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
-         size_t thread) {
-  using detail::HeapSort;
-  using detail::LaneTraits;
-  using detail::SharedTraits;
-
-  switch (algo) {
-#if HAVE_AVX2SORT
-    case Algo::kSEA:
-      return avx2::quicksort(inout, static_cast<int>(num));
-#endif
-
-#if HAVE_IPS4O
-    case Algo::kIPS4O:
-      if (Order().IsAscending()) {
-        return ips4o::sort(inout, inout + num, std::less<T>());
-      } else {
-        return ips4o::sort(inout, inout + num, std::greater<T>());
-      }
-#endif
-
-#if HAVE_PARALLEL_IPS4O
-    case Algo::kParallelIPS4O:
-      if (Order().IsAscending()) {
-        return ips4o::parallel::sort(inout, inout + num, std::less<T>());
-      } else {
-        return ips4o::parallel::sort(inout, inout + num, std::greater<T>());
-      }
-#endif
-
-#if HAVE_SORT512
-    case Algo::kSort512:
-      HWY_ABORT("not supported");
-      //    return Sort512::Sort(inout, num);
-#endif
-
-#if HAVE_PDQSORT
-    case Algo::kPDQ:
-      if (Order().IsAscending()) {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::less<T>());
-      } else {
-        return boost::sort::pdqsort_branchless(inout, inout + num,
-                                               std::greater<T>());
-      }
-#endif
-
-    case Algo::kStd:
-      if (Order().IsAscending()) {
-        return std::sort(inout, inout + num, std::less<T>());
-      } else {
-        return std::sort(inout, inout + num, std::greater<T>());
-      }
-
-    case Algo::kVQSort:
-      return shared.tls[thread].sorter(inout, num, Order());
-
-    case Algo::kHeap:
-      HWY_ASSERT(sizeof(T) < 16);
-      if (Order().IsAscending()) {
-        const SharedTraits<LaneTraits<detail::OrderAscending>> st;
-        return HeapSort(st, inout, num);
-      } else {
-        const SharedTraits<LaneTraits<detail::OrderDescending>> st;
-        return HeapSort(st, inout, num);
-      }
-
-    default:
-      HWY_ABORT("Not implemented");
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/bench_parallel.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_parallel.cc
@ -1,243 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Concurrent, independent sorts for generating more memory traffic and testing
-// scalability.
-
-// clang-format off
-#include "hwy/contrib/sort/vqsort.h"
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/aligned_allocator.h"
-// Last
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-#include <stdint.h>
-#include <stdio.h>
-
-#include <condition_variable>  //NOLINT
-#include <functional>
-#include <memory>
-#include <mutex>   //NOLINT
-#include <thread>  //NOLINT
-#include <utility>
-#include <vector>
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace {
-
-#if HWY_TARGET != HWY_SCALAR
-
-class ThreadPool {
- public:
-  // Starts the given number of worker threads and blocks until they are ready.
-  explicit ThreadPool(
-      const size_t num_threads = std::thread::hardware_concurrency() / 2)
-      : num_threads_(num_threads) {
-    HWY_ASSERT(num_threads_ > 0);
-    threads_.reserve(num_threads_);
-    for (size_t i = 0; i < num_threads_; ++i) {
-      threads_.emplace_back(ThreadFunc, this, i);
-    }
-
-    WorkersReadyBarrier();
-  }
-
-  ThreadPool(const ThreadPool&) = delete;
-  ThreadPool& operator&(const ThreadPool&) = delete;
-
-  // Waits for all threads to exit.
-  ~ThreadPool() {
-    StartWorkers(kWorkerExit);
-
-    for (std::thread& thread : threads_) {
-      thread.join();
-    }
-  }
-
-  size_t NumThreads() const { return threads_.size(); }
-
-  template <class Func>
-  void RunOnThreads(size_t max_threads, const Func& func) {
-    task_ = &CallClosure<Func>;
-    data_ = &func;
-    StartWorkers(max_threads);
-    WorkersReadyBarrier();
-  }
-
- private:
-  // After construction and between calls to Run, workers are "ready", i.e.
-  // waiting on worker_start_cv_. They are "started" by sending a "command"
-  // and notifying all worker_start_cv_ waiters. (That is why all workers
-  // must be ready/waiting - otherwise, the notification will not reach all of
-  // them and the main thread waits in vain for them to report readiness.)
-  using WorkerCommand = uint64_t;
-
-  static constexpr WorkerCommand kWorkerWait = ~1ULL;
-  static constexpr WorkerCommand kWorkerExit = ~2ULL;
-
-  // Calls a closure (lambda with captures).
-  template <class Closure>
-  static void CallClosure(const void* f, size_t thread) {
-    (*reinterpret_cast<const Closure*>(f))(thread);
-  }
-
-  void WorkersReadyBarrier() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    // Typically only a single iteration.
-    while (workers_ready_ != threads_.size()) {
-      workers_ready_cv_.wait(lock);
-    }
-    workers_ready_ = 0;
-
-    // Safely handle spurious worker wakeups.
-    worker_start_command_ = kWorkerWait;
-  }
-
-  // Precondition: all workers are ready.
-  void StartWorkers(const WorkerCommand worker_command) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    worker_start_command_ = worker_command;
-    // Workers will need this lock, so release it before they wake up.
-    lock.unlock();
-    worker_start_cv_.notify_all();
-  }
-
-  static void ThreadFunc(ThreadPool* self, size_t thread) {
-    // Until kWorkerExit command received:
-    for (;;) {
-      std::unique_lock<std::mutex> lock(self->mutex_);
-      // Notify main thread that this thread is ready.
-      if (++self->workers_ready_ == self->num_threads_) {
-        self->workers_ready_cv_.notify_one();
-      }
-    RESUME_WAIT:
-      // Wait for a command.
-      self->worker_start_cv_.wait(lock);
-      const WorkerCommand command = self->worker_start_command_;
-      switch (command) {
-        case kWorkerWait:    // spurious wakeup:
-          goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
-        case kWorkerExit:
-          return;  // exits thread
-        default:
-          break;
-      }
-
-      lock.unlock();
-      // Command is the maximum number of threads that should run the task.
-      HWY_ASSERT(command < self->NumThreads());
-      if (thread < command) {
-        self->task_(self->data_, thread);
-      }
-    }
-  }
-
-  const size_t num_threads_;
-
-  // Unmodified after ctor, but cannot be const because we call thread::join().
-  std::vector<std::thread> threads_;
-
-  std::mutex mutex_;  // guards both cv and their variables.
-  std::condition_variable workers_ready_cv_;
-  size_t workers_ready_ = 0;
-  std::condition_variable worker_start_cv_;
-  WorkerCommand worker_start_command_;
-
-  // Written by main thread, read by workers (after mutex lock/unlock).
-  std::function<void(const void*, size_t)> task_;  // points to CallClosure
-  const void* data_;                               // points to caller's Func
-};
-
-template <class Order, typename T>
-void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
-                      SharedState& shared, size_t thread) {
-  auto aligned = hwy::AllocateAligned<T>(num);
-
-  (void)GenerateInput(dist, aligned.get(), num);
-
-  const Timestamp t0;
-  Run<Order>(algo, aligned.get(), num, shared, thread);
-  HWY_ASSERT(aligned[0] < aligned[num - 1]);
-}
-
-void BenchParallel() {
-  // Not interested in benchmark results for other targets
-  if (HWY_TARGET != HWY_AVX3) return;
-
-  ThreadPool pool;
-  const size_t NT = pool.NumThreads();
-
-  using T = int64_t;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-
-  size_t num = 100 * 1000 * 1000;
-
-#if HAVE_IPS4O
-  const Algo algo = Algo::kIPS4O;
-#else
-  const Algo algo = Algo::kVQSort;
-#endif
-  const Dist dist = Dist::kUniform16;
-
-  SharedState shared;
-  shared.tls.resize(NT);
-
-  std::vector<Result> results;
-  for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
-    Timestamp t0;
-    // Default capture because MSVC wants algo/dist but clang does not.
-    pool.RunOnThreads(nt, [=, &shared](size_t thread) {
-      RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
-    });
-    const double sec = SecondsSince(t0);
-    results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
-    results.back().Print();
-  }
-}
-
-#else
-void BenchParallel() {}
-#endif
-
-}  // namespace
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-namespace {
-HWY_BEFORE_TEST(BenchParallel);
-HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
-}  // namespace
-}  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/bench_sort.cc
+++ b/third_party/highway/hwy/contrib/sort/bench_sort.cc
@ -1,259 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort.h"
-#include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <vector>
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace {
-using detail::LaneTraits;
-using detail::OrderAscending;
-using detail::OrderDescending;
-using detail::SharedTraits;
-
-#if HWY_TARGET != HWY_SCALAR
-using detail::OrderAscending128;
-using detail::OrderDescending128;
-using detail::Traits128;
-
-template <class Traits, typename T>
-HWY_NOINLINE void BenchPartition() {
-  const SortTag<T> d;
-  detail::SharedTraits<Traits> st;
-  const Dist dist = Dist::kUniform8;
-  double sum = 0.0;
-
-  const size_t max_log2 = AdjustedLog2Reps(20);
-  for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
-    const size_t num = 1ull << log2;
-    auto aligned = hwy::AllocateAligned<T>(num);
-    auto buf =
-        hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
-
-    std::vector<double> seconds;
-    const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
-    for (size_t rep = 0; rep < num_reps; ++rep) {
-      (void)GenerateInput(dist, aligned.get(), num);
-
-      const Timestamp t0;
-
-      detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
-                        buf.get());
-      seconds.push_back(SecondsSince(t0));
-      // 'Use' the result to prevent optimizing out the partition.
-      sum += static_cast<double>(aligned.get()[num / 2]);
-    }
-
-    MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
-                  SummarizeMeasurements(seconds))
-        .Print();
-  }
-  HWY_ASSERT(sum != 999999);  // Prevent optimizing out
-}
-
-HWY_NOINLINE void BenchAllPartition() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
-      HWY_TARGET == HWY_AVX2) {
-    return;
-  }
-
-  BenchPartition<LaneTraits<OrderDescending>, float>();
-  BenchPartition<LaneTraits<OrderAscending>, int64_t>();
-  BenchPartition<Traits128<OrderDescending128>, uint64_t>();
-}
-
-template <class Traits, typename T>
-HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
-    return;
-  }
-
-  const SortTag<T> d;
-  detail::SharedTraits<Traits> st;
-  const Dist dist = Dist::kUniform32;
-
-  const size_t N = Lanes(d);
-  const size_t num = SortConstants::BaseCaseNum(N);
-  auto keys = hwy::AllocateAligned<T>(num);
-  auto buf = hwy::AllocateAligned<T>(num + N);
-
-  std::vector<double> seconds;
-  double sum = 0;                             // prevents elision
-  constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
-
-  for (size_t rep = 0; rep < kReps; ++rep) {
-    InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
-
-    const Timestamp t0;
-    for (size_t i = 0; i < kMul; ++i) {
-      detail::BaseCase(d, st, keys.get(), num, buf.get());
-      sum += static_cast<double>(keys[0]);
-    }
-    seconds.push_back(SecondsSince(t0));
-    // printf("%f\n", seconds.back());
-
-    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
-  }
-  HWY_ASSERT(sum < 1E99);
-  results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
-                                  SummarizeMeasurements(seconds)));
-}
-
-HWY_NOINLINE void BenchAllBase() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3) {
-    return;
-  }
-
-  std::vector<Result> results;
-  BenchBase<LaneTraits<OrderAscending>, float>(results);
-  BenchBase<LaneTraits<OrderDescending>, int64_t>(results);
-  BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
-  for (const Result& r : results) {
-    r.Print();
-  }
-}
-
-std::vector<Algo> AlgoForBench() {
-  return {
-#if HAVE_AVX2SORT
-    Algo::kSEA,
-#endif
-#if HAVE_PARALLEL_IPS4O
-        Algo::kParallelIPS4O,
-#endif
-#if HAVE_IPS4O
-        Algo::kIPS4O,
-#endif
-#if HAVE_PDQSORT
-        Algo::kPDQ,
-#endif
-#if HAVE_SORT512
-        Algo::kSort512,
-#endif
-        // Algo::kStd,  // too slow to always benchmark
-        // Algo::kHeap,  // too slow to always benchmark
-        Algo::kVQSort,
-  };
-}
-
-template <class Traits, typename T>
-HWY_NOINLINE void BenchSort(size_t num) {
-  SharedState shared;
-  detail::SharedTraits<Traits> st;
-  auto aligned = hwy::AllocateAligned<T>(num);
-  for (Algo algo : AlgoForBench()) {
-    for (Dist dist : AllDist()) {
-      std::vector<double> seconds;
-      for (size_t rep = 0; rep < kReps; ++rep) {
-        InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
-
-        const Timestamp t0;
-        Run<typename Traits::Order>(algo, aligned.get(), num, shared,
-                                    /*thread=*/0);
-        seconds.push_back(SecondsSince(t0));
-        // printf("%f\n", seconds.back());
-
-        HWY_ASSERT(
-            VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
-      }
-      MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
-          .Print();
-    }  // dist
-  }    // algo
-}
-
-HWY_NOINLINE void BenchAllSort() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
-    return;
-  }
-
-  constexpr size_t K = 1000;
-  constexpr size_t M = K * K;
-  (void)K;
-  (void)M;
-  for (size_t num : {
-#if HAVE_PARALLEL_IPS4O
-         100 * M,
-#else
-         AdjustedReps(1 * M),
-#endif
-       }) {
-    // BenchSort<LaneTraits<OrderAscending>, float>(num);
-    // BenchSort<LaneTraits<OrderDescending>, double>(num);
-    // BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
-    BenchSort<LaneTraits<OrderDescending>, int32_t>(num);
-    BenchSort<LaneTraits<OrderAscending>, int64_t>(num);
-    // BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
-    // BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
-    // BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
-
-    BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
-    // BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
-  }
-}
-
-#else
-void BenchAllPartition() {}
-void BenchAllBase() {}
-void BenchAllSort() {}
-#endif
-
-}  // namespace
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-namespace {
-HWY_BEFORE_TEST(BenchSort);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
-}  // namespace
-}  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/disabled_targets.h
+++ b/third_party/highway/hwy/contrib/sort/disabled_targets.h
@ -1,30 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Speed up MSVC builds by building fewer targets. This header must be included
-// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
-// However, users of vqsort.h are unaffected.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
-
-#include "hwy/base.h"
-
-#if HWY_COMPILER_MSVC
-#undef HWY_DISABLED_TARGETS
-// HWY_SCALAR remains, so there will still be a valid target to call.
-#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
-#endif  // HWY_COMPILER_MSVC
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
--- a/third_party/highway/hwy/contrib/sort/print_network.cc
+++ b/third_party/highway/hwy/contrib/sort/print_network.cc
@ -1,190 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-
-#include <algorithm>
-
-#include "hwy/base.h"
-
-// Based on A.7 in "Entwurf und Implementierung vektorisierter
-// Sortieralgorithmen" and code by Mark Blacher.
-void PrintMergeNetwork16x2() {
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-  printf("\n");
-}
-
-void PrintMergeNetwork16x4() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-void PrintMergeNetwork16x8() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-void PrintMergeNetwork16x16() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-int main(int argc, char** argv) {
-  PrintMergeNetwork16x2();
-  PrintMergeNetwork16x4();
-  PrintMergeNetwork16x8();
-  PrintMergeNetwork16x16();
-  return 0;
-}
--- a/third_party/highway/hwy/contrib/sort/result-inl.h
+++ b/third_party/highway/hwy/contrib/sort/result-inl.h
@ -1,149 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/algo-inl.h"
-
-// Normal include guard for non-SIMD parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-
-#include <time.h>
-
-#include <algorithm>  // std::sort
-#include <string>
-
-#include "hwy/base.h"
-#include "hwy/nanobenchmark.h"
-
-namespace hwy {
-
-struct Timestamp {
-  Timestamp() { t = platform::Now(); }
-  double t;
-};
-
-double SecondsSince(const Timestamp& t0) {
-  const Timestamp t1;
-  return t1.t - t0.t;
-}
-
-constexpr size_t kReps = 30;
-
-// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
-// enough for the mode to be reliable).
-double SummarizeMeasurements(std::vector<double>& seconds) {
-  std::sort(seconds.begin(), seconds.end());
-  double sum = 0;
-  int count = 0;
-  for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
-    sum += seconds[i];
-    count += 1;
-  }
-  return sum / count;
-}
-
-}  // namespace hwy
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct Result {
-  Result() {}
-  Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
-         size_t num, size_t num_threads, double sec, size_t sizeof_t,
-         const char* type_name)
-      : target(target),
-        algo(algo),
-        dist(dist),
-        is128(is128),
-        num(num),
-        num_threads(num_threads),
-        sec(sec),
-        sizeof_t(sizeof_t),
-        type_name(type_name) {}
-
-  void Print() const {
-    const double bytes = static_cast<double>(num) *
-                         static_cast<double>(num_threads) *
-                         static_cast<double>(sizeof_t);
-    printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
-           hwy::TargetName(target), AlgoName(algo),
-           is128 ? "u128" : type_name.c_str(), DistName(dist),
-           static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
-  }
-
-  uint32_t target;
-  Algo algo;
-  Dist dist;
-  bool is128;
-  size_t num = 0;
-  size_t num_threads = 0;
-  double sec = 0.0;
-  size_t sizeof_t = 0;
-  std::string type_name;
-};
-
-template <typename T, class Traits>
-Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
-                  size_t num_threads, double sec) {
-  char string100[100];
-  hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
-  return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
-                sizeof(T), string100);
-}
-
-template <class Traits, typename T>
-bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
-                size_t num, const char* caller) {
-  constexpr size_t N1 = st.Is128() ? 2 : 1;
-  HWY_ASSERT(num >= N1);
-
-  InputStats<T> output_stats;
-  // Ensure it matches the sort order
-  for (size_t i = 0; i < num - N1; i += N1) {
-    output_stats.Notify(out[i]);
-    if (N1 == 2) output_stats.Notify(out[i + 1]);
-    // Reverse order instead of checking !Compare1 so we accept equal keys.
-    if (st.Compare1(out + i + N1, out + i)) {
-      printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
-             static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
-             double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
-             double(out[i + N1]));
-      HWY_ABORT("%d-bit sort is incorrect\n",
-                static_cast<int>(sizeof(T) * 8 * N1));
-    }
-  }
-  output_stats.Notify(out[num - N1]);
-  if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
-
-  return input_stats == output_stats;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/shared-inl.h
+++ b/third_party/highway/hwy/contrib/sort/shared-inl.h
@ -1,104 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Definitions shared between vqsort-inl and sorting_networks-inl.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-
-#include "hwy/base.h"
-
-namespace hwy {
-
-// Internal constants - these are to avoid magic numbers/literals and cannot be
-// changed without also changing the associated code.
-struct SortConstants {
-// SortingNetwork reshapes its input into a matrix. This is the maximum number
-// of *keys* per vector.
-#if HWY_COMPILER_MSVC
-  static constexpr size_t kMaxCols = 8;  // avoids build timeout
-#else
-  static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
-#endif
-
-  // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
-  // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
-  // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
-  // extra logN factor for larger networks (for which only loose upper bounds
-  // on size are known).
-  static constexpr size_t kMaxRowsLog2 = 4;
-  static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
-
-  static HWY_INLINE size_t BaseCaseNum(size_t N) {
-    return kMaxRows * HWY_MIN(N, kMaxCols);
-  }
-
-  // Unrolling is important (pipelining and amortizing branch mispredictions);
-  // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
-  // somewhat slower for sorting than 4x.
-  //
-  // To change, must also update left + 3 * N etc. in the loop.
-  static constexpr size_t kPartitionUnroll = 4;
-
-  static HWY_INLINE size_t PartitionBufNum(size_t N) {
-    // The main loop reads kPartitionUnroll vectors, and first loads from
-    // both left and right beforehand, so it requires min = 2 *
-    // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
-    // >= BaseCaseNum), we partition the right side into a buffer. We need
-    // another vector at the end so CompressStore does not overwrite anything.
-    return (2 * kPartitionUnroll + 1) * N;
-  }
-
-  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
-  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
-  // are larger, use entire vectors to ensure we do not overrun the array.
-  static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
-    return HWY_MAX(64 / sizeof_t, N);
-  }
-};
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#endif
-
-#include "hwy/highway.h"
-
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Default tag / vector width selector.
-// TODO(janwas): enable once LMUL < 1 is supported.
-#if HWY_TARGET == HWY_RVV && 0
-template <typename T>
-using SortTag = ScalableTag<T, -1>;
-#else
-template <typename T>
-using SortTag = ScalableTag<T>;
-#endif
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/sort_test.cc
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@ -12,553 +12,160 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
 // clang-format off
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
 #include "hwy/foreach_target.h"

-#include "hwy/contrib/sort/vqsort.h"
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
+#include "hwy/contrib/sort/sort-inl.h"
 #include "hwy/tests/test_util-inl.h"
 // clang-format on

-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>  // std::max
-#include <vector>
-
-#undef VQSORT_TEST_IMPL
-#if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
-// Scalar does not implement these, and MSVC non-debug builds time out.
-#define VQSORT_TEST_IMPL 0
-#else
-#define VQSORT_TEST_IMPL 1
-#endif
-
-#undef VQSORT_TEST_SORT
-// MSVC non-debug builds time out.
-#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
-#define VQSORT_TEST_SORT 0
-#else
-#define VQSORT_TEST_SORT 1
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
-namespace {

-#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
-using detail::LaneTraits;
-using detail::OrderAscending;
-using detail::OrderAscending128;
-using detail::OrderDescending;
-using detail::OrderDescending128;
-using detail::SharedTraits;
-using detail::Traits128;
-#endif
+#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86

-#if !VQSORT_TEST_IMPL
-static void TestAllMedian() {}
-static void TestAllBaseCase() {}
-static void TestAllPartition() {}
-static void TestAllGenerator() {}
-#else
-
-template <class Traits>
-static HWY_NOINLINE void TestMedian3() {
-  using T = uint64_t;
-  using D = CappedTag<T, 1>;
-  SharedTraits<Traits> st;
-  const D d;
-  using V = Vec<D>;
-  for (uint32_t bits = 0; bits < 8; ++bits) {
-    const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
-    const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
-    const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
-    const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
-    // If at least half(rounded up) of bits are 1, so is the median.
-    const size_t count = PopCount(bits);
-    HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
-  }
+template <class D>
+size_t K(D d) {
+  return SortBatchSize(d);
 }

-HWY_NOINLINE void TestAllMedian() {
-  TestMedian3<LaneTraits<OrderAscending> >();
-}
-
-template <class Traits, typename T>
-static HWY_NOINLINE void TestBaseCaseAscDesc() {
-  SharedTraits<Traits> st;
-  const SortTag<T> d;
+template <SortOrder kOrder, class D>
+void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) {
  const size_t N = Lanes(d);
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  const size_t N1 = st.LanesPerKey();
-
-  constexpr int kDebug = 0;
-  auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
-
-  std::vector<size_t> lengths;
-  lengths.push_back(HWY_MAX(1, N1));
-  lengths.push_back(3 * N1);
-  lengths.push_back(base_case_num / 2);
-  lengths.push_back(base_case_num / 2 + N1);
-  lengths.push_back(base_case_num - N1);
-  lengths.push_back(base_case_num);
-
-  std::vector<size_t> misalignments;
-  misalignments.push_back(0);
-  misalignments.push_back(1);
-  if (N >= 6) misalignments.push_back(N / 2 - 1);
-  misalignments.push_back(N / 2);
-  misalignments.push_back(N / 2 + 1);
-  misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
-
-  for (bool asc : {false, true}) {
-    for (size_t len : lengths) {
-      for (size_t misalign : misalignments) {
-        T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
-        if (kDebug) {
-          printf("============%s asc %d N1 %d len %d misalign %d\n",
-                 hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
-                 static_cast<int>(len), static_cast<int>(misalign));
-        }
-
-        for (size_t i = 0; i < misalign; ++i) {
-          aligned_keys[i] = hwy::LowestValue<T>();
-        }
-        InputStats<T> input_stats;
-        for (size_t i = 0; i < len; ++i) {
-          keys[i] =
-              asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
-          input_stats.Notify(keys[i]);
-          if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
-        }
-        for (size_t i = len; i < base_case_num + N; ++i) {
-          keys[i] = hwy::LowestValue<T>();
-        }
-
-        detail::BaseCase(d, st, keys, len, buf.get());
-
-        if (kDebug >= 2) {
-          printf("out>>>>>>\n");
-          for (size_t i = 0; i < len; ++i) {
-            printf("%3zu: %f\n", i, double(keys[i]));
-          }
-        }
-
-        HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
-        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned_keys[i] != hwy::LowestValue<T>())
-            HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
-        }
-        for (size_t i = len; i < base_case_num + N; ++i) {
-          if (keys[i] != hwy::LowestValue<T>())
-            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-        }
-      }  // misalign
-    }    // len
-  }      // asc
-}
-
-template <class Traits, typename T>
-static HWY_NOINLINE void TestBaseCase01() {
-  SharedTraits<Traits> st;
-  const SortTag<T> d;
-  const size_t N = Lanes(d);
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  const size_t N1 = st.LanesPerKey();
-
-  constexpr int kDebug = 0;
-  auto keys = hwy::AllocateAligned<T>(base_case_num + N);
-  auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
-
-  std::vector<size_t> lengths;
-  lengths.push_back(HWY_MAX(1, N1));
-  lengths.push_back(3 * N1);
-  lengths.push_back(base_case_num / 2);
-  lengths.push_back(base_case_num / 2 + N1);
-  lengths.push_back(base_case_num - N1);
-  lengths.push_back(base_case_num);
-
-  for (size_t len : lengths) {
-    if (kDebug) {
-      printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
-             static_cast<int>(N1), static_cast<int>(len));
-    }
-    const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
-    for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
-      InputStats<T> input_stats;
-      for (size_t i = 0; i < len; ++i) {
-        keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
-        input_stats.Notify(keys[i]);
-        if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
-      }
-      for (size_t i = len; i < base_case_num + N; ++i) {
-        keys[i] = hwy::LowestValue<T>();
+  // Ensure it matches the sort order
+  for (size_t i = 0; i < K(d) - 1; ++i) {
+    if (!verify::Compare(out[i], out[i + 1], kOrder)) {
+      printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n",
+             static_cast<uint64_t>(i), static_cast<uint64_t>(i),
+             static_cast<uint64_t>(N), static_cast<float>(out[i + 0]),
+             static_cast<float>(out[i + 1]));
+      for (size_t i = 0; i < K(d); ++i) {
+        printf("%.0f\n", static_cast<float>(out[i]));
      }

-      detail::BaseCase(d, st, keys.get(), len, buf.get());
-
-      if (kDebug >= 2) {
-        printf("out>>>>>>\n");
-        for (size_t i = 0; i < len; ++i) {
-          printf("%3zu: %f\n", i, double(keys[i]));
-        }
+      printf("\n\nin was:\n");
+      for (size_t i = 0; i < K(d); ++i) {
+        printf("%.0f\n", static_cast<float>(in[i]));
      }
-
-      HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
-      for (size_t i = len; i < base_case_num + N; ++i) {
-        if (keys[i] != hwy::LowestValue<T>())
-          HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-      }
-    }  // bits
-  }    // len
-}
-
-template <class Traits, typename T>
-static HWY_NOINLINE void TestBaseCase() {
-  TestBaseCaseAscDesc<Traits, T>();
-  TestBaseCase01<Traits, T>();
-}
-
-HWY_NOINLINE void TestAllBaseCase() {
-  // Workaround for stack overflow on MSVC debug.
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
-  return;
-#endif
-
-  TestBaseCase<LaneTraits<OrderAscending>, int32_t>();
-  TestBaseCase<LaneTraits<OrderDescending>, int64_t>();
-  TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
-  TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
-}
-
-template <class Traits, typename T>
-static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
-                                         size_t left, size_t border,
-                                         size_t right, const size_t N1,
-                                         const T* pivot) {
-  /* for (size_t i = left; i < right; ++i) {
-     if (i == border) printf("--\n");
-     printf("%4zu: %3d\n", i, keys[i]);
-   }*/
-
-  HWY_ASSERT(left % N1 == 0);
-  HWY_ASSERT(border % N1 == 0);
-  HWY_ASSERT(right % N1 == 0);
-  const bool asc = typename Traits::Order().IsAscending();
-  for (size_t i = left; i < border; i += N1) {
-    if (st.Compare1(pivot, keys + i)) {
-      HWY_ABORT(
-          "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
-          "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i + 0]), static_cast<int>(border));
+      fflush(stdout);
+      HWY_ABORT("Sort is incorrect");
    }
  }
-  for (size_t i = border; i < right; i += N1) {
-    if (!st.Compare1(pivot, keys + i)) {
-      HWY_ABORT(
-          "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
-          "border %d",
-          hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
-          double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
-          double(keys[i]), static_cast<int>(border));
+
+  // Also verify sums match (detects duplicated/lost values)
+  double expected_sum = 0.0;
+  double actual_sum = 0.0;
+  for (size_t i = 0; i < K(d); ++i) {
+    expected_sum += in[i];
+    actual_sum += out[i];
+  }
+  if (expected_sum != actual_sum) {
+    for (size_t i = 0; i < K(d); ++i) {
+      printf("%.0f  %.0f\n", static_cast<float>(in[i]),
+             static_cast<float>(out[i]));
    }
+    HWY_ABORT("Mismatch");
  }
 }

-template <class Traits, typename T>
-static HWY_NOINLINE void TestPartition() {
-  const SortTag<T> d;
-  SharedTraits<Traits> st;
-  const bool asc = typename Traits::Order().IsAscending();
-  const size_t N = Lanes(d);
-  constexpr int kDebug = 0;
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  // left + len + align
-  const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
-  auto aligned_keys = hwy::AllocateAligned<T>(total);
-  auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
+class TestReverse {
+  template <SortOrder kOrder, class D>
+  void TestOrder(D d, RandomState& /* rng */) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    HWY_ASSERT((N % 4) == 0);
+    auto in = AllocateAligned<T>(K(d));
+    auto inout = AllocateAligned<T>(K(d));

-  const size_t N1 = st.LanesPerKey();
-  for (bool in_asc : {false, true}) {
-    for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
-      const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
-      for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
-                         2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
-        const size_t len = (base_case_num + ofs) & ~(N1 - 1);
-        for (T pivot1 :
-             {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
-          const T pivot2[2] = {pivot1, 0};
-          const auto pivot = st.SetKey(d, pivot2);
-          for (size_t misalign = 0; misalign < N;
-               misalign += st.LanesPerKey()) {
-            T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
-            const size_t right = left + len;
-            if (kDebug) {
-              printf(
-                  "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
-                  hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
-                  static_cast<int>(len), static_cast<int>(right),
-                  double(pivot2[1]), double(pivot2[0]));
-            }
+    const size_t expected_size = SortBatchSize(d);

-            for (size_t i = 0; i < misalign; ++i) {
-              aligned_keys[i] = hwy::LowestValue<T>();
-            }
-            for (size_t i = 0; i < left; ++i) {
-              keys[i] = hwy::LowestValue<T>();
-            }
-            for (size_t i = left; i < right; ++i) {
-              keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
-                                              : static_cast<T>(right) - T(i));
-              if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
-            }
-            for (size_t i = right; i < total - misalign; ++i) {
-              keys[i] = hwy::LowestValue<T>();
-            }
-
-            size_t border =
-                detail::Partition(d, st, keys, left, right, pivot, buf.get());
-
-            if (kDebug >= 2) {
-              printf("out>>>>>>\n");
-              for (size_t i = left; i < right; ++i) {
-                printf("%3zu: %f\n", i, double(keys[i]));
-              }
-              for (size_t i = right; i < total - misalign; ++i) {
-                printf("%3zu: sentinel %f\n", i, double(keys[i]));
-              }
-            }
-
-            VerifyPartition(st, keys, left, border, right, N1, pivot2);
-            for (size_t i = 0; i < misalign; ++i) {
-              if (aligned_keys[i] != hwy::LowestValue<T>())
-                HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
-            }
-            for (size_t i = 0; i < left; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
-                HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
-            }
-            for (size_t i = right; i < total - misalign; ++i) {
-              if (keys[i] != hwy::LowestValue<T>())
-                HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-            }
-          }  // misalign
-        }    // pivot
-      }      // len
-    }        // left
-  }          // asc
-}
-
-HWY_NOINLINE void TestAllPartition() {
-  TestPartition<LaneTraits<OrderAscending>, int16_t>();
-  TestPartition<LaneTraits<OrderDescending>, int32_t>();
-  TestPartition<LaneTraits<OrderAscending>, int64_t>();
-  TestPartition<LaneTraits<OrderDescending>, float>();
-#if HWY_HAVE_FLOAT64
-  TestPartition<LaneTraits<OrderDescending>, double>();
-#endif
-  TestPartition<Traits128<OrderAscending128>, uint64_t>();
-  TestPartition<Traits128<OrderDescending128>, uint64_t>();
-}
-
-// (used for sample selection for choosing a pivot)
-template <typename TU>
-static HWY_NOINLINE void TestRandomGenerator() {
-  static_assert(!hwy::IsSigned<TU>(), "");
-  SortTag<TU> du;
-  const size_t N = Lanes(du);
-
-  detail::Generator rng(&N, N);
-
-  const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N);  // power of two
-
-  for (uint32_t num_blocks = 2; num_blocks < 100000;
-       num_blocks = 3 * num_blocks / 2) {
-    // Generate some numbers and ensure all are in range
-    uint64_t sum = 0;
-    constexpr size_t kReps = 10000;
-    for (size_t rep = 0; rep < kReps; ++rep) {
-      const uint32_t bits = rng() & 0xFFFFFFFF;
-      const size_t index = detail::RandomChunkIndex(num_blocks, bits);
-      HWY_ASSERT(((index + 1) * lanes_per_block) <=
-                 num_blocks * lanes_per_block);
-
-      sum += index;
+    for (size_t i = 0; i < K(d); ++i) {
+      in[i] = static_cast<T>(K(d) - i);
+      inout[i] = in[i];
    }

-    // Also ensure the mean is near the middle of the range
-    const double expected = (num_blocks - 1) / 2.0;
-    const double actual = double(sum) / kReps;
-    HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
+    const size_t actual_size = SortBatch<kOrder>(d, inout.get());
+    HWY_ASSERT_EQ(expected_size, actual_size);
+    Validate<kOrder>(d, in.get(), inout.get());
  }
-}

-HWY_NOINLINE void TestAllGenerator() {
-  TestRandomGenerator<uint32_t>();
-  TestRandomGenerator<uint64_t>();
-}
-
-#endif  // VQSORT_TEST_IMPL
-
-#if !VQSORT_TEST_SORT
-static void TestAllSort() {}
-#else
-
-// Remembers input, and compares results to that of a reference algorithm.
-template <class Traits, typename T>
-class CompareResults {
 public:
-  void SetInput(const T* in, size_t num) {
-    copy_.resize(num);
-    memcpy(copy_.data(), in, num * sizeof(T));
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    TestOrder<SortOrder::kAscending>(d, rng);
+    TestOrder<SortOrder::kDescending>(d, rng);
  }
-
-  bool Verify(const T* output) {
-#if HAVE_PDQSORT
-    const Algo reference = Algo::kPDQ;
-#else
-    const Algo reference = Algo::kStd;
-#endif
-    SharedState shared;
-    using Order = typename Traits::Order;
-    Run<Order>(reference, copy_.data(), copy_.size(), shared,
-               /*thread=*/0);
-
-    for (size_t i = 0; i < copy_.size(); ++i) {
-      if (copy_[i] != output[i]) {
-        fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
-                static_cast<int>(i), double(copy_[i]), double(output[i]));
-        return false;
-      }
-    }
-    return true;
-  }
-
- private:
-  std::vector<T> copy_;
 };

-std::vector<Algo> AlgoForTest() {
-  return {
-#if HAVE_AVX2SORT
-    Algo::kSEA,
-#endif
-#if HAVE_IPS4O
-        Algo::kIPS4O,
-#endif
-#if HAVE_PDQSORT
-        Algo::kPDQ,
-#endif
-#if HAVE_SORT512
-        Algo::kSort512,
-#endif
-        Algo::kHeap, Algo::kVQSort,
-  };
+void TestAllReverse() {
+  TestReverse test;
+  test(int32_t(), CappedTag<int32_t, 16>());
+  test(uint32_t(), CappedTag<uint32_t, 16>());
 }

-template <class Traits, typename T>
-void TestSort(size_t num) {
-  // TODO(janwas): fix
-  if (HWY_TARGET == HWY_SSSE3) return;
-// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
-#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
-  return;
-#endif
+class TestRanges {
+  template <SortOrder kOrder, class D>
+  void TestOrder(D d, RandomState& rng) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    HWY_ASSERT((N % 4) == 0);
+    auto in = AllocateAligned<T>(K(d));
+    auto inout = AllocateAligned<T>(K(d));

-  SharedState shared;
-  SharedTraits<Traits> st;
+    const size_t expected_size = SortBatchSize(d);

-  constexpr size_t kMaxMisalign = 16;
-  auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
-  for (Algo algo : AlgoForTest()) {
-#if HAVE_IPS4O
-    if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
-      continue;
+    // For each range, try all 0/1 combinations and set any other lanes to
+    // random inputs.
+    constexpr size_t kRange = 8;
+    for (size_t range = 0; range < K(d); range += kRange) {
+      for (size_t bits = 0; bits < (1ull << kRange); ++bits) {
+        // First set all to random, will later overwrite those for `range`
+        for (size_t i = 0; i < K(d); ++i) {
+          in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF);
+        }
+        // Now set the current combination of {0,1} for elements in the range.
+        // This is sufficient to establish correctness (arbitrary inputs could
+        // be mapped to 0/1 with a comparison predicate).
+        for (size_t i = 0; i < kRange; ++i) {
+          in[range + i] = inout[range + i] = (bits >> i) & 1;
+        }
+
+        const size_t actual_size = SortBatch<kOrder>(d, inout.get());
+        HWY_ASSERT_EQ(expected_size, actual_size);
+        Validate<kOrder>(d, in.get(), inout.get());
+      }
    }
-#endif
-    for (Dist dist : AllDist()) {
-      for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
-                              size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
-        T* keys = aligned.get() + misalign;
-
-        // Set up red zones before/after the keys to sort
-        for (size_t i = 0; i < misalign; ++i) {
-          aligned[i] = hwy::LowestValue<T>();
-        }
-        for (size_t i = 0; i < kMaxMisalign; ++i) {
-          keys[num + i] = hwy::HighestValue<T>();
-        }
-#if HWY_IS_MSAN
-        __msan_poison(aligned.get(), misalign * sizeof(T));
-        __msan_poison(keys + num, kMaxMisalign * sizeof(T));
-#endif
-        InputStats<T> input_stats = GenerateInput(dist, keys, num);
-
-        CompareResults<Traits, T> compare;
-        compare.SetInput(keys, num);
-
-        Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
-        HWY_ASSERT(compare.Verify(keys));
-        HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
-
-        // Check red zones
-#if HWY_IS_MSAN
-        __msan_unpoison(aligned.get(), misalign * sizeof(T));
-        __msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
-#endif
-        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned[i] != hwy::LowestValue<T>())
-            HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
-        }
-        for (size_t i = num; i < num + kMaxMisalign; ++i) {
-          if (keys[i] != hwy::HighestValue<T>())
-            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-        }
-      }  // misalign
-    }    // dist
-  }      // algo
-}
-
-void TestAllSort() {
-  const size_t num = 15 * 1000;
-
-  TestSort<LaneTraits<OrderAscending>, int16_t>(num);
-  TestSort<LaneTraits<OrderDescending>, uint16_t>(num);
-
-  TestSort<LaneTraits<OrderDescending>, int32_t>(num);
-  TestSort<LaneTraits<OrderDescending>, uint32_t>(num);
-
-  TestSort<LaneTraits<OrderAscending>, int64_t>(num);
-  TestSort<LaneTraits<OrderAscending>, uint64_t>(num);
-
-  // WARNING: for float types, SIMD comparisons will flush denormals to zero,
-  // causing mismatches with scalar sorts. In this test, we avoid generating
-  // denormal inputs.
-  TestSort<LaneTraits<OrderAscending>, float>(num);
-#if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
-  if (Sorter::HaveFloat64()) {
-    TestSort<LaneTraits<OrderDescending>, double>(num);
  }
-#endif

-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
-  TestSort<Traits128<OrderAscending128>, uint64_t>(num);
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    TestOrder<SortOrder::kAscending>(d, rng);
+    TestOrder<SortOrder::kDescending>(d, rng);
+  }
+};
+
+void TestAllRanges() {
+  TestRanges test;
+  test(int32_t(), CappedTag<int32_t, 16>());
+  test(uint32_t(), CappedTag<uint32_t, 16>());
 }

-#endif  // VQSORT_TEST_SORT
+#else
+void TestAllReverse() {}
+void TestAllRanges() {}
+#endif  // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86

-}  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@ -567,14 +174,9 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE

 namespace hwy {
-namespace {
 HWY_BEFORE_TEST(SortTest);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
-}  // namespace
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges);
 }  // namespace hwy

 // Ought not to be necessary, but without this, no tests run on RVV.
@ -583,4 +185,4 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }

-#endif  // HWY_ONCE
+#endif
--- a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
+++ b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
@ -1,686 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ SharedTraits
-
-// Code shared between all traits. It's unclear whether these can profitably be
-// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
-// Compare/DupOdd.
-template <class Base>
-struct SharedTraits : public Base {
-  // Conditionally swaps lane 0 with 2, 1 with 3 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->SwapAdjacentPairs(d, v);
-    base->Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys8(d, v);
-    base->Sort2(d, v, swapped);
-    return base->OddEvenQuads(d, swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
-    Vec<D> swapped = base->ReverseKeys(d, v);
-    base->Sort2(d, v, swapped);
-    return ConcatUpperLower(d, swapped, v);  // 8 = half of the vector
-  }
-};
-
-// ------------------------------ Sorting network
-
-// (Green's irregular) sorting network for independent columns in 16 vectors.
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  st.Sort2(d, v0, v2);
-  st.Sort2(d, v1, v3);
-  st.Sort2(d, v4, v6);
-  st.Sort2(d, v5, v7);
-  st.Sort2(d, v8, va);
-  st.Sort2(d, v9, vb);
-  st.Sort2(d, vc, ve);
-  st.Sort2(d, vd, vf);
-  st.Sort2(d, v0, v4);
-  st.Sort2(d, v1, v5);
-  st.Sort2(d, v2, v6);
-  st.Sort2(d, v3, v7);
-  st.Sort2(d, v8, vc);
-  st.Sort2(d, v9, vd);
-  st.Sort2(d, va, ve);
-  st.Sort2(d, vb, vf);
-  st.Sort2(d, v0, v8);
-  st.Sort2(d, v1, v9);
-  st.Sort2(d, v2, va);
-  st.Sort2(d, v3, vb);
-  st.Sort2(d, v4, vc);
-  st.Sort2(d, v5, vd);
-  st.Sort2(d, v6, ve);
-  st.Sort2(d, v7, vf);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v7, vb);
-  st.Sort2(d, vd, ve);
-  st.Sort2(d, v4, v8);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v1, v4);
-  st.Sort2(d, v7, vd);
-  st.Sort2(d, v2, v8);
-  st.Sort2(d, vb, ve);
-  st.Sort2(d, v2, v4);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vb, vd);
-  st.Sort2(d, v3, v8);
-  st.Sort2(d, v7, vc);
-  st.Sort2(d, v3, v5);
-  st.Sort2(d, v6, v8);
-  st.Sort2(d, v7, v9);
-  st.Sort2(d, va, vc);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v7, v8);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vb, vc);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-}
-
-// ------------------------------ Merging networks
-
-// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys2(d, v8);
-  v9 = st.ReverseKeys2(d, v9);
-  va = st.ReverseKeys2(d, va);
-  vb = st.ReverseKeys2(d, vb);
-  vc = st.ReverseKeys2(d, vc);
-  vd = st.ReverseKeys2(d, vd);
-  ve = st.ReverseKeys2(d, ve);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys2(d, v4);
-  vc = st.ReverseKeys2(d, vc);
-  v5 = st.ReverseKeys2(d, v5);
-  vd = st.ReverseKeys2(d, vd);
-  v6 = st.ReverseKeys2(d, v6);
-  ve = st.ReverseKeys2(d, ve);
-  v7 = st.ReverseKeys2(d, v7);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys2(d, v2);
-  v3 = st.ReverseKeys2(d, v3);
-  v6 = st.ReverseKeys2(d, v6);
-  v7 = st.ReverseKeys2(d, v7);
-  va = st.ReverseKeys2(d, va);
-  vb = st.ReverseKeys2(d, vb);
-  ve = st.ReverseKeys2(d, ve);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys2(d, v1);
-  v3 = st.ReverseKeys2(d, v3);
-  v5 = st.ReverseKeys2(d, v5);
-  v7 = st.ReverseKeys2(d, v7);
-  v9 = st.ReverseKeys2(d, v9);
-  vb = st.ReverseKeys2(d, vb);
-  vd = st.ReverseKeys2(d, vd);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys4(d, v8);
-  v9 = st.ReverseKeys4(d, v9);
-  va = st.ReverseKeys4(d, va);
-  vb = st.ReverseKeys4(d, vb);
-  vc = st.ReverseKeys4(d, vc);
-  vd = st.ReverseKeys4(d, vd);
-  ve = st.ReverseKeys4(d, ve);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys4(d, v4);
-  vc = st.ReverseKeys4(d, vc);
-  v5 = st.ReverseKeys4(d, v5);
-  vd = st.ReverseKeys4(d, vd);
-  v6 = st.ReverseKeys4(d, v6);
-  ve = st.ReverseKeys4(d, ve);
-  v7 = st.ReverseKeys4(d, v7);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys4(d, v2);
-  v3 = st.ReverseKeys4(d, v3);
-  v6 = st.ReverseKeys4(d, v6);
-  v7 = st.ReverseKeys4(d, v7);
-  va = st.ReverseKeys4(d, va);
-  vb = st.ReverseKeys4(d, vb);
-  ve = st.ReverseKeys4(d, ve);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys4(d, v1);
-  v3 = st.ReverseKeys4(d, v3);
-  v5 = st.ReverseKeys4(d, v5);
-  v7 = st.ReverseKeys4(d, v7);
-  v9 = st.ReverseKeys4(d, v9);
-  vb = st.ReverseKeys4(d, vb);
-  vd = st.ReverseKeys4(d, vd);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse4(d, v0);
-  v1 = st.SortPairsReverse4(d, v1);
-  v2 = st.SortPairsReverse4(d, v2);
-  v3 = st.SortPairsReverse4(d, v3);
-  v4 = st.SortPairsReverse4(d, v4);
-  v5 = st.SortPairsReverse4(d, v5);
-  v6 = st.SortPairsReverse4(d, v6);
-  v7 = st.SortPairsReverse4(d, v7);
-  v8 = st.SortPairsReverse4(d, v8);
-  v9 = st.SortPairsReverse4(d, v9);
-  va = st.SortPairsReverse4(d, va);
-  vb = st.SortPairsReverse4(d, vb);
-  vc = st.SortPairsReverse4(d, vc);
-  vd = st.SortPairsReverse4(d, vd);
-  ve = st.SortPairsReverse4(d, ve);
-  vf = st.SortPairsReverse4(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys8(d, v8);
-  v9 = st.ReverseKeys8(d, v9);
-  va = st.ReverseKeys8(d, va);
-  vb = st.ReverseKeys8(d, vb);
-  vc = st.ReverseKeys8(d, vc);
-  vd = st.ReverseKeys8(d, vd);
-  ve = st.ReverseKeys8(d, ve);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys8(d, v4);
-  vc = st.ReverseKeys8(d, vc);
-  v5 = st.ReverseKeys8(d, v5);
-  vd = st.ReverseKeys8(d, vd);
-  v6 = st.ReverseKeys8(d, v6);
-  ve = st.ReverseKeys8(d, ve);
-  v7 = st.ReverseKeys8(d, v7);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys8(d, v2);
-  v3 = st.ReverseKeys8(d, v3);
-  v6 = st.ReverseKeys8(d, v6);
-  v7 = st.ReverseKeys8(d, v7);
-  va = st.ReverseKeys8(d, va);
-  vb = st.ReverseKeys8(d, vb);
-  ve = st.ReverseKeys8(d, ve);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys8(d, v1);
-  v3 = st.ReverseKeys8(d, v3);
-  v5 = st.ReverseKeys8(d, v5);
-  v7 = st.ReverseKeys8(d, v7);
-  v9 = st.ReverseKeys8(d, v9);
-  vb = st.ReverseKeys8(d, vb);
-  vd = st.ReverseKeys8(d, vd);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse8(d, v0);
-  v1 = st.SortPairsReverse8(d, v1);
-  v2 = st.SortPairsReverse8(d, v2);
-  v3 = st.SortPairsReverse8(d, v3);
-  v4 = st.SortPairsReverse8(d, v4);
-  v5 = st.SortPairsReverse8(d, v5);
-  v6 = st.SortPairsReverse8(d, v6);
-  v7 = st.SortPairsReverse8(d, v7);
-  v8 = st.SortPairsReverse8(d, v8);
-  v9 = st.SortPairsReverse8(d, v9);
-  va = st.SortPairsReverse8(d, va);
-  vb = st.SortPairsReverse8(d, vb);
-  vc = st.SortPairsReverse8(d, vc);
-  vd = st.SortPairsReverse8(d, vd);
-  ve = st.SortPairsReverse8(d, ve);
-  vf = st.SortPairsReverse8(d, vf);
-  v0 = st.SortPairsDistance2(d, v0);
-  v1 = st.SortPairsDistance2(d, v1);
-  v2 = st.SortPairsDistance2(d, v2);
-  v3 = st.SortPairsDistance2(d, v3);
-  v4 = st.SortPairsDistance2(d, v4);
-  v5 = st.SortPairsDistance2(d, v5);
-  v6 = st.SortPairsDistance2(d, v6);
-  v7 = st.SortPairsDistance2(d, v7);
-  v8 = st.SortPairsDistance2(d, v8);
-  v9 = st.SortPairsDistance2(d, v9);
-  va = st.SortPairsDistance2(d, va);
-  vb = st.SortPairsDistance2(d, vb);
-  vc = st.SortPairsDistance2(d, vc);
-  vd = st.SortPairsDistance2(d, vd);
-  ve = st.SortPairsDistance2(d, ve);
-  vf = st.SortPairsDistance2(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-// Unused on MSVC, see below
-#if !HWY_COMPILER_MSVC
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
-                        V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
-                        V& vd, V& ve, V& vf) {
-  v8 = st.ReverseKeys16(d, v8);
-  v9 = st.ReverseKeys16(d, v9);
-  va = st.ReverseKeys16(d, va);
-  vb = st.ReverseKeys16(d, vb);
-  vc = st.ReverseKeys16(d, vc);
-  vd = st.ReverseKeys16(d, vd);
-  ve = st.ReverseKeys16(d, ve);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys16(d, v4);
-  vc = st.ReverseKeys16(d, vc);
-  v5 = st.ReverseKeys16(d, v5);
-  vd = st.ReverseKeys16(d, vd);
-  v6 = st.ReverseKeys16(d, v6);
-  ve = st.ReverseKeys16(d, ve);
-  v7 = st.ReverseKeys16(d, v7);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys16(d, v2);
-  v3 = st.ReverseKeys16(d, v3);
-  v6 = st.ReverseKeys16(d, v6);
-  v7 = st.ReverseKeys16(d, v7);
-  va = st.ReverseKeys16(d, va);
-  vb = st.ReverseKeys16(d, vb);
-  ve = st.ReverseKeys16(d, ve);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys16(d, v1);
-  v3 = st.ReverseKeys16(d, v3);
-  v5 = st.ReverseKeys16(d, v5);
-  v7 = st.ReverseKeys16(d, v7);
-  v9 = st.ReverseKeys16(d, v9);
-  vb = st.ReverseKeys16(d, vb);
-  vd = st.ReverseKeys16(d, vd);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse16(d, v0);
-  v1 = st.SortPairsReverse16(d, v1);
-  v2 = st.SortPairsReverse16(d, v2);
-  v3 = st.SortPairsReverse16(d, v3);
-  v4 = st.SortPairsReverse16(d, v4);
-  v5 = st.SortPairsReverse16(d, v5);
-  v6 = st.SortPairsReverse16(d, v6);
-  v7 = st.SortPairsReverse16(d, v7);
-  v8 = st.SortPairsReverse16(d, v8);
-  v9 = st.SortPairsReverse16(d, v9);
-  va = st.SortPairsReverse16(d, va);
-  vb = st.SortPairsReverse16(d, vb);
-  vc = st.SortPairsReverse16(d, vc);
-  vd = st.SortPairsReverse16(d, vd);
-  ve = st.SortPairsReverse16(d, ve);
-  vf = st.SortPairsReverse16(d, vf);
-  v0 = st.SortPairsDistance4(d, v0);
-  v1 = st.SortPairsDistance4(d, v1);
-  v2 = st.SortPairsDistance4(d, v2);
-  v3 = st.SortPairsDistance4(d, v3);
-  v4 = st.SortPairsDistance4(d, v4);
-  v5 = st.SortPairsDistance4(d, v5);
-  v6 = st.SortPairsDistance4(d, v6);
-  v7 = st.SortPairsDistance4(d, v7);
-  v8 = st.SortPairsDistance4(d, v8);
-  v9 = st.SortPairsDistance4(d, v9);
-  va = st.SortPairsDistance4(d, va);
-  vb = st.SortPairsDistance4(d, vb);
-  vc = st.SortPairsDistance4(d, vc);
-  vd = st.SortPairsDistance4(d, vd);
-  ve = st.SortPairsDistance4(d, ve);
-  vf = st.SortPairsDistance4(d, vf);
-  v0 = st.SortPairsDistance2(d, v0);
-  v1 = st.SortPairsDistance2(d, v1);
-  v2 = st.SortPairsDistance2(d, v2);
-  v3 = st.SortPairsDistance2(d, v3);
-  v4 = st.SortPairsDistance2(d, v4);
-  v5 = st.SortPairsDistance2(d, v5);
-  v6 = st.SortPairsDistance2(d, v6);
-  v7 = st.SortPairsDistance2(d, v7);
-  v8 = st.SortPairsDistance2(d, v8);
-  v9 = st.SortPairsDistance2(d, v9);
-  va = st.SortPairsDistance2(d, va);
-  vb = st.SortPairsDistance2(d, vb);
-  vc = st.SortPairsDistance2(d, vc);
-  vd = st.SortPairsDistance2(d, vd);
-  ve = st.SortPairsDistance2(d, ve);
-  vf = st.SortPairsDistance2(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-#endif  // !HWY_COMPILER_MSVC
-
-// Reshapes `buf` into a matrix, sorts columns independently, and then merges
-// into a sorted 1D array without transposing.
-//
-// `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
-//   bridges differences in sort order and single-lane vs 128-bit keys.
-// `buf` ensures full vectors are aligned, and enables loads/stores without
-//   bounds checks.
-//
-// References:
-// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
-// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
-// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
-template <class Traits, typename T>
-HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
-  const CappedTag<T, Constants::kMaxCols> d;
-  using V = decltype(Zero(d));
-
-  HWY_DASSERT(cols <= Constants::kMaxCols);
-
-  // The network width depends on the number of keys, not lanes.
-  constexpr size_t kLanesPerKey = st.LanesPerKey();
-  const size_t keys = cols / kLanesPerKey;
-  constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
-
-  // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
-  // offsets to duplicating this code for every value of cols.
-  static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
-  V v0 = LoadU(d, buf + 0x0 * cols);
-  V v1 = LoadU(d, buf + 0x1 * cols);
-  V v2 = LoadU(d, buf + 0x2 * cols);
-  V v3 = LoadU(d, buf + 0x3 * cols);
-  V v4 = LoadU(d, buf + 0x4 * cols);
-  V v5 = LoadU(d, buf + 0x5 * cols);
-  V v6 = LoadU(d, buf + 0x6 * cols);
-  V v7 = LoadU(d, buf + 0x7 * cols);
-  V v8 = LoadU(d, buf + 0x8 * cols);
-  V v9 = LoadU(d, buf + 0x9 * cols);
-  V va = LoadU(d, buf + 0xa * cols);
-  V vb = LoadU(d, buf + 0xb * cols);
-  V vc = LoadU(d, buf + 0xc * cols);
-  V vd = LoadU(d, buf + 0xd * cols);
-  V ve = LoadU(d, buf + 0xe * cols);
-  V vf = LoadU(d, buf + 0xf * cols);
-
-  Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
-
-  // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
-  // code paths: if MaxLanes < 2, then keys <= cols < 2.
-  if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
-    Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
-           vf);
-
-    if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
-      Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
-             vf);
-
-      if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
-        Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
-               ve, vf);
-
-        // Avoids build timeout
-#if !HWY_COMPILER_MSVC
-        if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
-          Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
-                  ve, vf);
-
-          static_assert(Constants::kMaxCols <= 16, "Add more branches");
-        }
-#endif
-      }
-    }
-  }
-
-  StoreU(v0, d, buf + 0x0 * cols);
-  StoreU(v1, d, buf + 0x1 * cols);
-  StoreU(v2, d, buf + 0x2 * cols);
-  StoreU(v3, d, buf + 0x3 * cols);
-  StoreU(v4, d, buf + 0x4 * cols);
-  StoreU(v5, d, buf + 0x5 * cols);
-  StoreU(v6, d, buf + 0x6 * cols);
-  StoreU(v7, d, buf + 0x7 * cols);
-  StoreU(v8, d, buf + 0x8 * cols);
-  StoreU(v9, d, buf + 0x9 * cols);
-  StoreU(va, d, buf + 0xa * cols);
-  StoreU(vb, d, buf + 0xb * cols);
-  StoreU(vc, d, buf + 0xc * cols);
-  StoreU(vd, d, buf + 0xd * cols);
-  StoreU(ve, d, buf + 0xe * cols);
-  StoreU(vf, d, buf + 0xf * cols);
-}
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/traits-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits-inl.h
@ -1,324 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
-#include "hwy/contrib/sort/vqsort.h"      // SortDescending
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
-// along with an abstraction layer for single-lane vs. lane-pair, which is
-// independent of the order.
-struct KeyLane {
-  constexpr size_t LanesPerKey() const { return 1; }
-
-  // For HeapSort
-  template <typename T>
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const T temp = *a;
-    *a = *b;
-    *b = temp;
-  }
-
-  // Broadcasts one key into a vector
-  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
-    return Set(d, *key);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
-    return Reverse(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
-    return Reverse2(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
-    return Reverse4(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
-    return Reverse8(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
-    static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
-    return ReverseKeys(d, v);
-  }
-
-  template <class V>
-  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
-    return OddEven(odd, even);
-  }
-
-  template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
-    const Repartition<uint32_t, D> du32;
-    return BitCast(d, Shuffle2301(BitCast(du32, v)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
-    return Shuffle1032(v);
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
-    return SwapAdjacentBlocks(v);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
-#endif
-    return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
-    // Assumes max vector size = 512
-    return ConcatLowerUpper(d, v, v);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
-                                 const Vec<D> even) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
-#endif
-    return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
-    return OddEvenBlocks(odd, even);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D>> dw;
-#endif
-    return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
-    return ConcatUpperLower(d, odd, even);
-  }
-};
-
-// Anything order-related depends on the key traits *and* the order (see
-// FirstOfLanes). We cannot implement just one Compare function because Lt128
-// only compiles if the lane type is u64. Thus we need either overloaded
-// functions with a tag type, class specializations, or separate classes.
-// We avoid overloaded functions because we want all functions to be callable
-// from a SortTraits without per-function wrappers. Specializing would work, but
-// we are anyway going to specialize at a higher level.
-struct OrderAscending : public KeyLane {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *a < *b;
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
-    return Lt(a, b);
-  }
-
-  // Two halves of Sort2, used in ScanMinMax.
-  template <class D>
-  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Min(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Max(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
-    return MinOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
-    return MaxOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
-  }
-};
-
-struct OrderDescending : public KeyLane {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return *b < *a;
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
-    return Lt(b, a);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Max(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Min(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT /* buf */) const {
-    return MaxOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT /* buf */) const {
-    return MinOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D>>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D>>());
-  }
-};
-
-// Shared code that depends on Order.
-template <class Base>
-struct LaneTraits : public Base {
-  constexpr bool Is128() const { return false; }
-
-  // For each lane i: replaces a[i] with the first and b[i] with the second
-  // according to Base.
-  // Corresponds to a conditional swap, which is one "node" of a sorting
-  // network. Min/Max are cheaper than compare + blend at least for integers.
-  template <class D>
-  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
-    const Base* base = static_cast<const Base*>(this);
-
-    const Vec<D> a_copy = a;
-    // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
-    // instructions. We can reduce it to a compare + 2 IfThenElse.
-#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
-    if (sizeof(TFromD<D>) == 8) {
-      const Mask<D> cmp = base->Compare(d, a, b);
-      a = IfThenElse(cmp, a, b);
-      b = IfThenElse(cmp, b, a_copy);
-      return;
-    }
-#endif
-    a = base->First(d, a, b);
-    b = base->Last(d, a_copy, b);
-  }
-
-  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-    // Further to the above optimization, Sort2+OddEvenKeys compile to four
-    // instructions; we can save one by combining two blends.
-#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
-    const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
-    return IfVecThenElse(DupOdd(cmp), swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-#endif
-  }
-
-  // (See above - we use Sort2 for non-64-bit types.)
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys4(d, v);
-    Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-  }
-
-  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->SwapAdjacentQuads(d, v);
-    // Only used in Merge16, so this will not be used on AVX2 (which only has 4
-    // u64 lanes), so skip the above optimization for 64-bit AVX2.
-    Sort2(d, v, swapped);
-    return base->OddEvenQuads(d, swapped, v);
-  }
-};
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/traits128-inl.h
+++ b/third_party/highway/hwy/contrib/sort/traits128-inl.h
@ -1,368 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/vqsort.h"  // SortDescending
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-#if HWY_TARGET == HWY_SCALAR
-
-struct OrderAscending128 {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-};
-
-struct OrderDescending128 {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-};
-
-template <class Order>
-struct Traits128 : public Order {
-  constexpr bool Is128() const { return true; }
-  constexpr size_t LanesPerKey() const { return 2; }
-};
-
-#else
-
-// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
-// along with an abstraction layer for single-lane vs. lane-pair, which is
-// independent of the order.
-struct Key128 {
-  constexpr size_t LanesPerKey() const { return 2; }
-
-  template <typename T>
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const FixedTag<T, 2> d;
-    const auto temp = LoadU(d, a);
-    StoreU(LoadU(d, b), d, a);
-    StoreU(temp, d, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
-    return LoadDup128(d, key);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
-    return ReverseBlocks(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
-    return SwapAdjacentBlocks(v);
-  }
-
-  // Only called for 4 keys because we do not support >512-bit vectors.
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
-    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
-    return ReverseKeys(d, v);
-  }
-
-  // Only called for 4 keys because we do not support >512-bit vectors.
-  template <class D>
-  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
-                                 const Vec<D> even) const {
-    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
-    return ConcatUpperLower(d, odd, even);
-  }
-
-  template <class V>
-  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
-    return OddEvenBlocks(odd, even);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
-    HWY_ASSERT(0);  // not supported: would require 1024-bit vectors
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
-    HWY_ASSERT(0);  // not supported: would require 2048-bit vectors
-  }
-
-  // This is only called for 8/16 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-
-  // This is only called for 16 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-
-  // This is only called for 8 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-};
-
-// Anything order-related depends on the key traits *and* the order (see
-// FirstOfLanes). We cannot implement just one Compare function because Lt128
-// only compiles if the lane type is u64. Thus we need either overloaded
-// functions with a tag type, class specializations, or separate classes.
-// We avoid overloaded functions because we want all functions to be callable
-// from a SortTraits without per-function wrappers. Specializing would work, but
-// we are anyway going to specialize at a higher level.
-struct OrderAscending128 : public Key128 {
-  using Order = SortAscending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128(d, a, b);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-};
-
-struct OrderDescending128 : public Key128 {
-  using Order = SortDescending;
-
-  template <typename T>
-  HWY_INLINE bool Compare1(const T* a, const T* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128(d, b, a);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(b, a);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = First(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
-      v = Last(d, v, SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-};
-
-// Shared code that depends on Order.
-template <class Base>
-class Traits128 : public Base {
-#if HWY_TARGET <= HWY_AVX2
-  // Returns vector with only the top u64 lane valid. Useful when the next step
-  // is to replicate the mask anyway.
-  template <class D>
-  HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
-    const Base* base = static_cast<const Base*>(this);
-    const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
-    const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
-    const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
-    return OrAnd(ltHL, eqHL, ltLX);
-  }
-
-  // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
-  // the most-significant of those lanes (the result of CompareTop), so
-  // replicate it 4x. Only called for >= 256-bit vectors.
-  template <class V>
-  HWY_INLINE V ReplicateTop4x(V v) const {
-#if HWY_TARGET <= HWY_AVX3
-    return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
-#else  // AVX2
-    return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
-#endif
-  }
-#endif
-
- public:
-  constexpr bool Is128() const { return true; }
-
-  template <class D>
-  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
-    const Base* base = static_cast<const Base*>(this);
-
-    const Vec<D> a_copy = a;
-    const auto lt = base->Compare(d, a, b);
-    a = IfThenElse(lt, a, b);
-    b = IfThenElse(lt, b, a_copy);
-  }
-
-  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-
-#if HWY_TARGET <= HWY_AVX2
-    const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
-    return IfVecThenElse(select, swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-#endif
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys4(d, v);
-
-    // Only specialize for AVX3 because this requires 512-bit vectors.
-#if HWY_TARGET <= HWY_AVX3
-    const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
-    // Similar to ReplicateTop4x, we want to gang together 2 comparison results
-    // (4 lanes). They are not contiguous, so use permute to replicate 4x.
-    alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
-    const Vec512<uint64_t> select =
-        TableLookupLanes(outHx, SetTableIndices(d, kIndices));
-    return IfVecThenElse(select, swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-#endif
-  }
-
-  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
-    // Only used by Merge16, which would require 2048 bit vectors (unsupported).
-    HWY_ASSERT(0);
-  }
-};
-
-#endif  // HWY_TARGET != HWY_SCALAR
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/vqsort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
@ -1,722 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-
-// Makes it harder for adversaries to predict our sampling locations, at the
-// cost of 1-2% increased runtime.
-#ifndef VQSORT_SECURE_RNG
-#define VQSORT_SECURE_RNG 0
-#endif
-
-#if VQSORT_SECURE_RNG
-#include "third_party/absl/random/random.h"
-#endif
-
-#include <string.h>  // memcpy
-
-#include "hwy/cache_control.h"  // Prefetch
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes
-
-#if HWY_IS_MSAN
-#include <sanitizer/msan_interface.h>
-#endif
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/shared-inl.h"
-#include "hwy/contrib/sort/sorting_networks-inl.h"
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-#if HWY_TARGET == HWY_SCALAR
-
-template <typename T>
-void Swap(T* a, T* b) {
-  T t = *a;
-  *a = *b;
-  *b = t;
-}
-
-// Scalar version of HeapSort (see below)
-template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
-  if (num < 2) return;
-
-  // Build heap.
-  for (size_t i = 1; i < num; i += 1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - 1) / 1 / 2);
-      if (!st.Compare1(keys + idx_parent, keys + j)) {
-        break;
-      }
-      Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
-  }
-
-  for (size_t i = num - 1; i != 0; i -= 1) {
-    // Swap root with last
-    Swap(keys + 0, keys + i);
-
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + 1;
-      const size_t right = 2 * j + 2;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      if (st.Compare1(keys + j, keys + left)) {
-        idx_larger = left;
-      }
-      if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
-    }
-  }
-}
-
-#else
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ HeapSort
-
-// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
-// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
-template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
-  constexpr size_t N1 = st.LanesPerKey();
-  const FixedTag<T, N1> d;
-
-  if (num < 2 * N1) return;
-
-  // Build heap.
-  for (size_t i = N1; i < num; i += N1) {
-    size_t j = i;
-    while (j != 0) {
-      const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
-      if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
-                                 st.SetKey(d, keys + j)))) {
-        break;
-      }
-      st.Swap(keys + j, keys + idx_parent);
-      j = idx_parent;
-    }
-  }
-
-  for (size_t i = num - N1; i != 0; i -= N1) {
-    // Swap root with last
-    st.Swap(keys + 0, keys + i);
-
-    // Sift down the new root.
-    size_t j = 0;
-    while (j < i) {
-      const size_t left = 2 * j + N1;
-      const size_t right = 2 * j + 2 * N1;
-      if (left >= i) break;
-      size_t idx_larger = j;
-      const auto key_j = st.SetKey(d, keys + j);
-      if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
-        idx_larger = left;
-      }
-      if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
-                                             st.SetKey(d, keys + right)))) {
-        idx_larger = right;
-      }
-      if (idx_larger == j) break;
-      st.Swap(keys + j, keys + idx_larger);
-      j = idx_larger;
-    }
-  }
-}
-
-// ------------------------------ BaseCase
-
-// Sorts `keys` within the range [0, num) via sorting network.
-template <class D, class Traits, typename T>
-HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                           T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  using V = decltype(Zero(d));
-
-  // _Nonzero32 requires num - 1 != 0.
-  if (HWY_UNLIKELY(num <= 1)) return;
-
-  // Reshape into a matrix with kMaxRows rows, and columns limited by the
-  // 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum).
-  const size_t num_pow2 = size_t{1}
-                          << (32 - Num0BitsAboveMS1Bit_Nonzero32(
-                                       static_cast<uint32_t>(num - 1)));
-  HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N));
-  const size_t cols =
-      HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
-  HWY_DASSERT(cols <= N);
-
-  // Copy `keys` to `buf`.
-  size_t i;
-  for (i = 0; i + N <= num; i += N) {
-    Store(LoadU(d, keys + i), d, buf + i);
-  }
-  for (; i < num; ++i) {
-    buf[i] = keys[i];
-  }
-
-  // Fill with padding - last in sort order, not copied to keys.
-  const V kPadding = st.LastValue(d);
-  // Initialize an extra vector because SortingNetwork loads full vectors,
-  // which may exceed cols*kMaxRows.
-  for (; i < (cols * Constants::kMaxRows + N); i += N) {
-    StoreU(kPadding, d, buf + i);
-  }
-
-  SortingNetwork(st, buf, cols);
-
-  for (i = 0; i + N <= num; i += N) {
-    StoreU(Load(d, buf + i), d, keys + i);
-  }
-  for (; i < num; ++i) {
-    keys[i] = buf[i];
-  }
-}
-
-// ------------------------------ Partition
-
-// Consumes from `left` until a multiple of kUnroll*N remains.
-// Temporarily stores the right side into `buf`, then moves behind `right`.
-template <class D, class Traits, class T>
-HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
-                                              T* HWY_RESTRICT keys,
-                                              size_t& left, size_t& right,
-                                              const Vec<D> pivot,
-                                              T* HWY_RESTRICT buf) {
-  constexpr size_t kUnroll = Constants::kPartitionUnroll;
-  const size_t N = Lanes(d);
-  size_t readL = left;
-  size_t bufR = 0;
-  const size_t num = right - left;
-  // Partition requires both a multiple of kUnroll*N and at least
-  // 2*kUnroll*N for the initial loads. If less, consume all here.
-  const size_t num_rem =
-      (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1));
-  size_t i = 0;
-  for (; i + N <= num_rem; i += N) {
-    const Vec<D> vL = LoadU(d, keys + readL);
-    readL += N;
-
-    const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, Not(comp), d, keys + left);
-    bufR += CompressStore(vL, comp, d, buf + bufR);
-  }
-  // Last iteration: only use valid lanes.
-  if (HWY_LIKELY(i != num_rem)) {
-    const auto mask = FirstN(d, num_rem - i);
-    const Vec<D> vL = LoadU(d, keys + readL);
-
-    const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left);
-    bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
-  }
-
-  // MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
-#if HWY_IS_MSAN
-  __msan_unpoison(buf, bufR * sizeof(T));
-#endif
-
-  // Everything we loaded was put into buf, or behind the new `left`, after
-  // which there is space for bufR items. First move items from `right` to
-  // `left` to free up space, then copy `buf` into the vacated `right`.
-  // A loop with masked loads from `buf` is insufficient - we would also need to
-  // mask from `right`. Combining a loop with memcpy for the remainders is
-  // slower than just memcpy, so we use that for simplicity.
-  right -= bufR;
-  memcpy(keys + left, keys + right, bufR * sizeof(T));
-  memcpy(keys + right, buf, bufR * sizeof(T));
-}
-
-template <class D, class Traits, typename T>
-HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
-                               const Vec<D> pivot, T* HWY_RESTRICT keys,
-                               size_t& writeL, size_t& writeR) {
-  const size_t N = Lanes(d);
-
-  const auto comp = st.Compare(d, pivot, v);
-  const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL);
-  writeL += num_left;
-
-  writeR -= (N - num_left);
-  (void)CompressBlendedStore(v, comp, d, keys + writeR);
-}
-
-template <class D, class Traits, typename T>
-HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
-                                const Vec<D> v1, const Vec<D> v2,
-                                const Vec<D> v3, const Vec<D> pivot,
-                                T* HWY_RESTRICT keys, size_t& writeL,
-                                size_t& writeR) {
-  StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
-  StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
-}
-
-// Moves "<= pivot" keys to the front, and others to the back. pivot is
-// broadcasted. Time-critical!
-//
-// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
-template <class D, class Traits, typename T>
-HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
-                              size_t right, const Vec<D> pivot,
-                              T* HWY_RESTRICT buf) {
-  using V = decltype(Zero(d));
-  const size_t N = Lanes(d);
-
-  // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
-  // lanes happen to be in the right-side partition, this will overrun `keys`,
-  // which triggers asan errors. Avoid by special-casing the last vector.
-  HWY_DASSERT(right - left > 2 * N);  // ensured by HandleSpecialCases
-  right -= N;
-  const size_t last = right;
-  const V vlast = LoadU(d, keys + last);
-
-  PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
-  constexpr size_t kUnroll = Constants::kPartitionUnroll;
-
-  // Invariant: [left, writeL) and [writeR, right) are already partitioned.
-  size_t writeL = left;
-  size_t writeR = right;
-
-  const size_t num = right - left;
-  // Cannot load if there were fewer than 2 * kUnroll * N.
-  if (HWY_LIKELY(num != 0)) {
-    HWY_DASSERT(num >= 2 * kUnroll * N);
-    HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
-
-    // Make space for writing in-place by reading from left and right.
-    const V vL0 = LoadU(d, keys + left + 0 * N);
-    const V vL1 = LoadU(d, keys + left + 1 * N);
-    const V vL2 = LoadU(d, keys + left + 2 * N);
-    const V vL3 = LoadU(d, keys + left + 3 * N);
-    left += kUnroll * N;
-    right -= kUnroll * N;
-    const V vR0 = LoadU(d, keys + right + 0 * N);
-    const V vR1 = LoadU(d, keys + right + 1 * N);
-    const V vR2 = LoadU(d, keys + right + 2 * N);
-    const V vR3 = LoadU(d, keys + right + 3 * N);
-
-    // The left/right updates may consume all inputs, so check before the loop.
-    while (left != right) {
-      V v0, v1, v2, v3;
-
-      // Free up capacity for writing by loading from the side that has less.
-      // Data-dependent but branching is faster than forcing branch-free.
-      const size_t capacityL = left - writeL;
-      const size_t capacityR = writeR - right;
-      HWY_DASSERT(capacityL <= num && capacityR <= num);  // >= 0
-      if (capacityR < capacityL) {
-        right -= kUnroll * N;
-        v0 = LoadU(d, keys + right + 0 * N);
-        v1 = LoadU(d, keys + right + 1 * N);
-        v2 = LoadU(d, keys + right + 2 * N);
-        v3 = LoadU(d, keys + right + 3 * N);
-        hwy::Prefetch(keys + right - 3 * kUnroll * N);
-      } else {
-        v0 = LoadU(d, keys + left + 0 * N);
-        v1 = LoadU(d, keys + left + 1 * N);
-        v2 = LoadU(d, keys + left + 2 * N);
-        v3 = LoadU(d, keys + left + 3 * N);
-        left += kUnroll * N;
-        hwy::Prefetch(keys + left + 3 * kUnroll * N);
-      }
-
-      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
-    }
-
-    // Now finish writing the initial left/right to the middle.
-    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
-    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
-  }
-
-  // We have partitioned [left, right) such that writeL is the boundary.
-  HWY_DASSERT(writeL == writeR);
-  // Make space for inserting vlast: move up to N of the first right-side keys
-  // into the unused space starting at last. If we have fewer, ensure they are
-  // the last items in that vector by subtracting from the *load* address,
-  // which is safe because we have at least two vectors (checked above).
-  const size_t totalR = last - writeL;
-  const size_t startR = totalR < N ? writeL + totalR - N : writeL;
-  StoreU(LoadU(d, keys + startR), d, keys + last);
-
-  // Partition vlast: write L, then R, into the single-vector gap at writeL.
-  const auto comp = st.Compare(d, pivot, vlast);
-  writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
-  (void)CompressBlendedStore(vlast, comp, d, keys + writeL);
-
-  return writeL;
-}
-
-// ------------------------------ Pivot
-
-template <class Traits, class V>
-HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
-  const DFromV<V> d;
-  // Slightly faster for 128-bit, apparently because not serially dependent.
-  if (st.Is128()) {
-    // Median = XOR-sum 'minus' the first and last. Calling First twice is
-    // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
-    const auto sum = Xor(Xor(v0, v1), v2);
-    const auto first = st.First(d, st.First(d, v0, v1), v2);
-    const auto last = st.Last(d, st.Last(d, v0, v1), v2);
-    return Xor(Xor(sum, first), last);
-  }
-  st.Sort2(d, v0, v2);
-  v1 = st.Last(d, v0, v1);
-  v1 = st.First(d, v1, v2);
-  return v1;
-}
-
-// Replaces triplets with their median and recurses until less than 3 keys
-// remain. Ignores leftover values (non-whole triplets)!
-template <class D, class Traits, typename T>
-Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                          T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  constexpr size_t N1 = st.LanesPerKey();
-
-  if (num < 3 * N1) return st.SetKey(d, keys);
-
-  size_t read = 0;
-  size_t written = 0;
-
-  // Triplets of vectors
-  for (; read + 3 * N <= num; read += 3 * N) {
-    const auto v0 = Load(d, keys + read + 0 * N);
-    const auto v1 = Load(d, keys + read + 1 * N);
-    const auto v2 = Load(d, keys + read + 2 * N);
-    Store(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N;
-  }
-
-  // Triplets of keys
-  for (; read + 3 * N1 <= num; read += 3 * N1) {
-    const auto v0 = st.SetKey(d, keys + read + 0 * N1);
-    const auto v1 = st.SetKey(d, keys + read + 1 * N1);
-    const auto v2 = st.SetKey(d, keys + read + 2 * N1);
-    StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
-    written += N1;
-  }
-
-  // Tail recursion; swap buffers
-  return RecursiveMedianOf3(d, st, buf, written, keys);
-}
-
-#if VQSORT_SECURE_RNG
-using Generator = absl::BitGen;
-#else
-// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
-#pragma pack(push, 1)
-class Generator {
- public:
-  Generator(const void* heap, size_t num) {
-    Sorter::Fill24Bytes(heap, num, &a_);
-    k_ = 1;  // stream index: must be odd
-  }
-
-  uint64_t operator()() {
-    const uint64_t b = b_;
-    w_ += k_;
-    const uint64_t next = a_ ^ w_;
-    a_ = (b + (b << 3)) ^ (b >> 11);
-    const uint64_t rot = (b << 24) | (b >> 40);
-    b_ = rot + next;
-    return next;
-  }
-
- private:
-  uint64_t a_;
-  uint64_t b_;
-  uint64_t w_;
-  uint64_t k_;  // increment
-};
-#pragma pack(pop)
-
-#endif  // !VQSORT_SECURE_RNG
-
-// Returns slightly biased random index of a chunk in [0, num_chunks).
-// See https://www.pcg-random.org/posts/bounded-rands.html.
-HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
-  const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32;
-  HWY_DASSERT(chunk_index < num_chunks);
-  return static_cast<size_t>(chunk_index);
-}
-
-template <class D, class Traits, typename T>
-HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
-                                const size_t begin, const size_t end,
-                                T* HWY_RESTRICT buf, Generator& rng) {
-  using V = decltype(Zero(d));
-  const size_t N = Lanes(d);
-
-  // Power of two
-  const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
-
-  keys += begin;
-  size_t num = end - begin;
-
-  // Align start of keys to chunks. We always have at least 2 chunks because the
-  // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
-  HWY_DASSERT(num >= 2 * lanes_per_chunk);
-  const size_t misalign =
-      (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1);
-  if (misalign != 0) {
-    const size_t consume = lanes_per_chunk - misalign;
-    keys += consume;
-    num -= consume;
-  }
-
-  // Generate enough random bits for 9 uint32
-  uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf);
-  for (size_t i = 0; i < 5; ++i) {
-    bits64[i] = rng();
-  }
-  const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf);
-
-  const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk);
-  // Avoid division
-  const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32);
-  const size_t num_chunks64 = num >> log2_lpc;
-  // Clamp to uint32 for RandomChunkIndex
-  const uint32_t num_chunks =
-      static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull));
-
-  const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc;
-  const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc;
-  const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc;
-  const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc;
-  const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc;
-  const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc;
-  const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc;
-  const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc;
-  const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc;
-  for (size_t i = 0; i < lanes_per_chunk; i += N) {
-    const V v0 = Load(d, keys + offset0 + i);
-    const V v1 = Load(d, keys + offset1 + i);
-    const V v2 = Load(d, keys + offset2 + i);
-    const V medians0 = MedianOf3(st, v0, v1, v2);
-    Store(medians0, d, buf + i);
-
-    const V v3 = Load(d, keys + offset3 + i);
-    const V v4 = Load(d, keys + offset4 + i);
-    const V v5 = Load(d, keys + offset5 + i);
-    const V medians1 = MedianOf3(st, v3, v4, v5);
-    Store(medians1, d, buf + i + lanes_per_chunk);
-
-    const V v6 = Load(d, keys + offset6 + i);
-    const V v7 = Load(d, keys + offset7 + i);
-    const V v8 = Load(d, keys + offset8 + i);
-    const V medians2 = MedianOf3(st, v6, v7, v8);
-    Store(medians2, d, buf + i + lanes_per_chunk * 2);
-  }
-
-  return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
-                            buf + 3 * lanes_per_chunk);
-}
-
-// Compute exact min/max to detect all-equal partitions. Only called after a
-// degenerate Partition (none in the right partition).
-template <class D, class Traits, typename T>
-HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
-                             size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
-                             Vec<D>& last) {
-  const size_t N = Lanes(d);
-
-  first = st.LastValue(d);
-  last = st.FirstValue(d);
-
-  size_t i = 0;
-  for (; i + N <= num; i += N) {
-    const Vec<D> v = LoadU(d, keys + i);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
-  }
-  if (HWY_LIKELY(i != num)) {
-    HWY_DASSERT(num >= N);  // See HandleSpecialCases
-    const Vec<D> v = LoadU(d, keys + num - N);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
-  }
-
-  first = st.FirstOfLanes(d, first, buf);
-  last = st.LastOfLanes(d, last, buf);
-}
-
-template <class D, class Traits, typename T>
-void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
-             const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
-             Generator& rng, size_t remaining_levels) {
-  HWY_DASSERT(begin + 1 < end);
-  const size_t num = end - begin;  // >= 2
-
-  // Too many degenerate partitions. This is extremely unlikely to happen
-  // because we select pivots from large (though still O(1)) samples.
-  if (HWY_UNLIKELY(remaining_levels == 0)) {
-    HeapSort(st, keys + begin, num);  // Slow but N*logN.
-    return;
-  }
-
-  const ptrdiff_t base_case_num =
-      static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
-  const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
-
-  const ptrdiff_t num_left =
-      static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
-  const ptrdiff_t num_right =
-      static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
-
-  // Check for degenerate partitions (i.e. Partition did not move any keys):
-  if (HWY_UNLIKELY(num_right == 0)) {
-    // Because the pivot is one of the keys, it must have been equal to the
-    // first or last key in sort order. Scan for the actual min/max:
-    // passing the current pivot as the new bound is insufficient because one of
-    // the partitions might not actually include that key.
-    Vec<D> first, last;
-    ScanMinMax(d, st, keys + begin, num, buf, first, last);
-    if (AllTrue(d, Eq(first, last))) return;
-
-    // Separate recursion to make sure that we don't pick `last` as the
-    // pivot - that would again lead to a degenerate partition.
-    Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
-    return;
-  }
-
-  if (HWY_UNLIKELY(num_left <= base_case_num)) {
-    BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
-  } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
-    Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
-            remaining_levels - 1);
-  }
-  if (HWY_UNLIKELY(num_right <= base_case_num)) {
-    BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
-  } else {
-    const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
-    Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
-            remaining_levels - 1);
-  }
-}
-
-// Returns true if sorting is finished.
-template <class D, class Traits, typename T>
-bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                        T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  const size_t base_case_num = Constants::BaseCaseNum(N);
-
-  // 128-bit keys require vectors with at least two u64 lanes, which is always
-  // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
-  // hardware vector width is less than 128bit / fraction.
-  const bool partial_128 = N < 2 && st.Is128();
-  // Partition assumes its input is at least two vectors. If vectors are huge,
-  // base_case_num may actually be smaller. If so, which is only possible on
-  // RVV, pass a capped or partial d (LMUL < 1).
-  constexpr bool kPotentiallyHuge =
-      HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
-  const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
-  if (partial_128 || huge_vec) {
-    // PERFORMANCE WARNING: falling back to HeapSort.
-    HeapSort(st, keys, num);
-    return true;
-  }
-
-  // Small arrays: use sorting network, no need for other checks.
-  if (HWY_UNLIKELY(num <= base_case_num)) {
-    BaseCase(d, st, keys, num, buf);
-    return true;
-  }
-
-  // We could also check for already sorted/reverse/equal, but that's probably
-  // counterproductive if vqsort is used as a base case.
-
-  return false;  // not finished sorting
-}
-
-#endif  // HWY_TARGET != HWY_SCALAR
-}  // namespace detail
-
-// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
-// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
-// Non-stable (order of equal keys may change), except for the common case where
-// the upper bits of T are the key, and the lower bits are a sequential or at
-// least unique ID.
-// There is no upper limit on `num`, but note that pivots may be chosen by
-// sampling only from the first 256 GiB.
-//
-// `d` is typically SortTag<T> (chooses between full and partial vectors).
-// `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
-//   bridges differences in sort order and single-lane vs 128-bit keys.
-template <class D, class Traits, typename T>
-void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-          T* HWY_RESTRICT buf) {
-#if HWY_TARGET == HWY_SCALAR
-  (void)d;
-  (void)buf;
-  // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
-  return detail::HeapSort(st, keys, num);
-#else
-  if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
-
-#if HWY_MAX_BYTES > 64
-  // sorting_networks-inl and traits assume no more than 512 bit vectors.
-  if (Lanes(d) > 64 / sizeof(T)) {
-    return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
-  }
-#endif  // HWY_MAX_BYTES > 64
-
-  // Pulled out of the recursion so we can special-case degenerate partitions.
-  detail::Generator rng(keys, num);
-  const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
-
-  // Introspection: switch to worst-case N*logN heapsort after this many.
-  const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
-
-  detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
-#endif  // HWY_TARGET == HWY_SCALAR
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
--- a/third_party/highway/hwy/contrib/sort/vqsort.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort.cc
@ -1,148 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#include <string.h>  // memset
-
-#include "hwy/aligned_allocator.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/shared-inl.h"
-
-// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
-// (not all Android support the getrandom wrapper)
-#ifndef VQSORT_SECURE_SEED
-
-#if (defined(linux) || defined(__linux__)) && \
-    !(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV)
-#define VQSORT_SECURE_SEED 1
-#elif defined(_WIN32) || defined(_WIN64)
-#define VQSORT_SECURE_SEED 2
-#else
-#define VQSORT_SECURE_SEED 0
-#endif
-
-#endif  // VQSORT_SECURE_SEED
-
-#if !VQSORT_SECURE_RNG
-
-#include <time.h>
-#if VQSORT_SECURE_SEED == 1
-#include <sys/random.h>
-#elif VQSORT_SECURE_SEED == 2
-#include <windows.h>
-#pragma comment(lib, "Advapi32.lib")
-// Must come after windows.h.
-#include <wincrypt.h>
-#endif  // VQSORT_SECURE_SEED
-
-#endif  // !VQSORT_SECURE_RNG
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
-bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(VectorSize);
-HWY_EXPORT(HaveFloat64);
-
-HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
-  // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
-  const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N);
-  return (3 + 1) * lpc + 2 * N;
-}
-
-}  // namespace
-
-Sorter::Sorter() {
-  // Determine the largest buffer size required for any type by trying them all.
-  // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
-  // may require a larger buffer.)
-  const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
-  size_t max_bytes = 0;
-  for (size_t sizeof_t :
-       {sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) {
-    const size_t N = vector_size / sizeof_t;
-    // One extra for padding plus another for full-vector loads.
-    const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N;
-    const size_t partition_num = SortConstants::PartitionBufNum(N);
-    const size_t buf_lanes =
-        HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N)));
-    max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t);
-  }
-
-  ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
-
-  // Prevent msan errors by initializing.
-  memset(ptr_, 0, max_bytes);
-}
-
-void Sorter::Delete() {
-  FreeAlignedBytes(ptr_, nullptr, nullptr);
-  ptr_ = nullptr;
-}
-
-#if !VQSORT_SECURE_RNG
-
-void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
-#if VQSORT_SECURE_SEED == 1
-  // May block if urandom is not yet initialized.
-  const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
-  if (ret == 24) return;
-#elif VQSORT_SECURE_SEED == 2
-  HCRYPTPROV hProvider{};
-  if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
-                           CRYPT_VERIFYCONTEXT)) {
-    const BOOL ok =
-        CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
-    CryptReleaseContext(hProvider, 0);
-    if (ok) return;
-  }
-#endif
-
-  // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
-  // stack/heap/code addresses and the clock() timer.
-  uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
-  uint64_t** seed_stack = &words;
-  void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
-  const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
-  const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
-  const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
-  const uint64_t bits_time = static_cast<uint64_t>(clock());
-  words[0] = bits_stack ^ bits_time ^ seed_num;
-  words[1] = bits_heap ^ bits_time ^ seed_num;
-  words[2] = bits_code ^ bits_time ^ seed_num;
-}
-
-#endif  // !VQSORT_SECURE_RNG
-
-bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort.h
+++ b/third_party/highway/hwy/contrib/sort/vqsort.h
@ -1,104 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Interface to vectorized quicksort with dynamic dispatch.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
-
-#include "hwy/base.h"
-
-namespace hwy {
-
-// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
-// https://reviews.llvm.org/D86310
-#pragma pack(push, 1)
-struct alignas(16) uint128_t {
-  uint64_t lo;  // little-endian layout
-  uint64_t hi;
-};
-#pragma pack(pop)
-
-// Tag arguments that determine the sort order.
-struct SortAscending {
-  constexpr bool IsAscending() const { return true; }
-};
-struct SortDescending {
-  constexpr bool IsAscending() const { return false; }
-};
-
-// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
-// This allows amortizing the allocation over multiple sorts.
-class HWY_CONTRIB_DLLEXPORT Sorter {
- public:
-  Sorter();
-  ~Sorter() { Delete(); }
-
-  // Move-only
-  Sorter(const Sorter&) = delete;
-  Sorter& operator=(const Sorter&) = delete;
-  Sorter(Sorter&& other) {
-    Delete();
-    ptr_ = other.ptr_;
-    other.ptr_ = nullptr;
-  }
-  Sorter& operator=(Sorter&& other) {
-    Delete();
-    ptr_ = other.ptr_;
-    other.ptr_ = nullptr;
-    return *this;
-  }
-
-  // Sorts keys[0, n). Dispatches to the best available instruction set,
-  // and does not allocate memory.
-  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  // For internal use only
-  static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
-  static bool HaveFloat64();
-
- private:
-  void Delete();
-
-  template <typename T>
-  T* Get() const {
-    return static_cast<T*>(ptr_);
-  }
-
-  void* ptr_ = nullptr;
-};
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
--- a/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
@ -1,55 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
-                uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(Sort128Asc);
-}  // namespace
-
-void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(Sort128Asc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
@ -1,55 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
-                 uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(Sort128Desc);
-}  // namespace
-
-void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(Sort128Desc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
@ -1,53 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
-  SortTag<float> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF32Asc);
-}  // namespace
-
-void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
-                 float* HWY_RESTRICT buf) {
-  SortTag<float> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF32Desc);
-}  // namespace
-
-void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
@ -1,61 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
-                double* HWY_RESTRICT buf) {
-#if HWY_HAVE_FLOAT64
-  SortTag<double> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void)keys;
-  (void)num;
-  (void)buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF64Asc);
-}  // namespace
-
-void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
@ -1,61 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
-                 double* HWY_RESTRICT buf) {
-#if HWY_HAVE_FLOAT64
-  SortTag<double> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void)keys;
-  (void)num;
-  (void)buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF64Desc);
-}  // namespace
-
-void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
@ -1,59 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
-                int16_t* HWY_RESTRICT buf) {
-  SortTag<int16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI16Asc);
-}  // namespace
-
-void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
@ -1,59 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
-                 int16_t* HWY_RESTRICT buf) {
-  SortTag<int16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI16Desc);
-}  // namespace
-
-void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
-                int32_t* HWY_RESTRICT buf) {
-  SortTag<int32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI32Asc);
-}  // namespace
-
-void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
-                 int32_t* HWY_RESTRICT buf) {
-  SortTag<int32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI32Desc);
-}  // namespace
-
-void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
-                int64_t* HWY_RESTRICT buf) {
-  SortTag<int64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI64Asc);
-}  // namespace
-
-void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
-                 int64_t* HWY_RESTRICT buf) {
-  SortTag<int64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI64Desc);
-}  // namespace
-
-void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
@ -1,59 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
-                uint16_t* HWY_RESTRICT buf) {
-  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU16Asc);
-}  // namespace
-
-void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
@ -1,59 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-// Workaround for build timeout
-#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
-                 uint16_t* HWY_RESTRICT buf) {
-  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU16Desc);
-}  // namespace
-
-void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
-
-#endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
--- a/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
-                uint32_t* HWY_RESTRICT buf) {
-  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU32Asc);
-}  // namespace
-
-void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
-                 uint32_t* HWY_RESTRICT buf) {
-  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU32Desc);
-}  // namespace
-
-void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
-                uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU64Asc);
-}  // namespace
-
-void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
+++ b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/disabled_targets.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
-#include "hwy/foreach_target.h"
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
-                 uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU64Desc);
-}  // namespace
-
-void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
--- a/third_party/highway/hwy/detect_compiler_arch.h
+++ b/third_party/highway/hwy/detect_compiler_arch.h
@ -106,6 +106,20 @@
 //------------------------------------------------------------------------------
 // Architecture

+#if defined(HWY_EMULATE_SVE)
+
+#define HWY_ARCH_X86_32 0
+#define HWY_ARCH_X86_64 0
+#define HWY_ARCH_X86 0
+#define HWY_ARCH_PPC 0
+#define HWY_ARCH_ARM_A64 1
+#define HWY_ARCH_ARM_V7 0
+#define HWY_ARCH_ARM 1
+#define HWY_ARCH_WASM 0
+#define HWY_ARCH_RVV 0
+
+#else
+
 #if defined(__i386__) || defined(_M_IX86)
 #define HWY_ARCH_X86_32 1
 #else
@ -168,6 +182,8 @@
 #define HWY_ARCH_RVV 0
 #endif

+#endif // defined(HWY_EMULATE_SVE)
+
 // It is an error to detect multiple architectures at the same time, but OK to
 // detect none of the above.
 #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
--- a/third_party/highway/hwy/detect_targets.h
+++ b/third_party/highway/hwy/detect_targets.h
@ -161,6 +161,11 @@
 // user to override this without any guarantee of success.
 #ifndef HWY_BASELINE_TARGETS

+#if defined(HWY_EMULATE_SVE)
+#define HWY_BASELINE_TARGETS HWY_SVE  // does not support SVE2
+#define HWY_BASELINE_AVX3_DL 0
+#else
+
 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
 // HWY_TARGET == HWY_SCALAR.

@ -181,7 +186,7 @@
 #define HWY_BASELINE_PPC8 0
 #endif

-// SVE2 compiles, but is not yet tested.
+// SVE compiles, but is not yet tested.
 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
 #define HWY_BASELINE_SVE2 HWY_SVE2
 #else
@ -302,6 +307,8 @@
   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
   HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)

+#endif  // HWY_EMULATE_SVE
+
 #else
 // User already defined HWY_BASELINE_TARGETS, but we still need to define
 // HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@ -25,17 +25,15 @@
 #include <numeric>  // iota

 #include "hwy/aligned_allocator.h"
-// Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
 #include "hwy/nanobenchmark.h"
-
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

 // These templates are not found via ADL.
 #if HWY_TARGET != HWY_SCALAR
-using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
 #endif

 class TwoArray {
@ -89,14 +87,14 @@ void RunBenchmark(const char* caption) {
 }

 void Intro() {
-  const float in[16] = {1, 2, 3, 4, 5, 6};
-  float out[16];
+  HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
+  HWY_ALIGN float out[16];
  const ScalableTag<float> d;  // largest possible vector
  for (size_t i = 0; i < 16; i += Lanes(d)) {
-    const auto vec = LoadU(d, in + i);  // no alignment requirement
-    auto result = Mul(vec, vec);
-    result = Add(result, result);  // can update if not const
-    StoreU(result, d, out + i);
+    const auto vec = Load(d, in + i);  // aligned!
+    auto result = vec * vec;
+    result += result;  // can update if not const
+    Store(result, d, out + i);
  }
  printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
 }
@ -111,34 +109,32 @@ class BenchmarkDot : public TwoArray {
    const ScalableTag<float> d;
    const size_t N = Lanes(d);
    using V = decltype(Zero(d));
+    constexpr size_t unroll = 8;
    // Compiler doesn't make independent sum* accumulators, so unroll manually.
-    // We cannot use an array because V might be a sizeless type. For reasonable
-    // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
-    V sum0 = Zero(d);
-    V sum1 = Zero(d);
-    V sum2 = Zero(d);
-    V sum3 = Zero(d);
+    // Some older compilers might not be able to fit the 8 arrays in registers,
+    // so manual unrolling can be helpfull if you run into this issue.
+    // 2 FMA ports * 4 cycle latency = 8x unrolled.
+    V sum[unroll];
+    for (size_t i = 0; i < unroll; ++i) {
+      sum[i] = Zero(d);
+    }
    const float* const HWY_RESTRICT pa = &a_[0];
    const float* const HWY_RESTRICT pb = b_;
-    for (size_t i = 0; i < num_items; i += 4 * N) {
-      const auto a0 = Load(d, pa + i + 0 * N);
-      const auto b0 = Load(d, pb + i + 0 * N);
-      sum0 = MulAdd(a0, b0, sum0);
-      const auto a1 = Load(d, pa + i + 1 * N);
-      const auto b1 = Load(d, pb + i + 1 * N);
-      sum1 = MulAdd(a1, b1, sum1);
-      const auto a2 = Load(d, pa + i + 2 * N);
-      const auto b2 = Load(d, pb + i + 2 * N);
-      sum2 = MulAdd(a2, b2, sum2);
-      const auto a3 = Load(d, pa + i + 3 * N);
-      const auto b3 = Load(d, pb + i + 3 * N);
-      sum3 = MulAdd(a3, b3, sum3);
+    for (size_t i = 0; i < num_items; i += unroll * N) {
+      for (size_t j = 0; j < unroll; ++j) {
+        const auto a = Load(d, pa + i + j * N);
+        const auto b = Load(d, pb + i + j * N);
+        sum[j] = MulAdd(a, b, sum[j]);
+      }
    }
-    // Reduction tree: sum of all accumulators by pairs into sum0.
-    sum0 = Add(sum0, sum1);
-    sum2 = Add(sum2, sum3);
-    sum0 = Add(sum0, sum2);
-    dot_ = GetLane(SumOfLanes(d, sum0));
+    // Reduction tree: sum of all accumulators by pairs into sum[0], then the
+    // lanes.
+    for (size_t power = 1; power < unroll; power *= 2) {
+      for (size_t i = 0; i < unroll; i += 2 * power) {
+        sum[i] += sum[i + power];
+      }
+    }
+    dot_ = GetLane(SumOfLanes(d, sum[0]));
    return static_cast<FuncOutput>(dot_);
  }
  void Verify(size_t num_items) {
@ -197,9 +193,9 @@ struct BenchmarkDelta : public TwoArray {
    auto prev = Load(df, &a_[0]);
    for (; i < num_items; i += Lanes(df)) {
      const auto a = Load(df, &a_[i]);
-      const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
+      const auto shifted = CombineShiftRightLanes<3>(a, prev);
      prev = a;
-      Store(Sub(a, shifted), df, &b_[i]);
+      Store(a - shifted, df, &b_[i]);
    }
 #endif
    return static_cast<FuncOutput>(b_[num_items - 1]);
--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@ -24,14 +24,11 @@
 // Generates code for each enabled target by re-including this source file.
 #include "hwy/foreach_target.h"

-// Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"

 // Optional, can instead add HWY_ATTR to all functions.
 HWY_BEFORE_NAMESPACE();
 namespace skeleton {
-// This namespace name is unique per target, which allows code for multiple
-// targets to co-exist in the same translation unit.
 namespace HWY_NAMESPACE {

 // Highway ops reside here; ADL does not find templates nor builtins.
@ -50,7 +47,7 @@ template <class DF>
 ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
                            uint8_t* HWY_RESTRICT log2) {
  // Type tags for converting to other element types (Rebind = same count).
-  const RebindToSigned<DF> d32;
+  const Rebind<int32_t, DF> d32;
  const Rebind<uint8_t, DF> d8;

  const auto u8 = Load(d8, values);
@ -62,7 +59,7 @@ ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
 void CodepathDemo() {
  // Highway defaults to portability, but per-target codepaths may be selected
  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  const char* gather = "Has int64";
 #else
  const char* gather = "No int64";
@ -74,16 +71,20 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
               uint8_t* HWY_RESTRICT log2) {
  CodepathDemo();

-  const ScalableTag<float> df;
+  // Second argument is necessary on RVV until it supports fractional lengths.
+  const ScalableTag<float, 2> df;
+
  const size_t N = Lanes(df);
  size_t i = 0;
  for (; i + N <= count; i += N) {
    OneFloorLog2(df, values + i, log2 + i);
  }
+  // TODO(janwas): implement
+#if HWY_TARGET != HWY_RVV
  for (; i < count; ++i) {
-    CappedTag<float, 1> d1;
-    OneFloorLog2(d1, values + i, log2 + i);
+    OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
  }
+#endif
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@ -91,9 +92,6 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
 }  // namespace skeleton
 HWY_AFTER_NAMESPACE();

-// The table of pointers to the various implementations in HWY_NAMESPACE must
-// be compiled only once (foreach_target #includes this file multiple times).
-// HWY_ONCE is true for only one of these 'compilation passes'.
 #if HWY_ONCE

 namespace skeleton {
@ -107,8 +105,6 @@ HWY_EXPORT(FloorLog2);
 // is equivalent to inlining this function.
 void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
                   uint8_t* HWY_RESTRICT out) {
-  // This must reside outside of HWY_NAMESPACE because it references (calls the
-  // appropriate one from) the per-target implementations there.
  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
 }

--- a/third_party/highway/hwy/examples/skeleton_test.cc
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@ -21,13 +21,10 @@
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
 #include "hwy/foreach_target.h"
-
-// Must come after foreach_target.h to avoid redefinition errors.
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

 // Optional: factor out parts of the implementation into *-inl.h
-// (must also come after foreach_target.h to avoid redefinition errors)
 #include "hwy/examples/skeleton-inl.h"

 HWY_BEFORE_NAMESPACE();
@ -53,7 +50,10 @@ struct TestFloorLog2 {
    CallFloorLog2(in.get(), count, out.get());
    int sum = 0;
    for (size_t i = 0; i < count; ++i) {
+      // TODO(janwas): implement
+#if HWY_TARGET != HWY_RVV
      HWY_ASSERT_EQ(expected[i], out[i]);
+#endif
      sum += out[i];
    }
    hwy::PreventElision(sum);
--- a/third_party/highway/hwy/foreach_target.h
+++ b/third_party/highway/hwy/foreach_target.h
@ -74,28 +74,6 @@
 #endif
 #endif

-#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE2
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
 #if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SSSE3
--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@ -27,7 +27,7 @@ namespace hwy {

 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 0
-#define HWY_MINOR 16
+#define HWY_MINOR 15
 #define HWY_PATCH 0

 //------------------------------------------------------------------------------
@ -37,9 +37,7 @@ namespace hwy {

 // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
 // registers in the group, and is ignored on targets that do not support groups.
-#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
-#define HWY_FULL2(T, LMUL) \
-  hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
+#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
 #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
 // Workaround for MSVC grouping __VA_ARGS__ into a single argument
 #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
@ -48,9 +46,9 @@ namespace hwy {
  HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
 #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)

-// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
+// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
 #define HWY_CAPPED(T, MAX_N) \
-  hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))>
+  hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>

 //------------------------------------------------------------------------------
 // Export user functions for static/dynamic dispatch
@ -111,7 +109,6 @@ struct FunctionCache {
  template <FunctionType* const table[]>
  static RetType ChooseAndCall(Args... args) {
    // If we are running here it means we need to update the chosen target.
-    ChosenTarget& chosen_target = GetChosenTarget();
    chosen_target.Update();
    return (table[chosen_target.GetIndex()])(args...);
  }
@ -266,15 +263,10 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
          HWY_CHOOSE_SCALAR(FUNC_NAME),                                    \
  }
 #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
-  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
+  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))

 #endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)

-// DEPRECATED names; please use HWY_HAVE_* instead.
-#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
-#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
-#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
-
 }  // namespace hwy

 #endif  // HWY_HIGHWAY_INCLUDED
@ -291,6 +283,13 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_HIGHWAY_PER_TARGET
 #endif

+#undef HWY_FULL2
+#if HWY_TARGET == HWY_RVV
+#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
+#else
+#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
+#endif
+
 // These define ops inside namespace hwy::HWY_NAMESPACE.
 #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
 #include "hwy/ops/x86_128-inl.h"
--- a/third_party/highway/hwy/highway_export.h
+++ b/third_party/highway/hwy/highway_export.h
@ -1,106 +0,0 @@
-// Pseudo-generated file to handle both cmake & bazel build system.
-
-// Initial generation done using cmake code:
-// include(GenerateExportHeader)
-// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
-// hwy/highway_export.h)
-// code reformatted using clang-format --style=Google
-
-#ifndef HWY_DLLEXPORT_H
-#define HWY_DLLEXPORT_H
-
-// Bazel build are always static:
-#if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE)
-#define HWY_STATIC_DEFINE
-#endif
-
-#ifdef HWY_STATIC_DEFINE
-#define HWY_DLLEXPORT
-#define HWY_NO_EXPORT
-#define HWY_CONTRIB_DLLEXPORT
-#define HWY_CONTRIB_NO_EXPORT
-#define HWY_TEST_DLLEXPORT
-#define HWY_TEST_NO_EXPORT
-#else
-
-#ifndef HWY_DLLEXPORT
-#if defined(hwy_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif
-#endif
-
-#ifndef HWY_NO_EXPORT
-#ifdef _WIN32
-#define HWY_NO_EXPORT
-#else
-#define HWY_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
-
-#ifndef HWY_CONTRIB_DLLEXPORT
-#if defined(hwy_contrib_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif
-#endif
-
-#ifndef HWY_CONTRIB_NO_EXPORT
-#ifdef _WIN32
-#define HWY_CONTRIB_NO_EXPORT
-#else
-#define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
-
-#ifndef HWY_TEST_DLLEXPORT
-#if defined(hwy_test_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_TEST_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_TEST_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif
-#endif
-
-#ifndef HWY_TEST_NO_EXPORT
-#ifdef _WIN32
-#define HWY_TEST_NO_EXPORT
-#else
-#define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden")))
-#endif
-#endif
-
-#endif
-
-#endif /* HWY_DLLEXPORT_H */
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@ -15,8 +15,6 @@
 #include <stddef.h>
 #include <stdint.h>

-#include <bitset>
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "highway_test.cc"
 #include "hwy/foreach_target.h"
@ -28,53 +26,6 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

-// For testing that ForPartialVectors reaches every possible size:
-using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
-
-// Monostate pattern because ForPartialVectors takes a template argument, not a
-// functor by reference.
-static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
-  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
-  static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
-  return num_lanes + sizeof_t;
-}
-static size_t* MaxLanesForSize(size_t sizeof_t) {
-  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
-  static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
-  return num_lanes + sizeof_t;
-}
-
-struct TestMaxLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const size_t kMax = MaxLanes(d);
-    HWY_ASSERT(N <= kMax);
-    HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
-
-    NumLanesForSize(sizeof(T))->set(N);
-    *MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
-  }
-};
-
-HWY_NOINLINE void TestAllMaxLanes() {
-  ForAllTypes(ForPartialVectors<TestMaxLanes>());
-
-  // Ensure ForPartialVectors visited all powers of two [1, N].
-  for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
-                          sizeof(uint64_t)}) {
-    const size_t N = *MaxLanesForSize(sizeof_t);
-    for (size_t i = 1; i <= N; i += i) {
-      if (!NumLanesForSize(sizeof_t)->test(i)) {
-        fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
-                static_cast<int>(sizeof_t), static_cast<int>(i),
-                static_cast<int>(N));
-        HWY_ASSERT(false);
-      }
-    }
-  }
-}
-
 struct TestSet {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -371,7 +322,6 @@ HWY_AFTER_NAMESPACE();

 namespace hwy {
 HWY_BEFORE_TEST(HighwayTest);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
--- a/third_party/highway/hwy/hwy.version
+++ b/third_party/highway/hwy/hwy.version
@ -1,19 +0,0 @@
-HWY_0 {
-  global:
-    extern "C++" {
-      *hwy::*;
-    };
-
-  local:
-    # Hide all the std namespace symbols. std namespace is explicitly marked
-    # as visibility(default) and header-only functions or methods (such as those
-    # from templates) should be exposed in shared libraries as weak symbols but
-    # this is only needed when we expose those types in the shared library API
-    # in any way. We don't use C++ std types in the API and we also don't
-    # support exceptions in the library.
-    # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
-    # about this.
-    extern "C++" {
-      *std::*;
-    };
-};
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@ -37,7 +37,7 @@
 #include <windows.h>
 #endif

-#if defined(__APPLE__)
+#if defined(__MACH__)
 #include <mach/mach.h>
 #include <mach/mach_time.h>
 #endif
@ -148,7 +148,7 @@ inline Ticks Start() {
  LARGE_INTEGER counter;
  (void)QueryPerformanceCounter(&counter);
  t = counter.QuadPart;
-#elif defined(__APPLE__)
+#elif defined(__MACH__)
  t = mach_absolute_time();
 #elif defined(__HAIKU__)
  t = system_time_nsecs();  // since boot
@ -405,7 +405,7 @@ double NominalClockRate() {

 }  // namespace

-HWY_DLLEXPORT double InvariantTicksPerSecond() {
+double InvariantTicksPerSecond() {
 #if HWY_ARCH_PPC && defined(__GLIBC__)
  return double(__ppc_get_timebase_freq());
 #elif HWY_ARCH_X86
@ -415,7 +415,7 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
  LARGE_INTEGER freq;
  (void)QueryPerformanceFrequency(&freq);
  return double(freq.QuadPart);
-#elif defined(__APPLE__)
+#elif defined(__MACH__)
  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
  mach_timebase_info_data_t timebase;
  (void)mach_timebase_info(&timebase);
@ -426,12 +426,12 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
 #endif
 }

-HWY_DLLEXPORT double Now() {
+double Now() {
  static const double mul = 1.0 / InvariantTicksPerSecond();
  return static_cast<double>(timer::Start()) * mul;
 }

-HWY_DLLEXPORT uint64_t TimerResolution() {
+uint64_t TimerResolution() {
  // Nested loop avoids exceeding stack/L1 capacity.
  timer::Ticks repetitions[Params::kTimerSamples];
  for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
@ -656,11 +656,10 @@ timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,

 }  // namespace

-HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
+int Unpredictable1() { return timer::Start() != ~0ULL; }

-HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
-                             const FuncInput* inputs, const size_t num_inputs,
-                             Result* results, const Params& p) {
+size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+               const size_t num_inputs, Result* results, const Params& p) {
  NANOBENCHMARK_CHECK(num_inputs != 0);

 #if HWY_ARCH_X86
--- a/third_party/highway/hwy/nanobenchmark.h
+++ b/third_party/highway/hwy/nanobenchmark.h
@ -47,8 +47,6 @@
 #include <stddef.h>
 #include <stdint.h>

-#include "hwy/highway_export.h"
-
 // Enables sanity checks that verify correct operation at the cost of
 // longer benchmark runs.
 #ifndef NANOBENCHMARK_ENABLE_CHECKS
@ -74,23 +72,23 @@ namespace platform {
 // Returns tick rate, useful for converting measurements to seconds. Invariant
 // means the tick counter frequency is independent of CPU throttling or sleep.
 // This call may be expensive, callers should cache the result.
-HWY_DLLEXPORT double InvariantTicksPerSecond();
+double InvariantTicksPerSecond();

 // Returns current timestamp [in seconds] relative to an unspecified origin.
 // Features: monotonic (no negative elapsed time), steady (unaffected by system
 // time changes), high-resolution (on the order of microseconds).
-HWY_DLLEXPORT double Now();
+double Now();

 // Returns ticks elapsed in back to back timer calls, i.e. a function of the
 // timer resolution (minimum measurable difference) and overhead.
 // This call is expensive, callers should cache the result.
-HWY_DLLEXPORT uint64_t TimerResolution();
+uint64_t TimerResolution();

 }  // namespace platform

 // Returns 1, but without the compiler knowing what the value is. This prevents
 // optimizing out code.
-HWY_DLLEXPORT int Unpredictable1();
+int Unpredictable1();

 // Input influencing the function being measured (e.g. number of bytes to copy).
 using FuncInput = size_t;
@ -166,9 +164,9 @@ struct Result {
 //   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
 // Returns how many Result were written to "results": one per unique input, or
 //   zero if the measurement failed (an error message goes to stderr).
-HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
-                             const FuncInput* inputs, const size_t num_inputs,
-                             Result* results, const Params& p = Params());
+size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+               const size_t num_inputs, Result* results,
+               const Params& p = Params());

 // Calls operator() of the given closure (lambda function).
 template <class Closure>
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
--- a/third_party/highway/hwy/ops/arm_sve-inl.h
+++ b/third_party/highway/hwy/ops/arm_sve-inl.h
--- a/third_party/highway/hwy/ops/generic_ops-inl.h
+++ b/third_party/highway/hwy/ops/generic_ops-inl.h
@ -19,14 +19,14 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

-// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
+// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
 template <class V>
 using LaneType = decltype(GetLane(V()));

-// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
-// type of functions that do not take a vector argument, or as an argument type
-// if the function only has a template argument for D, or for explicit type
-// names instead of auto. This may be a built-in type.
+// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
+// of functions that do not take a vector argument, or as an argument type if
+// the function only has a template argument for D, or for explicit type names
+// instead of auto. This may be a built-in type.
 template <class D>
 using Vec = decltype(Zero(D()));

@ -53,6 +53,12 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
  return CombineShiftRightBytes<kBytes>(d, hi, lo);
 }

+// DEPRECATED
+template <size_t kLanes, class V>
+HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
+  return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo);
+}
+
 #endif

 // Returns lanes with the most significant bit set and all other bits zero.
@ -202,15 +208,6 @@ HWY_API V AESRound(V state, const V round_key) {
  return state;
 }

-template <class V>  // u8
-HWY_API V AESLastRound(V state, const V round_key) {
-  // LIke AESRound, but without MixColumns.
-  state = detail::SubBytes(state);
-  state = detail::ShiftRows(state);
-  state = Xor(state, round_key);  // AddRoundKey
-  return state;
-}
-
 // Constant-time implementation inspired by
 // https://www.bearssl.org/constanttime.html, but about half the cost because we
 // use 64x64 multiplies and 128-bit XORs.
@ -281,47 +278,23 @@ HWY_API V CLMulUpper(V a, V b) {
 #define HWY_NATIVE_POPCNT
 #endif

-#if HWY_TARGET == HWY_RVV
-#define HWY_MIN_POW2_FOR_128 1
-#else
-// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
-// guarantee 128 bits anyway.
-#define HWY_MIN_POW2_FOR_128 0
-#endif
-
-// This algorithm requires vectors to be at least 16 bytes, which is the case
-// for LMUL >= 2. If not, use the fallback below.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
-          HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
+template <typename V, HWY_IF_LANES_ARE(uint8_t, V)>
 HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
+  constexpr DFromV<V> d;
  HWY_ALIGN constexpr uint8_t kLookup[16] = {
      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  };
-  const auto lo = And(v, Set(d, 0xF));
-  const auto hi = ShiftRight<4>(v);
-  const auto lookup = LoadDup128(d, kLookup);
+  auto lo = And(v, Set(d, 0xF));
+  auto hi = ShiftRight<4>(v);
+  auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup);
  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
 }

-// RVV has a specialization that avoids the Set().
-#if HWY_TARGET != HWY_RVV
-// Slower fallback for capped vectors.
-template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
-HWY_API V PopulationCount(V v) {
-  const DFromV<V> d;
-  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
-  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
-  return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
-}
-#endif  // HWY_TARGET != HWY_RVV
-
 template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
 HWY_API V PopulationCount(V v) {
  const DFromV<V> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
+  Repartition<uint8_t, decltype(d)> d8;
+  auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
 }

@ -333,7 +306,7 @@ HWY_API V PopulationCount(V v) {
  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
 }

-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
 template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
 HWY_API V PopulationCount(V v) {
  const DFromV<V> d;
--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
@ -27,7 +27,7 @@ namespace HWY_NAMESPACE {

 // Single instruction, single data.
 template <typename T>
-using Sisd = Simd<T, 1, 0>;
+using Sisd = Simd<T, 1>;

 // (Wrapper class required for overloading comparison operators.)
 template <typename T>
@ -187,20 +187,6 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
  return Xor(a, b);
 }

-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
 // ------------------------------ CopySign

 template <typename T>
@ -289,11 +275,6 @@ HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
  return mask.bits ? Vec1<T>(0) : no;
 }

-template <typename T>
-HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
-  return v.raw < 0 ? yes : no;
-}
-
 template <typename T>
 HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
  return v.raw < 0 ? Vec1<T>(0) : v;
@ -442,13 +423,7 @@ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
  return Vec1<double>(a.raw - b.raw);
 }

-// ------------------------------ SumsOf8
-
-HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
-  return Vec1<uint64_t>(v.raw);
-}
-
-// ------------------------------ SaturatedAdd
+// ------------------------------ Saturating addition

 // Returns a + b clamped to the destination range.

@ -956,30 +931,21 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
  return Vec1<ToT>(static_cast<ToT>(from.raw));
 }

-// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
-// so we overload for FromT=double and ToT={float,int32_t}.
-HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
+template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
+HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
+  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
+
  // Prevent ubsan errors when converting float to narrower integer/float
  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
-    return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
-                                              : HighestValue<float>());
+      std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
+    return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
+                                            : HighestValue<ToT>());
  }
-  return Vec1<float>(static_cast<float>(from.raw));
-}
-HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
-  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
-  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
-    return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
-                                                : HighestValue<int32_t>());
-  }
-  return Vec1<int32_t>(static_cast<int32_t>(from.raw));
+  return Vec1<ToT>(static_cast<ToT>(from.raw));
 }

-template <typename FromT, typename ToT>
+template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
 HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
-  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");

  // Int to int: choose closest value in ToT to `from` (avoids UB)
@ -1117,12 +1083,6 @@ HWY_API T GetLane(const Vec1<T> v) {
  return v.raw;
 }

-template <typename T>
-HWY_API Vec1<T> DupEven(Vec1<T> v) {
-  return v;
-}
-// DupOdd is unsupported.
-
 template <typename T>
 HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
  return even;
@ -1165,14 +1125,6 @@ HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
  return v;
 }

-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T>
-HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
 // ------------------------------ Reverse

 template <typename T>
@ -1180,21 +1132,6 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
  return v;
 }

-template <typename T>
-HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
 // ================================================== BLOCKWISE
 // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.

@ -1371,6 +1308,41 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
  return v;
 }

+// ================================================== DEPRECATED
+
+template <typename T>
+HWY_API size_t StoreMaskBits(const Mask1<T> mask, uint8_t* bits) {
+  return StoreMaskBits(Sisd<T>(), mask, bits);
+}
+
+template <typename T>
+HWY_API bool AllTrue(const Mask1<T> mask) {
+  return AllTrue(Sisd<T>(), mask);
+}
+
+template <typename T>
+HWY_API bool AllFalse(const Mask1<T> mask) {
+  return AllFalse(Sisd<T>(), mask);
+}
+
+template <typename T>
+HWY_API size_t CountTrue(const Mask1<T> mask) {
+  return CountTrue(Sisd<T>(), mask);
+}
+
+template <typename T>
+HWY_API Vec1<T> SumOfLanes(const Vec1<T> v) {
+  return SumOfLanes(Sisd<T>(), v);
+}
+template <typename T>
+HWY_API Vec1<T> MinOfLanes(const Vec1<T> v) {
+  return MinOfLanes(Sisd<T>(), v);
+}
+template <typename T>
+HWY_API Vec1<T> MaxOfLanes(const Vec1<T> v) {
+  return MaxOfLanes(Sisd<T>(), v);
+}
+
 // ================================================== Operator wrapper

 template <class V>
--- a/third_party/highway/hwy/ops/set_macros-inl.h
+++ b/third_party/highway/hwy/ops/set_macros-inl.h
@ -32,10 +32,9 @@
 #undef HWY_MAX_BYTES
 #undef HWY_LANES

-#undef HWY_HAVE_SCALABLE
-#undef HWY_HAVE_INTEGER64
-#undef HWY_HAVE_FLOAT16
-#undef HWY_HAVE_FLOAT64
+#undef HWY_CAP_INTEGER64
+#undef HWY_CAP_FLOAT16
+#undef HWY_CAP_FLOAT64
 #undef HWY_CAP_GE256
 #undef HWY_CAP_GE512

@ -80,10 +79,9 @@
 #define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_AES 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
@ -98,10 +96,9 @@
 #define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -116,10 +113,9 @@
 #define HWY_MAX_BYTES 32
 #define HWY_LANES(T) (32 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0

@ -133,10 +129,9 @@
 #define HWY_MAX_BYTES 64
 #define HWY_LANES(T) (64 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 1

@ -164,10 +159,9 @@
 #define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 0
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 0
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -183,16 +177,15 @@
 #define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

 #if HWY_ARCH_ARM_A64
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_FLOAT64 1
 #else
-#define HWY_HAVE_FLOAT64 0
+#define HWY_CAP_FLOAT64 0
 #endif

 #define HWY_NAMESPACE N_NEON
@ -203,19 +196,23 @@
 // SVE[2]
 #elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE

+#if defined(HWY_EMULATE_SVE) && !defined(__F16C__)
+#error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16"
+#endif
+
 // SVE only requires lane alignment, not natural alignment of the entire vector.
 #define HWY_ALIGN alignas(8)

 #define HWY_MAX_BYTES 256

-// Value ensures MaxLanes() is the tightest possible upper bound to reduce
-// overallocation.
-#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
+// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
+// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
+// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
+#define HWY_LANES(T) (32768 / sizeof(T))

-#define HWY_HAVE_SCALABLE 1
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -235,10 +232,9 @@
 #define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 0
+#define HWY_CAP_INTEGER64 0
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -254,10 +250,9 @@
 #define HWY_MAX_BYTES 32
 #define HWY_LANES(T) (32 / sizeof(T))

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 0
+#define HWY_CAP_INTEGER64 0
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -276,20 +271,20 @@
 // The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
 #define HWY_MAX_BYTES 65536

-// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
-// LMUL. This is the tightest possible upper bound.
-#define HWY_LANES(T) (8192 / sizeof(T))
+// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
+// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
+// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
+#define HWY_LANES(T) (8388608 / sizeof(T))

-#define HWY_HAVE_SCALABLE 1
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

 #if defined(__riscv_zfh)
-#define HWY_HAVE_FLOAT16 1
+#define HWY_CAP_FLOAT16 1
 #else
-#define HWY_HAVE_FLOAT16 0
+#define HWY_CAP_FLOAT16 0
 #endif

 #define HWY_NAMESPACE N_RVV
@ -305,10 +300,9 @@
 #define HWY_MAX_BYTES 8
 #define HWY_LANES(T) 1

-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -350,3 +344,7 @@
 #else
 #define HWY_ATTR
 #endif
+
+// DEPRECATED
+#undef HWY_GATHER_LANES
+#define HWY_GATHER_LANES(T) HWY_LANES(T)
--- a/third_party/highway/hwy/ops/shared-inl.h
+++ b/third_party/highway/hwy/ops/shared-inl.h
@ -26,117 +26,65 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

-// Highway operations are implemented as overloaded functions selected using an
-// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
-// shift count applied to scalable vectors. Instead of referring to Simd<>
-// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
-// full vector, or fractions/groups if the argument is negative/positive),
-// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
-// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
-// cap. For constexpr-size vectors, N is the actual number of lanes. This
-// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
-template <typename Lane, size_t N, int kPow2>
+// SIMD operations are implemented as overloaded functions selected using a tag
+// type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
+// use only. Users create D via aliases ScalableTag<T>() (a full vector),
+// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
+// (always a power of two) is Lanes(D()).
+template <typename Lane, size_t N>
 struct Simd {
  constexpr Simd() = default;
  using T = Lane;
  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");

-  // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
-  // warns when using enums and non-enums in the same expression. Cannot be
-  // static constexpr function (another MSVC limitation).
-  static constexpr size_t kPrivateN = N;
-  static constexpr int kPrivatePow2 = kPow2;
-
-  template <typename NewT>
-  static constexpr size_t NewN() {
-    // Round up to correctly handle scalars with N=1.
-    return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
-  }
-
-#if HWY_HAVE_SCALABLE
-  template <typename NewT>
-  static constexpr int Pow2Ratio() {
-    return (sizeof(NewT) > sizeof(T))
-               ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
-               : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
-  }
-#endif
-
  // Widening/narrowing ops change the number of lanes and/or their type.
  // To initialize such vectors, we need the corresponding tag types:

-// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
-#if HWY_HAVE_SCALABLE
-  template <typename NewT>
-  using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
-#else
-  template <typename NewT>
-  using Rebind = Simd<NewT, N, kPow2>;
-#endif
+  // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
+  template <typename NewLane>
+  using Rebind = Simd<NewLane, N>;

-  // Change lane type while keeping the same vector size, e.g. for MulEven.
-  template <typename NewT>
-  using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
+  // MulEven() with another lane type, but same total size.
+  // Round up to correctly handle scalars with N=1.
+  template <typename NewLane>
+  using Repartition =
+      Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;

-// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
-// Round up to correctly handle scalars with N=1.
-#if HWY_HAVE_SCALABLE
-  // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
-  // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
-  using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
-#else
-  using Half = Simd<T, (N + 1) / 2, kPow2>;
-#endif
+  // LowerHalf() with the same lane type, but half the lanes.
+  // Round up to correctly handle scalars with N=1.
+  using Half = Simd<T, (N + 1) / 2>;

-// Twice the lanes while keeping the same lane type, e.g. for Combine.
-#if HWY_HAVE_SCALABLE
-  using Twice = Simd<T, 2 * N, kPow2 + 1>;
-#else
-  using Twice = Simd<T, 2 * N, kPow2>;
-#endif
+  // Combine() with the same lane type, but twice the lanes.
+  using Twice = Simd<T, 2 * N>;
 };

 namespace detail {

-#if HWY_HAVE_SCALABLE
-
-template <typename T, size_t N, int kPow2>
-constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
-  return N == HWY_LANES(T) && kPow2 == 0;
-}
-
-#endif
-
-// Returns the number of lanes (possibly zero) after applying a shift:
-// - 0: no change;
-// - [1,3]: a group of 2,4,8 [fractional] vectors;
-// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
+// Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
+// - a full vector (pow2 = 0);
+// - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
+// - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
 constexpr size_t ScaleByPower(size_t N, int pow2) {
-  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
+#if HWY_TARGET == HWY_RVV
+  // For fractions, if N == 1 ensure we still return at least one lane.
+  return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
+#else
+  // If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
+  return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
+#endif
 }

 // Struct wrappers enable validation of arguments via static_assert.
 template <typename T, int kPow2>
 struct ScalableTagChecker {
  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
-#if HWY_TARGET == HWY_RVV
-  // Only RVV supports register groups.
-  using type = Simd<T, HWY_LANES(T), kPow2>;
-#elif HWY_HAVE_SCALABLE
-  // For SVE[2], only allow full or fractions.
-  using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
-#elif HWY_TARGET == HWY_SCALAR
-  using type = Simd<T, /*N=*/1, 0>;
-#else
-  // Only allow full or fractions.
-  using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
-#endif
+  using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
 };

 template <typename T, size_t kLimit>
 struct CappedTagChecker {
  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
-  using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
+  using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
 };

 template <typename T, size_t kNumLanes>
@ -147,7 +95,7 @@ struct FixedTagChecker {
  // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
  static_assert(kNumLanes == 1, "Scalar only supports one lane");
 #endif
-  using type = Simd<T, kNumLanes, 0>;
+  using type = Simd<T, kNumLanes>;
 };

 }  // namespace detail
@ -166,14 +114,15 @@ using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
 // typically used for 1D loops with a relatively low application-defined upper
 // bound, e.g. for 8x8 DCTs. However, it is better if data structures are
 // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
-// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
+// chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
 // this would enable vector-length-agnostic loops using ScalableTag).
 template <typename T, size_t kLimit>
 using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;

 // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
-// even on targets with scalable vectors. HWY_SCALAR only supports one lane.
-// All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
+// even on targets with scalable vectors. All targets except HWY_SCALAR support
+// up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
+// on that is non-portable and discouraged.
 //
 // NOTE: if the application does not need to support HWY_SCALAR (+), use this
 // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
@ -214,11 +163,11 @@ using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
 template <class D>
 using Half = typename D::Half;

-// Tag for the same lane type as D, but twice the lanes.
+// Descriptor for the same lane type as D, but twice the lanes.
 template <class D>
 using Twice = typename D::Twice;

-// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
+// Same as base.h macros but with a Simd<T, N> argument instead of T.
 #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
 #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
 #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
@ -226,12 +175,6 @@ using Twice = typename D::Twice;
 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)

-// MSVC workaround: use PrivateN directly instead of MaxLanes.
-#define HWY_IF_LT128_D(D) \
-  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
-#define HWY_IF_GE128_D(D) \
-  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
-
 // Same, but with a vector argument.
 #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
 #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
@ -240,59 +183,42 @@ using Twice = typename D::Twice;

 // For implementing functions for a specific type.
 // IsSame<...>() in template arguments is broken on MSVC2015.
-#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
+#define HWY_IF_LANES_ARE(T, V) \
+  EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr

-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
-  return D::kPrivatePow2;
-}
-
-// MSVC requires the explicit <D>.
-#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
-
-#if HWY_HAVE_SCALABLE
-
-// Upper bound on the number of lanes. Intended for template arguments and
-// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
-// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
-// actual size for allocating storage. WARNING: MSVC might not be able to deduce
-// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
-  return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
-                              D::kPrivatePow2);
-}
-
-#else
-// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
-// is not an option, nor does a member function work.
-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
-  return D::kPrivateN;
-}
-
-// (Potentially) non-constant actual size of the vector at runtime, subject to
-// the limit imposed by the Simd. Useful for advancing loop counters.
-// Targets with scalable vectors define this themselves.
-template <typename T, size_t N, int kPow2>
-HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
+// Compile-time-constant, (typically but not guaranteed) an upper bound on the
+// number of lanes.
+// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
+// `#if HWY_CAP_GE*`.
+template <typename T, size_t N>
+HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
  return N;
 }

-#endif  // !HWY_HAVE_SCALABLE
+// Targets with non-constexpr Lanes define this themselves.
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
+
+// (Potentially) non-constant actual size of the vector at runtime, subject to
+// the limit imposed by the Simd. Useful for advancing loop counters.
+template <typename T, size_t N>
+HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
+  return N;
+}
+
+#endif

 // NOTE: GCC generates incorrect code for vector arguments to non-inlined
 // functions in two situations:
 // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
 //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
-// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
+// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
 //   all) tests to fail.
 //
 // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
 // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
 // and possibly also other functions that are not inlined.
 #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
-    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
+    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64)
 template <class V>
 using VecArg = const V&;
 #else
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
--- a/third_party/highway/hwy/ops/wasm_256-inl.h
+++ b/third_party/highway/hwy/ops/wasm_256-inl.h
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
@ -44,6 +44,10 @@
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
+
+template <typename T>
+using Full256 = Simd<T, 32 / sizeof(T)>;
+
 namespace detail {

 template <typename T>
@ -322,38 +326,6 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
 #endif
 }

-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
-#if HWY_TARGET <= HWY_AVX3
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m256i ret = _mm256_ternarylogic_epi64(
-      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
-  return BitCast(d, VU{ret});
-#else
-  return Or(o, And(a1, a2));
-#endif
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
-#if HWY_TARGET <= HWY_AVX3
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
-                                                 BitCast(du, yes).raw,
-                                                 BitCast(du, no).raw, 0xCA)});
-#else
-  return IfThenElse(MaskFromVec(mask), yes, no);
-#endif
-}
-
 // ------------------------------ Operator overloads (internal-only if float)

 template <typename T>
@ -813,7 +785,6 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
 template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
  const auto zero = Zero(Full256<T>());
-  // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
  return IfThenElse(MaskFromVec(v), zero, v);
 }

@ -1424,12 +1395,7 @@ HWY_API Vec256<double> operator-(const Vec256<double> a,
  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
 }

-// ------------------------------ SumsOf8
-HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
-  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
-}
-
-// ------------------------------ SaturatedAdd
+// ------------------------------ Saturating addition

 // Returns a + b clamped to the destination range.

@ -1453,7 +1419,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
 }

-// ------------------------------ SaturatedSub
+// ------------------------------ Saturating subtraction

 // Returns a - b clamped to the destination range.

@ -1719,35 +1685,6 @@ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
 #endif
 }

-// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
-HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
-                                          Vec256<int8_t> no) {
-  // int8: AVX2 IfThenElse only looks at the MSB.
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const Full256<T> d;
-  const RebindToSigned<decltype(d)> di;
-
-  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
-  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const Full256<T> d;
-  const RebindToFloat<decltype(d)> df;
-
-  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
-  const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
-  return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
-}
-
 // ------------------------------ ShiftLeftSame

 HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@ -2297,7 +2234,7 @@ HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
  Store(v, d, lanes);

  alignas(32) Offset offset_lanes[N];
-  Store(offset, Full256<Offset>(), offset_lanes);
+  Store(offset, Simd<Offset, N>(), offset_lanes);

  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
  for (size_t i = 0; i < N; ++i) {
@ -2315,7 +2252,7 @@ HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
  Store(v, d, lanes);

  alignas(32) Index index_lanes[N];
-  Store(index, Full256<Index>(), index_lanes);
+  Store(index, Simd<Index, N>(), index_lanes);

  for (size_t i = 0; i < N; ++i) {
    base[index_lanes[i]] = lanes[i];
@ -2536,7 +2473,7 @@ HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
 template <int kLanes, typename T>
 HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
+  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
 }

 // ------------------------------ CombineShiftRightBytes
@ -2796,81 +2733,6 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
 #endif
 }

-// ------------------------------ Reverse2
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
-  const Full256<uint32_t> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse4[16] = {3,  2,  1, 0, 7,  6,  5,  4,
-                                                 11, 10, 9, 8, 15, 14, 13, 12};
-  const Vec256<int16_t> idx = Load(di, kReverse4);
-  return BitCast(d, Vec256<int16_t>{
-                        _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(d)> dw;
-  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
-  return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-HWY_API Vec256<double> Reverse4(Full256<double> /* tag */, Vec256<double> v) {
-  return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0,
-                                                 15, 14, 13, 12, 11, 10, 9, 8};
-  const Vec256<int16_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec256<int16_t>{
-                        _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(d)> dw;
-  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-  return Reverse(d, v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
-  HWY_ASSERT(0);  // AVX2 does not have 8 64-bit lanes
-}
-
 // ------------------------------ InterleaveLower

 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@ -2920,6 +2782,12 @@ HWY_API Vec256<double> InterleaveLower(const Vec256<double> a,
  return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
 }

+// Additional overload for the optional Simd<> tag.
+template <typename T, class V = Vec256<T>>
+HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
+  return InterleaveLower(a, b);
+}
+
 // ------------------------------ InterleaveUpper

 // All functions inside detail lack the required D parameter.
@ -2981,11 +2849,11 @@ HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
 // this is necessary because the single-lane scalar cannot return two values.
 template <typename T, typename TW = MakeWide<T>>
 HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) {
-  return BitCast(Full256<TW>(), InterleaveLower(a, b));
+  return BitCast(Full256<TW>(), InterleaveLower(Full256<T>(), a, b));
 }
 template <typename T, typename TW = MakeWide<T>>
 HWY_API Vec256<TW> ZipLower(Full256<TW> dw, Vec256<T> a, Vec256<T> b) {
-  return BitCast(dw, InterleaveLower(a, b));
+  return BitCast(dw, InterleaveLower(Full256<T>(), a, b));
 }

 template <typename T, typename TW = MakeWide<T>>
@ -3195,38 +3063,6 @@ HWY_API Vec256<double> ConcatEven(Full256<double> d, Vec256<double> hi,
 #endif
 }

-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> DupEven(Vec256<T> v) {
-  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-HWY_API Vec256<float> DupEven(Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> DupEven(const Vec256<T> v) {
-  return InterleaveLower(Full256<T>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> DupOdd(Vec256<T> v) {
-  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-HWY_API Vec256<float> DupOdd(Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
-  return InterleaveUpper(Full256<T>(), v, v);
-}
-
 // ------------------------------ OddEven

 namespace detail {
@ -3304,13 +3140,6 @@ HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
  return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
 }

-// ------------------------------ ReverseBlocks (ConcatLowerUpper)
-
-template <typename T>
-HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
-  return ConcatLowerUpper(d, v, v);
-}
-
 // ------------------------------ TableLookupBytes (ZeroExtendVector)

 // Both full
@ -3607,7 +3436,7 @@ HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
      _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
 }

-HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */,
+HWY_API Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */,
                                    const Vec256<int32_t> v) {
  const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
  // Concatenate lower 64 bits of each 128-bit block
@ -3626,7 +3455,7 @@ HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
      _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
 }

-HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */,
+HWY_API Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */,
                                   const Vec256<int32_t> v) {
  const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
  // Concatenate lower 64 bits of each 128-bit block
@ -3724,7 +3553,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
  const auto lo = LowerHalf(quad);
  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
  const auto pair = LowerHalf(lo | hi);
-  return BitCast(Full64<uint8_t>(), pair);
+  return BitCast(Simd<uint8_t, 8>(), pair);
 }

 // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
@ -3862,19 +3691,6 @@ HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state,
 #endif
 }

-HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state,
-                                     Vec256<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
-#else
-  const Full256<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d,
-                 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
 HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) {
 #if HWY_TARGET == HWY_AVX3_DL
  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
@ -4203,7 +4019,7 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
 #if HWY_TARGET <= HWY_AVX3_DL
  return CompressStore(v, m, d, unaligned);  // also native
 #else
-  const size_t count = CountTrue(d, m);
+  const size_t count = CountTrue(m);
  const Vec256<T> compressed = Compress(v, m);
  // There is no 16-bit MaskedStore, so blend.
  const Vec256<T> prev = LoadU(d, unaligned);
@ -4428,7 +4244,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
 namespace detail {

 template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
+HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d,
                                                uint64_t mask_bits) {
  const RebindToUnsigned<decltype(d)> d32;
  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
@ -4491,7 +4307,7 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
 }

 template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
+HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d,
                                                uint64_t mask_bits) {
  const Repartition<uint32_t, decltype(d)> d32;

@ -4537,8 +4353,8 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
  const auto compressed1 = Compress(promoted1, mask_bits1);

  const Half<decltype(du)> dh;
-  const auto demoted0 = ZeroExtendVector(du, DemoteTo(dh, compressed0));
-  const auto demoted1 = ZeroExtendVector(du, DemoteTo(dh, compressed1));
+  const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
+  const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));

  const size_t count0 = PopCount(mask_bits0);
  // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
@ -4809,6 +4625,101 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
 }

+// ================================================== DEPRECATED
+
+template <typename T>
+HWY_API size_t StoreMaskBits(const Mask256<T> mask, uint8_t* bits) {
+  return StoreMaskBits(Full256<T>(), mask, bits);
+}
+
+template <typename T>
+HWY_API bool AllTrue(const Mask256<T> mask) {
+  return AllTrue(Full256<T>(), mask);
+}
+
+template <typename T>
+HWY_API bool AllFalse(const Mask256<T> mask) {
+  return AllFalse(Full256<T>(), mask);
+}
+
+template <typename T>
+HWY_API size_t CountTrue(const Mask256<T> mask) {
+  return CountTrue(Full256<T>(), mask);
+}
+
+template <typename T>
+HWY_API Vec256<T> SumOfLanes(const Vec256<T> vHL) {
+  return SumOfLanes(Full256<T>(), vHL);
+}
+template <typename T>
+HWY_API Vec256<T> MinOfLanes(const Vec256<T> vHL) {
+  return MinOfLanes(Full256<T>(), vHL);
+}
+template <typename T>
+HWY_API Vec256<T> MaxOfLanes(const Vec256<T> vHL) {
+  return MaxOfLanes(Full256<T>(), vHL);
+}
+
+template <typename T>
+HWY_API Vec128<T> UpperHalf(Vec256<T> v) {
+  return UpperHalf(Full128<T>(), v);
+}
+
+template <int kBytes, typename T>
+HWY_API Vec256<T> ShiftRightBytes(const Vec256<T> v) {
+  return ShiftRightBytes<kBytes>(Full256<T>(), v);
+}
+
+template <int kLanes, typename T>
+HWY_API Vec256<T> ShiftRightLanes(const Vec256<T> v) {
+  return ShiftRightLanes<kLanes>(Full256<T>(), v);
+}
+
+template <size_t kBytes, typename T>
+HWY_API Vec256<T> CombineShiftRightBytes(Vec256<T> hi, Vec256<T> lo) {
+  return CombineShiftRightBytes<kBytes>(Full256<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> InterleaveUpper(Vec256<T> a, Vec256<T> b) {
+  return InterleaveUpper(Full256<T>(), a, b);
+}
+
+template <typename T>
+HWY_API Vec256<MakeWide<T>> ZipUpper(Vec256<T> a, Vec256<T> b) {
+  return InterleaveUpper(Full256<MakeWide<T>>(), a, b);
+}
+
+template <typename T>
+HWY_API Vec256<T> Combine(Vec128<T> hi, Vec128<T> lo) {
+  return Combine(Full256<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> ZeroExtendVector(Vec128<T> lo) {
+  return ZeroExtendVector(Full256<T>(), lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> ConcatLowerLower(Vec256<T> hi, Vec256<T> lo) {
+  return ConcatLowerLower(Full256<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> ConcatLowerUpper(Vec256<T> hi, Vec256<T> lo) {
+  return ConcatLowerUpper(Full256<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> ConcatUpperLower(Vec256<T> hi, Vec256<T> lo) {
+  return ConcatUpperLower(Full256<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec256<T> ConcatUpperUpper(Vec256<T> hi, Vec256<T> lo) {
+  return ConcatUpperUpper(Full256<T>(), hi, lo);
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
@ -57,6 +57,9 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+template <typename T>
+using Full512 = Simd<T, 64 / sizeof(T)>;
+
 namespace detail {

 template <typename T>
@ -310,30 +313,6 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
  return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
 }

-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m512i ret = _mm512_ternarylogic_epi64(
-      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
-  return BitCast(d, VU{ret});
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
-                                                 BitCast(du, yes).raw,
-                                                 BitCast(du, no).raw, 0xCA)});
-}
-
 // ------------------------------ Operator overloads (internal-only if float)

 template <typename T>
@ -600,13 +579,6 @@ HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask,
  return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
 }

-template <typename T>
-HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  // AVX3 MaskFromVec only looks at the MSB
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
 template <typename T, HWY_IF_FLOAT(T)>
 HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
  // AVX3 MaskFromVec only looks at the MSB
@ -709,12 +681,7 @@ HWY_API Vec512<double> operator-(const Vec512<double> a,
  return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
 }

-// ------------------------------ SumsOf8
-HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) {
-  return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
-}
-
-// ------------------------------ SaturatedAdd
+// ------------------------------ Saturating addition

 // Returns a + b clamped to the destination range.

@ -738,7 +705,7 @@ HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a,
  return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
 }

-// ------------------------------ SaturatedSub
+// ------------------------------ Saturating subtraction

 // Returns a - b clamped to the destination range.

@ -1853,7 +1820,7 @@ HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
  // https://gcc.godbolt.org/z/-Jt_-F
 #if HWY_LOADDUP_ASM
  __m512i out;
-  asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
+  asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
  return Vec512<T>{out};
 #else
  const auto x4 = LoadU(Full128<T>(), p);
@ -1864,7 +1831,7 @@ HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
                                 const float* const HWY_RESTRICT p) {
 #if HWY_LOADDUP_ASM
  __m512 out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
+  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
  return Vec512<float>{out};
 #else
  const __m128 x4 = _mm_loadu_ps(p);
@ -1876,7 +1843,7 @@ HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
                                  const double* const HWY_RESTRICT p) {
 #if HWY_LOADDUP_ASM
  __m512d out;
-  asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
+  asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
  return Vec512<double>{out};
 #else
  const __m128d x2 = _mm_loadu_pd(p);
@ -2040,7 +2007,7 @@ HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<8> /* tag */,
 template <typename T, typename Offset>
 HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
                               const Vec512<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
+static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
 }
 template <typename T, typename Index>
@ -2206,7 +2173,7 @@ HWY_API Vec512<T> ShiftRightBytes(Full512<T> /* tag */, const Vec512<T> v) {
 template <int kLanes, typename T>
 HWY_API Vec512<T> ShiftRightLanes(Full512<T> d, const Vec512<T> v) {
  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
+  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
 }

 // ------------------------------ CombineShiftRightBytes
@ -2429,78 +2396,6 @@ HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
  return TableLookupLanes(v, SetTableIndices(d, kReverse));
 }

-// ------------------------------ Reverse2
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse2(Full512<T> d, const Vec512<T> v) {
-  const Full512<uint32_t> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse4(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int16_t kReverse4[32] = {
-      3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,  15, 14, 13, 12,
-      19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
-  const Vec512<int16_t> idx = Load(di, kReverse4);
-  return BitCast(d, Vec512<int16_t>{
-                        _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
-  return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-HWY_API Vec512<double> Reverse4(Full512<double> /* tag */, Vec512<double> v) {
-  return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int16_t kReverse8[32] = {
-      7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
-      23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
-  const Vec512<int16_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec512<int16_t>{
-                        _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int32_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0,
-                                                 15, 14, 13, 12, 11, 10, 9, 8};
-  const Vec512<int32_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec512<int32_t>{
-                        _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  return Reverse(d, v);
-}
-
 // ------------------------------ InterleaveLower

 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@ -2550,6 +2445,12 @@ HWY_API Vec512<double> InterleaveLower(const Vec512<double> a,
  return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
 }

+// Additional overload for the optional Simd<> tag.
+template <typename T, class V = Vec512<T>>
+HWY_API V InterleaveLower(Full512<T> /* tag */, V a, V b) {
+  return InterleaveLower(a, b);
+}
+
 // ------------------------------ InterleaveUpper

 // All functions inside detail lack the required D parameter.
@ -2614,8 +2515,8 @@ HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) {
  return BitCast(Full512<TW>(), InterleaveLower(a, b));
 }
 template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec512<TW> ZipLower(Full512<TW> /* d */, Vec512<T> a, Vec512<T> b) {
-  return BitCast(Full512<TW>(), InterleaveLower(a, b));
+HWY_API Vec512<TW> ZipLower(Full512<TW> d, Vec512<T> a, Vec512<T> b) {
+  return BitCast(Full512<TW>(), InterleaveLower(d, a, b));
 }

 template <typename T, typename TW = MakeWide<T>>
@ -2663,17 +2564,17 @@ HWY_API Vec512<double> ConcatUpperUpper(Full512<double> /* tag */,
 template <typename T>
 HWY_API Vec512<T> ConcatLowerUpper(Full512<T> /* tag */, const Vec512<T> hi,
                                   const Vec512<T> lo) {
-  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
+  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, 0x4E)};
 }
 HWY_API Vec512<float> ConcatLowerUpper(Full512<float> /* tag */,
                                       const Vec512<float> hi,
                                       const Vec512<float> lo) {
-  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
+  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, 0x4E)};
 }
 HWY_API Vec512<double> ConcatLowerUpper(Full512<double> /* tag */,
                                        const Vec512<double> hi,
                                        const Vec512<double> lo) {
-  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
+  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, 0x4E)};
 }

 // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
@ -2774,36 +2675,6 @@ HWY_API Vec512<double> ConcatEven(Full512<double> d, Vec512<double> hi,
                                                     __mmask8{0xFF}, hi.raw)};
 }

-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> DupEven(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
-}
-HWY_API Vec512<float> DupEven(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> DupEven(const Vec512<T> v) {
-  return InterleaveLower(Full512<T>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> DupOdd(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
-}
-HWY_API Vec512<float> DupOdd(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
-  return InterleaveUpper(Full512<T>(), v, v);
-}
-
 // ------------------------------ OddEven

 template <typename T>
@ -2834,29 +2705,17 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {

 template <typename T>
 HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
+  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
 }

 HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
+  return Vec512<float>{
+      _mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
 }

 HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
-  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
-}
-
-// ------------------------------ ReverseBlocks
-
-template <typename T>
-HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
-                                     Vec512<double> v) {
-  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
+  return Vec512<double>{
+      _mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
 }

 // ------------------------------ TableLookupBytes (ZeroExtendVector)
@ -3153,23 +3012,17 @@ HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state,
 #if HWY_TARGET == HWY_AVX3_DL
  return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
 #else
+  alignas(64) uint8_t a[64];
+  alignas(64) uint8_t b[64];
  const Full512<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
-HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state,
-                                     Vec512<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
-#else
-  const Full512<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d,
-                 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
+  const Full128<uint8_t> d128;
+  Store(state, d, a);
+  Store(round_key, d, b);
+  for (size_t i = 0; i < 64; i += 16) {
+    const auto enc = AESRound(Load(d128, a + i), Load(d128, b + i));
+    Store(enc, d128, a + i);
+  }
+  return Load(d, a);
 #endif
 }

@ -3411,8 +3264,8 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
  const auto compressed0 = Compress(promoted0, mask0);
  const auto compressed1 = Compress(promoted1, mask1);

-  const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
-  const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
+  const auto demoted0 = ZeroExtendVector(DemoteTo(duh, compressed0));
+  const auto demoted1 = ZeroExtendVector(DemoteTo(duh, compressed1));

  // Concatenate into single vector by shifting upper with writemask.
  const size_t num0 = CountTrue(dw, mask0);
@ -3510,7 +3363,7 @@ HWY_API size_t CompressBlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> d,
  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
    return CompressStore(v, m, d, unaligned);
  } else {
-    const size_t count = CountTrue(d, m);
+    const size_t count = CountTrue(m);
    const Vec512<T> compressed = Compress(v, m);
    const Vec512<T> prev = LoadU(d, unaligned);
    StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned);
@ -3569,9 +3422,9 @@ HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
  const auto k = (r2 | g2 | b2).raw;  // low byte in each 128bit: 3A 2A 1A 0A

  // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
-  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA);
-  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB);
-  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC);
+  const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
+  const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
+  const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));

  // Alternating order, most-significant 128 bits from the second arg.
  const __mmask8 m = 0xCC;
@ -3603,12 +3456,12 @@ HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
  const auto k = ZipLower(d32, ba8, dc8).raw;  // 4x128bit: d..aB d..a8
  const auto l = ZipUpper(d32, ba8, dc8).raw;  // 4x128bit: d..aF d..aC
  // 128-bit blocks were independent until now; transpose 4x4.
-  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA);
-  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA);
-  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC);
-  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC);
-  constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA;
-  constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB;
+  const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
+  const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
+  const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
+  const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
+  constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
+  constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
  const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
  const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
  const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
@ -3778,6 +3631,103 @@ HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
  return BitCast(d, Or(min, ShiftLeft<16>(min)));
 }

+// ================================================== DEPRECATED
+
+template <typename T>
+HWY_API size_t StoreMaskBits(const Mask512<T> mask, uint8_t* bits) {
+  return StoreMaskBits(Full512<T>(), mask, bits);
+}
+
+template <typename T>
+HWY_API bool AllTrue(const Mask512<T> mask) {
+  return AllTrue(Full512<T>(), mask);
+}
+
+template <typename T>
+HWY_API bool AllFalse(const Mask512<T> mask) {
+  return AllFalse(Full512<T>(), mask);
+}
+
+template <typename T>
+HWY_API size_t CountTrue(const Mask512<T> mask) {
+  return CountTrue(Full512<T>(), mask);
+}
+
+template <typename T>
+HWY_API Vec512<T> SumOfLanes(Vec512<T> v) {
+  return SumOfLanes(Full512<T>(), v);
+}
+
+template <typename T>
+HWY_API Vec512<T> MinOfLanes(Vec512<T> v) {
+  return MinOfLanes(Full512<T>(), v);
+}
+
+template <typename T>
+HWY_API Vec512<T> MaxOfLanes(Vec512<T> v) {
+  return MaxOfLanes(Full512<T>(), v);
+}
+
+template <typename T>
+HWY_API Vec256<T> UpperHalf(Vec512<T> v) {
+  return UpperHalf(Full256<T>(), v);
+}
+
+template <int kBytes, typename T>
+HWY_API Vec512<T> ShiftRightBytes(const Vec512<T> v) {
+  return ShiftRightBytes<kBytes>(Full512<T>(), v);
+}
+
+template <int kLanes, typename T>
+HWY_API Vec512<T> ShiftRightLanes(const Vec512<T> v) {
+  return ShiftRightBytes<kLanes>(Full512<T>(), v);
+}
+
+template <size_t kBytes, typename T>
+HWY_API Vec512<T> CombineShiftRightBytes(Vec512<T> hi, Vec512<T> lo) {
+  return CombineShiftRightBytes<kBytes>(Full512<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> InterleaveUpper(Vec512<T> a, Vec512<T> b) {
+  return InterleaveUpper(Full512<T>(), a, b);
+}
+
+template <typename T>
+HWY_API Vec512<MakeWide<T>> ZipUpper(Vec512<T> a, Vec512<T> b) {
+  return InterleaveUpper(Full512<MakeWide<T>>(), a, b);
+}
+
+template <typename T>
+HWY_API Vec512<T> Combine(Vec256<T> hi, Vec256<T> lo) {
+  return Combine(Full512<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> ZeroExtendVector(Vec256<T> lo) {
+  return ZeroExtendVector(Full512<T>(), lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> ConcatLowerLower(Vec512<T> hi, Vec512<T> lo) {
+  return ConcatLowerLower(Full512<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> ConcatLowerUpper(Vec512<T> hi, Vec512<T> lo) {
+  return ConcatLowerUpper(Full512<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> ConcatUpperLower(Vec512<T> hi, Vec512<T> lo) {
+  return ConcatUpperLower(Full512<T>(), hi, lo);
+}
+
+template <typename T>
+HWY_API Vec512<T> ConcatUpperUpper(Vec512<T> hi, Vec512<T> lo) {
+  return ConcatUpperUpper(Full512<T>(), hi, lo);
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@ -15,25 +15,23 @@
 #include "hwy/targets.h"

 #include <stdarg.h>
-#include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>

 #include <atomic>
+#include <cstddef>
+#include <limits>

-#include "hwy/base.h"
-
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
-#endif
-
-#include <stdlib.h>  // abort / exit
+#endif                                        // defined(*_SANITIZER)

 #if HWY_ARCH_X86
 #include <xmmintrin.h>
 #if HWY_COMPILER_MSVC
 #include <intrin.h>
-#else  // !HWY_COMPILER_MSVC
+#else  // HWY_COMPILER_MSVC
 #include <cpuid.h>
 #endif  // HWY_COMPILER_MSVC
 #endif  // HWY_ARCH_X86
@ -95,7 +93,7 @@ std::atomic<uint32_t> supported_{0};  // Not yet initialized
 uint32_t supported_targets_for_test_ = 0;

 // Mask of targets disabled at runtime with DisableTargets.
-uint32_t supported_mask_{LimitsMax<uint32_t>()};
+uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};

 #if HWY_ARCH_X86
 // Arbritrary bit indices indicating which instruction set extensions are
@ -192,22 +190,21 @@ HWY_NORETURN void HWY_FORMAT(3, 4)
  va_end(args);

  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
-
-// If compiled with any sanitizer, they can also print a stack trace.
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+  // If compiled with any sanitizer print a stack trace. This call doesn't crash
+  // the program, instead the trap below will crash it also allowing gdb to
+  // break there.
  __sanitizer_print_stack_trace();
-#endif  // HWY_IS_*
+#endif  // defined(*_SANITIZER)
  fflush(stderr);

-// Now terminate the program:
-#if HWY_ARCH_RVV
-  exit(1);  // trap/abort just freeze Spike.
-#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
-  // Facilitates breaking into a debugger, but don't use this in non-debug
-  // builds because it looks like "illegal instruction", which is misleading.
-  __builtin_trap();
-#else
+#if HWY_COMPILER_MSVC
  abort();  // Compile error without this due to HWY_NORETURN.
+#elif HWY_ARCH_RVV
+  exit(1);  // trap/abort just freeze Spike
+#else
+  __builtin_trap();
 #endif
 }

@ -216,7 +213,7 @@ void DisableTargets(uint32_t disabled_targets) {
  // We can call Update() here to initialize the mask but that will trigger a
  // call to SupportedTargets() which we use in tests to tell whether any of the
  // highway dynamic dispatch functions were used.
-  GetChosenTarget().DeInit();
+  chosen_target.DeInit();
 }

 void SetSupportedTargetsForTest(uint32_t targets) {
@ -225,7 +222,7 @@ void SetSupportedTargetsForTest(uint32_t targets) {
  // if not zero.
  supported_.store(0, std::memory_order_release);
  supported_targets_for_test_ = targets;
-  GetChosenTarget().DeInit();
+  chosen_target.DeInit();
 }

 bool SupportedTargetsCalledForTest() {
@ -347,10 +344,8 @@ uint32_t SupportedTargets() {
  return bits & supported_mask_;
 }

-HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
-  static ChosenTarget chosen_target;
-  return chosen_target;
-}
+// Declared in targets.h
+ChosenTarget chosen_target;

 void ChosenTarget::Update() {
  // The supported variable contains the current CPU supported targets shifted
--- a/third_party/highway/hwy/targets.h
+++ b/third_party/highway/hwy/targets.h
@ -22,7 +22,6 @@

 #include "hwy/base.h"
 #include "hwy/detect_targets.h"
-#include "hwy/highway_export.h"

 namespace hwy {

@ -30,7 +29,7 @@ namespace hwy {
 // Implemented in targets.cc; unconditionally compiled to support the use case
 // of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
 // eliding calls to this function.
-HWY_DLLEXPORT uint32_t SupportedTargets();
+uint32_t SupportedTargets();

 // Evaluates to a function call, or literal if there is a single target.
 #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
@ -45,7 +44,7 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
 // lower target is desired. For this reason, attempts to disable targets which
 // are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
 // returns at least the baseline target.
-HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
+void DisableTargets(uint32_t disabled_targets);

 // Set the mock mask of CPU supported targets instead of the actual CPU
 // supported targets computed in SupportedTargets(). The return value of
@ -53,11 +52,11 @@ HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
 // regardless of this mock, to prevent accidentally adding targets that are
 // known to be buggy in the current CPU. Call with a mask of 0 to disable the
 // mock and use the actual CPU supported targets instead.
-HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
+void SetSupportedTargetsForTest(uint32_t targets);

 // Returns whether the SupportedTargets() function was called since the last
 // SetSupportedTargetsForTest() call.
-HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
+bool SupportedTargetsCalledForTest();

 // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
@ -226,7 +225,7 @@ struct ChosenTarget {
 public:
  // Update the ChosenTarget mask based on the current CPU supported
  // targets.
-  HWY_DLLEXPORT void Update();
+  void Update();

  // Reset the ChosenTarget to the uninitialized state.
  void DeInit() { mask_.store(1); }
@ -246,12 +245,11 @@ struct ChosenTarget {
  }

 private:
-  // Initialized to 1 so GetIndex() returns 0.
+  // Initialized to 1 so GetChosenTargetIndex() returns 0.
  std::atomic<uint32_t> mask_{1};
 };

-// For internal use (e.g. by FunctionCache and DisableTargets).
-HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
+extern ChosenTarget chosen_target;

 }  // namespace hwy

--- a/third_party/highway/hwy/targets_test.cc
+++ b/third_party/highway/hwy/targets_test.cc
@ -44,11 +44,11 @@ void CheckFakeFunction() {
    hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \
    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \
    /* the pointer to the already cached function. */                       \
-    hwy::GetChosenTarget().Update();                                        \
+    hwy::chosen_target.Update();                                            \
    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
    /* Calling DeInit() will test that the initializer function */          \
    /* also calls the right function. */                                    \
-    hwy::GetChosenTarget().DeInit();                                        \
+    hwy::chosen_target.DeInit();                                            \
    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
    /* Second call uses the cached value from the previous call. */         \
    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
--- a/third_party/highway/hwy/tests/arithmetic_test.cc
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@ -177,6 +177,387 @@ HWY_NOINLINE void TestAllAbs() {
  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
 }

+template <bool kSigned>
+struct TestLeftShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    if (kSigned) {
+      // Also test positive values
+      TestLeftShifts</*kSigned=*/false>()(t, d);
+    }
+
+    using TI = MakeSigned<T>;
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
+
+    // 1
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
+      expected[i] = T(TU(value) << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
+      expected[i] = T(TU(value) << kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
+  }
+};
+
+template <bool kSigned>
+struct TestVariableLeftShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    if (kSigned) {
+      // Also test positive values
+      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
+    }
+
+    using TI = MakeSigned<T>;
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    // Same: 0
+    HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
+
+    // Same: 1
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
+
+    // Same: max
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
+
+    // Variable: small
+    for (size_t i = 0; i < N; ++i) {
+      const T value = kSigned ? T(i) - T(N) : T(i);
+      expected[i] = T(TU(value) << (i & kMaxShift));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
+
+    // Variable: large
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
+  }
+};
+
+struct TestUnsignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto values = Iota(d, 0);
+
+    const T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
+  }
+};
+
+struct TestRotateRight {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr size_t kBits = sizeof(T) * 8;
+    const auto mask_shift = Set(d, T{kBits});
+    // Cover as many bit positions as possible to test shifting out
+    const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
+
+    // Rotate by 0
+    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+
+    // Rotate by 1
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+
+    // Rotate by half
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+
+    // Rotate by max
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
+  }
+};
+
+struct TestVariableUnsignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto values = Iota(d, 0);
+
+    const T kMax = LimitsMax<T>();
+    const auto max = Set(d, kMax);
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    // Same: 0
+    HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
+
+    // Same: 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
+
+    // Same: max
+    HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
+
+    // Variable: small
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(i) >> (i & kMaxShift);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
+
+    // Variable: Large
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
+  }
+};
+
+template <int kAmount, typename T>
+T RightShiftNegative(T val) {
+  // C++ shifts are implementation-defined for negative numbers, and we have
+  // seen divisions replaced with shifts, so resort to bit operations.
+  using TU = hwy::MakeUnsigned<T>;
+  TU bits;
+  CopyBytes<sizeof(T)>(&val, &bits);
+
+  const TU shifted = TU(bits >> kAmount);
+
+  const TU all = TU(~TU(0));
+  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
+  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
+
+  bits = shifted | sign_extended;
+  CopyBytes<sizeof(T)>(&bits, &val);
+  return val;
+}
+
+class TestSignedRightShifts {
+ public:
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    constexpr T kMin = LimitsMin<T>();
+    constexpr T kMax = LimitsMax<T>();
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto values = And(Iota(d, 0), Set(d, kMax));
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
+    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
+
+    // Even negative value
+    Test<0>(kMin, d, __LINE__);
+    Test<1>(kMin, d, __LINE__);
+    Test<2>(kMin, d, __LINE__);
+    Test<kMaxShift>(kMin, d, __LINE__);
+
+    const T odd = static_cast<T>(kMin + 1);
+    Test<0>(odd, d, __LINE__);
+    Test<1>(odd, d, __LINE__);
+    Test<2>(odd, d, __LINE__);
+    Test<kMaxShift>(odd, d, __LINE__);
+  }
+
+ private:
+  template <int kAmount, typename T, class D>
+  void Test(T val, D d, int line) {
+    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
+    const auto in = Set(d, val);
+    const char* file = __FILE__;
+    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
+    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
+  }
+};
+
+struct TestVariableSignedRightShifts {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr T kMin = LimitsMin<T>();
+    constexpr T kMax = LimitsMax<T>();
+
+    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
+
+    // First test positive values, negative are checked below.
+    const auto v0 = Zero(d);
+    const auto positive = Iota(d, 0) & Set(d, kMax);
+
+    // Shift by 0
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
+    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
+
+    // Shift by 1
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(T(i & kMax) >> 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
+
+    // max
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
+
+    const auto max_shift = Set(d, kMaxShift);
+    const auto small_shifts = And(Iota(d, 0), max_shift);
+    const auto large_shifts = max_shift - small_shifts;
+
+    const auto negative = Iota(d, kMin);
+
+    // Test varying negative to shift
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
+
+    // Shift MSB right by small amounts
+    for (size_t i = 0; i < N; ++i) {
+      const size_t amount = i & kMaxShift;
+      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
+
+    // Shift MSB right by large amounts
+    for (size_t i = 0; i < N; ++i) {
+      const size_t amount = kMaxShift - (i & kMaxShift);
+      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
+      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
+  }
+};
+
+HWY_NOINLINE void TestAllShifts() {
+  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
+  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
+  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
+  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
+}
+
+HWY_NOINLINE void TestAllVariableShifts() {
+  const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
+  const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
+  const ForPartialVectors<TestUnsignedRightShifts> shr_u;
+  const ForPartialVectors<TestSignedRightShifts> shr_s;
+
+  shl_u(uint16_t());
+  shr_u(uint16_t());
+
+  shl_u(uint32_t());
+  shr_u(uint32_t());
+
+  shl_s(int16_t());
+  shr_s(int16_t());
+
+  shl_s(int32_t());
+  shr_s(int32_t());
+
+#if HWY_CAP_INTEGER64
+  shl_u(uint64_t());
+  shr_u(uint64_t());
+
+  shl_s(int64_t());
+  shr_s(int64_t());
+#endif
+}
+
+HWY_NOINLINE void TestAllRotateRight() {
+  const ForPartialVectors<TestRotateRight> test;
+  test(uint32_t());
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+#endif
+}
+
 struct TestUnsignedMinMax {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -263,84 +644,6 @@ HWY_NOINLINE void TestAllMinMax() {
  ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
 }

-class TestMinMax128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
-
- public:
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const size_t N = Lanes(d);
-    auto a_lanes = AllocateAligned<T>(N);
-    auto b_lanes = AllocateAligned<T>(N);
-    auto min_lanes = AllocateAligned<T>(N);
-    auto max_lanes = AllocateAligned<T>(N);
-    RandomState rng;
-
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    // Same arg
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
-    HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
-
-    // First arg less
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
-
-    // Second arg less
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
-
-    // Also check 128-bit blocks are independent
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        a_lanes[i] = Random64(&rng);
-        b_lanes[i] = Random64(&rng);
-      }
-      const V a = Load(d, a_lanes.get());
-      const V b = Load(d, b_lanes.get());
-      for (size_t i = 0; i < N; i += 2) {
-        const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
-                            ? (a_lanes[i] < b_lanes[i])
-                            : (a_lanes[i + 1] < b_lanes[i + 1]);
-        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
-        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
-        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
-        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
-      }
-      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
-      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMinMax128() {
-  ForGEVectors<128, TestMinMax128>()(uint64_t());
-}
-
 struct TestUnsignedMul {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -531,11 +834,11 @@ struct TestMulEvenOdd64 {
 };

 HWY_NOINLINE void TestAllMulEven() {
-  ForGEVectors<64, TestMulEven> test;
+  ForExtendableVectors<TestMulEven> test;
  test(int32_t());
  test(uint32_t());

-  ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
+  ForGE128Vectors<TestMulEvenOdd64>()(uint64_t());
 }

 struct TestMulAdd {
@ -810,6 +1113,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
    // negative +/- epsilon
    T(-1) + eps,
    T(-1) - eps,
+#if !defined(HWY_EMULATE_SVE)  // these are not safe to just cast to int
    // +/- huge (but still fits in float)
    T(1E34),
    T(-1E35),
@ -818,6 +1122,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
    -std::numeric_limits<T>::infinity(),
    // qNaN
    GetLane(NaN(d))
+#endif
  };
  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
  const size_t N = Lanes(d);
@ -1064,41 +1369,6 @@ HWY_NOINLINE void TestAllAbsDiff() {
  ForPartialVectors<TestAbsDiff>()(float());
 }

-struct TestSumsOf8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    const size_t N = Lanes(d);
-    if (N < 8) return;
-    const Repartition<uint64_t, D> du64;
-
-    auto in_lanes = AllocateAligned<T>(N);
-    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = Random64(&rng) & 0xFF;
-      }
-
-      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
-        uint64_t sum = 0;
-        for (size_t i = 0; i < 8; ++i) {
-          sum += in_lanes[idx_sum * 8 + i];
-        }
-        sum_lanes[idx_sum] = sum;
-      }
-
-      const Vec<D> in = Load(d, in_lanes.get());
-      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllSumsOf8() {
-  ForGEVectors<64, TestSumsOf8>()(uint8_t());
-}
-
 struct TestNeg {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -1127,8 +1397,10 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
@ -1148,7 +1420,6 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
 }  // namespace hwy

--- a/third_party/highway/hwy/tests/blockwise_test.cc
+++ b/third_party/highway/hwy/tests/blockwise_test.cc
@ -393,7 +393,7 @@ HWY_NOINLINE void TestAllZip() {
  lower_unsigned(uint8_t());
 #endif
  lower_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  lower_unsigned(uint32_t());  // generates u64
 #endif

@ -402,7 +402,7 @@ HWY_NOINLINE void TestAllZip() {
  lower_signed(int8_t());
 #endif
  lower_signed(int16_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  lower_signed(int32_t());  // generates i64
 #endif

@ -411,7 +411,7 @@ HWY_NOINLINE void TestAllZip() {
  upper_unsigned(uint8_t());
 #endif
  upper_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  upper_unsigned(uint32_t());  // generates u64
 #endif

@ -420,20 +420,19 @@ HWY_NOINLINE void TestAllZip() {
  upper_signed(int8_t());
 #endif
  upper_signed(int16_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  upper_signed(int32_t());  // generates i64
 #endif

  // No float - concatenating f32 does not result in a f64
 }

+template <int kBytes>
+struct TestCombineShiftRightBytesR {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
 // Scalar does not define CombineShiftRightBytes.
 #if HWY_TARGET != HWY_SCALAR || HWY_IDE
-
-template <int kBytes>
-struct TestCombineShiftRightBytes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T, D d) {
    const size_t kBlockSize = 16;
    static_assert(kBytes < kBlockSize, "Shift count is per block");
    const Repartition<uint8_t, D> d8;
@ -462,13 +461,21 @@ struct TestCombineShiftRightBytes {
      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
    }
+
+    TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
  }
 };

 template <int kLanes>
-struct TestCombineShiftRightLanes {
+struct TestCombineShiftRightLanesR {
  template <class T, class D>
-  HWY_NOINLINE void operator()(T, D d) {
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
    const Repartition<uint8_t, D> d8;
    const size_t N8 = Lanes(d8);
    if (N8 < 16) return;
@ -498,29 +505,33 @@ struct TestCombineShiftRightLanes {
      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
    }
+
+    TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
  }
 };

-#endif  // #if HWY_TARGET != HWY_SCALAR
+template <>
+struct TestCombineShiftRightBytesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};
+
+template <>
+struct TestCombineShiftRightLanesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};

 struct TestCombineShiftRight {
  template <class T, class D>
  HWY_NOINLINE void operator()(T t, D d) {
-// Scalar does not define CombineShiftRightBytes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
    constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
-    constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
-    TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
-    TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
-    TestCombineShiftRightBytes<1>()(t, d);
-
-    TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
-    TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
-    TestCombineShiftRightLanes<1>()(t, d);
-#else
-    (void)t;
-    (void)d;
-#endif
+    TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
+    TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
  }
 };

@ -542,13 +553,11 @@ class TestSpecialShuffle32 {
  }

 private:
-  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
-  // 128 bits (the NEON register) of actual are zero.
  template <class D, class V>
-  HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
-                                const size_t i2, const size_t i1,
-                                const size_t i0, const char* filename,
-                                const int line) {
+  HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
+                                  const size_t i2, const size_t i1,
+                                  const size_t i0, const char* filename,
+                                  const int line) {
    using T = TFromD<D>;
    constexpr size_t kBlockN = 16 / sizeof(T);
    const size_t N = Lanes(d);
@ -573,12 +582,10 @@ class TestSpecialShuffle64 {
  }

 private:
-  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
-  // 128 bits (the NEON register) of actual are zero.
  template <class D, class V>
-  HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
-                                const size_t i0, const char* filename,
-                                const int line) {
+  HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
+                                  const size_t i0, const char* filename,
+                                  const int line) {
    using T = TFromD<D>;
    constexpr size_t kBlockN = 16 / sizeof(T);
    const size_t N = Lanes(d);
@ -593,19 +600,19 @@ class TestSpecialShuffle64 {
 };

 HWY_NOINLINE void TestAllSpecialShuffles() {
-  const ForGEVectors<128, TestSpecialShuffle32> test32;
+  const ForGE128Vectors<TestSpecialShuffle32> test32;
  test32(uint32_t());
  test32(int32_t());
  test32(float());

-#if HWY_HAVE_INTEGER64
-  const ForGEVectors<128, TestSpecialShuffle64> test64;
+#if HWY_CAP_INTEGER64
+  const ForGE128Vectors<TestSpecialShuffle64> test64;
  test64(uint64_t());
  test64(int64_t());
 #endif

-#if HWY_HAVE_FLOAT64
-  const ForGEVectors<128, TestSpecialShuffle64> test_d;
+#if HWY_CAP_FLOAT64
+  const ForGE128Vectors<TestSpecialShuffle64> test_d;
  test_d(double());
 #endif
 }
--- a/third_party/highway/hwy/tests/combine_test.cc
+++ b/third_party/highway/hwy/tests/combine_test.cc
@ -22,6 +22,9 @@
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

+// Not yet implemented
+#if HWY_TARGET != HWY_RVV
+
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
@ -82,8 +85,8 @@ struct TestLowerQuarter {
 };

 HWY_NOINLINE void TestAllLowerHalf() {
-  ForAllTypes(ForHalfVectors<TestLowerHalf>());
-  ForAllTypes(ForHalfVectors<TestLowerQuarter, 2>());
+  ForAllTypes(ForDemoteVectors<TestLowerHalf>());
+  ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>());
 }

 struct TestUpperHalf {
@ -92,14 +95,21 @@ struct TestUpperHalf {
    // Scalar does not define UpperHalf.
 #if HWY_TARGET != HWY_SCALAR
    const Half<D> d2;
-    const size_t N2 = Lanes(d2);
-    HWY_ASSERT(N2 * 2 == Lanes(d));
-    auto expected = AllocateAligned<T>(N2);
+
+    const auto v = Iota(d, 1);
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+
+    Store(UpperHalf(d2, v), d2, lanes.get());
    size_t i = 0;
-    for (; i < N2; ++i) {
-      expected[i] = static_cast<T>(N2 + 1 + i);
+    for (; i < Lanes(d2); ++i) {
+      HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
+    }
+    // Other half remains unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
 #else
    (void)d;
 #endif
@ -107,7 +117,7 @@ struct TestUpperHalf {
 };

 HWY_NOINLINE void TestAllUpperHalf() {
-  ForAllTypes(ForHalfVectors<TestUpperHalf>());
+  ForAllTypes(ForShrinkableVectors<TestUpperHalf>());
 }

 struct TestZeroExtendVector {
@ -116,23 +126,23 @@ struct TestZeroExtendVector {
    const Twice<D> d2;

    const auto v = Iota(d, 1);
-    const size_t N = Lanes(d);
    const size_t N2 = Lanes(d2);
-    // If equal, then N was already MaxLanes(d) and it's not clear what
-    // Combine or ZeroExtendVector should return.
-    if (N2 == N) return;
-    HWY_ASSERT(N2 == 2 * N);
    auto lanes = AllocateAligned<T>(N2);
    Store(v, d, &lanes[0]);
-    Store(v, d, &lanes[N]);
+    Store(v, d, &lanes[N2 / 2]);

    const auto ext = ZeroExtendVector(d2, v);
    Store(ext, d2, lanes.get());

+    size_t i = 0;
    // Lower half is unchanged
-    HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
+    for (; i < N2 / 2; ++i) {
+      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+    }
    // Upper half is zero
-    HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
+    for (; i < N2; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+    }
  }
 };

@ -148,7 +158,7 @@ struct TestCombine {
    auto lanes = AllocateAligned<T>(N2);

    const auto lo = Iota(d, 1);
-    const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
+    const auto hi = Iota(d, N2 / 2 + 1);
    const auto combined = Combine(d2, hi, lo);
    Store(combined, d2, lanes.get());

@ -222,7 +232,7 @@ struct TestConcatOddEven {
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
 #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
    const size_t N = Lanes(d);
-    const auto hi = Iota(d, static_cast<T>(N));
+    const auto hi = Iota(d, N);
    const auto lo = Iota(d, 0);
    const auto even = Add(Iota(d, 0), Iota(d, 0));
    const auto odd = Add(even, Set(d, 1));
@ -262,3 +272,7 @@ int main(int argc, char **argv) {
 }

 #endif  // HWY_ONCE
+
+#else
+int main(int, char**) { return 0; }
+#endif  // HWY_TARGET != HWY_RVV
--- a/third_party/highway/hwy/tests/compare_test.cc
+++ b/third_party/highway/hwy/tests/compare_test.cc
@ -218,63 +218,6 @@ HWY_NOINLINE void TestAllWeakFloat() {
  ForFloatTypes(ForPartialVectors<TestWeakFloat>());
 }

-class TestLt128 {
-  template <class D>
-  static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-    alignas(16) uint64_t in[2];
-    in[0] = lo;
-    in[1] = hi;
-    return LoadDup128(d, in);
-  }
-
- public:
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
-
-    // Reversed order
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
-
-    // Also check 128-bit blocks are independent
-    const V iota = Iota(d, 1);
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
-
-    // Max value
-    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
-  }
-};
-
-HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
@ -289,7 +232,6 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
 }  // namespace hwy

 // Ought not to be necessary, but without this, no tests run on RVV.
--- a/third_party/highway/hwy/tests/convert_test.cc
+++ b/third_party/highway/hwy/tests/convert_test.cc
@ -57,17 +57,17 @@ struct TestBitCastFrom {
    TestBitCast<uint8_t>()(t, d);
    TestBitCast<uint16_t>()(t, d);
    TestBitCast<uint32_t>()(t, d);
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
    TestBitCast<uint64_t>()(t, d);
 #endif
    TestBitCast<int8_t>()(t, d);
    TestBitCast<int16_t>()(t, d);
    TestBitCast<int32_t>()(t, d);
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
    TestBitCast<int64_t>()(t, d);
 #endif
    TestBitCast<float>()(t, d);
-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
    TestBitCast<double>()(t, d);
 #endif
  }
@ -103,39 +103,39 @@ HWY_NOINLINE void TestAllBitCast() {
  to_i32(int32_t());
  to_i32(float());

-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
  to_u64(uint64_t());
  to_u64(int64_t());
-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
  to_u64(double());
 #endif

  const ForPartialVectors<TestBitCast<int64_t>> to_i64;
  to_i64(uint64_t());
  to_i64(int64_t());
-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
  to_i64(double());
 #endif
-#endif  // HWY_HAVE_INTEGER64
+#endif  // HWY_CAP_INTEGER64

  const ForPartialVectors<TestBitCast<float>> to_float;
  to_float(uint32_t());
  to_float(int32_t());
  to_float(float());

-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
  const ForPartialVectors<TestBitCast<double>> to_double;
  to_double(double());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  to_double(uint64_t());
  to_double(int64_t());
-#endif  // HWY_HAVE_INTEGER64
-#endif  // HWY_HAVE_FLOAT64
+#endif  // HWY_CAP_INTEGER64
+#endif  // HWY_CAP_FLOAT64

 #if HWY_TARGET != HWY_SCALAR
  // For non-scalar vectors, we can cast all types to all.
-  ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
+  ForAllTypes(ForGE64Vectors<TestBitCastFrom>());
 #endif
 }

@ -165,39 +165,39 @@ struct TestPromoteTo {
 };

 HWY_NOINLINE void TestAllPromoteTo() {
-  const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
+  const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
  to_u16div2(uint8_t());

-  const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
  to_u32div4(uint8_t());

-  const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
  to_u32div2(uint16_t());

-  const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
+  const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
  to_i16div2(uint8_t());
  to_i16div2(int8_t());

-  const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
  to_i32div2(uint16_t());
  to_i32div2(int16_t());

-  const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
  to_i32div4(uint8_t());
  to_i32div4(int8_t());

  // Must test f16/bf16 separately because we can only load/store/convert them.

-#if HWY_HAVE_INTEGER64
-  const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
+#if HWY_CAP_INTEGER64
+  const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
  to_u64div2(uint32_t());

-  const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
+  const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
  to_i64div2(int32_t());
 #endif

-#if HWY_HAVE_FLOAT64
-  const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
+#if HWY_CAP_FLOAT64
+  const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2;
  to_f64div2(int32_t());
  to_f64div2(float());
 #endif
@ -213,6 +213,111 @@ bool IsFinite(T /*unused*/) {
  return true;
 }

+template <typename ToT>
+struct TestDemoteTo {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    // Narrower range in the wider type, for clamping before we cast
+    const T min = LimitsMin<ToT>();
+    const T max = LimitsMax<ToT>();
+
+    const auto value_ok = [&](T& value) {
+      if (!IsFinite(value)) return false;
+#if HWY_EMULATE_SVE
+      // farm_sve just casts, which is undefined if the value is out of range.
+      value = HWY_MIN(HWY_MAX(min, value), max);
+#endif
+      return true;
+    };
+
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(T));
+        } while (!value_ok(from[i]));
+        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        DemoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToInt() {
+  ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
+
+  ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
+  to_u16(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
+  to_i16(int32_t());
+}
+
+HWY_NOINLINE void TestAllDemoteToMixed() {
+#if HWY_CAP_FLOAT64
+  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
+  to_i32(double());
+#endif
+}
+
+template <typename ToT>
+struct TestDemoteToFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    // For floats, we clamp differently and cannot call LimitsMin.
+    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(T));
+        } while (!IsFinite(from[i]));
+        const T magn = std::abs(from[i]);
+        const T max_abs = HighestValue<ToT>();
+        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
+        expected[i] = static_cast<ToT>(clipped);
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        DemoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToFloat() {
+  // Must test f16 separately because we can only load/store/convert them.
+
+#if HWY_CAP_FLOAT64
+  const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
+  to_float(double());
+#endif
+}
+
 template <class D>
 AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
  const float test_cases[] = {
@ -247,7 +352,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
 struct TestF16 {
  template <typename TF32, class DF32>
  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if HWY_HAVE_FLOAT16
+#if HWY_CAP_FLOAT16
    size_t padded;
    auto in = F16TestCases(d32, padded);
    using TF16 = float16_t;
@ -301,7 +406,7 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
 struct TestBF16 {
  template <typename TF32, class DF32>
  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if !defined(HWY_EMULATE_SVE)
+#if HWY_TARGET != HWY_RVV
    size_t padded;
    auto in = BF16TestCases(d32, padded);
    using TBF16 = bfloat16_t;
@ -312,7 +417,6 @@ struct TestBF16 {
 #endif
    const Half<decltype(dbf16)> dbf16_half;
    const size_t N = Lanes(d32);
-    HWY_ASSERT(Lanes(dbf16_half) <= N);
    auto temp16 = AllocateAligned<TBF16>(N);

    for (size_t i = 0; i < padded; i += N) {
@ -330,6 +434,124 @@ struct TestBF16 {

 HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }

+template <class D>
+AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // Same as BF16TestCases:
+      // +/- 1
+      1.0f,
+      -1.0f,
+      // +/- 0
+      0.0f,
+      -0.0f,
+      // near 0
+      0.25f,
+      -0.25f,
+      // +/- integer
+      4.0f,
+      -32.0f,
+      // positive +/- delta
+      2.015625f,
+      3.984375f,
+      // negative +/- delta
+      -2.015625f,
+      -3.984375f,
+
+      // No huge values - would interfere with sum. But add more to fill 2 * N:
+      -2.0f,
+      -10.0f,
+      0.03125f,
+      1.03125f,
+      1.5f,
+      2.0f,
+      4.0f,
+      5.0f,
+      6.0f,
+      8.0f,
+      10.0f,
+      256.0f,
+      448.0f,
+      2080.0f,
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  return in;
+}
+
+class TestReorderDemote2To {
+  // In-place N^2 selection sort to avoid dependencies
+  void Sort(float* p, size_t count) {
+    for (size_t i = 0; i < count - 1; ++i) {
+      // Find min_element
+      size_t idx_min = i;
+      for (size_t j = i + 1; j < count; j++) {
+        if (p[j] < p[idx_min]) {
+          idx_min = j;
+        }
+      }
+
+      // Swap with current
+      const float tmp = p[i];
+      p[i] = p[idx_min];
+      p[idx_min] = tmp;
+    }
+  }
+
+ public:
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_TARGET != HWY_SCALAR
+    size_t padded;
+    auto in = ReorderBF16TestCases(d32, padded);
+
+    using TBF16 = bfloat16_t;
+    const Repartition<TBF16, DF32> dbf16;
+    const Half<decltype(dbf16)> dbf16_half;
+    const size_t N = Lanes(d32);
+    auto temp16 = AllocateAligned<TBF16>(2 * N);
+    auto expected = AllocateAligned<float>(2 * N);
+    auto actual = AllocateAligned<float>(2 * N);
+
+    for (size_t i = 0; i < padded; i += 2 * N) {
+      const auto f0 = Load(d32, &in[i + 0]);
+      const auto f1 = Load(d32, &in[i + N]);
+      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
+      Store(v16, dbf16, temp16.get());
+      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
+      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
+
+      // Smoke test: sum should be same (with tolerance for non-associativity)
+      const auto sum_expected =
+          GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
+      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
+      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
+                 sum_expected <= sum_actual + 1E-4);
+
+      // Ensure values are the same after sorting to undo the Reorder
+      Store(f0, d32, expected.get() + 0);
+      Store(f1, d32, expected.get() + N);
+      Store(promoted0, d32, actual.get() + 0);
+      Store(promoted1, d32, actual.get() + N);
+      Sort(expected.get(), 2 * N);
+      Sort(actual.get(), 2 * N);
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
+    }
+#else  // HWY_SCALAR
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllReorderDemote2To() {
+  ForShrinkableVectors<TestReorderDemote2To>()(float());
+}
+
 struct TestConvertU8 {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
@ -342,7 +564,7 @@ struct TestConvertU8 {
 };

 HWY_NOINLINE void TestAllConvertU8() {
-  ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
+  ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
 }

 // Separate function to attempt to work around a compiler bug on ARM: when this
@ -352,23 +574,19 @@ struct TestIntFromFloatHuge {
  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
    // Still does not work, although ARMv7 manual says that float->int
    // saturates, i.e. chooses the nearest representable value. Also causes
-    // out-of-memory for MSVC.
-#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
+    // out-of-memory for MSVC, and unsafe cast in farm_sve.
+#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE)
    using TI = MakeSigned<TF>;
    const Rebind<TI, DF> di;

-    // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
-    // the expected lvalue also seems to prevent the issue.
-    const size_t N = Lanes(df);
-    auto expected = AllocateAligned<TI>(N);
+    // Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
+    // the expected i32 value is otherwise 0x80..00).
+    const auto expected_max = Set(di, LimitsMax<TI>());
+    HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));

-    // Huge positive
-    Store(Set(di, LimitsMax<TI>()), di, expected.get());
-    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
-
-    // Huge negative
-    Store(Set(di, LimitsMin<TI>()), di, expected.get());
-    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
+    // Huge negative (also lvalue for safety, but GCC bug was not triggered)
+    const auto expected_min = Set(di, LimitsMin<TI>());
+    HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
 #else
    (void)df;
 #endif
@ -416,6 +634,10 @@ class TestIntFromFloat {
          const uint64_t bits = rng();
          memcpy(&from[i], &bits, sizeof(TF));
        } while (!std::isfinite(from[i]));
+#if defined(HWY_EMULATE_SVE)
+        // farm_sve just casts, which is undefined if the value is out of range.
+        from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2);
+#endif
        if (from[i] >= max) {
          expected[i] = LimitsMax<TI>();
        } else if (from[i] <= min) {
@ -503,21 +725,30 @@ struct TestI32F64 {
    const size_t N = Lanes(df);

    // Integer positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));

    // Integer negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));

    // Above positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));

    // Below positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));

+    const TF eps = static_cast<TF>(0.0001);
    // Above negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));

    // Below negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));

    // Max positive int
@ -527,11 +758,22 @@ struct TestI32F64 {
    // Min negative int
    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
                      PromoteTo(df, Set(di, LimitsMin<TI>())));
+
+    // farm_sve just casts, which is undefined if the value is out of range.
+#if !defined(HWY_EMULATE_SVE)
+    // Huge positive float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
+                      DemoteTo(di, Set(df, TF(1E12))));
+
+    // Huge negative float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
+                      DemoteTo(di, Set(df, TF(-1E12))));
+#endif
  }
 };

 HWY_NOINLINE void TestAllI32F64() {
-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
  ForDemoteVectors<TestI32F64>()(double());
 #endif
 }
@ -548,8 +790,12 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyConvertTest);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
--- a/third_party/highway/hwy/tests/crypto_test.cc
+++ b/third_party/highway/hwy/tests/crypto_test.cc
@ -74,7 +74,7 @@ class TestAES {
    }

    for (size_t i = 0; i < 256; i += N) {
-      const auto in = Iota(d, static_cast<T>(i));
+      const auto in = Iota(d, i);
      HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
    }
  }
@ -89,17 +89,11 @@ class TestAES {
        0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
    const auto test = LoadDup128(d, test_lanes);

-    // = ShiftRow result
-    alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
-        0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
-        0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
-    const auto expected_sr = LoadDup128(d, expected_sr_lanes);
-
    // = MixColumn result
-    alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
+    alignas(16) constexpr uint8_t expected0_lanes[16] = {
        0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
        0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
-    const auto expected_mc = LoadDup128(d, expected_mc_lanes);
+    const auto expected0 = LoadDup128(d, expected0_lanes);

    // = KeyAddition result
    alignas(16) constexpr uint8_t expected_lanes[16] = {
@ -109,20 +103,17 @@ class TestAES {

    alignas(16) uint8_t key_lanes[16];
    for (size_t i = 0; i < 16; ++i) {
-      key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
+      key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i];
    }
    const auto round_key = LoadDup128(d, key_lanes);

-    HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
+    HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d)));
    HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
-    HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
-    HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
-                      AESLastRound(test, round_key));

    TestSBox(t, d);
  }
 };
-HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
+HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); }

 #else
 HWY_NOINLINE void TestAllAES() {}
@ -132,7 +123,7 @@ struct TestCLMul {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    // needs 64 bit lanes and 128-bit result
-#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
+#if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64
    const size_t N = Lanes(d);
    if (N == 1) return;

@ -534,7 +525,7 @@ struct TestCLMul {
  }
 };

-HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
+HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
--- a/third_party/highway/hwy/tests/demote_test.cc
+++ b/third_party/highway/hwy/tests/demote_test.cc
@ -1,333 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
-#include "hwy/foreach_target.h"
-
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-// Causes build timeout.
-#if !HWY_IS_MSAN
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T, HWY_IF_FLOAT(T)>
-bool IsFinite(T t) {
-  return std::isfinite(t);
-}
-// Wrapper avoids calling std::isfinite for integer types (ambiguous).
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-bool IsFinite(T /*unused*/) {
-  return true;
-}
-
-template <typename ToT>
-struct TestDemoteTo {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
-    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
-    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
-    const Rebind<ToT, D> to_d;
-
-    const size_t N = Lanes(from_d);
-    auto from = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<ToT>(N);
-
-    // Narrower range in the wider type, for clamping before we cast
-    const T min = LimitsMin<ToT>();
-    const T max = LimitsMax<ToT>();
-
-    const auto value_ok = [&](T& value) {
-      if (!IsFinite(value)) return false;
-      return true;
-    };
-
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
-        } while (!value_ok(from[i]));
-        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
-      }
-
-      const auto in = Load(from_d, from.get());
-      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllDemoteToInt() {
-  ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
-  ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
-
-  ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
-  ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
-
-  const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
-  to_u16(int32_t());
-
-  const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
-  to_i16(int32_t());
-}
-
-HWY_NOINLINE void TestAllDemoteToMixed() {
-#if HWY_HAVE_FLOAT64
-  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
-  to_i32(double());
-#endif
-}
-
-template <typename ToT>
-struct TestDemoteToFloat {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
-    // For floats, we clamp differently and cannot call LimitsMin.
-    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
-    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
-    const Rebind<ToT, D> to_d;
-
-    const size_t N = Lanes(from_d);
-    auto from = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<ToT>(N);
-
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(T));
-        } while (!IsFinite(from[i]));
-        const T magn = std::abs(from[i]);
-        const T max_abs = HighestValue<ToT>();
-        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
-        expected[i] = static_cast<ToT>(clipped);
-      }
-
-      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
-                        DemoteTo(to_d, Load(from_d, from.get())));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllDemoteToFloat() {
-  // Must test f16 separately because we can only load/store/convert them.
-
-#if HWY_HAVE_FLOAT64
-  const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
-  to_float(double());
-#endif
-}
-
-template <class D>
-AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
-  const float test_cases[] = {
-      // Same as BF16TestCases:
-      // +/- 1
-      1.0f,
-      -1.0f,
-      // +/- 0
-      0.0f,
-      -0.0f,
-      // near 0
-      0.25f,
-      -0.25f,
-      // +/- integer
-      4.0f,
-      -32.0f,
-      // positive +/- delta
-      2.015625f,
-      3.984375f,
-      // negative +/- delta
-      -2.015625f,
-      -3.984375f,
-
-      // No huge values - would interfere with sum. But add more to fill 2 * N:
-      -2.0f,
-      -10.0f,
-      0.03125f,
-      1.03125f,
-      1.5f,
-      2.0f,
-      4.0f,
-      5.0f,
-      6.0f,
-      8.0f,
-      10.0f,
-      256.0f,
-      448.0f,
-      2080.0f,
-  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
-  auto in = AllocateAligned<float>(padded);
-  auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
-  return in;
-}
-
-class TestReorderDemote2To {
-  // In-place N^2 selection sort to avoid dependencies
-  void Sort(float* p, size_t count) {
-    for (size_t i = 0; i < count - 1; ++i) {
-      // Find min_element
-      size_t idx_min = i;
-      for (size_t j = i + 1; j < count; j++) {
-        if (p[j] < p[idx_min]) {
-          idx_min = j;
-        }
-      }
-
-      // Swap with current
-      const float tmp = p[i];
-      p[i] = p[idx_min];
-      p[idx_min] = tmp;
-    }
-  }
-
- public:
-  template <typename TF32, class DF32>
-  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if HWY_TARGET != HWY_SCALAR
-
-    size_t padded;
-    auto in = ReorderBF16TestCases(d32, padded);
-
-    using TBF16 = bfloat16_t;
-    const Repartition<TBF16, DF32> dbf16;
-    const Half<decltype(dbf16)> dbf16_half;
-    const size_t N = Lanes(d32);
-    auto temp16 = AllocateAligned<TBF16>(2 * N);
-    auto expected = AllocateAligned<float>(2 * N);
-    auto actual = AllocateAligned<float>(2 * N);
-
-    for (size_t i = 0; i < padded; i += 2 * N) {
-      const auto f0 = Load(d32, &in[i + 0]);
-      const auto f1 = Load(d32, &in[i + N]);
-      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
-      Store(v16, dbf16, temp16.get());
-      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
-      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
-
-      // Smoke test: sum should be same (with tolerance for non-associativity)
-      const auto sum_expected =
-          GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
-      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
-      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
-                 sum_expected <= sum_actual + 1E-4);
-
-      // Ensure values are the same after sorting to undo the Reorder
-      Store(f0, d32, expected.get() + 0);
-      Store(f1, d32, expected.get() + N);
-      Store(promoted0, d32, actual.get() + 0);
-      Store(promoted1, d32, actual.get() + N);
-      Sort(expected.get(), 2 * N);
-      Sort(actual.get(), 2 * N);
-      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
-      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
-    }
-#else  // HWY_SCALAR
-    (void)d32;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllReorderDemote2To() {
-  ForShrinkableVectors<TestReorderDemote2To>()(float());
-}
-
-struct TestI32F64 {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    using TI = int32_t;
-    const Rebind<TI, DF> di;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
-
-    // Above positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
-
-    // Below positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
-
-    const TF eps = static_cast<TF>(0.0001);
-    // Above negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
-                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
-
-    // Below negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
-                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
-
-    // Huge positive float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
-                      DemoteTo(di, Set(df, TF(1E12))));
-
-    // Huge negative float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
-                      DemoteTo(di, Set(df, TF(-1E12))));
-  }
-};
-
-HWY_NOINLINE void TestAllI32F64() {
-#if HWY_HAVE_FLOAT64
-  ForDemoteVectors<TestI32F64>()(double());
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  //  !HWY_IS_MSAN
-
-#if HWY_ONCE
-
-namespace hwy {
-#if !HWY_IS_MSAN
-HWY_BEFORE_TEST(HwyDemoteTest);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
-#endif  //  !HWY_IS_MSAN
-}  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#endif
--- a/third_party/highway/hwy/tests/logical_test.cc
+++ b/third_party/highway/hwy/tests/logical_test.cc
@ -17,6 +17,7 @@
 #include <string.h>  // memcmp

 #include "hwy/aligned_allocator.h"
+#include "hwy/base.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/logical_test.cc"
@ -58,15 +59,6 @@ struct TestLogicalInteger {
    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));

-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
-
    auto v = vi;
    v = And(v, vi);
    HWY_ASSERT_VEC_EQ(d, vi, v);
@ -164,43 +156,6 @@ struct TestCopySign {
  }
 };

-struct TestIfVecThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TU = MakeUnsigned<T>;  // For all-one mask
-    const Rebind<TU, D> du;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto vec_lanes = AllocateAligned<TU>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfVecThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
-}
-
 HWY_NOINLINE void TestAllCopySign() {
  ForFloatTypes(ForPartialVectors<TestCopySign>());
 }
@ -225,31 +180,6 @@ HWY_NOINLINE void TestAllZeroIfNegative() {
  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
 }

-struct TestIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Or(vp, SignBit(d));
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
-
-    // Negative are replaced with 2nd arg
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestIfNegative>());
-  ForSignedTypes(ForPartialVectors<TestIfNegative>());
-}
-
 struct TestBroadcastSignBit {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -304,11 +234,16 @@ HWY_NOINLINE void TestAllTestBit() {
 struct TestPopulationCount {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD
+    constexpr size_t kNumTests = 1 << 14;
+#else
+    constexpr size_t kNumTests = 1 << 20;
+#endif
    RandomState rng;
    size_t N = Lanes(d);
    auto data = AllocateAligned<T>(N);
    auto popcnt = AllocateAligned<T>(N);
-    for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
+    for (size_t i = 0; i < kNumTests / N; i++) {
      for (size_t i = 0; i < N; i++) {
        data[i] = static_cast<T>(rng());
        popcnt[i] = static_cast<T>(PopCount(data[i]));
@ -333,10 +268,8 @@ namespace hwy {
 HWY_BEFORE_TEST(HwyLogicalTest);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
--- a/third_party/highway/hwy/tests/mask_test.cc
+++ b/third_party/highway/hwy/tests/mask_test.cc
@ -17,6 +17,8 @@
 #include <stdint.h>
 #include <string.h>  // memcmp

+#include "hwy/base.h"
+
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/mask_test.cc"
 #include "hwy/foreach_target.h"
@ -53,18 +55,13 @@ struct TestFirstN {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t N = Lanes(d);
+
    const RebindToSigned<D> di;
    using TI = TFromD<decltype(di)>;
    using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>;
    const size_t max_len = static_cast<size_t>(LimitsMax<TN>());

-// TODO(janwas): 8-bit FirstN (using SlideUp) causes spike to freeze.
-#if HWY_TARGET == HWY_RVV
-    if (sizeof(T) == 1) return;
-#endif
-
-    const size_t max_lanes = AdjustedReps(HWY_MIN(2 * N, size_t(64)));
-    for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
+    for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) {
      const auto expected =
          RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len))));
      const auto actual = FirstN(d, len);
@ -371,7 +368,7 @@ struct TestFindFirstTrue {
    memset(bool_lanes.get(), 0, N * sizeof(TI));

    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
+    const size_t max_lanes = HWY_MIN(N, size_t(10));

    HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
    HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
@ -410,7 +407,7 @@ struct TestLogicalMask {
    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));

    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
+    const size_t max_lanes = HWY_MIN(N, size_t(6));
    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
      for (size_t i = 0; i < max_lanes; ++i) {
        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
--- a/third_party/highway/hwy/tests/memory_test.cc
+++ b/third_party/highway/hwy/tests/memory_test.cc
@ -36,7 +36,7 @@ struct TestLoadStore {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t N = Lanes(d);
-    const auto hi = Iota(d, static_cast<T>(1 + N));
+    const auto hi = Iota(d, 1 + N);
    const auto lo = Iota(d, 1);
    auto lanes = AllocateAligned<T>(2 * N);
    Store(hi, d, &lanes[N]);
@ -135,7 +135,7 @@ struct TestStoreInterleaved3 {
 HWY_NOINLINE void TestAllStoreInterleaved3() {
 #if HWY_TARGET == HWY_RVV
  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestStoreInterleaved3, 2> test;
+  const ForExtendableVectors<TestStoreInterleaved3, 4> test;
 #else
  const ForPartialVectors<TestStoreInterleaved3> test;
 #endif
@ -198,7 +198,7 @@ struct TestStoreInterleaved4 {
 HWY_NOINLINE void TestAllStoreInterleaved4() {
 #if HWY_TARGET == HWY_RVV
  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestStoreInterleaved4, 2> test;
+  const ForExtendableVectors<TestStoreInterleaved4, 4> test;
 #else
  const ForPartialVectors<TestStoreInterleaved4> test;
 #endif
@ -230,7 +230,7 @@ struct TestLoadDup128 {
 };

 HWY_NOINLINE void TestAllLoadDup128() {
-  ForAllTypes(ForGEVectors<128, TestLoadDup128>());
+  ForAllTypes(ForGE128Vectors<TestLoadDup128>());
 }

 struct TestStream {
@ -245,7 +245,7 @@ struct TestStream {
    std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));

    Stream(v, d, out.get());
-    FlushStream();
+    StoreFence();
    const auto actual = Load(d, out.get());
    HWY_ASSERT_VEC_EQ(d, v, actual);
    // Ensure Stream didn't modify more memory than expected
@ -386,7 +386,7 @@ HWY_NOINLINE void TestAllGather() {

 HWY_NOINLINE void TestAllCache() {
  LoadFence();
-  FlushStream();
+  StoreFence();
  int test = 0;
  Prefetch(&test);
  FlushCacheline(&test);
--- a/third_party/highway/hwy/tests/shift_test.cc
+++ b/third_party/highway/hwy/tests/shift_test.cc
@ -1,433 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <limits>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
-#include "hwy/foreach_target.h"
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <bool kSigned>
-struct TestLeftShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    if (kSigned) {
-      // Also test positive values
-      TestLeftShifts</*kSigned=*/false>()(t, d);
-    }
-
-    using TI = MakeSigned<T>;
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
-
-    // 1
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(T(i) - T(N)) : T(i);
-      expected[i] = T(TU(value) << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
-
-    // max
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(T(i) - T(N)) : T(i);
-      expected[i] = T(TU(value) << kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
-  }
-};
-
-template <bool kSigned>
-struct TestVariableLeftShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    if (kSigned) {
-      // Also test positive values
-      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
-    }
-
-    using TI = MakeSigned<T>;
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, 1);
-    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
-
-    // Same: 1
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
-
-    // Same: max
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
-
-    // Variable: small
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << (i & kMaxShift));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
-
-    // Variable: large
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
-  }
-};
-
-struct TestUnsignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto values = Iota(d, 0);
-
-    const T kMax = LimitsMax<T>();
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
-
-    // max
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
-  }
-};
-
-struct TestRotateRight {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr size_t kBits = sizeof(T) * 8;
-    const auto mask_shift = Set(d, T{kBits});
-    // Cover as many bit positions as possible to test shifting out
-    const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
-
-    // Rotate by 0
-    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
-
-    // Rotate by 1
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
-
-    // Rotate by half
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
-
-    // Rotate by max
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
-  }
-};
-
-struct TestVariableUnsignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, 1);
-    const auto values = Iota(d, 0);
-
-    const T kMax = LimitsMax<T>();
-    const auto max = Set(d, kMax);
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
-
-    // Same: 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
-
-    // Same: max
-    HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
-
-    // Variable: small
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(i) >> (i & kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
-
-    // Variable: Large
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
-  }
-};
-
-template <int kAmount, typename T>
-T RightShiftNegative(T val) {
-  // C++ shifts are implementation-defined for negative numbers, and we have
-  // seen divisions replaced with shifts, so resort to bit operations.
-  using TU = hwy::MakeUnsigned<T>;
-  TU bits;
-  CopyBytes<sizeof(T)>(&val, &bits);
-
-  const TU shifted = TU(bits >> kAmount);
-
-  const TU all = TU(~TU(0));
-  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
-  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
-
-  bits = shifted | sign_extended;
-  CopyBytes<sizeof(T)>(&bits, &val);
-  return val;
-}
-
-class TestSignedRightShifts {
- public:
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    constexpr T kMin = LimitsMin<T>();
-    constexpr T kMax = LimitsMax<T>();
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // First test positive values, negative are checked below.
-    const auto v0 = Zero(d);
-    const auto values = And(Iota(d, 0), Set(d, kMax));
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
-
-    // max
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
-
-    // Even negative value
-    Test<0>(kMin, d, __LINE__);
-    Test<1>(kMin, d, __LINE__);
-    Test<2>(kMin, d, __LINE__);
-    Test<kMaxShift>(kMin, d, __LINE__);
-
-    const T odd = static_cast<T>(kMin + 1);
-    Test<0>(odd, d, __LINE__);
-    Test<1>(odd, d, __LINE__);
-    Test<2>(odd, d, __LINE__);
-    Test<kMaxShift>(odd, d, __LINE__);
-  }
-
- private:
-  template <int kAmount, typename T, class D>
-  void Test(T val, D d, int line) {
-    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
-    const auto in = Set(d, val);
-    const char* file = __FILE__;
-    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
-    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
-  }
-};
-
-struct TestVariableSignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr T kMin = LimitsMin<T>();
-    constexpr T kMax = LimitsMax<T>();
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // First test positive values, negative are checked below.
-    const auto v0 = Zero(d);
-    const auto positive = Iota(d, 0) & Set(d, kMax);
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
-    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
-
-    // max
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
-
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    const auto negative = Iota(d, kMin);
-
-    // Test varying negative to shift
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
-
-    // Shift MSB right by small amounts
-    for (size_t i = 0; i < N; ++i) {
-      const size_t amount = i & kMaxShift;
-      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
-
-    // Shift MSB right by large amounts
-    for (size_t i = 0; i < N; ++i) {
-      const size_t amount = kMaxShift - (i & kMaxShift);
-      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopyBytes<sizeof(T)>(&shifted, &expected[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
-  }
-};
-
-HWY_NOINLINE void TestAllShifts() {
-  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
-  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
-  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
-  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
-}
-
-HWY_NOINLINE void TestAllVariableShifts() {
-  const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
-  const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
-  const ForPartialVectors<TestUnsignedRightShifts> shr_u;
-  const ForPartialVectors<TestSignedRightShifts> shr_s;
-
-  shl_u(uint16_t());
-  shr_u(uint16_t());
-
-  shl_u(uint32_t());
-  shr_u(uint32_t());
-
-  shl_s(int16_t());
-  shr_s(int16_t());
-
-  shl_s(int32_t());
-  shr_s(int32_t());
-
-#if HWY_HAVE_INTEGER64
-  shl_u(uint64_t());
-  shr_u(uint64_t());
-
-  shl_s(int64_t());
-  shr_s(int64_t());
-#endif
-}
-
-HWY_NOINLINE void TestAllRotateRight() {
-  const ForPartialVectors<TestRotateRight> test;
-  test(uint32_t());
-#if HWY_HAVE_INTEGER64
-  test(uint64_t());
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyShiftTest);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
-}  // namespace hwy
-
-// Ought not to be necessary, but without this, no tests run on RVV.
-int main(int argc, char** argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#endif
--- a/third_party/highway/hwy/tests/swizzle_test.cc
+++ b/third_party/highway/hwy/tests/swizzle_test.cc
@ -19,8 +19,6 @@

 #include <array>  // IWYU pragma: keep

-#include "hwy/base.h"
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
 #include "hwy/foreach_target.h"
@ -46,48 +44,12 @@ HWY_NOINLINE void TestAllGetLane() {
  ForAllTypes(ForPartialVectors<TestGetLane>());
 }

-struct TestDupEven {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
-  }
-};
-
-HWY_NOINLINE void TestAllDupEven() {
-  ForUIF3264(ForShrinkableVectors<TestDupEven>());
-}
-
-struct TestDupOdd {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllDupOdd() {
-  ForUIF3264(ForShrinkableVectors<TestDupOdd>());
-}
-
 struct TestOddEven {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t N = Lanes(d);
    const auto even = Iota(d, 1);
-    const auto odd = Iota(d, static_cast<T>(1 + N));
+    const auto odd = Iota(d, 1 + N);
    auto expected = AllocateAligned<T>(N);
    for (size_t i = 0; i < N; ++i) {
      expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
@ -105,7 +67,7 @@ struct TestOddEvenBlocks {
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t N = Lanes(d);
    const auto even = Iota(d, 1);
-    const auto odd = Iota(d, static_cast<T>(1 + N));
+    const auto odd = Iota(d, 1 + N);
    auto expected = AllocateAligned<T>(N);
    for (size_t i = 0; i < N; ++i) {
      const size_t idx_block = i / (16 / sizeof(T));
@ -116,7 +78,7 @@ struct TestOddEvenBlocks {
 };

 HWY_NOINLINE void TestAllOddEvenBlocks() {
-  ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
+  ForAllTypes(ForShrinkableVectors<TestOddEvenBlocks>());
 }

 struct TestSwapAdjacentBlocks {
@ -138,7 +100,7 @@ struct TestSwapAdjacentBlocks {
 };

 HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
-  ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
+  ForAllTypes(ForPartialVectors<TestSwapAdjacentBlocks>());
 }

 struct TestTableLookupLanes {
@ -235,131 +197,23 @@ struct TestReverse {
  }
 };

-struct TestReverse2 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 1];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
-  }
-};
-
-struct TestReverse4 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 3];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
-  }
-};
-
-struct TestReverse8 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 7];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
-  }
-};
-
 HWY_NOINLINE void TestAllReverse() {
  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
  // which requires 16 bits.
  ForUIF163264(ForPartialVectors<TestReverse>());
 }

-HWY_NOINLINE void TestAllReverse2() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<128, TestReverse2>());
-  ForUIF32(ForGEVectors<64, TestReverse2>());
-  ForUIF16(ForGEVectors<32, TestReverse2>());
-}
-
-HWY_NOINLINE void TestAllReverse4() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<256, TestReverse4>());
-  ForUIF32(ForGEVectors<128, TestReverse4>());
-  ForUIF16(ForGEVectors<64, TestReverse4>());
-}
-
-HWY_NOINLINE void TestAllReverse8() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<512, TestReverse8>());
-  ForUIF32(ForGEVectors<256, TestReverse8>());
-  ForUIF16(ForGEVectors<128, TestReverse8>());
-}
-
-struct TestReverseBlocks {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
-    const size_t num_blocks = N / kLanesPerBlock;
-    HWY_ASSERT(num_blocks != 0);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      const size_t idx_block = i / kLanesPerBlock;
-      const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
-      expected[i] = copy[base + (i % kLanesPerBlock)];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
-  }
-};
-
-HWY_NOINLINE void TestAllReverseBlocks() {
-  ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
-}
-
 class TestCompress {
-  template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
-  void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
-                   const AlignedFreeUniquePtr<T[]>& in,
+  template <typename T, typename TI, size_t N>
+  void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
+                   size_t actual_pos, const AlignedFreeUniquePtr<T[]>& in,
                   const AlignedFreeUniquePtr<TI[]>& mask_lanes,
                   const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
                   int line) {
    if (expected_pos != actual_pos) {
-      hwy::Abort(
-          __FILE__, line,
-          "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
-          TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
-          static_cast<uint64_t>(actual_pos));
+      hwy::Abort(__FILE__, line,
+                 "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
+                 TypeName(T(), N).c_str(), static_cast<uint64_t>(expected_pos), static_cast<uint64_t>(actual_pos));
    }
    // Upper lanes are undefined. Modified from AssertVecEqual.
    for (size_t i = 0; i < expected_pos; ++i) {
@ -368,7 +222,6 @@ class TestCompress {
                "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
                static_cast<uint64_t>(i), static_cast<uint64_t>(expected_pos),
                line);
-        const size_t N = Lanes(d);
        Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
        Print(d, "in", Load(d, in.get()), 0, N);
        Print(d, "expect", Load(d, expected.get()), 0, N);
@ -398,10 +251,7 @@ class TestCompress {
      auto expected = AllocateAligned<T>(N);
      auto actual_a = AllocateAligned<T>(misalign + N);
      T* actual_u = actual_a.get() + misalign;
-
-      const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
-      auto bits = AllocateAligned<uint8_t>(bits_size);
-      memset(bits.get(), 0, bits_size);  // for MSAN
+      auto bits = AllocateAligned<uint8_t>(HWY_MAX(8, (N + 7) / 8));

      // Each lane should have a chance of having mask=true.
      for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
@ -615,7 +465,7 @@ HWY_NOINLINE void TestAllCompress() {

  test(uint16_t());
  test(int16_t());
-#if HWY_HAVE_FLOAT16
+#if HWY_CAP_FLOAT16
  test(float16_t());
 #endif

@ -632,17 +482,11 @@ HWY_AFTER_NAMESPACE();
 namespace hwy {
 HWY_BEFORE_TEST(HwySwizzleTest);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse2);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse4);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse8);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks);
 HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
 }  // namespace hwy

--- a/third_party/highway/hwy/tests/test_util-inl.h
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@ -41,7 +41,7 @@ HWY_NOINLINE void PrintValue(T value) {
  fprintf(stderr, "0x%02X,", byte);
 }

-#if HWY_HAVE_FLOAT16
+#if HWY_CAP_FLOAT16
 HWY_NOINLINE void PrintValue(float16_t value) {
  uint16_t bits;
  CopyBytes<2>(&value, &bits);
@ -70,11 +70,9 @@ void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
 }

 // Compare expected vector to vector.
-// HWY_INLINE works around a Clang SVE compiler bug where all but the first
-// 128 bits (the NEON register) of actual are zero.
 template <class D, typename T = TFromD<D>, class V = Vec<D>>
-HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
-                               const char* filename, const int line) {
+void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
+                    const char* filename, const int line) {
  const size_t N = Lanes(d);
  auto actual_lanes = AllocateAligned<T>(N);
  Store(actual, d, actual_lanes.get());
@ -86,11 +84,9 @@ HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
 }

 // Compare expected lanes to vector.
-// HWY_INLINE works around a Clang SVE compiler bug where all but the first
-// 128 bits (the NEON register) of actual are zero.
 template <class D, typename T = TFromD<D>, class V = Vec<D>>
-HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
-                               const char* filename, int line) {
+HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
+                                 const char* filename, int line) {
  auto expected_lanes = AllocateAligned<T>(Lanes(d));
  Store(expected, d, expected_lanes.get());
  AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
@ -100,10 +96,7 @@ HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
 template <class D>
 HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
                                  const char* filename, int line) {
-  // lvalues prevented MSAN failure in farm_sve.
-  const Vec<D> va = VecFromMask(d, a);
-  const Vec<D> vb = VecFromMask(d, b);
-  AssertVecEqual(d, va, vb, filename, line);
+  AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);

  const char* target_name = hwy::TargetName(HWY_TARGET);
  AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
@ -185,269 +178,169 @@ HWY_INLINE Mask<D> MaskFalse(const D d) {

 // Helpers for instantiating tests with combinations of lane types / counts.

-// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
-// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
-// is required to ensure capped vectors remain extendable. Implemented by
-// recursively halving kMul until it is zero.
-template <typename T, size_t kMul, size_t kMinArg, class Test>
-struct ForeachCappedR {
-  static void Do(size_t min_lanes, size_t max_lanes) {
-    const CappedTag<T, kMul * kMinArg> d;
+// For ensuring we do not call tests with D such that widening D results in 0
+// lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
+// of a u64 vector in this case.
+template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+HWY_INLINE size_t PromotedLanes(const D d) {
+  return Lanes(RepartitionToWide<decltype(d)>());
+}
+// Already the widest possible T, cannot widen.
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+HWY_INLINE size_t PromotedLanes(const D d) {
+  return Lanes(d);
+}

-    // If we already don't have enough lanes, stop.
-    const size_t lanes = Lanes(d);
-    if (lanes < min_lanes) return;
+// For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
+// stops at kMul == 0). Note that N may be capped or a fraction.
+template <typename T, size_t kMul, size_t kMinLanes, class Test,
+          bool kPromote = false>
+struct ForeachSizeR {
+  static void Do() {
+    const Simd<T, kMul * kMinLanes> d;

-    if (lanes <= max_lanes) {
-      Test()(T(), d);
-    }
-    ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
+    // Skip invalid fractions (e.g. 1/8th of u32x4).
+    const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d);
+    if (lanes < kMinLanes) return;
+
+    Test()(T(), d);
+
+    static_assert(kMul != 0, "Recursion should have ended already");
+    ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do();
  }
 };

 // Base case to stop the recursion.
-template <typename T, size_t kMinArg, class Test>
-struct ForeachCappedR<T, 0, kMinArg, Test> {
-  static void Do(size_t, size_t) {}
+template <typename T, size_t kMinLanes, class Test, bool kPromote>
+struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> {
+  static void Do() {}
 };

-#if HWY_HAVE_SCALABLE
-
-constexpr int MinVectorSize() {
-#if HWY_TARGET == HWY_RVV
-  // Actually 16 for the application processor profile, but the intrinsics are
-  // defined as if VLEN might be only 64: there is no vuint64mf2_t.
-  return 8;
-#else
-  return 16;
-#endif
-}
-
-template <typename T>
-constexpr int MinPow2() {
-  // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
-  // as kPow2 == -3). The fraction also must not result in zero lanes for the
-  // smallest possible vector size.
-  return HWY_MAX(-3, -static_cast<int>(CeilLog2(MinVectorSize() / sizeof(T))));
-}
-
-// Iterates kPow2 upward through +3.
-template <typename T, int kPow2, int kAddPow2, class Test>
-struct ForeachShiftR {
-  static void Do(size_t min_lanes) {
-    const ScalableTag<T, kPow2 + kAddPow2> d;
-
-    // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
-    // vector size, so we always have enough lanes, except ForGEVectors.
-    if (Lanes(d) >= min_lanes) {
-      Test()(T(), d);
-    } else {
-      fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
-              static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
-              static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
-      HWY_ASSERT(min_lanes != 1);
-    }
-
-    ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
-  }
-};
-
-// Base case to stop the recursion.
-template <typename T, int kAddPow2, class Test>
-struct ForeachShiftR<T, 4, kAddPow2, Test> {
-  static void Do(size_t) {}
-};
-#else
-// ForeachCappedR already handled all possible sizes.
-#endif  // HWY_HAVE_SCALABLE
-
 // These adapters may be called directly, or via For*Types:

-// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
+// Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
 // ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
-template <class Test, int kPow2 = 1>
+template <class Test, size_t kFactor = 2>
 struct ForExtendableVectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // Skip CappedTag that are already full vectors.
-    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
-    (void)kMaxCapped;
-    (void)max_lanes;
 #if HWY_TARGET == HWY_SCALAR
    // not supported
 #else
-    ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
+    constexpr bool kPromote = true;
 #if HWY_TARGET == HWY_RVV
-    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(1);
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
+    // Fractions
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do();
 #endif
 #endif  // HWY_SCALAR
  }
 };

-// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
+// Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
 // that narrow their input, e.g. UpperHalf.
-template <class Test, int kPow2 = 1>
+template <class Test, size_t kFactor = 2>
 struct ForShrinkableVectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-
-    (void)kMinLanes;
-    (void)max_lanes;
-    (void)max_lanes;
 #if HWY_TARGET == HWY_SCALAR
    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do();
+#elif HWY_TARGET == HWY_SCALAR
+    // not supported
 #else
-    ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
-                                                                  max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
+    ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do();
 #endif
-#endif  // HWY_TARGET == HWY_SCALAR
  }
 };

-// Calls Test for all supported power of two vectors of at least kMinBits.
-// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
-template <size_t kMinBits, class Test>
-struct ForGEVectors {
+// Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
+// ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
+template <class Test>
+struct ForGE128Vectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
-    // An upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-    (void)max_lanes;
 #if HWY_TARGET == HWY_SCALAR
-    (void)kMinLanes;  // not supported
+    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
 #else
-    ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(kMinLanes,
-                                                                     max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // Can be 0 (handled below) if kMinBits > 64.
-    constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
-    constexpr int kMinPow2 =
-        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
-    // For each [kMinPow2, 3]; counter is [kMinPow2, 3].
-    ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // Can be 0 (handled below) if kMinBits > 128.
-    constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
-    constexpr int kMinPow2 =
-        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
-    // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
-    ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
+    ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
+                 Test>::Do();
 #endif
-#endif  // HWY_TARGET == HWY_SCALAR
  }
 };

+// Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
+// ops that require at least 64 bits, e.g. casts.
 template <class Test>
-using ForGE128Vectors = ForGEVectors<128, Test>;
+struct ForGE64Vectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)),
+                 Test>::Do();
+#endif
+  }
+};

 // Calls Test for all N that can be promoted (not the same as Extendable because
 // HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
-template <class Test, int kPow2 = 1>
+template <class Test, size_t kFactor = 2>
 struct ForPromoteVectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-    constexpr size_t kFactor = size_t{1} << kPow2;
-    static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    constexpr size_t kMinLanes = kFactor;
-    // Skip CappedTag that are already full vectors.
-    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
-    (void)kMaxCapped;
-    (void)kMinLanes;
-    (void)max_lanes;
 #if HWY_TARGET == HWY_SCALAR
-    ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+    ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do();
 #else
-    // TODO(janwas): call Extendable if kMinLanes check not required?
-    ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes, max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(kMinLanes);
+    return ForExtendableVectors<Test, kFactor>()(T());
 #endif
-#endif  // HWY_SCALAR
  }
 };

 // Calls Test for all N than can be demoted (not the same as Shrinkable because
-// HWY_SCALAR has one lane).
-template <class Test, int kPow2 = 1>
+// HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
+template <class Test, size_t kFactor = 2>
 struct ForDemoteVectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-
-    (void)kMinLanes;
-    (void)max_lanes;
-    (void)max_lanes;
 #if HWY_TARGET == HWY_SCALAR
-    ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
+    ForeachSizeR<T, 1, 1, Test>::Do();
 #else
-    ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
-                                                                  max_lanes);
-
-// TODO(janwas): call Extendable if kMinLanes check not required?
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
+    return ForShrinkableVectors<Test, kFactor>()(T());
 #endif
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-// For LowerHalf/Quarter.
-template <class Test, int kPow2 = 1>
-struct ForHalfVectors {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-
-    (void)kMinLanes;
-    (void)max_lanes;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
-#else
-//    ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
-//                                                                  max_lanes);
-
-// TODO(janwas): call Extendable if kMinLanes check not required?
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
-#endif
-#endif  // HWY_TARGET == HWY_SCALAR
  }
 };

@ -457,7 +350,7 @@ template <class Test>
 struct ForPartialVectors {
  template <typename T>
  void operator()(T t) const {
-    ForExtendableVectors<Test, 0>()(t);
+    ForExtendableVectors<Test, 1>()(t);
  }
 };

@ -468,7 +361,7 @@ void ForSignedTypes(const Func& func) {
  func(int8_t());
  func(int16_t());
  func(int32_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  func(int64_t());
 #endif
 }
@ -478,7 +371,7 @@ void ForUnsignedTypes(const Func& func) {
  func(uint8_t());
  func(uint16_t());
  func(uint32_t());
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  func(uint64_t());
 #endif
 }
@ -492,7 +385,7 @@ void ForIntegerTypes(const Func& func) {
 template <class Func>
 void ForFloatTypes(const Func& func) {
  func(float());
-#if HWY_HAVE_FLOAT64
+#if HWY_CAP_FLOAT64
  func(double());
 #endif
 }
@ -504,49 +397,32 @@ void ForAllTypes(const Func& func) {
 }

 template <class Func>
-void ForUIF16(const Func& func) {
-  func(uint16_t());
-  func(int16_t());
-#if HWY_HAVE_FLOAT16
-  func(float16_t());
-#endif
-}
-
-template <class Func>
-void ForUIF32(const Func& func) {
+void ForUIF3264(const Func& func) {
  func(uint32_t());
  func(int32_t());
-  func(float());
-}
-
-template <class Func>
-void ForUIF64(const Func& func) {
-#if HWY_HAVE_INTEGER64
+#if HWY_CAP_INTEGER64
  func(uint64_t());
  func(int64_t());
 #endif
-#if HWY_HAVE_FLOAT64
-  func(double());
-#endif
-}

-template <class Func>
-void ForUIF3264(const Func& func) {
-  ForUIF32(func);
-  ForUIF64(func);
+  ForFloatTypes(func);
 }

 template <class Func>
 void ForUIF163264(const Func& func) {
-  ForUIF16(func);
  ForUIF3264(func);
+  func(uint16_t());
+  func(int16_t());
+#if HWY_CAP_FLOAT16
+  func(float16_t());
+#endif
 }

 // For tests that involve loops, adjust the trip count so that emulated tests
 // finish quickly (but always at least 2 iterations to ensure some diversity).
 constexpr size_t AdjustedReps(size_t max_reps) {
 #if HWY_ARCH_RVV
-  return HWY_MAX(max_reps / 32, 2);
+  return HWY_MAX(max_reps / 16, 2);
 #elif HWY_ARCH_ARM
  return HWY_MAX(max_reps / 4, 2);
 #elif HWY_IS_DEBUG_BUILD
@ -556,20 +432,6 @@ constexpr size_t AdjustedReps(size_t max_reps) {
 #endif
 }

-// Same as above, but the loop trip count will be 1 << max_pow2.
-constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
-  // If "negative" (unsigned wraparound), use original.
-#if HWY_ARCH_RVV
-  return HWY_MIN(max_pow2 - 4, max_pow2);
-#elif HWY_ARCH_ARM
-  return HWY_MIN(max_pow2 - 1, max_pow2);
-#elif HWY_IS_DEBUG_BUILD
-  return HWY_MIN(max_pow2 - 1, max_pow2);
-#else
-  return max_pow2;
-#endif
-}
-
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/tests/test_util.cc
+++ b/third_party/highway/hwy/tests/test_util.cc
@ -30,6 +30,9 @@ bool BytesEqual(const void* p1, const void* p2, const size_t size,
  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
  for (size_t i = 0; i < size; ++i) {
    if (bytes1[i] != bytes2[i]) {
+      fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n",
+              static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i],
+              bytes2[i]);
      if (pos != nullptr) {
        *pos = i;
      }
--- a/third_party/highway/hwy/tests/test_util.h
+++ b/third_party/highway/hwy/tests/test_util.h
@ -26,7 +26,6 @@
 #include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
 #include "hwy/highway.h"
-#include "hwy/highway_export.h"

 namespace hwy {

@ -68,7 +67,9 @@ static HWY_INLINE uint32_t Random32(RandomState* rng) {
  return static_cast<uint32_t>((*rng)());
 }

-static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
+static HWY_INLINE uint64_t Random64(RandomState* rng) {
+  return (*rng)();
+}

 // Prevents the compiler from eliding the computations that led to "output".
 // Works by indicating to the compiler that "output" is being read and modified.
@ -83,8 +84,8 @@ inline void PreventElision(T&& output) {
 #endif  // HWY_COMPILER_MSVC
 }

-HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
-                                   const size_t size, size_t* pos = nullptr);
+bool BytesEqual(const void* p1, const void* p2, const size_t size,
+                size_t* pos = nullptr);

 void AssertStringEqual(const char* expected, const char* actual,
                       const char* target_name, const char* filename, int line);
@ -128,25 +129,25 @@ HWY_INLINE TypeInfo MakeTypeInfo() {
  return info;
 }

-HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
-                                const void* actual_ptr);
+bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+             const void* actual_ptr);

-HWY_TEST_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
+void TypeName(const TypeInfo& info, size_t N, char* string100);

-HWY_TEST_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
-                                   const void* array_void, size_t N,
-                                   size_t lane_u = 0, size_t max_lanes = 7);
+void PrintArray(const TypeInfo& info, const char* caption,
+                const void* array_void, size_t N, size_t lane_u = 0,
+                size_t max_lanes = 7);

-HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
-    const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
-    const char* target_name, const char* filename, int line, size_t lane = 0,
-    size_t num_lanes = 1);
+HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
+                                        const void* expected_ptr,
+                                        const void* actual_ptr,
+                                        const char* target_name,
+                                        const char* filename, int line,
+                                        size_t lane = 0, size_t num_lanes = 1);

-HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
-                                         const void* expected_void,
-                                         const void* actual_void, size_t N,
-                                         const char* target_name,
-                                         const char* filename, int line);
+void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
+                      const void* actual_void, size_t N,
+                      const char* target_name, const char* filename, int line);

 }  // namespace detail

--- a/third_party/highway/hwy/tests/test_util_test.cc
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@ -52,10 +52,10 @@ HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
 struct TestEqualInteger {
  template <class T>
  HWY_NOINLINE void operator()(T /*t*/) const {
-    HWY_ASSERT_EQ(T(0), T(0));
-    HWY_ASSERT_EQ(T(1), T(1));
-    HWY_ASSERT_EQ(T(-1), T(-1));
-    HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
+    HWY_ASSERT(IsEqual(T(0), T(0)));
+    HWY_ASSERT(IsEqual(T(1), T(1)));
+    HWY_ASSERT(IsEqual(T(-1), T(-1)));
+    HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));

    HWY_ASSERT(!IsEqual(T(0), T(1)));
    HWY_ASSERT(!IsEqual(T(1), T(0)));
--- a/third_party/highway/libhwy-contrib.pc.in
+++ b/third_party/highway/libhwy-contrib.pc.in
@ -4,7 +4,7 @@ libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
 includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: libhwy-contrib
-Description: Additions to Highway: dot product, image, math, sort
+Description: Additions to Highway: image and math library
 Version: @HWY_LIBRARY_VERSION@
 Libs: -L${libdir} -lhwy_contrib
 Cflags: -I${includedir}
--- a/third_party/highway/libhwy.pc.in
+++ b/third_party/highway/libhwy.pc.in
@ -7,4 +7,4 @@ Name: libhwy
 Description: Efficient and performance-portable SIMD wrapper
 Version: @HWY_LIBRARY_VERSION@
 Libs: -L${libdir} -lhwy
-Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@
+Cflags: -I${includedir}
--- a/third_party/highway/preamble.js.lds
+++ b/third_party/highway/preamble.js.lds
@ -1,9 +0,0 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */
-var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } };
--- a/third_party/highway/run_tests.sh
+++ b/third_party/highway/run_tests.sh
@ -59,7 +59,7 @@ export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
 rm -rf build_arm7
 mkdir build_arm7
 cd build_arm7
-CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
 make -j8
 ctest
 cd ..
@ -71,7 +71,7 @@ export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
 rm -rf build_arm8
 mkdir build_arm8
 cd build_arm8
-CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
 make -j8
 ctest
 cd ..
--- a/third_party/jpeg-xl/.github/workflows/build_test.yml
+++ b/third_party/jpeg-xl/.github/workflows/build_test.yml
@ -34,7 +34,7 @@ jobs:
            env_stack_size: 1
            max_stack: 3000
            # Conformance tooling test requires numpy.
-            apt_pkgs: graphviz python3-numpy
+            apt_pkgs: python3-numpy
          - name: lowprecision
            mode: release
            test_in_pr: true
@ -461,8 +461,8 @@ jobs:
    runs-on: ubuntu-latest
    env:
      CCACHE_DIR: ${{ github.workspace }}/.ccache
-      EM_VERSION: 3.1.4
-      V8_VERSION: 9.8.177
+      EM_VERSION: 2.0.23
+      V8_VERSION: 9.3.22
      V8: ${{ github.workspace }}/.jsvu/v8
      BUILD_TARGET: wasm32

@ -506,7 +506,7 @@ jobs:
          ${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }}

    - name: Install emsdk
-      uses: mymindstorm/setup-emsdk@v11
+      uses: mymindstorm/setup-emsdk@v10
      # TODO(deymo): We could cache this action but it doesn't work when running
      # in a matrix.
      with:
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
 # Git revisions we use for the given submodules. Update these whenever you
 # update a git submodule.
 THIRD_PARTY_GFLAGS="827c769e5fc98e0f2a34c47cef953cc6328abced"
-THIRD_PARTY_HIGHWAY="f13e3b956eb226561ac79427893ec0afd66f91a8"
+THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5"
 THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
 THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
 THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
--- a/third_party/jpeg-xl/lib/extras/codec.cc
+++ b/third_party/jpeg-xl/lib/extras/codec.cc
@ -5,12 +5,6 @@

 #include "lib/extras/codec.h"

-#include "jxl/decode.h"
-#include "jxl/types.h"
-#include "lib/extras/packed_image.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-
 #if JPEGXL_ENABLE_APNG
 #include "lib/extras/enc/apng.h"
 #endif
@ -74,14 +68,6 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
    JXL_WARNING("Writing JPEG data as pixels");
  }

-  extras::PackedPixelFile ppf;
-  size_t num_channels = io.metadata.m.color_encoding.Channels();
-  JxlPixelFormat format = {
-      static_cast<uint32_t>(num_channels),
-      bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
-      JXL_NATIVE_ENDIAN, 0};
-  std::vector<uint8_t> bytes_vector;
-  const bool floating_point = bits_per_sample > 16;
  switch (codec) {
    case extras::Codec::kPNG:
 #if JPEGXL_ENABLE_APNG
@ -101,24 +87,8 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
      return JXL_FAILURE("JPEG XL was built without JPEG support");
 #endif
    case extras::Codec::kPNM:
-
-      // Choose native for PFM; PGM/PPM require big-endian (N/A for PBM)
-      format.endianness = floating_point ? JXL_NATIVE_ENDIAN : JXL_BIG_ENDIAN;
-      if (floating_point) {
-        format.data_type = JXL_TYPE_FLOAT;
-      }
-      if (!c_desired.IsSRGB()) {
-        JXL_WARNING(
-            "PNM encoder cannot store custom ICC profile; decoder\n"
-            "will need hint key=color_space to get the same values");
-      }
-      JXL_RETURN_IF_ERROR(extras::ConvertCodecInOutToPackedPixelFile(
-          io, format, c_desired, pool, &ppf));
-      JXL_RETURN_IF_ERROR(
-          extras::EncodeImagePNM(ppf, bits_per_sample, pool, &bytes_vector));
-      bytes->assign(bytes_vector.data(),
-                    bytes_vector.data() + bytes_vector.size());
-      return true;
+      return extras::EncodeImagePNM(&io, c_desired, bits_per_sample, pool,
+                                    bytes);
    case extras::Codec::kPGX:
      return extras::EncodeImagePGX(&io, c_desired, bits_per_sample, pool,
                                    bytes);
--- a/third_party/jpeg-xl/lib/extras/dec/color_description.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/color_description.cc
@ -61,6 +61,7 @@ const EnumName<JxlRenderingIntent> kJxlRenderingIntentNames[] = {
 template <typename T>
 Status ParseEnum(const std::string& token, const EnumName<T>* enum_values,
                 size_t enum_len, T* value) {
+  std::string str;
  for (size_t i = 0; i < enum_len; i++) {
    if (enum_values[i].name == token) {
      *value = enum_values[i].value;
--- a/Показать больше
+++ b/Показать больше