Bug 1743793 - Update libjxl and highway r=tnikkel

Differential Revision: https://phabricator.services.mozilla.com/D132952
2021-12-06 23:43:32 +00:00 · 2021-12-06 23:43:32 +00:00 · 59265056dd
--- a/media/highway/moz.build
+++ b/media/highway/moz.build
@ -18,6 +18,8 @@ EXPORTS.hwy += [
    "/third_party/highway/hwy/aligned_allocator.h",
    "/third_party/highway/hwy/base.h",
    "/third_party/highway/hwy/cache_control.h",
+    "/third_party/highway/hwy/detect_compiler_arch.h",
+    "/third_party/highway/hwy/detect_targets.h",
    "/third_party/highway/hwy/foreach_target.h",
    "/third_party/highway/hwy/highway.h",
    "/third_party/highway/hwy/targets.h",
@ -25,6 +27,8 @@ EXPORTS.hwy += [

 EXPORTS.hwy.ops += [
    "/third_party/highway/hwy/ops/arm_neon-inl.h",
+    "/third_party/highway/hwy/ops/arm_sve-inl.h",
+    "/third_party/highway/hwy/ops/generic_ops-inl.h",
    "/third_party/highway/hwy/ops/rvv-inl.h",
    "/third_party/highway/hwy/ops/scalar-inl.h",
    "/third_party/highway/hwy/ops/set_macros-inl.h",
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@ -1,43 +1,44 @@
-# Version of this schema
-schema: 1
-
-bugzilla:
-  # Bugzilla product and component for this directory and subdirectories
-  product: Core
-  component: "ImageLib"
-
-# Document the source of externally hosted code
-origin:
-
-  # Short name of the package/library
-  name: highway
-
-  description: Performance-portable, length-agnostic SIMD with runtime dispatch
-
-  # Full URL for the package's homepage/etc
-  # Usually different from repository url
-  url: https://github.com/google/highway
-
-  # Human-readable identifier for this version/release
-  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit e2397743fe092df68b760d358253773699a16c93 (2021-06-09T08:56:32Z).
-
-  # Revision to pull in
-  # Must be a long or short commit SHA (long preferred)
-  revision: e2397743fe092df68b760d358253773699a16c93
-
-  # The package's license, where possible using the mnemonic from
-  # https://spdx.org/licenses/
-  # Multiple licenses can be specified (as a YAML list)
-  # A "LICENSE" file must exist containing the full license text
-  license: Apache-2.0
-
-  license-file: LICENSE
-
-vendoring:
-  url: https://github.com/google/highway
-  source-hosting: github
-  vendor-directory: third_party/highway
-
-  exclude:
-    - g3doc/
+# Version of this schema
+schema: 1
+
+bugzilla:
+  # Bugzilla product and component for this directory and subdirectories
+  product: Core
+  component: "ImageLib"
+
+# Document the source of externally hosted code
+origin:
+
+  # Short name of the package/library
+  name: highway
+
+  description: Performance-portable, length-agnostic SIMD with runtime dispatch
+
+  # Full URL for the package's homepage/etc
+  # Usually different from repository url
+  url: https://github.com/google/highway
+
+  # Human-readable identifier for this version/release
+  # Generally "version NNN", "tag SSS", "bookmark SSS"
+  release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z).
+
+  # Revision to pull in
+  # Must be a long or short commit SHA (long preferred)
+  revision: e69083a12a05caf037cabecdf1b248b7579705a5
+
+  # The package's license, where possible using the mnemonic from
+  # https://spdx.org/licenses/
+  # Multiple licenses can be specified (as a YAML list)
+  # A "LICENSE" file must exist containing the full license text
+  license: Apache-2.0
+
+  license-file: LICENSE
+
+vendoring:
+  url: https://github.com/google/highway
+  source-hosting: github
+  vendor-directory: third_party/highway
+
+  exclude:
+    - g3doc/
+
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -20,12 +20,12 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit 9e8c5766ba32c008ddcaf11f0fac591aa7964f7b (2021-11-10T07:51:06Z).
+  release: commit a9100da7143e4f367d2e26f4d11e457e85343543 (2021-11-25T17:57:37Z).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
  # NOTE(krosylight): Update highway together when updating this!
-  revision: 9e8c5766ba32c008ddcaf11f0fac591aa7964f7b
+  revision: a9100da7143e4f367d2e26f4d11e457e85343543

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
@ -53,3 +53,4 @@ vendoring:
    - third_party/testdata/
    - tools/

+
--- a/third_party/highway/BUILD
+++ b/third_party/highway/BUILD
@ -41,9 +41,15 @@ selects.config_setting_group(
    ],
 )

+config_setting(
+    name = "emulate_sve",
+    values = {
+        "copt": "-DHWY_EMULATE_SVE",
+    },
+)
+
 # Additional warnings for Clang OR GCC (skip for MSVC)
 CLANG_GCC_COPTS = [
-    "-Werror",
    "-Wunused-parameter",
    "-Wunused-variable",
    "-Wextra-semi",
@ -74,13 +80,19 @@ COPTS = select({
    ":compiler_gcc": CLANG_GCC_COPTS,
    # Default to clang because compiler detection only works in Bazel
    "//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
+}) + select({
+    "@platforms//cpu:riscv64": [
+        "-march=rv64gcv0p10",
+        "-menable-experimental-extensions",
+    ],
+    "//conditions:default": [
+    ],
 })

-# Unused on Bazel builds, where these are not defined/known; Copybara replaces
-# usages of this with an empty list.
+# Unused on Bazel builds, where this is not defined/known; Copybara replaces
+# usages with an empty list.
 COMPAT = [
-    "//buildenv/target:mobile",
-    "//buildenv/target:vendor",
+    "//buildenv/target:non_prod",  # includes mobile/vendor.
 ]

 # WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
@ -94,24 +106,23 @@ cc_library(
        "hwy/aligned_allocator.cc",
        "hwy/targets.cc",
    ],
+    # Normal headers with include guards
    hdrs = [
        "hwy/aligned_allocator.h",
        "hwy/base.h",
        "hwy/cache_control.h",
-        "hwy/highway.h",
+        "hwy/detect_compiler_arch.h",  # private
+        "hwy/detect_targets.h",  # private
        "hwy/targets.h",
    ],
    compatible_with = [],
    copts = COPTS,
-    # TODO(janwas): remove once WASM toolchain supports *extend instead of *widen
-    defines = select({
-        "//tools/cc_target_os:wasm": ["HWY_WASM_OLD_NAMES"],
-        "//conditions:default": [],
-    }),
    textual_hdrs = [
+        "hwy/highway.h",  # public
        "hwy/foreach_target.h",  # public
        "hwy/ops/arm_neon-inl.h",
        "hwy/ops/arm_sve-inl.h",
+        "hwy/ops/generic_ops-inl.h",
        "hwy/ops/rvv-inl.h",
        "hwy/ops/scalar-inl.h",
        "hwy/ops/set_macros-inl.h",
@ -121,6 +132,19 @@ cc_library(
        "hwy/ops/x86_256-inl.h",
        "hwy/ops/x86_512-inl.h",
    ],
+    deps = select({
+        ":emulate_sve": ["//third_party/farm_sve"],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "dot",
+    compatible_with = [],
+    textual_hdrs = [
+        "hwy/contrib/dot/dot-inl.h",
+    ],
+    deps = [":hwy"],
 )

 cc_library(
@ -144,9 +168,26 @@ cc_library(
    deps = [":hwy"],
 )

+cc_library(
+    name = "sort",
+    compatible_with = [],
+    textual_hdrs = [
+        "hwy/contrib/sort/sort-inl.h",
+    ],
+    deps = [":hwy"],
+)
+
+# Everything required for tests that use Highway.
 cc_library(
    name = "hwy_test_util",
-    textual_hdrs = ["hwy/tests/test_util-inl.h"],
+    srcs = ["hwy/tests/test_util.cc"],
+    hdrs = ["hwy/tests/test_util.h"],
+    textual_hdrs = [
+        "hwy/tests/test_util-inl.h",
+        "hwy/tests/hwy_gtest.h",
+    ],
+    # Must not depend on a gtest variant, which can conflict with the
+    # GUNIT_INTERNAL_BUILD_MODE defined by the test.
    deps = [":hwy"],
 )

@ -182,8 +223,10 @@ cc_binary(

 # path, name
 HWY_TESTS = [
+    ("hwy/contrib/dot/", "dot_test"),
    ("hwy/contrib/image/", "image_test"),
    ("hwy/contrib/math/", "math_test"),
+    ("hwy/contrib/sort/", "sort_test"),
    ("hwy/examples/", "skeleton_test"),
    ("hwy/", "nanobenchmark_test"),
    ("hwy/", "aligned_allocator_test"),
@ -191,10 +234,13 @@ HWY_TESTS = [
    ("hwy/", "highway_test"),
    ("hwy/", "targets_test"),
    ("hwy/tests/", "arithmetic_test"),
+    ("hwy/tests/", "blockwise_test"),
    ("hwy/tests/", "combine_test"),
    ("hwy/tests/", "compare_test"),
    ("hwy/tests/", "convert_test"),
+    ("hwy/tests/", "crypto_test"),
    ("hwy/tests/", "logical_test"),
+    ("hwy/tests/", "mask_test"),
    ("hwy/tests/", "memory_test"),
    ("hwy/tests/", "swizzle_test"),
    ("hwy/tests/", "test_util_test"),
@ -209,16 +255,32 @@ HWY_TESTS = [
            srcs = [
                subdir + test + ".cc",
            ],
-            copts = COPTS,
-            # for test_suite. math_test is not yet supported on RVV.
-            tags = ["hwy_ops_test"] if test != "math_test" else [],
+            copts = COPTS + [
+                # gTest triggers this warning (which is enabled by the
+                # extra-semi in COPTS), so we need to disable it here,
+                # but it's still enabled for :hwy.
+                "-Wno-c++98-compat-extra-semi",
+            ],
+            features = select({
+                "@platforms//cpu:riscv64": ["fully_static_link"],
+                "//conditions:default": [],
+            }),
+            linkstatic = select({
+                "@platforms//cpu:riscv64": True,
+                "//conditions:default": False,
+            }),
+            local_defines = ["HWY_IS_TEST"],
+            # for test_suite.
+            tags = ["hwy_ops_test"],
            deps = [
+                ":dot",
                ":hwy",
                ":hwy_test_util",
                ":image",
                ":math",
                ":nanobenchmark",
                ":skeleton",
+                ":sort",
                "@com_google_googletest//:gtest_main",
            ],
        ),
--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@ -19,7 +19,7 @@ if(POLICY CMP0083)
  cmake_policy(SET CMP0083 NEW)
 endif()

-project(hwy VERSION 0.12.2)  # Keep in sync with highway.h version
+project(hwy VERSION 0.15.0)  # Keep in sync with highway.h version

 set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_EXTENSIONS OFF)
@ -42,6 +42,12 @@ endif()

 set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")

+# Unconditionally adding -Werror risks breaking the build when new warnings
+# arise due to compiler/platform changes. Enable this in CI/tests.
+set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
+
+set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?")
+
 include(CheckCXXSourceCompiles)
 check_cxx_source_compiles(
   "int main() {
@ -54,9 +60,11 @@ check_cxx_source_compiles(
 )

 set(HWY_CONTRIB_SOURCES
+    hwy/contrib/dot/dot-inl.h
    hwy/contrib/image/image.cc
    hwy/contrib/image/image.h
    hwy/contrib/math/math-inl.h
+    hwy/contrib/sort/sort-inl.h
 )

 set(HWY_SOURCES
@ -64,12 +72,15 @@ set(HWY_SOURCES
    hwy/aligned_allocator.h
    hwy/base.h
    hwy/cache_control.h
+    hwy/detect_compiler_arch.h  # private
+    hwy/detect_targets.h  # private
    hwy/foreach_target.h
    hwy/highway.h
    hwy/nanobenchmark.cc
    hwy/nanobenchmark.h
    hwy/ops/arm_neon-inl.h
    hwy/ops/arm_sve-inl.h
+    hwy/ops/generic_ops-inl.h
    hwy/ops/scalar-inl.h
    hwy/ops/set_macros-inl.h
    hwy/ops/shared-inl.h
@ -79,7 +90,13 @@ set(HWY_SOURCES
    hwy/ops/x86_512-inl.h
    hwy/targets.cc
    hwy/targets.h
+)
+
+set(HWY_TEST_SOURCES
+    hwy/tests/hwy_gtest.h
    hwy/tests/test_util-inl.h
+    hwy/tests/test_util.cc
+    hwy/tests/test_util.h
 )

 if (MSVC)
@ -98,16 +115,15 @@ else()
    # Warnings
    -Wall
    -Wextra
-    -Wformat-security
-    -Wno-unused-function
-    -Wnon-virtual-dtor
-    -Woverloaded-virtual
+    # These are not included in Wall nor Wextra:
+    -Wconversion
+    -Wsign-conversion
    -Wvla
+    -Wnon-virtual-dtor
  )

  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
    list(APPEND HWY_FLAGS
-      -Wc++2a-extensions
      -Wfloat-overflow-conversion
      -Wfloat-zero-conversion
      -Wfor-loop-analysis
@ -126,32 +142,40 @@ else()
      # Use color in messages
      -fdiagnostics-show-option -fcolor-diagnostics
    )
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
+      list(APPEND HWY_FLAGS -Wc++2a-extensions)
+    endif()
  endif()

  if (WIN32)
+    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+      list(APPEND HWY_FLAGS
+        -Wno-global-constructors
+        -Wno-language-extension-token
+        -Wno-used-but-marked-unused
+        -Wno-shadow-field-in-constructor
+        -Wno-unused-member-function
+        -Wno-unused-template
+        -Wno-c++98-compat-pedantic
+        -Wno-used-but-marked-unused
+        -Wno-zero-as-null-pointer-constant
+      )
+    endif()
+
    list(APPEND HWY_FLAGS
-      -Wno-c++98-compat-pedantic
      -Wno-cast-align
      -Wno-double-promotion
      -Wno-float-equal
      -Wno-format-nonliteral
-      -Wno-global-constructors
-      -Wno-language-extension-token
-      -Wno-missing-prototypes
      -Wno-shadow
-      -Wno-shadow-field-in-constructor
      -Wno-sign-conversion
-      -Wno-unused-member-function
-      -Wno-unused-template
-      -Wno-used-but-marked-unused
-      -Wno-zero-as-null-pointer-constant
    )
  else()
    list(APPEND HWY_FLAGS
      -fmath-errno
      -fno-exceptions
    )
-  endif()
+  endif()  # WIN32

  if (HWY_CMAKE_ARM7)
    list(APPEND HWY_FLAGS
@ -162,6 +186,10 @@ else()
    )
  endif()  # HWY_CMAKE_ARM7

+  if (HWY_WARNINGS_ARE_ERRORS)
+    list(APPEND HWY_FLAGS -Werror)
+  endif()
+
 endif()  # !MSVC

 add_library(hwy STATIC ${HWY_SOURCES})
@ -174,6 +202,31 @@ target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
 set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})

+add_library(hwy_test STATIC ${HWY_TEST_SOURCES})
+target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+
+# -------------------------------------------------------- hwy_list_targets
+# Generate a tool to print the compiled-in targets as defined by the current
+# flags. This tool will print to stderr at build time, after building hwy.
+add_executable(hwy_list_targets hwy/tests/list_targets.cc)
+target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
+target_link_libraries(hwy_list_targets hwy)
+target_include_directories(hwy_list_targets PRIVATE
+  $<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
+# TARGET_FILE always returns the path to executable
+# Naked target also not always could be run (due to the lack of '.\' prefix)
+# Thus effective command to run should contain the full path
+# and emulator prefix (if any).
+add_custom_command(TARGET hwy_list_targets POST_BUILD
+    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
+
+# --------------------------------------------------------
+# Allow skipping the following sections for projects that do not need them:
+# tests, examples, benchmarks and installation.
+if (HWY_EXAMPLES_TESTS_INSTALL)
+
 # -------------------------------------------------------- install library
 install(TARGETS hwy
  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
@ -199,6 +252,18 @@ foreach (source ${HWY_CONTRIB_SOURCES})
  endif()
 endforeach()

+install(TARGETS hwy_test
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_TEST_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+
 # Add a pkg-config file for libhwy and the contrib/test libraries.
 set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
 foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
@ -207,20 +272,6 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 endforeach()

-# -------------------------------------------------------- hwy_list_targets
-# Generate a tool to print the compiled-in targets as defined by the current
-# flags. This tool will print to stderr at build time, after building hwy.
-add_executable(hwy_list_targets hwy/tests/list_targets.cc)
-target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
-target_include_directories(hwy_list_targets PRIVATE
-  $<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
-# TARGET_FILE always returns the path to executable
-# Naked target also not always could be run (due to the lack of '.\' prefix)
-# Thus effective command to run should contain the full path
-# and emulator prefix (if any).
-add_custom_command(TARGET hwy POST_BUILD
-    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
-
 # -------------------------------------------------------- Examples

 # Avoids mismatch between GTest's static CRT and our dynamic.
@ -284,18 +335,25 @@ endif()
 endif() # HWY_SYSTEM_GTEST

 set(HWY_TEST_FILES
+  hwy/contrib/dot/dot_test.cc
  hwy/contrib/image/image_test.cc
+  # Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
+  # not reproducible locally. Still tested via bazel build.
  # hwy/contrib/math/math_test.cc
+  hwy/contrib/sort/sort_test.cc
  hwy/aligned_allocator_test.cc
  hwy/base_test.cc
  hwy/highway_test.cc
  hwy/targets_test.cc
  hwy/examples/skeleton_test.cc
  hwy/tests/arithmetic_test.cc
+  hwy/tests/blockwise_test.cc
  hwy/tests/combine_test.cc
  hwy/tests/compare_test.cc
  hwy/tests/convert_test.cc
+  hwy/tests/crypto_test.cc
  hwy/tests/logical_test.cc
+  hwy/tests/mask_test.cc
  hwy/tests/memory_test.cc
  hwy/tests/swizzle_test.cc
  hwy/tests/test_util_test.cc
@ -314,9 +372,9 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)

  if(HWY_SYSTEM_GTEST)
-    target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
+    target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test GTest::GTest GTest::Main)
  else()
-    target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
+    target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
  endif()
  # Output test targets in the test directory.
  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
@ -336,3 +394,5 @@ endforeach ()
 target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)

 endif() # BUILD_TESTING
+
+endif() # HWY_EXAMPLES_TESTS_INSTALL
--- a/third_party/highway/CMakeLists.txt.in
+++ b/third_party/highway/CMakeLists.txt.in
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 2.8.2)
+cmake_minimum_required(VERSION 2.8.12)

 project(googletest-download NONE)

--- a/third_party/highway/README.md
+++ b/third_party/highway/README.md
@ -1,7 +1,7 @@
 # Efficient and performance-portable SIMD

 Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
-applying the same operation to 'lanes'.
+applying the same operation to multiple 'lanes' using a single CPU instruction.

 ## Why Highway?

@ -14,13 +14,17 @@ applying the same operation to 'lanes'.

 ## Current status

-Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
-Ports to RVV and SVE/SVE2 are in progress.
+Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
+requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE,
+WASM SIMD.
+
+SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not
+yet validated. A subset of RVV is implemented and tested with GCC and QEMU.
+Work is underway to compile using LLVM, which has different intrinsics with AVL.

 Version 0.11 is considered stable enough to use in other projects, and is
 expected to remain backwards compatible unless serious issues are discovered
-while implementing SVE/RVV targets. After these targets are added, Highway will
-reach version 1.0.
+while finishing the RVV target. After that, Highway will reach version 1.0.

 Continuous integration tests build with a recent version of Clang (running on
 x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
@ -61,11 +65,6 @@ make -j && make test

 Or you can run `run_tests.sh` (`run_tests.bat` on Windows).

-To test on all the attainable targets for your platform, use
-`cmake .. -DCMAKE_CXX_FLAGS="-DHWY_COMPILE_ALL_ATTAINABLE"`. Otherwise, the
-default configuration skips baseline targets (e.g. scalar) that are superseded
-by another baseline target.
-
 Bazel is also supported for building, but it is not as widely used/tested.

 ## Quick start
@ -73,16 +72,17 @@ Bazel is also supported for building, but it is not as widely used/tested.
 You can use the `benchmark` inside examples/ as a starting point.

 A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
-and their parameters, and the [instruction_matrix][instmtx] indicates the
-number of instructions per operation.
+and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
+indicates the number of instructions per operation.

 We recommend using full SIMD vectors whenever possible for maximum performance
 portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
 `Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
-two) lanes: `HWY_CAPPED(T, N)`. 128-bit vectors are guaranteed to be available
-for lanes of type `T` if `HWY_TARGET != HWY_SCALAR` and `N == 16 / sizeof(T)`.
+two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET ==
+HWY_SCALAR`, the vector always has one lane. For all other targets, up to
+128-bit vectors are guaranteed to be available.

-Functions using Highway must be inside a namespace `namespace HWY_NAMESPACE {`
+Functions using Highway must be inside `namespace HWY_NAMESPACE {`
 (possibly nested in one or more other namespaces defined by the project), and
 additionally either prefixed with `HWY_ATTR`, or residing between
 `HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
@ -97,11 +97,27 @@ additionally either prefixed with `HWY_ATTR`, or residing between

 *   For dynamic dispatch, a table of function pointers is generated via the
    `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
-    call the best function pointer for the current CPU supported targets. A
+    call the best function pointer for the current CPU's supported targets. A
    module is automatically compiled for each target in `HWY_TARGETS` (see
    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
    defined and foreach_target.h is included.

+## Compiler flags
+
+Applications should be compiled with optimizations enabled - without inlining,
+SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
+generally sufficient.
+
+For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
+pass vector arguments in registers. If intending to use the AVX2 target together
+with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
+with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
+instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
+non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
+resulting binary will then require AVX2. Note that no such flag is needed for
+clang and GCC because they support target-specific attributes, which we use to
+ensure proper VEX code generation for AVX2 targets.
+
 ## Strip-mining loops

 To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
@ -139,10 +155,7 @@ Highway offers several ways to express loops where `N` need not divide `count`:
    The template parameter and second function arguments are again not needed.

    This avoids duplicating code, and is reasonable if `count` is large.
-    Otherwise, multiple iterations may be slower than one `LoopBody` variant
-    with masking, especially because the `HWY_SCALAR` target selected by
-    `HWY_CAPPED(T, 1)` is slower for some operations due to workarounds for
-    undefined behavior in C++.
+    If `count` is small, the second loop may be slower than the next option.

 *   Process whole vectors as above, followed by a single call to a modified
    `LoopBody` with masking:
@ -157,210 +170,23 @@ Highway offers several ways to express loops where `N` need not divide `count`:
    }
    ```
    Now the template parameter and second function argument can be used inside
-    `LoopBody` to replace `Load/Store` of full aligned vectors with
-    `LoadN/StoreN(n)` that affect no more than `1 <= n <= N` aligned elements
-    (pending implementation).
+    `LoopBody` to 'blend' the new partial vector with previous memory contents:
+    `Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.

    This is a good default when it is infeasible to ensure vectors are padded.
    In contrast to the scalar loop, only a single final iteration is needed.

-## Design philosophy
-
-*   Performance is important but not the sole consideration. Anyone who goes to
-    the trouble of using SIMD clearly cares about speed. However, portability,
-    maintainability and readability also matter, otherwise we would write in
-    assembly. We aim for performance within 10-20% of a hand-written assembly
-    implementation on the development platform.
-
-*   The guiding principles of C++ are "pay only for what you use" and "leave no
-    room for a lower-level language below C++". We apply these by defining a
-    SIMD API that ensures operation costs are visible, predictable and minimal.
-
-*   Performance portability is important, i.e. the API should be efficient on
-    all target platforms. Unfortunately, common idioms for one platform can be
-    inefficient on others. For example: summing lanes horizontally versus
-    shuffling. Documenting which operations are expensive does not prevent their
-    use, as evidenced by widespread use of `HADDPS`. Performance acceptance
-    tests may detect large regressions, but do not help choose the approach
-    during initial development. Analysis tools can warn about some potential
-    inefficiencies, but likely not all. We instead provide [a carefully chosen
-    set of vector types and operations that are efficient on all target
-    platforms][instmtx] (PPC8, SSE4/AVX2+, ARMv8).
-
-*   Future SIMD hardware features are difficult to predict. For example, AVX2
-    came with surprising semantics (almost no interaction between 128-bit
-    blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
-    To ensure the API reflects hardware realities, we suggest a flexible
-    approach that adds new operations as they become commonly available, with
-    scalar fallbacks where not supported.
-
-*   Masking is not yet widely supported on current CPUs. It is difficult to
-    define an interface that provides access to all platform features while
-    retaining performance portability. The P0214R5 proposal lacks support for
-    AVX-512/ARM SVE zeromasks. We suggest limiting usage of masks to the
-    `IfThen[Zero]Else[Zero]` functions until the community has gained more
-    experience with them.
-
-*   "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
-    For example, valarray-like code can iterate over a 1D array with a
-    library-specified vector width. This will result in better code when vector
-    sizes increase, and matches the direction taken by
-    [ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
-    RiscV V as well as Agner Fog's
-    [ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
-    applications may require fixed sizes, so we also guarantee support for
-    128-bit vectors in each instruction set.
-
-*   The API and its implementation should be usable and efficient with commonly
-    used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
-    instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
-    (https://godbolt.org/g/rKx5Ga). Highway requires function-specific
-    target attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
-
-*   Efficient and safe runtime dispatch is important. Modules such as image or
-    video codecs are typically embedded into larger applications such as
-    browsers, so they cannot require separate binaries for each CPU. Libraries
-    also cannot predict whether the application already uses AVX2 (and pays the
-    frequency throttling cost), so this decision must be left to the
-    application. Using only the lowest-common denominator instructions
-    sacrifices too much performance.
-    Therefore, we provide code paths for multiple instruction sets and choose
-    the most suitable at runtime. To reduce overhead, dispatch should be hoisted
-    to higher layers instead of checking inside every low-level function.
-    Highway supports inlining functions in the same file or in *-inl.h headers.
-    We generate all code paths from the same source to reduce implementation-
-    and debugging cost.
-
-*   Not every CPU need be supported. For example, pre-SSE4.1 CPUs are
-    increasingly rare and the AVX instruction set is limited to floating-point
-    operations. To reduce code size and compile time, we provide specializations
-    for SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar fallback.
-
-*   Access to platform-specific intrinsics is necessary for acceptance in
-    performance-critical projects. We provide conversions to and from intrinsics
-    to allow utilizing specialized platform-specific functionality, and simplify
-    incremental porting of existing code.
-
-*   The core API should be compact and easy to learn. We provide only the few
-    dozen operations which are necessary and sufficient for most of the 150+
-    SIMD applications we examined.
-
-## Prior API designs
-
-The author has been writing SIMD code since 2002: first via assembly language,
-then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
-of custom vector classes. The first used macros to generate the classes, which
-reduces duplication but also readability. The second used templates instead.
-The third (used in highwayhash and PIK) added support for AVX2 and runtime
-dispatch. The current design (used in JPEG XL) enables code generation for
-multiple platforms and/or instruction sets from the same source, and improves
-runtime dispatch.
-
-## Differences versus [P0214R5 proposal](https://goo.gl/zKW4SA)
-
-1.  Adding widely used and portable operations such as `AndNot`, `AverageRound`,
-    bit-shift by immediates and `IfThenElse`.
-
-1.  Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
-    crossing 128-bit 'block' boundaries.
-
-1.  Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
-    returns `fixed_size<>` vectors which are more expensive to access because
-    they reside on the stack. We can avoid this plus additional overhead on
-    ARM/AVX2 by defining width-expanding operations as functions of a vector
-    part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
-    `uint16_t`, or demoting full vectors to half vectors with half-width lanes.
-
-1.  Guaranteeing access to the underlying intrinsic vector type. This ensures
-    all platform-specific capabilities can be used. P0214R5 instead only
-    'encourages' implementations to provide access.
-
-1.  Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
-    based on the Vc library, which does not provide assistance for linking
-    multiple instruction sets into the same binary. The Vc documentation
-    suggests compiling separate executables for each instruction set or using
-    GCC's ifunc (indirect functions). The latter is compiler-specific and risks
-    crashes due to ODR violations when compiling the same function with
-    different compiler flags. We solve this problem via target-specific
-    namespaces and attributes (see HOWTO section below). We also permit a mix of
-    static target selection and runtime dispatch for hotspots that may benefit
-    from newer instruction sets if available.
-
-1.  Using built-in PPC vector types without a wrapper class. This leads to much
-    better code generation with GCC 6.3: https://godbolt.org/z/pd2PNP.
-    By contrast, P0214R5 requires a wrapper. We avoid this by using only the
-    member operators provided by the PPC vectors; all other functions and
-    typedefs are non-members. 2019-04 update: Clang power64le does not have
-    this issue, so we simplified get_part(d, v) to GetLane(v).
-
-1.  Omitting inefficient or non-performance-portable operations such as `hmax`,
-    `operator[]`, and unsupported integer comparisons. Applications can often
-    replace these operations at lower cost than emulating that exact behavior.
-
-1.  Omitting `long double` types: these are not commonly available in hardware.
-
-1.  Ensuring signed integer overflow has well-defined semantics (wraparound).
-
-1.  Simple header-only implementation and less than a tenth of the size of the
-    Vc library from which P0214 was derived (98,000 lines in
-    https://github.com/VcDevel/Vc according to the gloc Chrome extension).
-
-1.  Avoiding hidden performance costs. P0214R5 allows implicit conversions from
-    integer to float, which costs 3-4 cycles on x86. We make these conversions
-    explicit to ensure their cost is visible.
-
-## Other related work
-
-*   [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
-    adopts a similar approach with interchangeable vector/scalar types and
-    a compact interface. It allows access to the underlying intrinsics, but
-    does not appear to be designed for other platforms than x86.
-
-*   UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
-    also adopts an explicit vectorization model with vector classes.
-    However, it exposes the union of all platform capabilities, which makes the
-    API harder to learn (209-page spec) and implement (the estimated LOC count
-    is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
-    because it allows applications to use operations that are inefficient on
-    other platforms.
-
-*   Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
-    is a vector library for scientific computing with some innovative features:
-    automatic FLOPS counting, and "if/else branches" using lambda functions.
-    It supports IBM Power8, but only provides float and double types.
-
-## Overloaded function API
-
-Most C++ vector APIs rely on class templates. However, the ARM SVE vector
-type is sizeless and cannot be wrapped in a class. We instead rely on overloaded
-functions. Overloading based on vector types is also undesirable because SVE
-vectors cannot be default-constructed. We instead use a dedicated 'descriptor'
-type `Simd` for overloading, abbreviated to `D` for template arguments and
-`d` in lvalues.
-
-Note that generic function templates are possible (see highway.h).
-
-## Masks
-
-AVX-512 introduced a major change to the SIMD interface: special mask registers
-(one bit per lane) that serve as predicates. It would be expensive to force
-AVX-512 implementations to conform to the prior model of full vectors with lanes
-set to all one or all zero bits. We instead provide a Mask type that emulates
-a subset of this functionality on other platforms at zero cost.
-
-Masks are returned by comparisons and `TestBit`; they serve as the input to
-`IfThen*`. We provide conversions between masks and vector lanes. For clarity
-and safety, we use FF..FF as the definition of true. To also benefit from
-x86 instructions that only require the sign bit of floating-point inputs to be
-set, we provide a special `ZeroIfNegative` function.
-
 ## Additional resources

-*   [Highway introduction (slides)][intro]
-*   [Overview of instructions per operation on different architectures][instmtx]
+*   [Highway introduction (slides)](g3doc/highway_intro.pdf)
+*   [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
+*   [Design philosophy and comparison](g3doc/design_philosophy.md)

-[intro]: g3doc/highway_intro.pdf
-[instmtx]: g3doc/instruction_matrix.pdf
+## Acknowledgments
+
+We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
+Bramas; it has proved useful for checking the SVE port on an x86 development
+machine.

 This is not an officially supported Google product.
 Contact: janwas@google.com
--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@ -1,3 +1,41 @@
+highway (0.15.0-1) UNRELEASED; urgency=medium
+
+  * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
+  * New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
+  * Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
+  * Contrib: add sort(ing network) and dot(product)
+  * Targets: update RVV for LLVM, add experimental WASM2
+  * Separate library hwy_test for test utils
+  * Add non-macro Simd<> aliases
+  * Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
+
+ -- Jan Wassenberg <janwas@google.com>  Wed, 10 Nov 2021 10:00:00 +0100
+
+highway (0.14.2-1) UNRELEASED; urgency=medium
+
+  * Add MaskedLoad
+  * Fix non-glibc PPC, Windows GCC, MSVC 19.14
+  * Opt-in for -Werror; separate design_philosophy.md
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 24 Aug 2021 15:00:00 +0200
+
+highway (0.14.1-1) UNRELEASED; urgency=medium
+
+  * Add LoadMaskBits, CompressBits[Store]
+  * Fix CPU feature check (AES/F16C) and warnings
+  * Improved DASSERT - disabled in optimized builds
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 17 Aug 2021 14:00:00 +0200
+
+highway (0.14.0-1) UNRELEASED; urgency=medium
+
+  * Add SVE, S-SSE3, AVX3_DL targets
+  * Support partial vectors in all ops
+  * Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
+  * Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 29 Jul 2021 15:00:00 +0200
+
 highway (0.12.2-1) UNRELEASED; urgency=medium

  * fix scalar-only test and Windows macro conflict with Load/StoreFence
--- a/third_party/highway/hwy/aligned_allocator.cc
+++ b/third_party/highway/hwy/aligned_allocator.cc
@ -27,10 +27,21 @@
 namespace hwy {
 namespace {

-constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize);
+#if HWY_ARCH_RVV && defined(__riscv_vector)
+// Not actually an upper bound on the size, but this value prevents crossing a
+// 4K boundary (relevant on Andes).
+constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
+#else
+constexpr size_t kAlignment = HWY_ALIGNMENT;
+#endif
+
+#if HWY_ARCH_X86
 // On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
 // if this is used for single-vector allocations. 256 is more reasonable.
 constexpr size_t kAlias = kAlignment * 4;
+#else
+constexpr size_t kAlias = kAlignment;
+#endif

 #pragma pack(push, 1)
 struct AllocationHeader {
@ -53,6 +64,7 @@ size_t NextAlignedOffset() {

 void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
                           void* opaque_ptr) {
+  HWY_ASSERT(payload_size != 0);  // likely a bug in caller
  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
    HWY_DASSERT(false && "payload_size too large");
    return nullptr;
@ -94,7 +106,7 @@ void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
  header->allocated = allocated;
  header->payload_size = payload_size;

-  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kMaxVectorSize);
+  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
 }

 void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
--- a/third_party/highway/hwy/aligned_allocator_test.cc
+++ b/third_party/highway/hwy/aligned_allocator_test.cc
@ -120,13 +120,13 @@ TEST(AlignedAllocatorTest, AllocDefaultPointers) {
                                   /*opaque_ptr=*/nullptr);
  ASSERT_NE(nullptr, ptr);
  // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
  char* p = static_cast<char*>(ptr);
  size_t ret = 0;
  for (size_t i = 0; i < kSize; i++) {
    // Performs a computation using p[] to prevent it being optimized away.
    p[i] = static_cast<char>(i & 0x7F);
-    if (i) ret += p[i] * p[i - 1];
+    if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
  }
  EXPECT_NE(0U, ret);
  FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
@ -152,7 +152,7 @@ TEST(AlignedAllocatorTest, CustomAlloc) {
  // We should have only requested one alloc from the allocator.
  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
  // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
  FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
 }
@ -197,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
 TEST(AlignedAllocatorTest, AllocSingleInt) {
  auto ptr = AllocateAligned<uint32_t>(1);
  ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
  // Force delete of the unique_ptr now to check that it doesn't crash.
  ptr.reset(nullptr);
  EXPECT_EQ(nullptr, ptr.get());
@ -207,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultipleInt) {
  const size_t kSize = 7777;
  auto ptr = AllocateAligned<uint32_t>(kSize);
  ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
  // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
  // underlying type chosen by AllocateAligned() for the std::unique_ptr.
  EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
@ -276,3 +276,9 @@ TEST(AlignedAllocatorTest, DefaultInit) {
 }

 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@ -23,72 +23,7 @@
 #include <atomic>
 #include <cfloat>

-// Add to #if conditions to prevent IDE from graying out code.
-#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
-    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
-#define HWY_IDE 1
-#else
-#define HWY_IDE 0
-#endif
-
-//------------------------------------------------------------------------------
-// Detect compiler using predefined macros
-
-// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
-// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
-// purpose.
-#if defined(_MSC_VER) && !defined(__clang__)
-#define HWY_COMPILER_MSVC _MSC_VER
-#else
-#define HWY_COMPILER_MSVC 0
-#endif
-
-#ifdef __INTEL_COMPILER
-#define HWY_COMPILER_ICC __INTEL_COMPILER
-#else
-#define HWY_COMPILER_ICC 0
-#endif
-
-#ifdef __GNUC__
-#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#define HWY_COMPILER_GCC 0
-#endif
-
-// Clang can masquerade as MSVC/GCC, in which case both are set.
-#ifdef __clang__
-#ifdef __APPLE__
-// Apple LLVM version is unrelated to the actual Clang version, which we need
-// for enabling workarounds. Use the presence of warning flags to deduce it.
-// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if __has_warning("-Wformat-insufficient-args")
-#define HWY_COMPILER_CLANG 1200
-#elif __has_warning("-Wimplicit-const-int-float-conversion")
-#define HWY_COMPILER_CLANG 1100
-#elif __has_warning("-Wmisleading-indentation")
-#define HWY_COMPILER_CLANG 1000
-#elif defined(__FILE_NAME__)
-#define HWY_COMPILER_CLANG 900
-#elif __has_warning("-Wextra-semi-stmt") || \
-    __has_builtin(__builtin_rotateleft32)
-#define HWY_COMPILER_CLANG 800
-#elif __has_warning("-Wc++98-compat-extra-semi")
-#define HWY_COMPILER_CLANG 700
-#else  // Anything older than 7.0 is not recommended for Highway.
-#define HWY_COMPILER_CLANG 600
-#endif  // __has_warning chain
-#else   // Non-Apple: normal version
-#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
-#endif
-#else  // Not clang
-#define HWY_COMPILER_CLANG 0
-#endif
-
-// More than one may be nonzero, but we want at least one.
-#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
-    !HWY_COMPILER_CLANG
-#error "Unsupported compiler"
-#endif
+#include "hwy/detect_compiler_arch.h"

 //------------------------------------------------------------------------------
 // Compiler-specific definitions
@ -140,18 +75,6 @@
 //------------------------------------------------------------------------------
 // Builtin/attributes

-#ifdef __has_builtin
-#define HWY_HAS_BUILTIN(name) __has_builtin(name)
-#else
-#define HWY_HAS_BUILTIN(name) 0
-#endif
-
-#ifdef __has_attribute
-#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
-#else
-#define HWY_HAS_ATTRIBUTE(name) 0
-#endif
-
 // Enables error-checking of format strings.
 #if HWY_HAS_ATTRIBUTE(__format__)
 #define HWY_FORMAT(idx_fmt, idx_arg) \
@ -175,9 +98,9 @@
 // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
 // automatic annotation via pragmas.
 #if HWY_COMPILER_CLANG
-#define HWY_PUSH_ATTRIBUTES(targets_str)                                     \
+#define HWY_PUSH_ATTRIBUTES(targets_str)                                \
  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
-                                       apply_to = function))
+                                  apply_to = function))
 #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
 #elif HWY_COMPILER_GCC
 #define HWY_PUSH_ATTRIBUTES(targets_str) \
@ -188,78 +111,6 @@
 #define HWY_POP_ATTRIBUTES
 #endif

-//------------------------------------------------------------------------------
-// Detect architecture using predefined macros
-
-#if defined(__i386__) || defined(_M_IX86)
-#define HWY_ARCH_X86_32 1
-#else
-#define HWY_ARCH_X86_32 0
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define HWY_ARCH_X86_64 1
-#else
-#define HWY_ARCH_X86_64 0
-#endif
-
-#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
-#error "Cannot have both x86-32 and x86-64"
-#endif
-
-#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
-#define HWY_ARCH_X86 1
-#else
-#define HWY_ARCH_X86 0
-#endif
-
-#if defined(__powerpc64__) || defined(_M_PPC)
-#define HWY_ARCH_PPC 1
-#else
-#define HWY_ARCH_PPC 0
-#endif
-
-#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
-#define HWY_ARCH_ARM_A64 1
-#else
-#define HWY_ARCH_ARM_A64 0
-#endif
-
-#if defined(__arm__) || defined(_M_ARM)
-#define HWY_ARCH_ARM_V7 1
-#else
-#define HWY_ARCH_ARM_V7 0
-#endif
-
-#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
-#error "Cannot have both A64 and V7"
-#endif
-
-#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
-#define HWY_ARCH_ARM 1
-#else
-#define HWY_ARCH_ARM 0
-#endif
-
-#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
-#define HWY_ARCH_WASM 1
-#else
-#define HWY_ARCH_WASM 0
-#endif
-
-#ifdef __riscv
-#define HWY_ARCH_RVV 1
-#else
-#define HWY_ARCH_RVV 0
-#endif
-
-// It is an error to detect multiple architectures at the same time, but OK to
-// detect none of the above.
-#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
-     HWY_ARCH_RVV) > 1
-#error "Must not detect more than one architecture"
-#endif
-
 //------------------------------------------------------------------------------
 // Macros

@ -295,9 +146,37 @@
    }                                     \
  } while (0)

-// Only for "debug" builds
-#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
-    defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
+#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
+#define HWY_IS_MSAN 1
+#else
+#define HWY_IS_MSAN 0
+#endif
+
+#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
+#define HWY_IS_ASAN 1
+#else
+#define HWY_IS_ASAN 0
+#endif
+
+#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
+#define HWY_IS_TSAN 1
+#else
+#define HWY_IS_TSAN 0
+#endif
+
+// For enabling HWY_DASSERT and shortening tests in slower debug builds
+#if !defined(HWY_IS_DEBUG_BUILD)
+// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
+// MSVC defines NDEBUG (if not, could instead check _DEBUG).
+#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
+    HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
+#define HWY_IS_DEBUG_BUILD 1
+#else
+#define HWY_IS_DEBUG_BUILD 0
+#endif
+#endif  // HWY_IS_DEBUG_BUILD
+
+#if HWY_IS_DEBUG_BUILD
 #define HWY_DASSERT(condition) HWY_ASSERT(condition)
 #else
 #define HWY_DASSERT(condition) \
@ -305,24 +184,33 @@
  } while (0)
 #endif

+#if defined(HWY_EMULATE_SVE)
+class FarmFloat16;
+#endif

 namespace hwy {

 //------------------------------------------------------------------------------
-// Alignment
+// kMaxVectorSize (undocumented, pending removal)

-// Not guaranteed to be an upper bound, but the alignment established by
-// aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
 #if HWY_ARCH_X86
 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
-#define HWY_ALIGN_MAX alignas(64)
-#elif HWY_ARCH_RVV
-// Not actually an upper bound on the size, but this value prevents crossing a
-// 4K boundary (relevant on Andes).
+#elif HWY_ARCH_RVV && defined(__riscv_vector)
+// Not actually an upper bound on the size.
 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
-#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
 #else
 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
+#endif
+
+//------------------------------------------------------------------------------
+// Alignment
+
+// For stack-allocated partial arrays or LoadDup128.
+#if HWY_ARCH_X86
+#define HWY_ALIGN_MAX alignas(64)
+#elif HWY_ARCH_RVV && defined(__riscv_vector)
+#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
+#else
 #define HWY_ALIGN_MAX alignas(16)
 #endif

@ -332,26 +220,33 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
 // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
 // by concatenating base type and bits.

-// RVV already has a builtin type and the GCC intrinsics require it.
-#if HWY_ARCH_RVV && HWY_COMPILER_GCC
+#if HWY_ARCH_ARM && (__ARM_FP & 2)
 #define HWY_NATIVE_FLOAT16 1
 #else
 #define HWY_NATIVE_FLOAT16 0
 #endif

-#if HWY_NATIVE_FLOAT16
+#pragma pack(push, 1)
+
+#if defined(HWY_EMULATE_SVE)
+using float16_t = FarmFloat16;
+#elif HWY_NATIVE_FLOAT16
 using float16_t = __fp16;
 // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
 // arguments, so use a wrapper.
 // TODO(janwas): replace with _Float16 when that is supported?
 #else
-#pragma pack(push, 1)
 struct float16_t {
  uint16_t bits;
 };
-#pragma pack(pop)
 #endif

+struct bfloat16_t {
+  uint16_t bits;
+};
+
+#pragma pack(pop)
+
 using float32_t = float;
 using float64_t = double;

@ -368,15 +263,34 @@ struct EnableIfT<true, T> {
 template <bool Condition, class T = void>
 using EnableIf = typename EnableIfT<Condition, T>::type;

+template <typename T, typename U>
+struct IsSameT {
+  enum { value = 0 };
+};
+
+template <typename T>
+struct IsSameT<T, T> {
+  enum { value = 1 };
+};
+
+template <typename T, typename U>
+HWY_API constexpr bool IsSame() {
+  return IsSameT<T, U>::value;
+}
+
 // Insert into template/function arguments to enable this overload only for
 // vectors of AT MOST this many bits.
 //
 // Note that enabling for exactly 128 bits is unnecessary because a function can
-// simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
-// other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
+// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
+// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
 #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
 #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
 #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
+#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
+#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
+#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
+#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr

 #define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
 #define HWY_IF_SIGNED(T) \
@ -393,28 +307,50 @@ using EnableIf = typename EnableIfT<Condition, T>::type;
 template <size_t N>
 struct SizeTag {};

+template <class T>
+struct RemoveConstT {
+  using type = T;
+};
+template <class T>
+struct RemoveConstT<const T> {
+  using type = T;
+};
+
+template <class T>
+using RemoveConst = typename RemoveConstT<T>::type;
+
 //------------------------------------------------------------------------------
 // Type traits

 template <typename T>
-constexpr bool IsFloat() {
-  return T(1.25) != T(1);
+HWY_API constexpr bool IsFloat() {
+  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
+  // from a float, not compared.
+  return IsSame<T, float>() || IsSame<T, double>();
 }

 template <typename T>
-constexpr bool IsSigned() {
+HWY_API constexpr bool IsSigned() {
  return T(0) > T(-1);
 }
+template <>
+constexpr bool IsSigned<float16_t>() {
+  return true;
+}
+template <>
+constexpr bool IsSigned<bfloat16_t>() {
+  return true;
+}

 // Largest/smallest representable integer values.
 template <typename T>
-constexpr T LimitsMax() {
+HWY_API constexpr T LimitsMax() {
  static_assert(!IsFloat<T>(), "Only for integer types");
  return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
                       : static_cast<T>(~0ull);
 }
 template <typename T>
-constexpr T LimitsMin() {
+HWY_API constexpr T LimitsMin() {
  static_assert(!IsFloat<T>(), "Only for integer types");
  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
 }
@ -422,7 +358,7 @@ constexpr T LimitsMin() {
 // Largest/smallest representable value (integer or float). This naming avoids
 // confusion with numeric_limits<float>::min() (the smallest positive value).
 template <typename T>
-constexpr T LowestValue() {
+HWY_API constexpr T LowestValue() {
  return LimitsMin<T>();
 }
 template <>
@ -435,7 +371,7 @@ constexpr double LowestValue<double>() {
 }

 template <typename T>
-constexpr T HighestValue() {
+HWY_API constexpr T HighestValue() {
  return LimitsMax<T>();
 }
 template <>
@ -550,11 +486,18 @@ struct Relations<float16_t> {
  using Wide = float;
 };
 template <>
+struct Relations<bfloat16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = float;
+};
+template <>
 struct Relations<float> {
  using Unsigned = uint32_t;
  using Signed = int32_t;
  using Float = float;
  using Wide = double;
+  using Narrow = float16_t;
 };
 template <>
 struct Relations<double> {
@ -564,6 +507,31 @@ struct Relations<double> {
  using Narrow = float;
 };

+template <size_t N>
+struct TypeFromSize;
+template <>
+struct TypeFromSize<1> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+};
+template <>
+struct TypeFromSize<2> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+};
+template <>
+struct TypeFromSize<4> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+};
+template <>
+struct TypeFromSize<8> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+};
+
 }  // namespace detail

 // Aliases for types of a different category, but the same size.
@ -580,6 +548,14 @@ using MakeWide = typename detail::Relations<T>::Wide;
 template <typename T>
 using MakeNarrow = typename detail::Relations<T>::Narrow;

+// Obtain type from its size [bytes].
+template <size_t N>
+using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
+template <size_t N>
+using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
+template <size_t N>
+using FloatFromSize = typename detail::TypeFromSize<N>::Float;
+
 //------------------------------------------------------------------------------
 // Helper functions

@ -599,27 +575,123 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
  unsigned long index;  // NOLINT
  _BitScanForward(&index, x);
  return index;
-#else  // HWY_COMPILER_MSVC
+#else   // HWY_COMPILER_MSVC
  return static_cast<size_t>(__builtin_ctz(x));
 #endif  // HWY_COMPILER_MSVC
 }

+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanForward64(&index, x);
+  return index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanForward64 not available
+  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+  unsigned long index;
+  if (lsb == 0) {
+    uint32_t msb = static_cast<uint32_t>(x >> 32u);
+    _BitScanForward(&index, msb);
+    return 32 + index;
+  } else {
+    _BitScanForward(&index, lsb);
+    return index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanReverse(&index, x);
+  return 31 - index;
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
+#if HWY_COMPILER_MSVC
+#if HWY_ARCH_X86_64
+  unsigned long index;  // NOLINT
+  _BitScanReverse64(&index, x);
+  return 63 - index;
+#else   // HWY_ARCH_X86_64
+  // _BitScanReverse64 not available
+  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
+  unsigned long index;
+  if (msb == 0) {
+    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
+    _BitScanReverse(&index, lsb);
+    return 63 - index;
+  } else {
+    _BitScanReverse(&index, msb);
+    return 31 - index;
+  }
+#endif  // HWY_ARCH_X86_64
+#else   // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_clzll(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
 HWY_API size_t PopCount(uint64_t x) {
 #if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
  return static_cast<size_t>(__builtin_popcountll(x));
-#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+  // This instruction has a separate feature flag, but is often called from
+  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
+  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
+  // for AVX, so check for that.
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
  return _mm_popcnt_u64(x);
-#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
 #else
-  x -= ((x >> 1) & 0x55555555U);
-  x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
-  x = (((x >> 4) + x) & 0x0F0F0F0FU);
+  x -= ((x >> 1) & 0x5555555555555555ULL);
+  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
+  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
  x += (x >> 8);
  x += (x >> 16);
  x += (x >> 32);
-  x = x & 0x0000007FU;
-  return (unsigned int)x;
+  return static_cast<size_t>(x & 0x7Fu);
+#endif
+}
+
+template <typename TI>
+HWY_API constexpr size_t FloorLog2(TI x) {
+  return x == 1 ? 0 : FloorLog2(x >> 1) + 1;
+}
+
+template <typename TI>
+HWY_API constexpr size_t CeilLog2(TI x) {
+  return x == 1 ? 0 : FloorLog2(x - 1) + 1;
+}
+
+#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+#pragma intrinsic(_umul128)
+#endif
+
+// 64 x 64 = 128 bit multiplication
+HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
+#if defined(__SIZEOF_INT128__)
+  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
+  *upper = (uint64_t)(product >> 64);
+  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+  return _umul128(a, b, upper);
+#else
+  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
+  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
+  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
+  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
+  const uint64_t hi_hi = (a >> 32) * (b >> 32);
+  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
+  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
+  return (t << 32) | (lo_lo & kLo32);
 #endif
 }

@ -639,6 +711,22 @@ HWY_API void CopyBytes(const From* from, To* to) {
 #endif
 }

+HWY_API float F32FromBF16(bfloat16_t bf) {
+  uint32_t bits = bf.bits;
+  bits <<= 16;
+  float f;
+  CopyBytes<4>(&bits, &f);
+  return f;
+}
+
+HWY_API bfloat16_t BF16FromF32(float f) {
+  uint32_t bits;
+  CopyBytes<4>(&f, &bits);
+  bfloat16_t bf;
+  bf.bits = static_cast<uint16_t>(bits >> 16);
+  return bf;
+}
+
 HWY_NORETURN void HWY_FORMAT(3, 4)
    Abort(const char* file, int line, const char* format, ...);

--- a/third_party/highway/hwy/base_test.cc
+++ b/third_party/highway/hwy/base_test.cc
@ -90,6 +90,51 @@ HWY_NOINLINE void TestAllType() {
  ForFloatTypes(TestIsFloat());
 }

+struct TestIsSame {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(IsSame<T, T>(), "T == T");
+    static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
+    static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
+  }
+};
+
+HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
+
+HWY_NOINLINE void TestAllBitScan() {
+  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
+  HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
+  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
+  HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
+  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
+  HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
+
+  HWY_ASSERT_EQ(size_t(0),
+                Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
+  HWY_ASSERT_EQ(size_t(0),
+                Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t(1),
+                Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
+  HWY_ASSERT_EQ(size_t(1),
+                Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
+  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
+  HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
+
+  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
+  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
+  HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
+  HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
+
+  HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
+  HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
+  HWY_ASSERT_EQ(size_t(62),
+                Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
+  HWY_ASSERT_EQ(size_t(63),
+                Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
+}
+
 HWY_NOINLINE void TestAllPopCount() {
  HWY_ASSERT_EQ(size_t(0), PopCount(0u));
  HWY_ASSERT_EQ(size_t(1), PopCount(1u));
@ -113,11 +158,21 @@ HWY_NOINLINE void TestAllPopCount() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(BaseTest);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
 HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/cache_control.h
+++ b/third_party/highway/hwy/cache_control.h
@ -55,20 +55,28 @@ namespace hwy {
 // Delays subsequent loads until prior loads are visible. On Intel CPUs, also
 // serves as a full fence (waits for all prior instructions to complete).
 // No effect on non-x86.
+// DEPRECATED due to differing behavior across architectures AND vendors.
 HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_lfence();
 #endif
 }

-// Ensures previous weakly-ordered stores are visible. No effect on non-x86.
-HWY_INLINE HWY_ATTR_CACHE void StoreFence() {
+// Ensures values written by previous `Stream` calls are visible on the current
+// core. This is NOT sufficient for synchronizing across cores; when `Stream`
+// outputs are to be consumed by other core(s), the producer must publish
+// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
+HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_sfence();
 #endif
 }

-// Begins loading the cache line containing "p".
+// DEPRECATED, replace with `FlushStream`.
+HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); }
+
+// Optionally begins loading the cache line containing "p" to reduce latency of
+// subsequent actual loads.
 template <typename T>
 HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
@ -82,7 +90,7 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
 #endif
 }

-// Invalidates and flushes the cache line containing "p". No effect on non-x86.
+// Invalidates and flushes the cache line containing "p", if possible.
 HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_clflush(p);
@ -91,7 +99,7 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
 #endif
 }

-// Reduces power consumption in spin-loops. No effect on non-x86.
+// When called inside a spin-loop, may reduce power consumption.
 HWY_INLINE HWY_ATTR_CACHE void Pause() {
 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
  _mm_pause();
--- a/third_party/highway/hwy/contrib/dot/dot-inl.h
+++ b/third_party/highway/hwy/contrib/dot/dot-inl.h
@ -0,0 +1,258 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Include guard (still compiled once per target)
+#include <cmath>
+
+#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
+#endif
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct Dot {
+  // Specify zero or more of these, ORed together, as the kAssumptions template
+  // argument to Compute. Each one may improve performance or reduce code size,
+  // at the cost of additional requirements on the arguments.
+  enum Assumptions {
+    // num_elements is at least N, which may be up to HWY_MAX_LANES(T).
+    kAtLeastOneVector = 1,
+    // num_elements is divisible by N (a power of two, so this can be used if
+    // the problem size is known to be a power of two >= HWY_MAX_LANES(T)).
+    kMultipleOfVector = 2,
+    // RoundUpTo(num_elements, N) elements are accessible; their value does not
+    // matter (will be treated as if they were zero).
+    kPaddedToVector = 4,
+    // Pointers pa and pb, respectively, are multiples of N * sizeof(T).
+    // For example, aligned_allocator.h ensures this. Note that it is still
+    // beneficial to ensure such alignment even if these flags are not set.
+    // If not set, the pointers need only be aligned to alignof(T).
+    kVectorAlignedA = 8,
+    kVectorAlignedB = 16,
+  };
+
+  // Returns sum{pa[i] * pb[i]} for float or double inputs.
+  template <int kAssumptions, class D, typename T = TFromD<D>,
+            HWY_IF_NOT_LANE_SIZE_D(D, 2)>
+  static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
+                              const T* const HWY_RESTRICT pb,
+                              const size_t num_elements) {
+    static_assert(IsFloat<T>(), "MulAdd requires float type");
+    using V = decltype(Zero(d));
+
+    const size_t N = Lanes(d);
+    size_t i = 0;
+
+    constexpr bool kIsAtLeastOneVector =
+        (kAssumptions & kAtLeastOneVector) != 0;
+    constexpr bool kIsMultipleOfVector =
+        (kAssumptions & kMultipleOfVector) != 0;
+    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
+    constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
+    constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
+
+    // Won't be able to do a full vector load without padding => scalar loop.
+    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
+        HWY_UNLIKELY(num_elements < N)) {
+      // Only 2x unroll to avoid excessive code size.
+      T sum0 = T(0);
+      T sum1 = T(0);
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += pa[i + 0] * pb[i + 0];
+        sum1 += pa[i + 1] * pb[i + 1];
+      }
+      if (i < num_elements) {
+        sum1 += pa[i] * pb[i];
+      }
+      return sum0 + sum1;
+    }
+
+    // Compiler doesn't make independent sum* accumulators, so unroll manually.
+    // 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
+    // for unaligned inputs (each unaligned pointer halves the throughput
+    // because it occupies both L1 load ports for a cycle). We cannot have
+    // arrays of vectors on RVV/SVE, so always unroll 4x.
+    V sum0 = Zero(d);
+    V sum1 = Zero(d);
+    V sum2 = Zero(d);
+    V sum3 = Zero(d);
+
+    // Main loop: unrolled
+    for (; i + 4 * N <= num_elements; /* i += 4 * N */) {  // incr in loop
+      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum0 = MulAdd(a0, b0, sum0);
+      const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum1 = MulAdd(a1, b1, sum1);
+      const auto a2 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b2 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum2 = MulAdd(a2, b2, sum2);
+      const auto a3 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b3 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum3 = MulAdd(a3, b3, sum3);
+    }
+
+    // Up to 3 iterations of whole vectors
+    for (; i + N <= num_elements; i += N) {
+      const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      sum0 = MulAdd(a, b, sum0);
+    }
+
+    if (!kIsMultipleOfVector) {
+      const size_t remaining = num_elements - i;
+      if (remaining != 0) {
+        if (kIsPaddedToVector) {
+          const auto mask = FirstN(d, remaining);
+          const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+          const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+          sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
+        } else {
+          // Unaligned load such that the last element is in the highest lane -
+          // ensures we do not touch any elements outside the valid range.
+          // If we get here, then num_elements >= N.
+          HWY_DASSERT(i >= N);
+          i += remaining - N;
+          const auto skip = FirstN(d, N - remaining);
+          const auto a = LoadU(d, pa + i);  // always unaligned
+          const auto b = LoadU(d, pb + i);
+          sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
+        }
+      }
+    }  // kMultipleOfVector
+
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    sum0 = Add(sum0, sum1);
+    sum2 = Add(sum2, sum3);
+    sum0 = Add(sum0, sum2);
+    return GetLane(SumOfLanes(d, sum0));
+  }
+
+  // Returns sum{pa[i] * pb[i]} for bfloat16 inputs.
+  template <int kAssumptions, class D>
+  static HWY_INLINE float Compute(const D d,
+                                  const bfloat16_t* const HWY_RESTRICT pa,
+                                  const bfloat16_t* const HWY_RESTRICT pb,
+                                  const size_t num_elements) {
+    const RebindToUnsigned<D> du16;
+    const Repartition<float, D> df32;
+
+    using V = decltype(Zero(df32));
+    const size_t N = Lanes(d);
+    size_t i = 0;
+
+    constexpr bool kIsAtLeastOneVector =
+        (kAssumptions & kAtLeastOneVector) != 0;
+    constexpr bool kIsMultipleOfVector =
+        (kAssumptions & kMultipleOfVector) != 0;
+    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
+    constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
+    constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
+
+    // Won't be able to do a full vector load without padding => scalar loop.
+    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
+        HWY_UNLIKELY(num_elements < N)) {
+      float sum0 = 0.0f;  // Only 2x unroll to avoid excessive code size for..
+      float sum1 = 0.0f;  // this unlikely(?) case.
+      for (; i + 2 <= num_elements; i += 2) {
+        sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
+        sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
+      }
+      if (i < num_elements) {
+        sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
+      }
+      return sum0 + sum1;
+    }
+
+    // See comment in the other Compute() overload. Unroll 2x, but we need
+    // twice as many sums for ReorderWidenMulAccumulate.
+    V sum0 = Zero(df32);
+    V sum1 = Zero(df32);
+    V sum2 = Zero(df32);
+    V sum3 = Zero(df32);
+
+    // Main loop: unrolled
+    for (; i + 2 * N <= num_elements; /* i += 2 * N */) {  // incr in loop
+      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
+      const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
+    }
+
+    // Possibly one more iteration of whole vectors
+    if (i + N <= num_elements) {
+      const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+      const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+      i += N;
+      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
+    }
+
+    if (!kIsMultipleOfVector) {
+      const size_t remaining = num_elements - i;
+      if (remaining != 0) {
+        if (kIsPaddedToVector) {
+          const auto mask = FirstN(du16, remaining);
+          const auto va = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
+          const auto vb = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
+          const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
+          const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
+          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
+
+        } else {
+          // Unaligned load such that the last element is in the highest lane -
+          // ensures we do not touch any elements outside the valid range.
+          // If we get here, then num_elements >= N.
+          HWY_DASSERT(i >= N);
+          i += remaining - N;
+          const auto skip = FirstN(du16, N - remaining);
+          const auto va = LoadU(d, pa + i);  // always unaligned
+          const auto vb = LoadU(d, pb + i);
+          const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
+          const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
+          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
+        }
+      }
+    }  // kMultipleOfVector
+
+    // Reduction tree: sum of all accumulators by pairs, then across lanes.
+    sum0 = Add(sum0, sum1);
+    sum2 = Add(sum2, sum3);
+    sum0 = Add(sum0, sum2);
+    return GetLane(SumOfLanes(df32, sum0));
+  }
+};
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
--- a/third_party/highway/hwy/contrib/dot/dot_test.cc
+++ b/third_party/highway/hwy/contrib/dot/dot_test.cc
@ -0,0 +1,193 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "hwy/aligned_allocator.h"
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/contrib/dot/dot-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <typename T>
+HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
+  double sum = 0.0;
+  for (size_t i = 0; i < num; ++i) {
+    sum += pa[i] * pb[i];
+  }
+  return static_cast<T>(sum);
+}
+
+HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
+                             size_t num) {
+  float sum = 0.0f;
+  for (size_t i = 0; i < num; ++i) {
+    sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
+  }
+  return sum;
+}
+
+template <typename T>
+void SetValue(const float value, T* HWY_RESTRICT ptr) {
+  *ptr = static_cast<T>(value);
+}
+void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
+  *ptr = BF16FromF32(value);
+}
+
+class TestDot {
+  // Computes/verifies one dot product.
+  template <int kAssumptions, class D>
+  void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
+            RandomState& rng) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    const auto random_t = [&rng]() {
+      const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
+      return static_cast<float>(bits - 512) * (1.0f / 64);
+    };
+
+    const bool kIsAlignedA = (kAssumptions & Dot::kVectorAlignedA) != 0;
+    const bool kIsAlignedB = (kAssumptions & Dot::kVectorAlignedB) != 0;
+
+    HWY_ASSERT(!kIsAlignedA || misalign_a == 0);
+    HWY_ASSERT(!kIsAlignedB || misalign_b == 0);
+    const size_t padded =
+        (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
+    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
+    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
+    T* a = pa.get() + misalign_a;
+    T* b = pb.get() + misalign_b;
+    size_t i = 0;
+    for (; i < num; ++i) {
+      SetValue(random_t(), a + i);
+      SetValue(random_t(), b + i);
+    }
+    // Fill padding with NaN - the values are not used, but avoids MSAN errors.
+    for (; i < padded; ++i) {
+      ScalableTag<float> df1;
+      SetValue(GetLane(NaN(df1)), a + i);
+      SetValue(GetLane(NaN(df1)), b + i);
+    }
+
+    const auto expected = SimpleDot(a, b, num);
+    const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
+    const auto max = static_cast<decltype(actual)>(8 * 8 * num);
+    HWY_ASSERT(-max <= actual && actual <= max);
+    HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
+  }
+
+  // Runs tests with various alignments compatible with the given assumptions.
+  template <int kAssumptions, class D>
+  void ForeachMisalign(D d, size_t num, RandomState& rng) {
+    static_assert(
+        (kAssumptions & (Dot::kVectorAlignedA | Dot::kVectorAlignedB)) == 0,
+        "Alignment must not be specified by caller");
+
+    const size_t N = Lanes(d);
+    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
+
+    // Both flags, both aligned
+    Test<kAssumptions | Dot::kVectorAlignedA | Dot::kVectorAlignedB>(d, num, 0,
+                                                                     0, rng);
+
+    // One flag and aligned, other aligned/misaligned
+    for (size_t m : misalignments) {
+      Test<kAssumptions | Dot::kVectorAlignedA>(d, num, 0, m, rng);
+      Test<kAssumptions | Dot::kVectorAlignedB>(d, num, m, 0, rng);
+    }
+
+    // Neither flag, all combinations of aligned/misaligned
+    for (size_t ma : misalignments) {
+      for (size_t mb : misalignments) {
+        Test<kAssumptions>(d, num, ma, mb, rng);
+      }
+    }
+  }
+
+  // Runs tests with various lengths compatible with the given assumptions.
+  template <int kAssumptions, class D>
+  void ForeachCount(D d, RandomState& rng) {
+    const size_t N = Lanes(d);
+    const size_t counts[] = {1,
+                             3,
+                             7,
+                             16,
+                             HWY_MAX(N / 2, 1),
+                             HWY_MAX(2 * N / 3, 1),
+                             N,
+                             N + 1,
+                             4 * N / 3,
+                             3 * N,
+                             8 * N,
+                             8 * N + 2};
+    for (size_t num : counts) {
+      if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
+      if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
+      ForeachMisalign<kAssumptions>(d, num, rng);
+    }
+  }
+
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    // All 8 combinations of the three length-related flags:
+    ForeachCount<0>(d, rng);
+    ForeachCount<Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kMultipleOfVector>(d, rng);
+    ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
+    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
+                 Dot::kAtLeastOneVector>(d, rng);
+  }
+};
+
+void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
+void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(DotTest);
+HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
+HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
+}  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/third_party/highway/hwy/contrib/image/image.cc
+++ b/third_party/highway/hwy/contrib/image/image.cc
@ -26,7 +26,7 @@
 HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {
-size_t GetVectorSize() { return Lanes(HWY_FULL(uint8_t)()); }
+size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE

@ -58,7 +58,7 @@ size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
  }

  // Round up to vector and cache line size.
-  const size_t align = std::max<size_t>(vec_size, HWY_ALIGNMENT);
+  const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
  size_t bytes_per_row = RoundUpTo(valid_bytes, align);

  // During the lengthy window before writes are committed to memory, CPUs
--- a/third_party/highway/hwy/contrib/image/image.h
+++ b/third_party/highway/hwy/contrib/image/image.h
@ -17,6 +17,7 @@

 // SIMD/multicore-friendly planar image representation with row accessors.

+#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
@ -102,7 +103,7 @@ struct ImageBase {
 #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
    defined(THREAD_SANITIZER)
    if (y >= ysize_) {
-      HWY_ABORT("Row(%zu) >= %u\n", y, ysize_);
+      HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
    }
 #endif

@ -221,9 +222,14 @@ class Image3 {

  Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
    if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
-      HWY_ABORT("Not same size: %zu x %zu, %zu x %zu, %zu x %zu\n",
-                plane0.xsize(), plane0.ysize(), plane1.xsize(), plane1.ysize(),
-                plane2.xsize(), plane2.ysize());
+      HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64
+                " x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n",
+                static_cast<uint64_t>(plane0.xsize()),
+                static_cast<uint64_t>(plane0.ysize()),
+                static_cast<uint64_t>(plane1.xsize()),
+                static_cast<uint64_t>(plane1.ysize()),
+                static_cast<uint64_t>(plane2.xsize()),
+                static_cast<uint64_t>(plane2.ysize()));
    }
    planes_[0] = std::move(plane0);
    planes_[1] = std::move(plane1);
@ -288,7 +294,9 @@ class Image3 {
 #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
    defined(THREAD_SANITIZER)
    if (c >= kNumPlanes || y >= ysize()) {
-      HWY_ABORT("PlaneRow(%zu, %zu) >= %zu\n", c, y, ysize());
+      HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
+                static_cast<uint64_t>(c), static_cast<uint64_t>(y),
+                static_cast<uint64_t>(ysize()));
    }
 #endif
    // Use the first plane's stride because the compiler might not realize they
--- a/third_party/highway/hwy/contrib/image/image_test.cc
+++ b/third_party/highway/hwy/contrib/image/image_test.cc
@ -42,7 +42,7 @@ struct TestAlignedT {
  void operator()(T /*unused*/) const {
    std::mt19937 rng(129);
    std::uniform_int_distribution<int> dist(0, 16);
-    const HWY_FULL(T) d;
+    const ScalableTag<T> d;

    for (size_t ysize = 1; ysize < 4; ++ysize) {
      for (size_t xsize = 1; xsize < 64; ++xsize) {
@ -73,7 +73,7 @@ struct TestUnalignedT {
  void operator()(T /*unused*/) const {
    std::mt19937 rng(129);
    std::uniform_int_distribution<int> dist(0, 3);
-    const HWY_FULL(T) d;
+    const ScalableTag<T> d;

    for (size_t ysize = 1; ysize < 4; ++ysize) {
      for (size_t xsize = 1; xsize < 128; ++xsize) {
@ -87,7 +87,7 @@ struct TestUnalignedT {
        for (size_t y = 0; y < ysize; ++y) {
          T* HWY_RESTRICT row = img.MutableRow(y);
          for (size_t x = 0; x < xsize; ++x) {
-            row[x] = 1 << dist(rng);
+            row[x] = static_cast<T>(1u << dist(rng));
          }
        }

@ -96,7 +96,7 @@ struct TestUnalignedT {
        for (size_t y = 0; y < ysize; ++y) {
          T* HWY_RESTRICT row = img.MutableRow(y);
          for (size_t x = 0; x < xsize; ++x) {
-            accum |= LoadU(d, row + x);
+            accum = Or(accum, LoadU(d, row + x));
          }
        }

@ -143,9 +143,17 @@ void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(ImageTest);
 HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
 HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/contrib/math/math-inl.h
+++ b/third_party/highway/hwy/contrib/math/math-inl.h
@ -38,7 +38,7 @@ namespace HWY_NAMESPACE {
 template <class D, class V>
 HWY_INLINE V Acos(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAcos(const D d, V x) {
+HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
  return Acos(d, x);
 }

@ -53,7 +53,7 @@ HWY_NOINLINE V CallAcos(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Acosh(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAcosh(const D d, V x) {
+HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
  return Acosh(d, x);
 }

@ -68,7 +68,7 @@ HWY_NOINLINE V CallAcosh(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Asin(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAsin(const D d, V x) {
+HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
  return Asin(d, x);
 }

@ -83,7 +83,7 @@ HWY_NOINLINE V CallAsin(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Asinh(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAsinh(const D d, V x) {
+HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
  return Asinh(d, x);
 }

@ -98,7 +98,7 @@ HWY_NOINLINE V CallAsinh(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Atan(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAtan(const D d, V x) {
+HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
  return Atan(d, x);
 }

@ -113,7 +113,7 @@ HWY_NOINLINE V CallAtan(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Atanh(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallAtanh(const D d, V x) {
+HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) {
  return Atanh(d, x);
 }

@ -128,7 +128,7 @@ HWY_NOINLINE V CallAtanh(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Cos(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallCos(const D d, V x) {
+HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
  return Cos(d, x);
 }

@ -143,7 +143,7 @@ HWY_NOINLINE V CallCos(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Exp(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallExp(const D d, V x) {
+HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
  return Exp(d, x);
 }

@ -158,7 +158,7 @@ HWY_NOINLINE V CallExp(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Expm1(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallExpm1(const D d, V x) {
+HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) {
  return Expm1(d, x);
 }

@ -173,7 +173,7 @@ HWY_NOINLINE V CallExpm1(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Log(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallLog(const D d, V x) {
+HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
  return Log(d, x);
 }

@ -188,7 +188,7 @@ HWY_NOINLINE V CallLog(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Log10(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallLog10(const D d, V x) {
+HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) {
  return Log10(d, x);
 }

@ -203,7 +203,7 @@ HWY_NOINLINE V CallLog10(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Log1p(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallLog1p(const D d, V x) {
+HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) {
  return Log1p(d, x);
 }

@ -218,7 +218,7 @@ HWY_NOINLINE V CallLog1p(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Log2(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallLog2(const D d, V x) {
+HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
  return Log2(d, x);
 }

@ -233,7 +233,7 @@ HWY_NOINLINE V CallLog2(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Sin(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallSin(const D d, V x) {
+HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
  return Sin(d, x);
 }

@ -248,7 +248,7 @@ HWY_NOINLINE V CallSin(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Sinh(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallSinh(const D d, V x) {
+HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
  return Sinh(d, x);
 }

@ -263,7 +263,7 @@ HWY_NOINLINE V CallSinh(const D d, V x) {
 template <class D, class V>
 HWY_INLINE V Tanh(const D d, V x);
 template <class D, class V>
-HWY_NOINLINE V CallTanh(const D d, V x) {
+HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
  return Tanh(d, x);
 }

@ -282,43 +282,49 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
-  T x2(x * x);
+  T x2 = Mul(x, x);
  return MulAdd(x2, c2, MulAdd(c1, x, c0));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
-  T x2(x * x);
+  T x2 = Mul(x, x);
  return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
-  T x2(x * x), x4(x2 * x2);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
  return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
-  T x2(x * x), x4(x2 * x2);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
  return MulAdd(x4, MulAdd(c5, x, c4),
                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6) {
-  T x2(x * x), x4(x2 * x2);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
  return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7) {
-  T x2(x * x), x4(x2 * x2);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
  return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
 }
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8, c8,
                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
@ -326,7 +332,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8, MulAdd(c9, x, c8),
                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
@ -334,7 +342,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
@ -342,7 +352,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
 template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
@ -351,7 +363,9 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(
      x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
      MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
@ -361,7 +375,9 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8,
                MulAdd(x4, MulAdd(c13, x, c12),
                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
@ -372,7 +388,9 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13, T c14) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8,
                MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
@ -383,7 +401,9 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13, T c14, T c15) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
  return MulAdd(x8,
                MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
@ -394,7 +414,10 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13, T c14, T c15, T c16) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
+  T x16 = Mul(x8, x8);
  return MulAdd(
      x16, c16,
      MulAdd(x8,
@ -407,7 +430,10 @@ template <class T>
 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13, T c14, T c15, T c16, T c17) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
+  T x16 = Mul(x8, x8);
  return MulAdd(
      x16, MulAdd(c17, x, c16),
      MulAdd(x8,
@ -421,7 +447,10 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
                                     T c6, T c7, T c8, T c9, T c10, T c11,
                                     T c12, T c13, T c14, T c15, T c16, T c17,
                                     T c18) {
-  T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
+  T x2 = Mul(x, x);
+  T x4 = Mul(x2, x2);
+  T x8 = Mul(x4, x4);
+  T x16 = Mul(x8, x8);
  return MulAdd(
      x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
      MulAdd(x8,
@ -497,8 +526,8 @@ struct AtanImpl<float> {
    const auto k6 = Set(d, -0.0159569028764963150024414f);
    const auto k7 = Set(d, +0.00282363896258175373077393f);

-    const auto y = (x * x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), (y * x), x);
+    const auto y = Mul(x, x);
+    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
  }
 };

@ -529,10 +558,10 @@ struct AtanImpl<double> {
    const auto k17 = Set(d, +0.000209850076645816976906797);
    const auto k18 = Set(d, -1.88796008463073496563746e-5);

-    const auto y = (x * x);
+    const auto y = Mul(x, x);
    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
                         k12, k13, k14, k15, k16, k17, k18),
-                  (y * x), x);
+                  Mul(y, x), x);
  }
 };

@ -553,8 +582,8 @@ struct CosSinImpl<float> {
    const auto k2 = Set(d, -1.981069071916863322258e-4f);
    const auto k3 = Set(d, +2.6083159809786593541503e-6f);

-    const auto y(x * x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3), (y * x), x);
+    const auto y = Mul(x, x);
+    return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
  }

  template <class D, class V, class VI32>
@ -628,8 +657,8 @@ struct CosSinImpl<double> {
    const auto k7 = Set(d, +2.81009972710863200091251e-15);
    const auto k8 = Set(d, -7.97255955009037868891952e-18);

-    const auto y(x * x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), (y * x), x);
+    const auto y = Mul(x, x);
+    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
  }

  template <class D, class V, class VI32>
@ -702,7 +731,7 @@ struct ExpImpl<float> {
    const auto k4 = Set(d, +0.00139304355252534151077271f);
    const auto k5 = Set(d, +0.000198527617612853646278381f);

-    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), (x * x), x);
+    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
  }

  // Computes 2^x, where x is an integer.
@ -710,14 +739,14 @@ struct ExpImpl<float> {
  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
    const Rebind<int32_t, D> di32;
    const VI32 kOffset = Set(di32, 0x7F);
-    return BitCast(d, ShiftLeft<23>(x + kOffset));
+    return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
  }

  // Sets the exponent of 'x' to 2^e.
  template <class D, class V, class VI32>
  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
    const VI32 y = ShiftRight<1>(e);
-    return x * Pow2I(d, y) * Pow2I(d, e - y);
+    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
  }

  template <class D, class V, class VI32>
@ -740,7 +769,8 @@ struct LogImpl<float> {
  HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
    const Rebind<int32_t, D> di32;
    const Rebind<uint32_t, D> du32;
-    return BitCast(di32, ShiftRight<23>(BitCast(du32, x))) - Set(di32, 0x7F);
+    const auto kBias = Set(di32, 0x7F);
+    return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
  }

  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
@ -751,9 +781,9 @@ struct LogImpl<float> {
    const V k2 = Set(d, 0.28498786688f);
    const V k3 = Set(d, 0.24279078841f);

-    const V x2 = (x * x);
-    const V x4 = (x2 * x2);
-    return MulAdd(MulAdd(k2, x4, k0), x2, (MulAdd(k3, x4, k1) * x4));
+    const V x2 = Mul(x, x);
+    const V x4 = Mul(x2, x2);
+    return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
  }
 };

@ -781,7 +811,7 @@ struct ExpImpl<double> {
    const auto k10 = Set(d, +2.08860621107283687536341e-9);

    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
-                  (x * x), x);
+                  Mul(x, x), x);
  }

  // Computes 2^x, where x is an integer.
@ -790,14 +820,14 @@ struct ExpImpl<double> {
    const Rebind<int32_t, D> di32;
    const Rebind<int64_t, D> di64;
    const VI32 kOffset = Set(di32, 0x3FF);
-    return BitCast(d, ShiftLeft<52>(PromoteTo(di64, x + kOffset)));
+    return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
  }

  // Sets the exponent of 'x' to 2^e.
  template <class D, class V, class VI32>
  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
    const VI32 y = ShiftRight<1>(e);
-    return (x * Pow2I(d, y) * Pow2I(d, e - y));
+    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
  }

  template <class D, class V, class VI32>
@ -820,7 +850,8 @@ struct LogImpl<double> {
  HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
    const Rebind<int64_t, D> di64;
    const Rebind<uint64_t, D> du64;
-    return BitCast(di64, ShiftRight<52>(BitCast(du64, x))) - Set(di64, 0x3FF);
+    return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
+               Set(di64, 0x3FF));
  }

  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
@ -834,10 +865,10 @@ struct LogImpl<double> {
    const V k5 = Set(d, 0.1531383769920937332);
    const V k6 = Set(d, 0.1479819860511658591);

-    const V x2 = (x * x);
-    const V x4 = (x2 * x2);
+    const V x2 = Mul(x, x);
+    const V x4 = Mul(x2, x2);
    return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
-                  (MulAdd(MulAdd(k5, x4, k3), x4, k1) * x4));
+                  (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
  }
 };

@ -846,200 +877,215 @@ struct LogImpl<double> {
 template <class D, class V, bool kAllowSubnormals = true>
 HWY_INLINE V Log(const D d, V x) {
  // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
-  using LaneType = LaneType<V>;
-  impl::LogImpl<LaneType> impl;
+  using T = TFromD<D>;
+  impl::LogImpl<T> impl;

-  // clang-format off
-  constexpr bool kIsF32 = (sizeof(LaneType) == 4);
+  constexpr bool kIsF32 = (sizeof(T) == 4);

  // Float Constants
-  const V kLn2Hi     = Set(d, (kIsF32 ? 0.69313812256f   :
-                                        0.693147180369123816490   ));
-  const V kLn2Lo     = Set(d, (kIsF32 ? 9.0580006145e-6f :
-                                        1.90821492927058770002e-10));
-  const V kOne       = Set(d, +1.0);
-  const V kMinNormal = Set(d, (kIsF32 ? 1.175494351e-38f :
-                                        2.2250738585072014e-308   ));
-  const V kScale     = Set(d, (kIsF32 ? 3.355443200e+7f  :
-                                        1.8014398509481984e+16    ));
+  const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
+                                 : static_cast<T>(0.693147180369123816490));
+  const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
+                                 : static_cast<T>(1.90821492927058770002e-10));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
+                                     : static_cast<T>(2.2250738585072014e-308));
+  const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
+                                 : static_cast<T>(1.8014398509481984e+16));

  // Integer Constants
-  const Rebind<MakeSigned<LaneType>, D> di;
+  using TI = MakeSigned<T>;
+  const Rebind<TI, D> di;
  using VI = decltype(Zero(di));
-  const VI kLowerBits = Set(di, (kIsF32 ? 0x00000000L : 0xFFFFFFFFLL));
-  const VI kMagic     = Set(di, (kIsF32 ? 0x3F3504F3L : 0x3FE6A09E00000000LL));
-  const VI kExpMask   = Set(di, (kIsF32 ? 0x3F800000L : 0x3FF0000000000000LL));
-  const VI kExpScale  = Set(di, (kIsF32 ? -25         : -54));
-  const VI kManMask   = Set(di, (kIsF32 ? 0x7FFFFFL   : 0xFFFFF00000000LL));
-  // clang-format on
+  const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
+                                       : static_cast<TI>(0xFFFFFFFFLL));
+  const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
+                                   : static_cast<TI>(0x3FE6A09E00000000LL));
+  const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
+                                     : static_cast<TI>(0x3FF0000000000000LL));
+  const VI kExpScale =
+      Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
+  const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
+                                     : static_cast<TI>(0xFFFFF00000000LL));

  // Scale up 'x' so that it is no longer denormalized.
  VI exp_bits;
  V exp;
  if (kAllowSubnormals == true) {
-    const auto is_denormal = (x < kMinNormal);
-    x = IfThenElse(is_denormal, (x * kScale), x);
+    const auto is_denormal = Lt(x, kMinNormal);
+    x = IfThenElse(is_denormal, Mul(x, kScale), x);

    // Compute the new exponent.
-    exp_bits = (BitCast(di, x) + (kExpMask - kMagic));
+    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
    const VI exp_scale =
        BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
    exp = ConvertTo(
-        d, exp_scale + impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
+        d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
  } else {
    // Compute the new exponent.
-    exp_bits = (BitCast(di, x) + (kExpMask - kMagic));
+    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
    exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
  }

  // Renormalize.
  const V y = Or(And(x, BitCast(d, kLowerBits)),
-                 BitCast(d, ((exp_bits & kManMask) + kMagic)));
+                 BitCast(d, Add(And(exp_bits, kManMask), kMagic)));

  // Approximate and reconstruct.
-  const V ym1 = (y - kOne);
-  const V z = (ym1 / (y + kOne));
+  const V ym1 = Sub(y, kOne);
+  const V z = Div(ym1, Add(y, kOne));

-  return MulSub(exp, kLn2Hi,
-                (MulSub(z, (ym1 - impl.LogPoly(d, z)), (exp * kLn2Lo)) - ym1));
+  return MulSub(
+      exp, kLn2Hi,
+      Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
 }

 }  // namespace impl

 template <class D, class V>
 HWY_INLINE V Acos(const D d, V x) {
-  using LaneType = LaneType<V>;
+  using T = TFromD<D>;

  const V kZero = Zero(d);
-  const V kHalf = Set(d, +0.5);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
-  const V kPi = Set(d, +3.14159265358979323846264);
-  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
+  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

  const V sign_x = And(SignBit(d), x);
  const V abs_x = Xor(x, sign_x);
-  const auto mask = (abs_x < kHalf);
-  const V yy = IfThenElse(mask, (abs_x * abs_x), ((kOne - abs_x) * kHalf));
+  const auto mask = Lt(abs_x, kHalf);
+  const V yy =
+      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
  const V y = IfThenElse(mask, abs_x, Sqrt(yy));

-  impl::AsinImpl<LaneType> impl;
-  const V t = (impl.AsinPoly(d, yy, y) * (y * yy));
-  const V z = IfThenElse(mask, (kPiOverTwo - (Xor(y, sign_x) + Xor(t, sign_x))),
-                         ((t + y) * kTwo));
-  return IfThenElse(Or(mask, (x >= kZero)), z, (kPi - z));
+  impl::AsinImpl<T> impl;
+  const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
+
+  const V t_plus_y = Add(t, y);
+  const V z =
+      IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
+                 Add(t_plus_y, t_plus_y));
+  return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
 }

 template <class D, class V>
 HWY_INLINE V Acosh(const D d, V x) {
-  const V kLarge = Set(d, 268435456.0);
-  const V kLog2 = Set(d, 0.693147180559945286227);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
+  using T = TFromD<D>;

-  const auto is_x_large = (x > kLarge);
-  const auto is_x_gt_2 = (x > kTwo);
+  const V kLarge = Set(d, static_cast<T>(268435456.0));
+  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kTwo = Set(d, static_cast<T>(+2.0));

-  const V x_minus_1 = (x - kOne);
-  const V y0 = MulSub(kTwo, x, (kOne / (Sqrt(MulSub(x, x, kOne)) + x)));
+  const auto is_x_large = Gt(x, kLarge);
+  const auto is_x_gt_2 = Gt(x, kTwo);
+
+  const V x_minus_1 = Sub(x, kOne);
+  const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
  const V y1 =
-      (Sqrt(MulAdd(x_minus_1, kTwo, (x_minus_1 * x_minus_1))) + x_minus_1);
+      Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
  const V y2 =
-      IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), (y1 + kOne));
+      IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);

-  const auto is_pole = y2 == kOne;
-  const auto divisor = IfThenZeroElse(is_pole, y2) - kOne;
-  return IfThenElse(is_x_gt_2, z, IfThenElse(is_pole, y1, z * y1 / divisor)) +
-         IfThenElseZero(is_x_large, kLog2);
+  const auto is_pole = Eq(y2, kOne);
+  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
+  return Add(IfThenElse(is_x_gt_2, z,
+                        IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
+             IfThenElseZero(is_x_large, kLog2));
 }

 template <class D, class V>
 HWY_INLINE V Asin(const D d, V x) {
-  using LaneType = LaneType<V>;
+  using T = TFromD<D>;

-  const V kHalf = Set(d, +0.5);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
-  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kTwo = Set(d, static_cast<T>(+2.0));
+  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

  const V sign_x = And(SignBit(d), x);
  const V abs_x = Xor(x, sign_x);
-  const auto mask = (abs_x < kHalf);
-  const V yy = IfThenElse(mask, (abs_x * abs_x), (kOne - abs_x) * kHalf);
+  const auto mask = Lt(abs_x, kHalf);
+  const V yy =
+      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
  const V y = IfThenElse(mask, abs_x, Sqrt(yy));

-  impl::AsinImpl<LaneType> impl;
-  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), (yy * y), y);
-  const V z1 = (kPiOverTwo - (z0 * kTwo));
+  impl::AsinImpl<T> impl;
+  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
+  const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
  return Or(IfThenElse(mask, z0, z1), sign_x);
 }

 template <class D, class V>
 HWY_INLINE V Asinh(const D d, V x) {
-  const V kSmall = Set(d, 1.0 / 268435456.0);
-  const V kLarge = Set(d, 268435456.0);
-  const V kLog2 = Set(d, 0.693147180559945286227);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
+  using T = TFromD<D>;
+
+  const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
+  const V kLarge = Set(d, static_cast<T>(268435456.0));
+  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kTwo = Set(d, static_cast<T>(+2.0));

  const V sign_x = And(SignBit(d), x);  // Extract the sign bit
  const V abs_x = Xor(x, sign_x);

-  const auto is_x_large = (abs_x > kLarge);
-  const auto is_x_lt_2 = (abs_x < kTwo);
+  const auto is_x_large = Gt(abs_x, kLarge);
+  const auto is_x_lt_2 = Lt(abs_x, kTwo);

-  const V x2 = (x * x);
-  const V sqrt_x2_plus_1 = Sqrt(x2 + kOne);
+  const V x2 = Mul(x, x);
+  const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));

-  const V y0 = MulAdd(abs_x, kTwo, (kOne / (sqrt_x2_plus_1 + abs_x)));
-  const V y1 = ((x2 / (sqrt_x2_plus_1 + kOne)) + abs_x);
+  const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
+  const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
  const V y2 =
-      IfThenElse(is_x_lt_2, (y1 + kOne), IfThenElse(is_x_large, abs_x, y0));
+      IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);

-  const auto is_pole = y2 == kOne;
-  const auto divisor = IfThenZeroElse(is_pole, y2) - kOne;
-  const auto large = IfThenElse(is_pole, y1, z * y1 / divisor);
-  const V y = IfThenElse(abs_x < kSmall, x, large);
-  return Or((IfThenElse(is_x_lt_2, y, z) + IfThenElseZero(is_x_large, kLog2)),
+  const auto is_pole = Eq(y2, kOne);
+  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
+  const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
+  const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
+  return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
            sign_x);
 }

 template <class D, class V>
 HWY_INLINE V Atan(const D d, V x) {
-  using LaneType = LaneType<V>;
+  using T = TFromD<D>;

-  const V kOne = Set(d, +1.0);
-  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));

  const V sign = And(SignBit(d), x);
  const V abs_x = Xor(x, sign);
-  const auto mask = (abs_x > kOne);
+  const auto mask = Gt(abs_x, kOne);

-  impl::AtanImpl<LaneType> impl;
+  impl::AtanImpl<T> impl;
  const auto divisor = IfThenElse(mask, abs_x, kOne);
-  const V y = impl.AtanPoly(d, IfThenElse(mask, kOne / divisor, abs_x));
-  return Or(IfThenElse(mask, (kPiOverTwo - y), y), sign);
+  const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
+  return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
 }

 template <class D, class V>
 HWY_INLINE V Atanh(const D d, V x) {
-  const V kHalf = Set(d, +0.5);
-  const V kOne = Set(d, +1.0);
+  using T = TFromD<D>;
+
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kOne = Set(d, static_cast<T>(+1.0));

  const V sign = And(SignBit(d), x);  // Extract the sign bit
  const V abs_x = Xor(x, sign);
-  return Log1p(d, ((abs_x + abs_x) / (kOne - abs_x))) * Xor(kHalf, sign);
+  return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
+             Xor(kHalf, sign));
 }

 template <class D, class V>
 HWY_INLINE V Cos(const D d, V x) {
-  using LaneType = LaneType<V>;
-  impl::CosSinImpl<LaneType> impl;
+  using T = TFromD<D>;
+  impl::CosSinImpl<T> impl;

  // Float Constants
-  const V kOneOverPi = Set(d, 0.31830988618379067153);
+  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));

  // Integer Constants
  const Rebind<int32_t, D> di32;
@ -1049,7 +1095,7 @@ HWY_INLINE V Cos(const D d, V x) {
  const V y = Abs(x);  // cos(x) == cos(|x|)

  // Compute the quadrant, q = int(|x| / pi) * 2 + 1
-  const VI32 q = (ShiftLeft<1>(impl.ToInt32(d, y * kOneOverPi)) + kOne);
+  const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);

  // Reduce range, apply sign, and approximate.
  return impl.Poly(
@ -1058,17 +1104,16 @@ HWY_INLINE V Cos(const D d, V x) {

 template <class D, class V>
 HWY_INLINE V Exp(const D d, V x) {
-  using LaneType = LaneType<V>;
+  using T = TFromD<D>;

-  // clang-format off
-  const V kHalf        = Set(d, +0.5);
-  const V kLowerBound  = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
-  const V kNegZero     = Set(d, -0.0);
-  const V kOne         = Set(d, +1.0);
-  const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
-  // clang-format on
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kLowerBound =
+      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
+  const V kNegZero = Set(d, static_cast<T>(-0.0));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));

-  impl::ExpImpl<LaneType> impl;
+  impl::ExpImpl<T> impl;

  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
  const auto q =
@ -1076,25 +1121,24 @@ HWY_INLINE V Exp(const D d, V x) {

  // Reduce, approximate, and then reconstruct.
  const V y = impl.LoadExpShortRange(
-      d, (impl.ExpPoly(d, impl.ExpReduce(d, x, q)) + kOne), q);
-  return IfThenElseZero(x >= kLowerBound, y);
+      d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
+  return IfThenElseZero(Ge(x, kLowerBound), y);
 }

 template <class D, class V>
 HWY_INLINE V Expm1(const D d, V x) {
-  using LaneType = LaneType<V>;
+  using T = TFromD<D>;

-  // clang-format off
-  const V kHalf        = Set(d, +0.5);
-  const V kLowerBound  = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
-  const V kLn2Over2    = Set(d, +0.346573590279972654708616);
-  const V kNegOne      = Set(d, -1.0);
-  const V kNegZero     = Set(d, -0.0);
-  const V kOne         = Set(d, +1.0);
-  const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
-  // clang-format on
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kLowerBound =
+      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
+  const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
+  const V kNegOne = Set(d, static_cast<T>(-1.0));
+  const V kNegZero = Set(d, static_cast<T>(-0.0));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));

-  impl::ExpImpl<LaneType> impl;
+  impl::ExpImpl<T> impl;

  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
  const auto q =
@ -1102,9 +1146,9 @@ HWY_INLINE V Expm1(const D d, V x) {

  // Reduce, approximate, and then reconstruct.
  const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
-  const V z = IfThenElse(Abs(x) < kLn2Over2, y,
-                         impl.LoadExpShortRange(d, (y + kOne), q) - kOne);
-  return IfThenElse(x < kLowerBound, kNegOne, z);
+  const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
+                         Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
+  return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
 }

 template <class D, class V>
@ -1114,34 +1158,37 @@ HWY_INLINE V Log(const D d, V x) {

 template <class D, class V>
 HWY_INLINE V Log10(const D d, V x) {
-  return Log(d, x) * Set(d, 0.4342944819032518276511);
+  using T = TFromD<D>;
+  return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
 }

 template <class D, class V>
 HWY_INLINE V Log1p(const D d, V x) {
-  const V kOne = Set(d, +1.0);
+  using T = TFromD<D>;
+  const V kOne = Set(d, static_cast<T>(+1.0));

-  const V y = x + kOne;
-  const auto is_pole = y == kOne;
-  const auto divisor = IfThenZeroElse(is_pole, y) - kOne;
+  const V y = Add(x, kOne);
+  const auto is_pole = Eq(y, kOne);
+  const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
  const auto non_pole =
-      impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y) * (x / divisor);
+      Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
  return IfThenElse(is_pole, x, non_pole);
 }

 template <class D, class V>
 HWY_INLINE V Log2(const D d, V x) {
-  return Log(d, x) * Set(d, 1.44269504088896340735992);
+  using T = TFromD<D>;
+  return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
 }

 template <class D, class V>
 HWY_INLINE V Sin(const D d, V x) {
-  using LaneType = LaneType<V>;
-  impl::CosSinImpl<LaneType> impl;
+  using T = TFromD<D>;
+  impl::CosSinImpl<T> impl;

  // Float Constants
-  const V kOneOverPi = Set(d, 0.31830988618379067153);
-  const V kHalf = Set(d, 0.5);
+  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
+  const V kHalf = Set(d, static_cast<T>(0.5));

  // Integer Constants
  const Rebind<int32_t, D> di32;
@ -1160,27 +1207,29 @@ HWY_INLINE V Sin(const D d, V x) {

 template <class D, class V>
 HWY_INLINE V Sinh(const D d, V x) {
-  const V kHalf = Set(d, +0.5);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
+  using T = TFromD<D>;
+  const V kHalf = Set(d, static_cast<T>(+0.5));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kTwo = Set(d, static_cast<T>(+2.0));

  const V sign = And(SignBit(d), x);  // Extract the sign bit
  const V abs_x = Xor(x, sign);
  const V y = Expm1(d, abs_x);
-  const V z = ((y + kTwo) / (y + kOne) * (y * kHalf));
+  const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
  return Xor(z, sign);  // Reapply the sign bit
 }

 template <class D, class V>
 HWY_INLINE V Tanh(const D d, V x) {
-  const V kLimit = Set(d, 18.714973875);
-  const V kOne = Set(d, +1.0);
-  const V kTwo = Set(d, +2.0);
+  using T = TFromD<D>;
+  const V kLimit = Set(d, static_cast<T>(18.714973875));
+  const V kOne = Set(d, static_cast<T>(+1.0));
+  const V kTwo = Set(d, static_cast<T>(+2.0));

  const V sign = And(SignBit(d), x);  // Extract the sign bit
  const V abs_x = Xor(x, sign);
-  const V y = Expm1(d, abs_x * kTwo);
-  const V z = IfThenElse((abs_x > kLimit), kOne, (y / (y + kTwo)));
+  const V y = Expm1(d, Mul(abs_x, kTwo));
+  const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
  return Xor(z, sign);  // Reapply the sign bit
 }

--- a/third_party/highway/hwy/contrib/math/math_test.cc
+++ b/third_party/highway/hwy/contrib/math/math_test.cc
@ -12,8 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <stdio.h>
+
 #include <cfloat>  // FLT_MAX
-#include <iostream>
 #include <type_traits>

 // clang-format off
@ -29,10 +30,18 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+template <class Out, class In>
+inline Out BitCast(const In& in) {
+  static_assert(sizeof(Out) == sizeof(In), "");
+  Out out;
+  CopyBytes<sizeof(out)>(&in, &out);
+  return out;
+}
+
 template <class T, class D>
-void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
-              D d, T min, T max, uint64_t max_error_ulp) {
-  constexpr bool kIsF32 = (sizeof(T) == 4);
+HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
+                           Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
+                           uint64_t max_error_ulp) {
  using UintT = MakeUnsigned<T>;

  const UintT min_bits = BitCast<UintT>(min);
@ -51,18 +60,16 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
  }

  uint64_t max_ulp = 0;
-#if HWY_ARCH_ARM
  // Emulation is slower, so cannot afford as many.
-  constexpr UintT kSamplesPerRange = 25000;
-#else
-  constexpr UintT kSamplesPerRange = 100000;
-#endif
+  constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000));
  for (int range_index = 0; range_index < range_count; ++range_index) {
    const UintT start = ranges[range_index][0];
    const UintT stop = ranges[range_index][1];
-    const UintT step = std::max<UintT>(1, ((stop - start) / kSamplesPerRange));
+    const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
    for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
-      const T value = BitCast<T>(std::min(value_bits, stop));
+      // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
+      // value_bits can be less than start, and thus possibly NaN.
+      const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
      const T actual = GetLane(fxN(d, Set(d, value)));
      const T expected = fx1(value);

@ -73,21 +80,41 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
      }
 #endif

-      const auto ulp = ComputeUlpDelta(actual, expected);
-      max_ulp = std::max<uint64_t>(max_ulp, ulp);
+      const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
+      max_ulp = HWY_MAX(max_ulp, ulp);
      if (ulp > max_error_ulp) {
-        std::cout << name << "<" << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
-                  << ">(" << value << ") expected: " << expected
-                  << " actual: " << actual << " ulp: " << ulp
-                  << " max: " << max_error_ulp << std::endl;
+        fprintf(stderr,
+                "%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
+                hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
+                expected, actual, static_cast<uint64_t>(ulp),
+                static_cast<uint32_t>(max_error_ulp));
      }
-      HWY_ASSERT(ulp <= max_error_ulp);
    }
  }
-  std::cout << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
-            << ", Max ULP: " << max_ulp << std::endl;
+  fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
+          hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
+  HWY_ASSERT(max_ulp <= max_error_ulp);
 }

+// TODO(janwas): remove once RVV supports fractional LMUL
+#undef DEFINE_MATH_TEST_FUNC
+#if HWY_TARGET == HWY_RVV
+
+#define DEFINE_MATH_TEST_FUNC(NAME)                    \
+  HWY_NOINLINE void TestAll##NAME() {                  \
+    ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \
+  }
+
+#else
+
+#define DEFINE_MATH_TEST_FUNC(NAME)                 \
+  HWY_NOINLINE void TestAll##NAME() {               \
+    ForFloatTypes(ForPartialVectors<Test##NAME>()); \
+  }
+
+#endif
+
+#undef DEFINE_MATH_TEST
 #define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
  struct Test##NAME {                                                     \
@ -97,28 +124,45 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
        TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,  \
                       F32_ERROR);                                        \
      } else {                                                            \
-        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, F64_MIN, F64_MAX,  \
+        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d,                    \
+                       static_cast<T>(F64_MIN), static_cast<T>(F64_MAX),  \
                       F64_ERROR);                                        \
      }                                                                   \
    }                                                                     \
  };                                                                      \
-  HWY_NOINLINE void TestAll##NAME() {                                     \
-    ForFloatTypes(ForPartialVectors<Test##NAME>());                       \
-  }
+  DEFINE_MATH_TEST_FUNC(NAME)

 // Floating point values closest to but less than 1.0
 const float kNearOneF = BitCast<float>(0x3F7FFFFF);
 const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);

+// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
+// only increase the error tolerance there.
+constexpr uint64_t Cos64ULP() {
+#if defined(__MINGW32__)
+  return 23;
+#else
+  return 3;
+#endif
+}
+
+constexpr uint64_t ACosh32ULP() {
+#if defined(__MINGW32__)
+  return 8;
+#else
+  return 3;
+#endif
+}
+
 // clang-format off
 DEFINE_MATH_TEST(Acos,
-  std::acos,  CallAcos,  -1.0,       +1.0,        3,  // NEON is 3 instead of 2
+  std::acos,  CallAcos,  -1.0f,      +1.0f,       3,  // NEON is 3 instead of 2
  std::acos,  CallAcos,  -1.0,       +1.0,        2)
 DEFINE_MATH_TEST(Acosh,
-  std::acosh, CallAcosh, +1.0,       +FLT_MAX,    3,
+  std::acosh, CallAcosh, +1.0f,      +FLT_MAX,    ACosh32ULP(),
  std::acosh, CallAcosh, +1.0,       +DBL_MAX,    3)
 DEFINE_MATH_TEST(Asin,
-  std::asin,  CallAsin,  -1.0,       +1.0,        4,  // ARMv7 is 4 instead of 2
+  std::asin,  CallAsin,  -1.0f,      +1.0f,       4,  // ARMv7 is 4 instead of 2
  std::asin,  CallAsin,  -1.0,       +1.0,        2)
 DEFINE_MATH_TEST(Asinh,
  std::asinh, CallAsinh, -FLT_MAX,   +FLT_MAX,    3,
@ -130,13 +174,13 @@ DEFINE_MATH_TEST(Atanh,
  std::atanh, CallAtanh, -kNearOneF, +kNearOneF,  4,  // NEON is 4 instead of 3
  std::atanh, CallAtanh, -kNearOneD, +kNearOneD,  3)
 DEFINE_MATH_TEST(Cos,
-  std::cos,   CallCos,   -39000.0,   +39000.0,    3,
-  std::cos,   CallCos,   -39000.0,   +39000.0,    3)
+  std::cos,   CallCos,   -39000.0f,  +39000.0f,   3,
+  std::cos,   CallCos,   -39000.0,   +39000.0,    Cos64ULP())
 DEFINE_MATH_TEST(Exp,
-  std::exp,   CallExp,   -FLT_MAX,   +104.0,      1,
+  std::exp,   CallExp,   -FLT_MAX,   +104.0f,     1,
  std::exp,   CallExp,   -DBL_MAX,   +104.0,      1)
 DEFINE_MATH_TEST(Expm1,
-  std::expm1, CallExpm1, -FLT_MAX,   +104.0,      4,
+  std::expm1, CallExpm1, -FLT_MAX,   +104.0f,     4,
  std::expm1, CallExpm1, -DBL_MAX,   +104.0,      4)
 DEFINE_MATH_TEST(Log,
  std::log,   CallLog,   +FLT_MIN,   +FLT_MAX,    1,
@ -145,14 +189,14 @@ DEFINE_MATH_TEST(Log10,
  std::log10, CallLog10, +FLT_MIN,   +FLT_MAX,    2,
  std::log10, CallLog10, +DBL_MIN,   +DBL_MAX,    2)
 DEFINE_MATH_TEST(Log1p,
-  std::log1p, CallLog1p, +0.0f,      +1e37,       3,  // NEON is 3 instead of 2
+  std::log1p, CallLog1p, +0.0f,      +1e37f,      3,  // NEON is 3 instead of 2
  std::log1p, CallLog1p, +0.0,       +DBL_MAX,    2)
 DEFINE_MATH_TEST(Log2,
  std::log2,  CallLog2,  +FLT_MIN,   +FLT_MAX,    2,
  std::log2,  CallLog2,  +DBL_MIN,   +DBL_MAX,    2)
 DEFINE_MATH_TEST(Sin,
-  std::sin,   CallSin,   -39000.0,   +39000.0,    3,
-  std::sin,   CallSin,   -39000.0,   +39000.0,    3)
+  std::sin,   CallSin,   -39000.0f,  +39000.0f,   3,
+  std::sin,   CallSin,   -39000.0,   +39000.0,    4)  // MSYS is 4 instead of 3
 DEFINE_MATH_TEST(Sinh,
  std::sinh,  CallSinh,  -80.0f,     +80.0f,      4,
  std::sinh,  CallSinh,  -709.0,     +709.0,      4)
@ -167,6 +211,7 @@ DEFINE_MATH_TEST(Tanh,
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyMathTest);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
@ -186,4 +231,11 @@ HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
 HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/contrib/sort/sort-inl.h
+++ b/third_party/highway/hwy/contrib/sort/sort-inl.h
@ -0,0 +1,909 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target include guard
+
+#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
+#undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
+#else
+#define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
+#endif
+
+#include <inttypes.h>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+enum class SortOrder { kAscending, kDescending };
+
+#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
+
+#define HWY_SORT_VERIFY 1
+
+constexpr inline SortOrder Reverse(SortOrder order) {
+  return (order == SortOrder::kAscending) ? SortOrder::kDescending
+                                          : SortOrder::kAscending;
+}
+
+namespace verify {
+
+template <typename T>
+bool Compare(T a, T b, SortOrder kOrder) {
+  if (kOrder == SortOrder::kAscending) return a <= b;
+  return a >= b;
+}
+
+#if HWY_SORT_VERIFY
+
+template <class D>
+class Runs {
+  using T = TFromD<D>;
+
+ public:
+  Runs(D d, size_t num_regs, size_t run_length = 0, bool alternating = false) {
+    const size_t N = Lanes(d);
+
+    buf_ = AllocateAligned<T>(N);
+    consecutive_ = AllocateAligned<T>(num_regs * N);
+
+    num_regs_ = num_regs;
+    if (run_length) {
+      run_length_ = run_length;
+      num_runs_ = num_regs * N / run_length;
+      is_vector_ = true;
+      alternating_ = alternating;
+    } else {
+      run_length_ = num_regs * 4;
+      num_runs_ = N / 4;
+      is_vector_ = false;
+      alternating_ = false;
+    }
+  }
+
+  void ScatterQuartets(D d, const size_t idx_reg, Vec<D> v) {
+    HWY_ASSERT(idx_reg < num_regs_);
+    const size_t N = Lanes(d);
+    for (size_t i = 0; i < N; i += 4) {
+      Store(v, d, buf_.get());
+      const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
+      CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
+    }
+  }
+
+  void StoreVector(D d, const size_t idx_reg, Vec<D> v) {
+    HWY_ASSERT(idx_reg < num_regs_);
+    Store(v, d, &consecutive_[idx_reg * Lanes(d)]);
+  }
+
+  bool IsBitonic() const {
+    HWY_ASSERT(!alternating_);
+    for (size_t ir = 0; ir < num_runs_; ++ir) {
+      const T* p = &consecutive_[ir * run_length_];
+      bool is_asc = true;
+      bool is_desc = true;
+      bool is_zero = true;
+
+      for (size_t i = 0; i < run_length_ / 2 - 1; ++i) {
+        is_asc &= (p[i] <= p[i + 1]);
+        is_desc &= (p[i] >= p[i + 1]);
+      }
+      for (size_t i = 0; i < run_length_; ++i) {
+        is_zero &= (p[i] == 0);
+      }
+
+      bool is_asc2 = true;
+      bool is_desc2 = true;
+      for (size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
+        is_asc2 &= (p[i] <= p[i + 1]);
+        is_desc2 &= (p[i] >= p[i + 1]);
+      }
+
+      if (is_zero) continue;
+      if (is_asc && is_desc2) continue;
+      if (is_desc && is_asc2) continue;
+      return false;
+    }
+    return true;
+  }
+
+  void CheckBitonic(int line, int caller) const {
+    if (IsBitonic()) return;
+    for (size_t ir = 0; ir < num_runs_; ++ir) {
+      const T* p = &consecutive_[ir * run_length_];
+      printf("run %" PRIu64 " (len %" PRIu64 ")\n", static_cast<uint64_t>(ir),
+             static_cast<uint64_t>(run_length_));
+      for (size_t i = 0; i < run_length_; ++i) {
+        printf("%.0f\n", static_cast<float>(p[i]));
+      }
+    }
+    printf("caller %d\n", caller);
+    hwy::Abort("", line, "not bitonic");
+  }
+
+  void CheckSorted(SortOrder kOrder, int line, int caller) const {
+    for (size_t ir = 0; ir < num_runs_; ++ir) {
+      const SortOrder order =
+          (alternating_ && (ir & 1)) ? Reverse(kOrder) : kOrder;
+      const T* p = &consecutive_[ir * run_length_];
+
+      for (size_t i = 0; i < run_length_ - 1; ++i) {
+        if (!Compare(p[i], p[i + 1], order)) {
+          printf("ir%" PRIu64 " run_length=%" PRIu64
+                 " alt=%d original order=%d this order=%d\n",
+                 static_cast<uint64_t>(ir), static_cast<uint64_t>(run_length_),
+                 alternating_, static_cast<int>(kOrder),
+                 static_cast<int>(order));
+          for (size_t i = 0; i < run_length_; ++i) {
+            printf(" %.0f\n", static_cast<float>(p[i]));
+          }
+          printf("caller %d\n", caller);
+          hwy::Abort("", line, "not sorted");
+        }
+      }
+    }
+  }
+
+ private:
+  AlignedFreeUniquePtr<T[]> buf_;
+  AlignedFreeUniquePtr<T[]> consecutive_;
+  size_t num_regs_;
+  size_t run_length_;
+  size_t num_runs_;
+  bool is_vector_;
+  bool alternating_;
+};
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
+  Runs<D> runs(d, 1);
+  runs.ScatterQuartets(d, 0, v0);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
+  Runs<D> runs(d, 2);
+  runs.ScatterQuartets(d, 0, v0);
+  runs.ScatterQuartets(d, 1, v1);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
+                                   Vec<D> v3) {
+  Runs<D> runs(d, 4);
+  runs.ScatterQuartets(d, 0, v0);
+  runs.ScatterQuartets(d, 1, v1);
+  runs.ScatterQuartets(d, 2, v2);
+  runs.ScatterQuartets(d, 3, v3);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
+                                   Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
+                                   Vec<D> v7) {
+  Runs<D> runs(d, 8);
+  runs.ScatterQuartets(d, 0, v0);
+  runs.ScatterQuartets(d, 1, v1);
+  runs.ScatterQuartets(d, 2, v2);
+  runs.ScatterQuartets(d, 3, v3);
+  runs.ScatterQuartets(d, 4, v4);
+  runs.ScatterQuartets(d, 5, v5);
+  runs.ScatterQuartets(d, 6, v6);
+  runs.ScatterQuartets(d, 7, v7);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
+                                   Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
+                                   Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
+                                   Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
+                                   Vec<D> vF) {
+  Runs<D> runs(d, 16);
+  runs.ScatterQuartets(d, 0x0, v0);
+  runs.ScatterQuartets(d, 0x1, v1);
+  runs.ScatterQuartets(d, 0x2, v2);
+  runs.ScatterQuartets(d, 0x3, v3);
+  runs.ScatterQuartets(d, 0x4, v4);
+  runs.ScatterQuartets(d, 0x5, v5);
+  runs.ScatterQuartets(d, 0x6, v6);
+  runs.ScatterQuartets(d, 0x7, v7);
+  runs.ScatterQuartets(d, 0x8, v8);
+  runs.ScatterQuartets(d, 0x9, v9);
+  runs.ScatterQuartets(d, 0xA, vA);
+  runs.ScatterQuartets(d, 0xB, vB);
+  runs.ScatterQuartets(d, 0xC, vC);
+  runs.ScatterQuartets(d, 0xD, vD);
+  runs.ScatterQuartets(d, 0xE, vE);
+  runs.ScatterQuartets(d, 0xF, vF);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreDeinterleavedQuartets(
+    D d, const Vec<D>& v00, const Vec<D>& v01, const Vec<D>& v02,
+    const Vec<D>& v03, const Vec<D>& v04, const Vec<D>& v05, const Vec<D>& v06,
+    const Vec<D>& v07, const Vec<D>& v08, const Vec<D>& v09, const Vec<D>& v0A,
+    const Vec<D>& v0B, const Vec<D>& v0C, const Vec<D>& v0D, const Vec<D>& v0E,
+    const Vec<D>& v0F, const Vec<D>& v10, const Vec<D>& v11, const Vec<D>& v12,
+    const Vec<D>& v13, const Vec<D>& v14, const Vec<D>& v15, const Vec<D>& v16,
+    const Vec<D>& v17, const Vec<D>& v18, const Vec<D>& v19, const Vec<D>& v1A,
+    const Vec<D>& v1B, const Vec<D>& v1C, const Vec<D>& v1D, const Vec<D>& v1E,
+    const Vec<D>& v1F) {
+  Runs<D> runs(d, 32);
+  runs.ScatterQuartets(d, 0x00, v00);
+  runs.ScatterQuartets(d, 0x01, v01);
+  runs.ScatterQuartets(d, 0x02, v02);
+  runs.ScatterQuartets(d, 0x03, v03);
+  runs.ScatterQuartets(d, 0x04, v04);
+  runs.ScatterQuartets(d, 0x05, v05);
+  runs.ScatterQuartets(d, 0x06, v06);
+  runs.ScatterQuartets(d, 0x07, v07);
+  runs.ScatterQuartets(d, 0x08, v08);
+  runs.ScatterQuartets(d, 0x09, v09);
+  runs.ScatterQuartets(d, 0x0A, v0A);
+  runs.ScatterQuartets(d, 0x0B, v0B);
+  runs.ScatterQuartets(d, 0x0C, v0C);
+  runs.ScatterQuartets(d, 0x0D, v0D);
+  runs.ScatterQuartets(d, 0x0E, v0E);
+  runs.ScatterQuartets(d, 0x0F, v0F);
+  runs.ScatterQuartets(d, 0x10, v10);
+  runs.ScatterQuartets(d, 0x11, v11);
+  runs.ScatterQuartets(d, 0x12, v12);
+  runs.ScatterQuartets(d, 0x13, v13);
+  runs.ScatterQuartets(d, 0x14, v14);
+  runs.ScatterQuartets(d, 0x15, v15);
+  runs.ScatterQuartets(d, 0x16, v16);
+  runs.ScatterQuartets(d, 0x17, v17);
+  runs.ScatterQuartets(d, 0x18, v18);
+  runs.ScatterQuartets(d, 0x19, v19);
+  runs.ScatterQuartets(d, 0x1A, v1A);
+  runs.ScatterQuartets(d, 0x1B, v1B);
+  runs.ScatterQuartets(d, 0x1C, v1C);
+  runs.ScatterQuartets(d, 0x1D, v1D);
+  runs.ScatterQuartets(d, 0x1E, v1E);
+  runs.ScatterQuartets(d, 0x1F, v1F);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreVectors(D d, Vec<D> v0, size_t run_length, bool alternating) {
+  Runs<D> runs(d, 1, run_length, alternating);
+  runs.StoreVector(d, 0, v0);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
+  constexpr size_t kRegs = 2;
+  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
+  runs.StoreVector(d, 0, v0);
+  runs.StoreVector(d, 1, v1);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
+  constexpr size_t kRegs = 4;
+  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
+  runs.StoreVector(d, 0, v0);
+  runs.StoreVector(d, 1, v1);
+  runs.StoreVector(d, 2, v2);
+  runs.StoreVector(d, 3, v3);
+  return runs;
+}
+
+template <class D>
+Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
+                     Vec<D> v5, Vec<D> v6, Vec<D> v7) {
+  constexpr size_t kRegs = 8;
+  Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
+  runs.StoreVector(d, 0, v0);
+  runs.StoreVector(d, 1, v1);
+  runs.StoreVector(d, 2, v2);
+  runs.StoreVector(d, 3, v3);
+  runs.StoreVector(d, 4, v4);
+  runs.StoreVector(d, 5, v5);
+  runs.StoreVector(d, 6, v6);
+  runs.StoreVector(d, 7, v7);
+  return runs;
+}
+
+#endif  // HWY_SORT_VERIFY
+}  // namespace verify
+
+namespace detail {
+
+// ------------------------------ Vector-length agnostic (quartets)
+
+// For each lane i: replaces a[i] with the first and b[i] with the second
+// according to kOrder.
+// Corresponds to a conditional swap, which is one "node" of a sorting network.
+// Min/Max are cheaper than compare + blend at least for integers.
+template <SortOrder kOrder, class V>
+HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
+  V temp = a;
+  a = (kOrder == SortOrder::kAscending) ? Min(a, b) : Max(a, b);
+  b = (kOrder == SortOrder::kAscending) ? Max(temp, b) : Min(temp, b);
+}
+
+// For each lane: sorts the four values in the that lane of the four vectors.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void SortLanesIn4Vectors(D d, const TFromD<D>* in, V& v0, V& v1,
+                                    V& v2, V& v3) {
+  const size_t N = Lanes(d);
+
+  // Bitonic and odd-even sorters both have 5 nodes. This one is from
+  // http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
+
+  // layer 1
+  v0 = Load(d, in + 0 * N);
+  v2 = Load(d, in + 2 * N);
+  SortLanesIn2Vectors<kOrder>(v0, v2);
+  v1 = Load(d, in + 1 * N);
+  v3 = Load(d, in + 3 * N);
+  SortLanesIn2Vectors<kOrder>(v1, v3);
+
+  // layer 2
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  SortLanesIn2Vectors<kOrder>(v2, v3);
+
+  // layer 3
+  SortLanesIn2Vectors<kOrder>(v1, v2);
+}
+
+// Inputs are vectors with columns in sorted order (from SortLanesIn4Vectors).
+// Transposes so that output vectors are sorted quartets (128-bit blocks),
+// and a quartet in v0 comes before its counterpart in v1, etc.
+template <class D, class V = Vec<D>>
+HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
+  const RepartitionToWide<decltype(d)> dw;
+
+  // Input: first number is reg, second is lane (0 is lowest)
+  // 03 02 01 00  |
+  // 13 12 11 10  | columns are sorted
+  // 23 22 21 20  | (in this order)
+  // 33 32 31 30  V
+  const V t0 = InterleaveLower(d, v0, v1);  // 11 01 10 00
+  const V t1 = InterleaveLower(d, v2, v3);  // 31 21 30 20
+  const V t2 = InterleaveUpper(d, v0, v1);  // 13 03 12 02
+  const V t3 = InterleaveUpper(d, v2, v3);  // 33 23 32 22
+
+  // 30 20 10 00
+  v0 = BitCast(d, InterleaveLower(BitCast(dw, t0), BitCast(dw, t1)));
+  // 31 21 11 01
+  v1 = BitCast(d, InterleaveUpper(BitCast(dw, t0), BitCast(dw, t1)));
+  // 32 22 12 02
+  v2 = BitCast(d, InterleaveLower(BitCast(dw, t2), BitCast(dw, t3)));
+  // 33 23 13 03 --> sorted in descending order (03=smallest in lane 0).
+  v3 = BitCast(d, InterleaveUpper(BitCast(dw, t2), BitCast(dw, t3)));
+}
+
+// 12 ops (including 4 swizzle)
+// Precondition: v0 and v1 are already sorted according to kOrder.
+// Postcondition: concatenate(v0, v1) is sorted and v0 is the lower half.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
+  const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
+  input0.CheckSorted(kOrder, __LINE__, caller);
+  input1.CheckSorted(kOrder, __LINE__, caller);
+#endif
+
+  // See figure 5 from https://www.vldb.org/pvldb/vol8/p1274-inoue.pdf.
+  // This requires 8 min/max vs 6 for bitonic merge (see Figure 2 in
+  // http://www.vldb.org/pvldb/vol1/1454171.pdf), but is faster overall because
+  // it needs less shuffling, and does not need a bitonic input.
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  v0 = Shuffle0321(v0);
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  v0 = Shuffle0321(v0);
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  v0 = Shuffle0321(v0);
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  v0 = Shuffle0321(v0);
+
+#if HWY_SORT_VERIFY
+  auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// ------------------------------ Bitonic merge (quartets)
+
+// For the last layer of bitonic merge. Conditionally swaps even-numbered lanes
+// with their odd-numbered neighbor. Works for both quartets and vectors.
+template <SortOrder kOrder, class D>
+HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
+  (void)d;
+  // Optimization for 32-bit integers: swap via Shuffle and 64-bit Min/Max.
+  // (not worthwhile on SSE4/AVX2 because they lack 64-bit Min/Max)
+#if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
+  if (sizeof(TFromD<D>) == 4 && !IsFloat<TFromD<D>>()) {
+    const RepartitionToWide<decltype(d)> dw;
+    const auto wide = BitCast(dw, q_or_v);
+    const auto swap = BitCast(dw, Shuffle2301(q_or_v));
+    if (kOrder == SortOrder::kAscending) {
+      q_or_v = BitCast(d, Max(wide, swap));
+    } else {
+      q_or_v = BitCast(d, Min(wide, swap));
+    }
+  } else
+#endif
+  {
+    Vec<D> swapped = Shuffle2301(q_or_v);
+    SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
+    q_or_v = OddEven(swapped, q_or_v);
+  }
+}
+
+// Lane 0 with 2, 1 with 3 etc. Works for both quartets and vectors.
+template <SortOrder kOrder, class D>
+HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
+  const RepartitionToWide<decltype(d)> dw;
+  Vec<D> swapped = Shuffle1032(q_or_v);
+  SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
+  q_or_v = BitCast(d, OddEven(BitCast(dw, swapped), BitCast(dw, q_or_v)));
+}
+
+// For all BitonicMerge*, and each block, the concatenation of those blocks from
+// the first half and second half of the input vectors must be sorted in
+// opposite orders.
+
+// 14 ops (including 4 swizzle)
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 4 (2 ops)
+  SortLanesIn2Vectors<kOrder>(q0, q1);
+
+  // Layer 2: lane stride 2 (6 ops)
+  SortDistance2LanesQV<kOrder>(d, q0);
+  SortDistance2LanesQV<kOrder>(d, q1);
+
+  // Layer 3: lane stride 1 (4 ops)
+  SortAdjacentLanesQV<kOrder>(d, q0);
+  SortAdjacentLanesQV<kOrder>(d, q1);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// 32 ops, more efficient than three 4+4 merges (36 ops).
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
+                                      int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input =
+      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 8
+  SortLanesIn2Vectors<kOrder>(q0, q2);
+  SortLanesIn2Vectors<kOrder>(q1, q3);
+
+  // Layers 2 to 4
+  // Inputs are not fully sorted, so cannot use Merge2SortedQuartets.
+  BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
+  BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output =
+      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// 72 ops.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
+                                      V& q5, V& q6, V& q7, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input =
+      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 16
+  SortLanesIn2Vectors<kOrder>(q0, q4);
+  SortLanesIn2Vectors<kOrder>(q1, q5);
+  SortLanesIn2Vectors<kOrder>(q2, q6);
+  SortLanesIn2Vectors<kOrder>(q3, q7);
+
+  // Layers 2 to 5
+  BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
+  BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output =
+      verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// ------------------------------ Bitonic merge (vectors)
+
+// Lane 0 with 4, 1 with 5 etc. Only used for vectors with at least 8 lanes.
+#if HWY_TARGET <= HWY_AVX3
+
+// TODO(janwas): move to op
+template <typename T>
+Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
+}
+
+template <typename T>
+Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
+}
+
+template <typename T>
+Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
+  return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
+}
+
+template <typename T>
+Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
+  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
+}
+
+template <SortOrder kOrder, class T>
+HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d, Vec<decltype(d)>& v) {
+  // In: FEDCBA98 76543210
+  // Swap 128-bit halves of each 256 bits => BA98FEDC 32107654
+  Vec512<T> swapped = Shuffle128_2301(v, v);
+  SortLanesIn2Vectors<kOrder>(v, swapped);
+  v = OddEven128(swapped, v);
+}
+
+#endif
+
+template <SortOrder kOrder, typename T>
+HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d, Vec<decltype(d)>& v) {
+  Vec<decltype(d)> swapped = ConcatLowerUpper(d, v, v);
+  SortLanesIn2Vectors<kOrder>(v, swapped);
+  v = ConcatUpperLower(swapped, v);
+}
+
+template <SortOrder kOrder, typename T>
+HWY_INLINE void SortDistance4LanesV(Simd<T, 4> /* tag */, ...) {}
+
+// Only used for vectors with at least 16 lanes.
+template <SortOrder kOrder, class D>
+HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
+  Vec<D> swapped = ConcatLowerUpper(d, v, v);
+  SortLanesIn2Vectors<kOrder>(v, swapped);
+  v = ConcatUpperLower(swapped, v);
+}
+
+// 120 ops. Only used if vectors are at least 8 lanes.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                                 V& v6, V& v7, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input =
+      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 32
+  SortLanesIn2Vectors<kOrder>(v0, v4);
+  SortLanesIn2Vectors<kOrder>(v1, v5);
+  SortLanesIn2Vectors<kOrder>(v2, v6);
+  SortLanesIn2Vectors<kOrder>(v3, v7);
+
+  // Layer 2: lane stride 16
+  SortLanesIn2Vectors<kOrder>(v0, v2);
+  SortLanesIn2Vectors<kOrder>(v1, v3);
+  SortLanesIn2Vectors<kOrder>(v4, v6);
+  SortLanesIn2Vectors<kOrder>(v5, v7);
+
+  // Layer 3: lane stride 8
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  SortLanesIn2Vectors<kOrder>(v2, v3);
+  SortLanesIn2Vectors<kOrder>(v4, v5);
+  SortLanesIn2Vectors<kOrder>(v6, v7);
+
+  // Layer 4: lane stride 4
+  SortDistance4LanesV<kOrder>(d, v0);
+  SortDistance4LanesV<kOrder>(d, v1);
+  SortDistance4LanesV<kOrder>(d, v2);
+  SortDistance4LanesV<kOrder>(d, v3);
+  SortDistance4LanesV<kOrder>(d, v4);
+  SortDistance4LanesV<kOrder>(d, v5);
+  SortDistance4LanesV<kOrder>(d, v6);
+  SortDistance4LanesV<kOrder>(d, v7);
+
+  // Layer 5: lane stride 2
+  SortDistance2LanesQV<kOrder>(d, v0);
+  SortDistance2LanesQV<kOrder>(d, v1);
+  SortDistance2LanesQV<kOrder>(d, v2);
+  SortDistance2LanesQV<kOrder>(d, v3);
+  SortDistance2LanesQV<kOrder>(d, v4);
+  SortDistance2LanesQV<kOrder>(d, v5);
+  SortDistance2LanesQV<kOrder>(d, v6);
+  SortDistance2LanesQV<kOrder>(d, v7);
+
+  // Layer 6: lane stride 1
+  SortAdjacentLanesQV<kOrder>(d, v0);
+  SortAdjacentLanesQV<kOrder>(d, v1);
+  SortAdjacentLanesQV<kOrder>(d, v2);
+  SortAdjacentLanesQV<kOrder>(d, v3);
+  SortAdjacentLanesQV<kOrder>(d, v4);
+  SortAdjacentLanesQV<kOrder>(d, v5);
+  SortAdjacentLanesQV<kOrder>(d, v6);
+  SortAdjacentLanesQV<kOrder>(d, v7);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output =
+      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// 60 ops. Only used if vectors are at least 16 lanes.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 32
+  SortLanesIn2Vectors<kOrder>(v0, v2);
+  SortLanesIn2Vectors<kOrder>(v1, v3);
+
+  // Layer 2: lane stride 16
+  SortLanesIn2Vectors<kOrder>(v0, v1);
+  SortLanesIn2Vectors<kOrder>(v2, v3);
+
+  // Layer 3: lane stride 8
+  SortDistance8LanesV<kOrder>(d, v0);
+  SortDistance8LanesV<kOrder>(d, v1);
+  SortDistance8LanesV<kOrder>(d, v2);
+  SortDistance8LanesV<kOrder>(d, v3);
+
+  // Layer 4: lane stride 4
+  SortDistance4LanesV<kOrder>(d, v0);
+  SortDistance4LanesV<kOrder>(d, v1);
+  SortDistance4LanesV<kOrder>(d, v2);
+  SortDistance4LanesV<kOrder>(d, v3);
+
+  // Layer 5: lane stride 2
+  SortDistance2LanesQV<kOrder>(d, v0);
+  SortDistance2LanesQV<kOrder>(d, v1);
+  SortDistance2LanesQV<kOrder>(d, v2);
+  SortDistance2LanesQV<kOrder>(d, v3);
+
+  // Layer 6: lane stride 1
+  SortAdjacentLanesQV<kOrder>(d, v0);
+  SortAdjacentLanesQV<kOrder>(d, v1);
+  SortAdjacentLanesQV<kOrder>(d, v2);
+  SortAdjacentLanesQV<kOrder>(d, v3);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// 128 ops. Only used if vectors are at least 16 lanes.
+template <SortOrder kOrder, class D, class V = Vec<D>>
+HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
+                                  V& v6, V& v7, int caller) {
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> input =
+      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
+  if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
+#endif
+
+  // Layer 1: lane stride 64
+  SortLanesIn2Vectors<kOrder>(v0, v4);
+  SortLanesIn2Vectors<kOrder>(v1, v5);
+  SortLanesIn2Vectors<kOrder>(v2, v6);
+  SortLanesIn2Vectors<kOrder>(v3, v7);
+
+  BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
+  BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
+
+#if HWY_SORT_VERIFY
+  const verify::Runs<D> output =
+      verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
+  output.CheckSorted(kOrder, __LINE__, caller);
+#endif
+}
+
+// ------------------------------ Vector-length dependent
+
+// Only called when N=4 (single block, so quartets can just be stored).
+template <SortOrder kOrder, class D, class V>
+HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
+                                      V& q5, V& q6, V& q7, TFromD<D>* inout) {
+  Store(q0, d, inout + 0 * 4);
+  Store(q1, d, inout + 1 * 4);
+  Store(q2, d, inout + 2 * 4);
+  Store(q3, d, inout + 3 * 4);
+  Store(q4, d, inout + 4 * 4);
+  Store(q5, d, inout + 5 * 4);
+  Store(q6, d, inout + 6 * 4);
+  Store(q7, d, inout + 7 * 4);
+  return 8 * 4;
+}
+
+// Only called when N=8.
+template <SortOrder kOrder, class D, class V>
+HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
+                                    V& q5, V& q6, V& q7, TFromD<D>* inout) {
+  V v0 = ConcatLowerLower(d, q1, q0);
+  V v1 = ConcatLowerLower(d, q3, q2);
+  V v2 = ConcatLowerLower(d, q5, q4);
+  V v3 = ConcatLowerLower(d, q7, q6);
+  // TODO(janwas): merge into single table
+  V v4 = Reverse(d, ConcatUpperUpper(d, q7, q6));
+  V v5 = Reverse(d, ConcatUpperUpper(d, q5, q4));
+  V v6 = Reverse(d, ConcatUpperUpper(d, q3, q2));
+  V v7 = Reverse(d, ConcatUpperUpper(d, q1, q0));
+  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
+
+  Store(v0, d, inout + 0 * 8);
+  Store(v1, d, inout + 1 * 8);
+  Store(v2, d, inout + 2 * 8);
+  Store(v3, d, inout + 3 * 8);
+  Store(v4, d, inout + 4 * 8);
+  Store(v5, d, inout + 5 * 8);
+  Store(v6, d, inout + 6 * 8);
+  Store(v7, d, inout + 7 * 8);
+  return 8 * 8;
+}
+
+// Only called when N=16.
+template <SortOrder kOrder, typename T, class V>
+HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
+                                     V& q4, V& q5, V& q6, V& q7, T* inout) {
+  const V q11_01_10_00 = Shuffle128_2020(q0, q1);
+  const V q13_03_12_02 = Shuffle128_2020(q2, q3);
+  V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02);  // 3..0
+
+  const V q15_05_14_04 = Shuffle128_2020(q4, q5);
+  const V q17_07_16_06 = Shuffle128_2020(q6, q7);
+  V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06);  // 7..4
+
+  const V q19_09_18_08 = Shuffle128_3131(q0, q1);
+  const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
+  V v3 = Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a));  // b..8
+
+  const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
+  const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
+  V v2 = Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e));  // f..c
+
+  detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
+
+  // TODO(janwas): merge into single table
+  V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02);              // 13..10
+  V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06);              // 17..14
+  V v7 = Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a));  // 1b..18
+  V v6 = Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e));  // 1f..1c
+
+  detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
+
+  detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
+
+  Store(v0, d, inout + 0 * 16);
+  Store(v1, d, inout + 1 * 16);
+  Store(v2, d, inout + 2 * 16);
+  Store(v3, d, inout + 3 * 16);
+  Store(v4, d, inout + 4 * 16);
+  Store(v5, d, inout + 5 * 16);
+  Store(v6, d, inout + 6 * 16);
+  Store(v7, d, inout + 7 * 16);
+  return 8 * 16;
+}
+
+// Avoid needing #if at the call sites.
+template <SortOrder kOrder, typename T>
+HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
+  return 0;
+}
+
+template <SortOrder kOrder, typename T>
+HWY_API size_t FourQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
+  return 0;
+}
+template <SortOrder kOrder, typename T>
+HWY_API size_t FourQuartetsPerVector(Simd<T, 8> /* tag */, ...) {
+  return 0;
+}
+
+}  // namespace detail
+
+template <class D>
+HWY_API size_t SortBatchSize(D d) {
+  const size_t N = Lanes(d);
+  if (N == 4) return 32;
+  if (N == 8) return 64;
+  if (N == 16) return 128;
+  return 0;
+}
+
+template <SortOrder kOrder, class D>
+HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
+  const size_t N = Lanes(d);
+
+  Vec<D> q0, q1, q2, q3;
+  detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
+  detail::Transpose4x4(d, q0, q1, q2, q3);
+  detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
+  detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
+
+  // Bitonic merges require one input to be in reverse order.
+  constexpr SortOrder kReverse = Reverse(kOrder);
+
+  Vec<D> q4, q5, q6, q7;
+  detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
+  detail::Transpose4x4(d, q4, q5, q6, q7);
+  detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
+  detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
+
+  detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
+  detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
+
+  detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
+                                        __LINE__);
+
+  if (N == 4) {
+    return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
+                                                  q7, inout);
+  }
+
+  if (N == 8) {
+    return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
+                                                q7, inout);
+  }
+
+  return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
+                                               q7, inout);
+}
+
+#else
+
+// Avoids unused attribute warning
+template <SortOrder kOrder, class D>
+HWY_API size_t SortBatch(D /* tag */, TFromD<D>* /* inout */) {
+  return 0;
+}
+
+#endif  // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
--- a/third_party/highway/hwy/contrib/sort/sort_test.cc
+++ b/third_party/highway/hwy/contrib/sort/sort_test.cc
@ -0,0 +1,188 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <inttypes.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/contrib/sort/sort-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
+
+template <class D>
+size_t K(D d) {
+  return SortBatchSize(d);
+}
+
+template <SortOrder kOrder, class D>
+void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) {
+  const size_t N = Lanes(d);
+  // Ensure it matches the sort order
+  for (size_t i = 0; i < K(d) - 1; ++i) {
+    if (!verify::Compare(out[i], out[i + 1], kOrder)) {
+      printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n",
+             static_cast<uint64_t>(i), static_cast<uint64_t>(i),
+             static_cast<uint64_t>(N), static_cast<float>(out[i + 0]),
+             static_cast<float>(out[i + 1]));
+      for (size_t i = 0; i < K(d); ++i) {
+        printf("%.0f\n", static_cast<float>(out[i]));
+      }
+
+      printf("\n\nin was:\n");
+      for (size_t i = 0; i < K(d); ++i) {
+        printf("%.0f\n", static_cast<float>(in[i]));
+      }
+      fflush(stdout);
+      HWY_ABORT("Sort is incorrect");
+    }
+  }
+
+  // Also verify sums match (detects duplicated/lost values)
+  double expected_sum = 0.0;
+  double actual_sum = 0.0;
+  for (size_t i = 0; i < K(d); ++i) {
+    expected_sum += in[i];
+    actual_sum += out[i];
+  }
+  if (expected_sum != actual_sum) {
+    for (size_t i = 0; i < K(d); ++i) {
+      printf("%.0f  %.0f\n", static_cast<float>(in[i]),
+             static_cast<float>(out[i]));
+    }
+    HWY_ABORT("Mismatch");
+  }
+}
+
+class TestReverse {
+  template <SortOrder kOrder, class D>
+  void TestOrder(D d, RandomState& /* rng */) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    HWY_ASSERT((N % 4) == 0);
+    auto in = AllocateAligned<T>(K(d));
+    auto inout = AllocateAligned<T>(K(d));
+
+    const size_t expected_size = SortBatchSize(d);
+
+    for (size_t i = 0; i < K(d); ++i) {
+      in[i] = static_cast<T>(K(d) - i);
+      inout[i] = in[i];
+    }
+
+    const size_t actual_size = SortBatch<kOrder>(d, inout.get());
+    HWY_ASSERT_EQ(expected_size, actual_size);
+    Validate<kOrder>(d, in.get(), inout.get());
+  }
+
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    TestOrder<SortOrder::kAscending>(d, rng);
+    TestOrder<SortOrder::kDescending>(d, rng);
+  }
+};
+
+void TestAllReverse() {
+  TestReverse test;
+  test(int32_t(), CappedTag<int32_t, 16>());
+  test(uint32_t(), CappedTag<uint32_t, 16>());
+}
+
+class TestRanges {
+  template <SortOrder kOrder, class D>
+  void TestOrder(D d, RandomState& rng) {
+    using T = TFromD<D>;
+    const size_t N = Lanes(d);
+    HWY_ASSERT((N % 4) == 0);
+    auto in = AllocateAligned<T>(K(d));
+    auto inout = AllocateAligned<T>(K(d));
+
+    const size_t expected_size = SortBatchSize(d);
+
+    // For each range, try all 0/1 combinations and set any other lanes to
+    // random inputs.
+    constexpr size_t kRange = 8;
+    for (size_t range = 0; range < K(d); range += kRange) {
+      for (size_t bits = 0; bits < (1ull << kRange); ++bits) {
+        // First set all to random, will later overwrite those for `range`
+        for (size_t i = 0; i < K(d); ++i) {
+          in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF);
+        }
+        // Now set the current combination of {0,1} for elements in the range.
+        // This is sufficient to establish correctness (arbitrary inputs could
+        // be mapped to 0/1 with a comparison predicate).
+        for (size_t i = 0; i < kRange; ++i) {
+          in[range + i] = inout[range + i] = (bits >> i) & 1;
+        }
+
+        const size_t actual_size = SortBatch<kOrder>(d, inout.get());
+        HWY_ASSERT_EQ(expected_size, actual_size);
+        Validate<kOrder>(d, in.get(), inout.get());
+      }
+    }
+  }
+
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    TestOrder<SortOrder::kAscending>(d, rng);
+    TestOrder<SortOrder::kDescending>(d, rng);
+  }
+};
+
+void TestAllRanges() {
+  TestRanges test;
+  test(int32_t(), CappedTag<int32_t, 16>());
+  test(uint32_t(), CappedTag<uint32_t, 16>());
+}
+
+#else
+void TestAllReverse() {}
+void TestAllRanges() {}
+#endif  // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(SortTest);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse);
+HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges);
+}  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/third_party/highway/hwy/detect_compiler_arch.h
+++ b/third_party/highway/hwy/detect_compiler_arch.h
@ -0,0 +1,194 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
+#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
+
+// Detects compiler and arch from predefined macros. Zero dependencies for
+// inclusion by foreach_target.h.
+
+// Add to #if conditions to prevent IDE from graying out code.
+#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
+    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
+#define HWY_IDE 1
+#else
+#define HWY_IDE 0
+#endif
+
+//------------------------------------------------------------------------------
+// Compiler
+
+// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
+// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
+// purpose.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define HWY_COMPILER_MSVC _MSC_VER
+#else
+#define HWY_COMPILER_MSVC 0
+#endif
+
+#ifdef __INTEL_COMPILER
+#define HWY_COMPILER_ICC __INTEL_COMPILER
+#else
+#define HWY_COMPILER_ICC 0
+#endif
+
+#ifdef __GNUC__
+#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define HWY_COMPILER_GCC 0
+#endif
+
+// Clang can masquerade as MSVC/GCC, in which case both are set.
+#ifdef __clang__
+#ifdef __APPLE__
+// Apple LLVM version is unrelated to the actual Clang version, which we need
+// for enabling workarounds. Use the presence of warning flags to deduce it.
+// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
+#if __has_warning("-Wformat-insufficient-args")
+#define HWY_COMPILER_CLANG 1200
+#elif __has_warning("-Wimplicit-const-int-float-conversion")
+#define HWY_COMPILER_CLANG 1100
+#elif __has_warning("-Wmisleading-indentation")
+#define HWY_COMPILER_CLANG 1000
+#elif defined(__FILE_NAME__)
+#define HWY_COMPILER_CLANG 900
+#elif __has_warning("-Wextra-semi-stmt") || \
+    __has_builtin(__builtin_rotateleft32)
+#define HWY_COMPILER_CLANG 800
+#elif __has_warning("-Wc++98-compat-extra-semi")
+#define HWY_COMPILER_CLANG 700
+#else  // Anything older than 7.0 is not recommended for Highway.
+#define HWY_COMPILER_CLANG 600
+#endif  // __has_warning chain
+#else   // Non-Apple: normal version
+#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+#endif
+#else  // Not clang
+#define HWY_COMPILER_CLANG 0
+#endif
+
+// More than one may be nonzero, but we want at least one.
+#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
+    !HWY_COMPILER_CLANG
+#error "Unsupported compiler"
+#endif
+
+#ifdef __has_builtin
+#define HWY_HAS_BUILTIN(name) __has_builtin(name)
+#else
+#define HWY_HAS_BUILTIN(name) 0
+#endif
+
+#ifdef __has_attribute
+#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
+#else
+#define HWY_HAS_ATTRIBUTE(name) 0
+#endif
+
+#ifdef __has_feature
+#define HWY_HAS_FEATURE(name) __has_feature(name)
+#else
+#define HWY_HAS_FEATURE(name) 0
+#endif
+
+//------------------------------------------------------------------------------
+// Architecture
+
+#if defined(HWY_EMULATE_SVE)
+
+#define HWY_ARCH_X86_32 0
+#define HWY_ARCH_X86_64 0
+#define HWY_ARCH_X86 0
+#define HWY_ARCH_PPC 0
+#define HWY_ARCH_ARM_A64 1
+#define HWY_ARCH_ARM_V7 0
+#define HWY_ARCH_ARM 1
+#define HWY_ARCH_WASM 0
+#define HWY_ARCH_RVV 0
+
+#else
+
+#if defined(__i386__) || defined(_M_IX86)
+#define HWY_ARCH_X86_32 1
+#else
+#define HWY_ARCH_X86_32 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define HWY_ARCH_X86_64 1
+#else
+#define HWY_ARCH_X86_64 0
+#endif
+
+#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
+#error "Cannot have both x86-32 and x86-64"
+#endif
+
+#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
+#define HWY_ARCH_X86 1
+#else
+#define HWY_ARCH_X86 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define HWY_ARCH_PPC 1
+#else
+#define HWY_ARCH_PPC 0
+#endif
+
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
+#define HWY_ARCH_ARM_A64 1
+#else
+#define HWY_ARCH_ARM_A64 0
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+#define HWY_ARCH_ARM_V7 1
+#else
+#define HWY_ARCH_ARM_V7 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
+#error "Cannot have both A64 and V7"
+#endif
+
+#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
+#define HWY_ARCH_ARM 1
+#else
+#define HWY_ARCH_ARM 0
+#endif
+
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
+#define HWY_ARCH_WASM 1
+#else
+#define HWY_ARCH_WASM 0
+#endif
+
+#ifdef __riscv
+#define HWY_ARCH_RVV 1
+#else
+#define HWY_ARCH_RVV 0
+#endif
+
+#endif // defined(HWY_EMULATE_SVE)
+
+// It is an error to detect multiple architectures at the same time, but OK to
+// detect none of the above.
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
+     HWY_ARCH_RVV) > 1
+#error "Must not detect more than one architecture"
+#endif
+
+#endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
--- a/third_party/highway/hwy/detect_targets.h
+++ b/third_party/highway/hwy/detect_targets.h
@ -0,0 +1,392 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
+#define HIGHWAY_HWY_DETECT_TARGETS_H_
+
+// Defines targets and chooses which to enable.
+
+#include "hwy/detect_compiler_arch.h"
+
+//------------------------------------------------------------------------------
+// Optional configuration
+
+// See ../quick_reference.md for documentation of these macros.
+
+// Uncomment to override the default baseline determined from predefined macros:
+// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
+
+// Uncomment to override the default blocklist:
+// #define HWY_BROKEN_TARGETS HWY_AVX3
+
+// Uncomment to definitely avoid generating those target(s):
+// #define HWY_DISABLED_TARGETS HWY_SSE4
+
+// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
+// AVX2 target for VMs which support AVX2 but not the other instruction sets)
+// #define HWY_DISABLE_BMI2_FMA
+
+//------------------------------------------------------------------------------
+// Targets
+
+// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
+// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
+//
+// All values are unconditionally defined so we can test HWY_TARGETS without
+// first checking the HWY_ARCH_*.
+//
+// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
+// can use 32-bit literals.
+
+// 1,2: reserved
+
+// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
+// be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
+// yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
+#define HWY_AVX3_DL 4  // see HWY_WANT_AVX3_DL below
+#define HWY_AVX3 8
+#define HWY_AVX2 16
+// 32: reserved for AVX
+#define HWY_SSE4 64
+#define HWY_SSSE3 128
+// 0x100, 0x200: reserved for SSE3, SSE2
+
+// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
+// dynamic dispatch. All x86 target bits must be lower or equal to
+// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
+// HWY_MAX_DYNAMIC_TARGETS in total.
+#define HWY_HIGHEST_TARGET_BIT_X86 9
+
+#define HWY_SVE2 0x400
+#define HWY_SVE 0x800
+// 0x1000 reserved for Helium
+#define HWY_NEON 0x2000
+
+#define HWY_HIGHEST_TARGET_BIT_ARM 13
+
+// 0x4000, 0x8000 reserved
+#define HWY_PPC8 0x10000  // v2.07 or 3
+// 0x20000, 0x40000 reserved for prior VSX/AltiVec
+
+#define HWY_HIGHEST_TARGET_BIT_PPC 18
+
+#define HWY_WASM2 0x80000  // Experimental
+#define HWY_WASM 0x100000
+
+#define HWY_HIGHEST_TARGET_BIT_WASM 20
+
+// 0x200000, 0x400000, 0x800000 reserved
+
+#define HWY_RVV 0x1000000
+
+#define HWY_HIGHEST_TARGET_BIT_RVV 24
+
+// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
+
+#define HWY_SCALAR 0x20000000
+
+#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
+
+// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+
+//------------------------------------------------------------------------------
+// Set default blocklists
+
+// Disabled means excluded from enabled at user's request. A separate config
+// macro allows disabling without deactivating the blocklist below.
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS 0
+#endif
+
+// Broken means excluded from enabled due to known compiler issues. Allow the
+// user to override this blocklist without any guarantee of success.
+#ifndef HWY_BROKEN_TARGETS
+
+// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
+// SSE4 codegen (possibly only for msan), so disable all those targets.
+#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
+#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
+// This entails a major speed reduction, so warn unless the user explicitly
+// opts in to scalar-only.
+#if !defined(HWY_COMPILE_ONLY_SCALAR)
+#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
+#endif
+
+// 32-bit may fail to compile AVX2/3.
+#elif HWY_ARCH_X86_32
+#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
+
+// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
+#elif HWY_COMPILER_MSVC != 0
+#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
+
+// armv7be has not been tested and is not yet supported.
+#elif HWY_ARCH_ARM_V7 &&          \
+    (defined(__ARM_BIG_ENDIAN) || \
+     (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
+#define HWY_BROKEN_TARGETS (HWY_NEON)
+
+// SVE[2] require recent clang or gcc versions.
+#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
+(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
+#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
+
+#else
+#define HWY_BROKEN_TARGETS 0
+#endif
+
+#endif  // HWY_BROKEN_TARGETS
+
+// Enabled means not disabled nor blocklisted.
+#define HWY_ENABLED(targets) \
+  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
+
+//------------------------------------------------------------------------------
+// Detect baseline targets using predefined macros
+
+// Baseline means the targets for which the compiler is allowed to generate
+// instructions, implying the target CPU would have to support them. Do not use
+// this directly because it does not take the blocklist into account. Allow the
+// user to override this without any guarantee of success.
+#ifndef HWY_BASELINE_TARGETS
+
+#if defined(HWY_EMULATE_SVE)
+#define HWY_BASELINE_TARGETS HWY_SVE  // does not support SVE2
+#define HWY_BASELINE_AVX3_DL 0
+#else
+
+// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
+// HWY_TARGET == HWY_SCALAR.
+
+#if HWY_ARCH_WASM && defined(__wasm_simd128__)
+#if defined(HWY_WANT_WASM2)
+#define HWY_BASELINE_WASM HWY_WASM2
+#else
+#define HWY_BASELINE_WASM HWY_WASM
+#endif // HWY_WANT_WASM2
+#else
+#define HWY_BASELINE_WASM 0
+#endif
+
+// Avoid choosing the PPC target until we have an implementation.
+#if HWY_ARCH_PPC && defined(__VSX__) && 0
+#define HWY_BASELINE_PPC8 HWY_PPC8
+#else
+#define HWY_BASELINE_PPC8 0
+#endif
+
+// SVE compiles, but is not yet tested.
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
+#define HWY_BASELINE_SVE2 HWY_SVE2
+#else
+#define HWY_BASELINE_SVE2 0
+#endif
+
+#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
+#define HWY_BASELINE_SVE HWY_SVE
+#else
+#define HWY_BASELINE_SVE 0
+#endif
+
+// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
+#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#define HWY_BASELINE_NEON HWY_NEON
+#else
+#define HWY_BASELINE_NEON 0
+#endif
+
+// Special handling for MSVC because it has fewer predefined macros
+#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
+
+// We can only be sure SSSE3/SSE4 are enabled if AVX is
+// (https://stackoverflow.com/questions/18563978/)
+#if defined(__AVX__)
+#define HWY_CHECK_SSSE3 1
+#define HWY_CHECK_SSE4 1
+#else
+#define HWY_CHECK_SSSE3 0
+#define HWY_CHECK_SSE4 0
+#endif
+
+// Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
+// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
+#define HWY_CHECK_PCLMUL_AES 1
+#define HWY_CHECK_BMI2_FMA 1
+#define HWY_CHECK_F16C 1
+
+#else  // non-MSVC
+
+#if defined(__SSSE3__)
+#define HWY_CHECK_SSSE3 1
+#else
+#define HWY_CHECK_SSSE3 0
+#endif
+
+#if defined(__SSE4_1__) && defined(__SSE4_2__)
+#define HWY_CHECK_SSE4 1
+#else
+#define HWY_CHECK_SSE4 0
+#endif
+
+// If these are disabled, they should not gate the availability of SSE4/AVX2.
+#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
+#define HWY_CHECK_PCLMUL_AES 1
+#else
+#define HWY_CHECK_PCLMUL_AES 0
+#endif
+
+#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
+#define HWY_CHECK_BMI2_FMA 1
+#else
+#define HWY_CHECK_BMI2_FMA 0
+#endif
+
+#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
+#define HWY_CHECK_F16C 1
+#else
+#define HWY_CHECK_F16C 0
+#endif
+
+#endif  // non-MSVC
+
+#if HWY_ARCH_X86 && HWY_CHECK_SSSE3
+#define HWY_BASELINE_SSSE3 HWY_SSSE3
+#else
+#define HWY_BASELINE_SSSE3 0
+#endif
+
+#if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
+#define HWY_BASELINE_SSE4 HWY_SSE4
+#else
+#define HWY_BASELINE_SSE4 0
+#endif
+
+#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
+    defined(__AVX2__)
+#define HWY_BASELINE_AVX2 HWY_AVX2
+#else
+#define HWY_BASELINE_AVX2 0
+#endif
+
+// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
+#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
+    defined(__AVX512DQ__) && defined(__AVX512VL__)
+#define HWY_BASELINE_AVX3 HWY_AVX3
+#else
+#define HWY_BASELINE_AVX3 0
+#endif
+
+// TODO(janwas): not yet known whether these will be set by MSVC
+#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
+    defined(__VPCLMULQDQ__)
+#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
+#else
+#define HWY_BASELINE_AVX3_DL 0
+#endif
+
+#if HWY_ARCH_RVV && defined(__riscv_vector)
+#define HWY_BASELINE_RVV HWY_RVV
+#else
+#define HWY_BASELINE_RVV 0
+#endif
+
+#define HWY_BASELINE_TARGETS                                                \
+  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
+   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 |              \
+   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
+   HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
+
+#endif  // HWY_EMULATE_SVE
+
+#else
+// User already defined HWY_BASELINE_TARGETS, but we still need to define
+// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
+#define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
+#endif  // HWY_BASELINE_TARGETS
+
+//------------------------------------------------------------------------------
+// Choose target for static dispatch
+
+#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
+#if HWY_ENABLED_BASELINE == 0
+#error "At least one baseline target must be defined and enabled"
+#endif
+
+// Best baseline, used for static dispatch. This is the least-significant 1-bit
+// within HWY_ENABLED_BASELINE and lower bit values imply "better".
+#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
+
+// Start by assuming static dispatch. If we later use dynamic dispatch, this
+// will be defined to other targets during the multiple-inclusion, and finally
+// return to the initial value. Defining this outside begin/end_target ensures
+// inl headers successfully compile by themselves (required by Bazel).
+#define HWY_TARGET HWY_STATIC_TARGET
+
+//------------------------------------------------------------------------------
+// Choose targets for dynamic dispatch according to one of four policies
+
+#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
+     defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
+#error "Invalid config: can only define a single policy for targets"
+#endif
+
+// Further to checking for disabled/broken targets, we only use AVX3_DL after
+// explicit opt-in (via this macro OR baseline compiler flags) to avoid
+// generating a codepath which is only helpful if the app uses AVX3_DL features.
+#if defined(HWY_WANT_AVX3_DL)
+#define HWY_CHECK_AVX3_DL HWY_AVX3_DL
+#else
+#define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
+#endif
+
+// Attainable means enabled and the compiler allows intrinsics (even when not
+// allowed to autovectorize). Used in 3 and 4.
+#if HWY_ARCH_X86
+#define HWY_ATTAINABLE_TARGETS                                          \
+  HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
+              HWY_CHECK_AVX3_DL)
+#else
+#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
+#endif
+
+// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
+// to ~HWY_SCALAR, but this is more explicit).
+#if defined(HWY_COMPILE_ONLY_SCALAR)
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
+#define HWY_TARGETS HWY_SCALAR
+
+// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
+#elif defined(HWY_COMPILE_ONLY_STATIC)
+#define HWY_TARGETS HWY_STATIC_TARGET
+
+// 3) For tests: include all attainable targets (in particular: scalar)
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
+#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
+
+// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
+// excluding superseded targets, in particular scalar.
+#else
+#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+
+#endif  // target policy
+
+// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
+// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
+// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
+#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
+#error "Logic error: best baseline should be included in dynamic targets"
+#endif
+
+#endif  // HIGHWAY_HWY_DETECT_TARGETS_H_
--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@ -16,7 +16,9 @@
 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
 #include "hwy/foreach_target.h"

+#include <inttypes.h>
 #include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>

 #include <memory>
@ -57,7 +59,7 @@ template <class Benchmark>
 void RunBenchmark(const char* caption) {
  printf("%10s: ", caption);
  const size_t kNumInputs = 1;
-  const size_t num_items = Benchmark::NumItems() * Unpredictable1();
+  const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
  const FuncInput inputs[kNumInputs] = {num_items};
  Result results[kNumInputs];

@ -77,16 +79,17 @@ void RunBenchmark(const char* caption) {
  benchmark.Verify(num_items);

  for (size_t i = 0; i < num_results; ++i) {
-    const double cycles_per_item = results[i].ticks / results[i].input;
+    const double cycles_per_item = results[i].ticks / double(results[i].input);
    const double mad = results[i].variability * cycles_per_item;
-    printf("%6zu: %6.3f (+/- %5.3f)\n", results[i].input, cycles_per_item, mad);
+    printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
+           static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
  }
 }

 void Intro() {
  HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
  HWY_ALIGN float out[16];
-  HWY_FULL(float) d;  // largest possible vector
+  const ScalableTag<float> d;  // largest possible vector
  for (size_t i = 0; i < 16; i += Lanes(d)) {
    const auto vec = Load(d, in + i);  // aligned!
    auto result = vec * vec;
@ -103,22 +106,22 @@ class BenchmarkDot : public TwoArray {
  BenchmarkDot() : dot_{-1.0f} {}

  FuncOutput operator()(const size_t num_items) {
-    HWY_FULL(float) d;
+    const ScalableTag<float> d;
    const size_t N = Lanes(d);
    using V = decltype(Zero(d));
-    constexpr int unroll = 8;
+    constexpr size_t unroll = 8;
    // Compiler doesn't make independent sum* accumulators, so unroll manually.
    // Some older compilers might not be able to fit the 8 arrays in registers,
    // so manual unrolling can be helpfull if you run into this issue.
    // 2 FMA ports * 4 cycle latency = 8x unrolled.
    V sum[unroll];
-    for (int i = 0; i < unroll; ++i) {
+    for (size_t i = 0; i < unroll; ++i) {
      sum[i] = Zero(d);
    }
    const float* const HWY_RESTRICT pa = &a_[0];
    const float* const HWY_RESTRICT pb = b_;
    for (size_t i = 0; i < num_items; i += unroll * N) {
-      for (int j = 0; j < unroll; ++j) {
+      for (size_t j = 0; j < unroll; ++j) {
        const auto a = Load(d, pa + i + j * N);
        const auto b = Load(d, pb + i + j * N);
        sum[j] = MulAdd(a, b, sum[j]);
@ -126,12 +129,12 @@ class BenchmarkDot : public TwoArray {
    }
    // Reduction tree: sum of all accumulators by pairs into sum[0], then the
    // lanes.
-    for (int power = 1; power < unroll; power *= 2) {
-      for (int i = 0; i < unroll; i += 2 * power) {
+    for (size_t power = 1; power < unroll; power *= 2) {
+      for (size_t i = 0; i < unroll; i += 2 * power) {
        sum[i] += sum[i + power];
      }
    }
-    dot_ = GetLane(SumOfLanes(sum[0]));
+    dot_ = GetLane(SumOfLanes(d, sum[0]));
    return static_cast<FuncOutput>(dot_);
  }
  void Verify(size_t num_items) {
@ -166,7 +169,7 @@ struct BenchmarkDelta : public TwoArray {
 #elif HWY_CAP_GE256
    // Larger vectors are split into 128-bit blocks, easiest to use the
    // unaligned load support to shift between them.
-    const HWY_FULL(float) df;
+    const ScalableTag<float> df;
    const size_t N = Lanes(df);
    size_t i;
    b_[0] = a_[0];
--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@ -14,7 +14,6 @@

 #include "hwy/examples/skeleton.h"

-#include <assert.h>
 #include <stdio.h>

 // First undef to prevent error when re-included.
@ -35,21 +34,29 @@ namespace HWY_NAMESPACE {
 // Highway ops reside here; ADL does not find templates nor builtins.
 using namespace hwy::HWY_NAMESPACE;

+// For reasons unknown, optimized msan builds encounter long build times here;
+// work around it until a cause is found.
+#if HWY_COMPILER_CLANG && defined(MEMORY_SANITIZER) && defined(__OPTIMIZE__)
+#define ATTR_MSAN __attribute__((optnone))
+#else
+#define ATTR_MSAN
+#endif
+
 // Computes log2 by converting to a vector of floats. Compiled once per target.
 template <class DF>
-HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
-                               uint8_t* HWY_RESTRICT log2) {
+ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
+                            uint8_t* HWY_RESTRICT log2) {
  // Type tags for converting to other element types (Rebind = same count).
  const Rebind<int32_t, DF> d32;
  const Rebind<uint8_t, DF> d8;

  const auto u8 = Load(d8, values);
  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
-  const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
+  const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
  Store(DemoteTo(d8, exponent), d8, log2);
 }

-HWY_NOINLINE void CodepathDemo() {
+void CodepathDemo() {
  // Highway defaults to portability, but per-target codepaths may be selected
  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
 #if HWY_CAP_INTEGER64
@ -60,12 +67,12 @@ HWY_NOINLINE void CodepathDemo() {
  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
 }

-HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
-                            uint8_t* HWY_RESTRICT log2) {
+void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+               uint8_t* HWY_RESTRICT log2) {
  CodepathDemo();

  // Second argument is necessary on RVV until it supports fractional lengths.
-  HWY_FULL(float, 4) df;
+  const ScalableTag<float, 2> df;

  const size_t N = Lanes(df);
  size_t i = 0;
--- a/third_party/highway/hwy/examples/skeleton_test.cc
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@ -99,9 +99,17 @@ HWY_NOINLINE void TestAllSumMulAdd() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace skeleton {
 HWY_BEFORE_TEST(SkeletonTest);
 HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
 HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
 }  // namespace skeleton
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/foreach_target.h
+++ b/third_party/highway/hwy/foreach_target.h
@ -19,7 +19,7 @@
 // targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
 // highway.h defines the corresponding macro/namespace.

-#include "hwy/targets.h"
+#include "hwy/detect_targets.h"

 // *_inl.h may include other headers, which requires include guards to prevent
 // repeated inclusion. The guards must be reset after compiling each target, so
@ -74,6 +74,17 @@
 #endif
 #endif

+#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SSSE3
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_SSE4
@ -107,6 +118,28 @@
 #endif
 #endif

+#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX3_DL
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_WASM2
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
 #if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
 #undef HWY_TARGET
 #define HWY_TARGET HWY_WASM
--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@ -27,16 +27,13 @@ namespace hwy {

 // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 #define HWY_MAJOR 0
-#define HWY_MINOR 12
-#define HWY_PATCH 2
+#define HWY_MINOR 15
+#define HWY_PATCH 0

 //------------------------------------------------------------------------------
-// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
-
-// Because Highway functions take descriptor and/or vector arguments, ADL finds
-// these functions without requiring users in project::HWY_NAMESPACE to
-// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
-// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
+// Shorthand for tags (defined in shared-inl.h) used to select overloads.
+// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
+// HWY_CAPPED(T, N).

 // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
 // registers in the group, and is ignored on targets that do not support groups.
@ -71,6 +68,8 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_RVV
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM2
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_WASM
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_NEON
@ -81,12 +80,16 @@ namespace hwy {
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_PPC8
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SSSE3
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_SSE4
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_AVX2
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
 #elif HWY_STATIC_TARGET == HWY_AVX3
 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX3_DL
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
 #endif

 // Dynamic dispatch declarations.
@ -129,6 +132,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
 #endif

+#if HWY_TARGETS & HWY_WASM2
+#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
+#else
+#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_WASM
 #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
 #else
@ -165,6 +174,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
 #endif

+#if HWY_TARGETS & HWY_SSSE3
+#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
+#else
+#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
+#endif
+
 #if HWY_TARGETS & HWY_SSE4
 #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
 #else
@ -183,6 +198,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
 #endif

+#if HWY_TARGETS & HWY_AVX3_DL
+#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
+#endif
+
 #define HWY_DISPATCH_TABLE(FUNC_NAME) \
  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)

@ -270,11 +291,11 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #endif

 // These define ops inside namespace hwy::HWY_NAMESPACE.
-#if HWY_TARGET == HWY_SSE4
+#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
 #include "hwy/ops/x86_128-inl.h"
 #elif HWY_TARGET == HWY_AVX2
 #include "hwy/ops/x86_256-inl.h"
-#elif HWY_TARGET == HWY_AVX3
+#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
 #include "hwy/ops/x86_512-inl.h"
 #elif HWY_TARGET == HWY_PPC8
 #error "PPC is not yet supported"
@ -282,6 +303,8 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #include "hwy/ops/arm_neon-inl.h"
 #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
 #include "hwy/ops/arm_sve-inl.h"
+#elif HWY_TARGET == HWY_WASM2
+#include "hwy/ops/wasm_256-inl.h"
 #elif HWY_TARGET == HWY_WASM
 #include "hwy/ops/wasm_128-inl.h"
 #elif HWY_TARGET == HWY_RVV
@ -292,65 +315,6 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
 #pragma message("HWY_TARGET does not match any known target")
 #endif  // HWY_TARGET

-// Commonly used functions/types that must come after ops are defined.
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
-template <class V>
-using LaneType = decltype(GetLane(V()));
-
-// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
-// of functions that do not take a vector argument, or as an argument type if
-// the function only has a template argument for D, or for explicit type names
-// instead of auto. This may be a built-in type.
-template <class D>
-using Vec = decltype(Zero(D()));
-
-// Mask type. Useful as the return type of functions that do not take a mask
-// argument, or as an argument type if the function only has a template argument
-// for D, or for explicit type names instead of auto.
-template <class D>
-using Mask = decltype(MaskFromVec(Zero(D())));
-
-// Returns the closest value to v within [lo, hi].
-template <class V>
-HWY_API V Clamp(const V v, const V lo, const V hi) {
-  return Min(Max(lo, v), hi);
-}
-
-// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
-// TODO(janwas): implement for RVV
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
-
-template <size_t kLanes, class V>
-HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
-  return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
-}
-
-#endif
-
-// Returns lanes with the most significant bit set and all other bits zero.
-template <class D>
-HWY_API Vec<D> SignBit(D d) {
-  using Unsigned = MakeUnsigned<TFromD<D>>;
-  const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
-  return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
-}
-
-// Returns quiet NaN.
-template <class D>
-HWY_API Vec<D> NaN(D d) {
-  const RebindToSigned<D> di;
-  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
-  // mantissa MSB (to indicate quiet) would be sufficient.
-  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
+#include "hwy/ops/generic_ops-inl.h"

 #endif  // HWY_HIGHWAY_PER_TARGET
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@ -66,9 +66,9 @@ struct TestOverflow {
    const auto vmax = Set(d, LimitsMax<T>());
    const auto vmin = Set(d, LimitsMin<T>());
    // Unsigned underflow / negative -> positive
-    HWY_ASSERT_VEC_EQ(d, vmax, vmin - v1);
+    HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1));
    // Unsigned overflow / positive -> negative
-    HWY_ASSERT_VEC_EQ(d, vmin, vmax + v1);
+    HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1));
  }
 };

@ -76,6 +76,22 @@ HWY_NOINLINE void TestAllOverflow() {
  ForIntegerTypes(ForPartialVectors<TestOverflow>());
 }

+struct TestClamp {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto v1 = Set(d, 1);
+    const auto v2 = Set(d, 2);
+
+    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllClamp() {
+  ForAllTypes(ForPartialVectors<TestClamp>());
+}
+
 struct TestSignBitInteger {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -122,16 +138,20 @@ bool IsNaN(TF f) {
 }

 template <class D, class V>
-HWY_NOINLINE void AssertNaN(const D d, const V v, const char* file, int line) {
+HWY_NOINLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
  using T = TFromD<D>;
  const T lane = GetLane(v);
  if (!IsNaN(lane)) {
    const std::string type_name = TypeName(T(), Lanes(d));
-    MakeUnsigned<T> bits;
-    memcpy(&bits, &lane, sizeof(T));
-    // RVV lacks PRIu64, so use size_t; double will be truncated on 32-bit.
-    Abort(file, line, "Expected %s NaN, got %E (%zu)", type_name.c_str(), lane,
-          size_t(bits));
+    // RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
+    // avoid truncating doubles.
+    uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
+    memcpy(bytes, &lane, sizeof(T));
+    Abort(file, line,
+          "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
+          "%02x)",
+          type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3],
+          bytes[4], bytes[5], bytes[6], bytes[7]);
  }
 }

@ -187,18 +207,18 @@ struct TestNaN {
    HWY_ASSERT_NAN(d, Or(nan, v1));

    // Comparison
-    HWY_ASSERT(AllFalse(Eq(nan, v1)));
-    HWY_ASSERT(AllFalse(Gt(nan, v1)));
-    HWY_ASSERT(AllFalse(Lt(nan, v1)));
-    HWY_ASSERT(AllFalse(Ge(nan, v1)));
-    HWY_ASSERT(AllFalse(Le(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Eq(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Gt(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Lt(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Ge(nan, v1)));
+    HWY_ASSERT(AllFalse(d, Le(nan, v1)));

    // Reduction
-    HWY_ASSERT_NAN(d, SumOfLanes(nan));
+    HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
 // TODO(janwas): re-enable after QEMU is fixed
 #if HWY_TARGET != HWY_RVV
-    HWY_ASSERT_NAN(d, MinOfLanes(nan));
-    HWY_ASSERT_NAN(d, MaxOfLanes(nan));
+    HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
+    HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
 #endif

    // Min
@ -227,13 +247,6 @@ struct TestNaN {
 #endif
    HWY_ASSERT_NAN(d, Min(nan, nan));
    HWY_ASSERT_NAN(d, Max(nan, nan));
-
-    // Comparison
-    HWY_ASSERT(AllFalse(Eq(nan, v1)));
-    HWY_ASSERT(AllFalse(Gt(nan, v1)));
-    HWY_ASSERT(AllFalse(Lt(nan, v1)));
-    HWY_ASSERT(AllFalse(Ge(nan, v1)));
-    HWY_ASSERT(AllFalse(Le(nan, v1)));
  }
 };

@ -286,6 +299,19 @@ HWY_NOINLINE void TestAllGetLane() {
  ForAllTypes(ForPartialVectors<TestGetLane>());
 }

+struct TestDFromV {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    using D0 = DFromV<decltype(v0)>;         // not necessarily same as D
+    const auto v0b = And(v0, Set(D0(), 1));  // but vectors can interoperate
+    HWY_ASSERT_VEC_EQ(d, v0, v0b);
+  }
+};
+
+HWY_NOINLINE void TestAllDFromV() {
+  ForAllTypes(ForPartialVectors<TestDFromV>());
+}

 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
@ -293,13 +319,23 @@ HWY_NOINLINE void TestAllGetLane() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HighwayTest);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
 HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@ -14,6 +14,7 @@

 #include "hwy/nanobenchmark.h"

+#include <inttypes.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>  // abort
@ -46,7 +47,7 @@
 #endif

 #include "hwy/base.h"
-#if HWY_ARCH_PPC
+#if HWY_ARCH_PPC && defined(__GLIBC__)
 #include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
 #elif HWY_ARCH_X86

@ -119,7 +120,7 @@ using Ticks = uint64_t;
 // divide by InvariantTicksPerSecond.
 inline Ticks Start() {
  Ticks t;
-#if HWY_ARCH_PPC
+#if HWY_ARCH_PPC && defined(__GLIBC__)
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  _ReadWriteBarrier();
@ -154,14 +155,14 @@ inline Ticks Start() {
 #else  // POSIX
  timespec ts;
  clock_gettime(CLOCK_MONOTONIC, &ts);
-  t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
 #endif
  return t;
 }

 inline Ticks Stop() {
  uint64_t t;
-#if HWY_ARCH_PPC
+#if HWY_ARCH_PPC && defined(__GLIBC__)
  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
  _ReadWriteBarrier();
@ -353,6 +354,12 @@ void Cpuid(const uint32_t level, const uint32_t count,
 #endif
 }

+bool HasRDTSCP() {
+  uint32_t abcd[4];
+  Cpuid(0x80000001U, 0, abcd);         // Extended feature flags
+  return (abcd[3] & (1u << 27)) != 0;  // RDTSCP
+}
+
 std::string BrandString() {
  char brand_string[49];
  std::array<uint32_t, 4> abcd;
@ -399,8 +406,8 @@ double NominalClockRate() {
 }  // namespace

 double InvariantTicksPerSecond() {
-#if HWY_ARCH_PPC
-  return __ppc_get_timebase_freq();
+#if HWY_ARCH_PPC && defined(__GLIBC__)
+  return double(__ppc_get_timebase_freq());
 #elif HWY_ARCH_X86
  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
  return NominalClockRate();
@ -457,9 +464,10 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
  static const double ticks_per_second = platform::InvariantTicksPerSecond();
  const size_t ticks_per_eval =
      static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
-  size_t samples_per_eval =
-      est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
-  samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
+  size_t samples_per_eval = est == 0
+                                ? p.min_samples_per_eval
+                                : static_cast<size_t>(ticks_per_eval / est);
+  samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval);

  std::vector<timer::Ticks> samples;
  samples.reserve(1 + samples_per_eval);
@ -494,17 +502,21 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,

    if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
      if (p.verbose) {
-        printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
-               samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
+        printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64
+               ", rel_mad=%4.2f%%)\n",
+               static_cast<uint64_t>(samples.size()),
+               static_cast<uint64_t>(est), static_cast<uint64_t>(abs_mad),
+               *rel_mad * 100.0);
      }
      return est;
    }
  }

  if (p.verbose) {
-    printf(
-        "WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
-        *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
+    printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
+           " samples.\n",
+           *rel_mad * 100.0, max_rel_mad * 100.0,
+           static_cast<uint64_t>(samples.size()));
  }
  return est;
 }
@ -530,17 +542,22 @@ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
    const timer::Ticks total = SampleUntilStable(
        p.target_rel_mad, &rel_mad, p,
        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
-    min_duration = std::min(min_duration, total - timer_resolution);
+    min_duration = HWY_MIN(min_duration, total - timer_resolution);
  }

  // Number of repetitions required to reach the target resolution.
  const size_t max_skip = p.precision_divisor;
  // Number of repetitions given the estimated duration.
  const size_t num_skip =
-      min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
+      min_duration == 0
+          ? 0
+          : static_cast<size_t>((max_skip + min_duration - 1) / min_duration);
  if (p.verbose) {
-    printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
-           size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
+    printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64
+           " num_skip=%" PRIu64 "\n",
+           static_cast<uint64_t>(timer_resolution),
+           static_cast<uint64_t>(max_skip), static_cast<uint64_t>(min_duration),
+           static_cast<uint64_t>(num_skip));
  }
  return num_skip;
 }
@ -615,7 +632,7 @@ timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
          platform::PreventElision(func(arg, input));
        }
      });
-  *max_rel_mad = std::max(*max_rel_mad, rel_mad);
+  *max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad);
  return duration;
 }

@ -644,6 +661,15 @@ int Unpredictable1() { return timer::Start() != ~0ULL; }
 size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
               const size_t num_inputs, Result* results, const Params& p) {
  NANOBENCHMARK_CHECK(num_inputs != 0);
+
+#if HWY_ARCH_X86
+  if (!platform::HasRDTSCP()) {
+    fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
+            platform::BrandString().c_str());
+    return 0;
+  }
+#endif
+
  const InputVec& unique = UniqueInputs(inputs, num_inputs);

  const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
@ -658,14 +684,19 @@ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
  const timer::Ticks overhead = Overhead(arg, &full, p);
  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
  if (overhead < overhead_skip) {
-    fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
-            size_t(overhead), size_t(overhead_skip));
+    fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n",
+            static_cast<uint64_t>(overhead),
+            static_cast<uint64_t>(overhead_skip));
    return 0;
  }

  if (p.verbose) {
-    printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
-           size_t(overhead), size_t(overhead_skip));
+    printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64
+           "\n",
+           static_cast<uint64_t>(full.size()),
+           static_cast<uint64_t>(subset.size()),
+           static_cast<uint64_t>(overhead),
+           static_cast<uint64_t>(overhead_skip));
  }

  double max_rel_mad = 0.0;
@ -677,8 +708,8 @@ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
        TotalDuration(func, arg, &subset, p, &max_rel_mad);

    if (total < total_skip) {
-      fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
-              size_t(total_skip));
+      fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n",
+              static_cast<uint64_t>(total), static_cast<uint64_t>(total_skip));
      return 0;
    }

--- a/third_party/highway/hwy/nanobenchmark_test.cc
+++ b/third_party/highway/hwy/nanobenchmark_test.cc
@ -14,6 +14,8 @@

 #include "hwy/nanobenchmark.h"

+#include <inttypes.h>
+#include <stdint.h>
 #include <stdio.h>

 #include <random>
@ -23,6 +25,13 @@
 namespace hwy {
 namespace {

+// Governs duration of test; avoid timeout in debug builds.
+#if HWY_IS_DEBUG_BUILD
+constexpr size_t kMaxEvals = 3;
+#else
+constexpr size_t kMaxEvals = 4;
+#endif
+
 FuncOutput Div(const void*, FuncInput in) {
  // Here we're measuring the throughput because benchmark invocations are
  // independent. Any dividend will do; the divisor is nonzero.
@ -34,11 +43,12 @@ void MeasureDiv(const FuncInput (&inputs)[N]) {
  printf("Measuring integer division (output on final two lines)\n");
  Result results[N];
  Params params;
-  params.max_evals = 4;  // avoid test timeout
+  params.max_evals = kMaxEvals;
  const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
  for (size_t i = 0; i < num_results; ++i) {
-    printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
-           results[i].ticks, results[i].variability * 100.0);
+    printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n",
+           static_cast<uint64_t>(results[i].input), results[i].ticks,
+           results[i].variability * 100.0);
  }
 }

@ -59,7 +69,7 @@ template <size_t N>
 void MeasureRandom(const FuncInput (&inputs)[N]) {
  Result results[N];
  Params p;
-  p.max_evals = 4;  // avoid test timeout
+  p.max_evals = kMaxEvals;
  p.verbose = false;
  const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
  for (size_t i = 0; i < num_results; ++i) {
@ -78,3 +88,9 @@ TEST(NanobenchmarkTest, RunAll) {

 }  // namespace
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
--- a/third_party/highway/hwy/ops/arm_sve-inl.h
+++ b/third_party/highway/hwy/ops/arm_sve-inl.h
--- a/third_party/highway/hwy/ops/generic_ops-inl.h
+++ b/third_party/highway/hwy/ops/generic_ops-inl.h
@ -0,0 +1,324 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Target-independent types/functions defined after target-specific ops.
+
+// Relies on the external include guard in highway.h.
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
+template <class V>
+using LaneType = decltype(GetLane(V()));
+
+// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
+// of functions that do not take a vector argument, or as an argument type if
+// the function only has a template argument for D, or for explicit type names
+// instead of auto. This may be a built-in type.
+template <class D>
+using Vec = decltype(Zero(D()));
+
+// Mask type. Useful as the return type of functions that do not take a mask
+// argument, or as an argument type if the function only has a template argument
+// for D, or for explicit type names instead of auto.
+template <class D>
+using Mask = decltype(MaskFromVec(Zero(D())));
+
+// Returns the closest value to v within [lo, hi].
+template <class V>
+HWY_API V Clamp(const V v, const V lo, const V hi) {
+  return Min(Max(lo, v), hi);
+}
+
+// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
+// and RVV has its own implementation of -Lanes.
+#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
+
+template <size_t kLanes, class D, class V = VFromD<D>>
+HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
+  constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
+  static_assert(kBytes < 16, "Shift count is per-block");
+  return CombineShiftRightBytes<kBytes>(d, hi, lo);
+}
+
+// DEPRECATED
+template <size_t kLanes, class V>
+HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
+  return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo);
+}
+
+#endif
+
+// Returns lanes with the most significant bit set and all other bits zero.
+template <class D>
+HWY_API Vec<D> SignBit(D d) {
+  using Unsigned = MakeUnsigned<TFromD<D>>;
+  const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
+  return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
+}
+
+// Returns quiet NaN.
+template <class D>
+HWY_API Vec<D> NaN(D d) {
+  const RebindToSigned<D> di;
+  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
+  // mantissa MSB (to indicate quiet) would be sufficient.
+  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
+}
+
+// ------------------------------ AESRound
+
+// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
+#if HWY_TARGET != HWY_SCALAR
+
+// Define for white-box testing, even if native instructions are available.
+namespace detail {
+
+// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
+// Vector Permute Instructions" and the accompanying assembly language
+// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
+// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
+//
+// A brute-force 256 byte table lookup can also be made constant-time, and
+// possibly competitive on NEON, but this is more performance-portable
+// especially for x86 and large vectors.
+template <class V>  // u8
+HWY_INLINE V SubBytes(V state) {
+  const DFromV<V> du;
+  const auto mask = Set(du, 0xF);
+
+  // Change polynomial basis to GF(2^4)
+  {
+    alignas(16) static constexpr uint8_t basisL[16] = {
+        0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
+        0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
+    alignas(16) static constexpr uint8_t basisU[16] = {
+        0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
+        0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
+    const auto sL = And(state, mask);
+    const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
+    const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
+    const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
+    state = Xor(gf4L, gf4U);
+  }
+
+  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
+  // cause TableLookupBytesOr0 to return 0.
+  alignas(16) static constexpr uint8_t kZetaInv[16] = {
+      0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
+  alignas(16) static constexpr uint8_t kInv[16] = {
+      0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
+  const auto tbl = LoadDup128(du, kInv);
+  const auto sL = And(state, mask);      // L=low nibble, U=upper
+  const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
+  const auto sX = Xor(sU, sL);
+  const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
+  const auto invU = TableLookupBytes(tbl, sU);
+  const auto invX = TableLookupBytes(tbl, sX);
+  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
+  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
+
+  // Linear skew (cannot bake 0x63 bias into the table because out* indices
+  // may have the infinity flag set).
+  alignas(16) static constexpr uint8_t kAffineL[16] = {
+      0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
+      0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
+  alignas(16) static constexpr uint8_t kAffineU[16] = {
+      0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
+      0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
+  const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
+  const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
+  return Xor(Xor(affL, affU), Set(du, 0x63));
+}
+
+}  // namespace detail
+
+#endif  // HWY_TARGET != HWY_SCALAR
+
+// "Include guard": skip if native AES instructions are available.
+#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_AES
+#undef HWY_NATIVE_AES
+#else
+#define HWY_NATIVE_AES
+#endif
+
+// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
+#if HWY_TARGET != HWY_SCALAR
+
+namespace detail {
+
+template <class V>  // u8
+HWY_API V ShiftRows(const V state) {
+  const DFromV<V> du;
+  alignas(16) static constexpr uint8_t kShiftRow[16] = {
+      0,  5,  10, 15,  // transposed: state is column major
+      4,  9,  14, 3,   //
+      8,  13, 2,  7,   //
+      12, 1,  6,  11};
+  const auto shift_row = LoadDup128(du, kShiftRow);
+  return TableLookupBytes(state, shift_row);
+}
+
+template <class V>  // u8
+HWY_API V MixColumns(const V state) {
+  const DFromV<V> du;
+  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
+  // 2 3 1 1  // Let s := state*1, d := state*2, t := state*3.
+  // 1 2 3 1  // d are on diagonal, no permutation needed.
+  // 1 1 2 3  // t1230 indicates column indices of threes for the 4 rows.
+  // 3 1 1 2  // We also need to compute s2301 and s3012 (=1230 o 2301).
+  alignas(16) static constexpr uint8_t k2301[16] = {
+      2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
+  alignas(16) static constexpr uint8_t k1230[16] = {
+      1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
+  const RebindToSigned<decltype(du)> di;  // can only do signed comparisons
+  const auto msb = Lt(BitCast(di, state), Zero(di));
+  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
+  const auto d = Xor(Add(state, state), overflow);  // = state*2 in GF(2^8).
+  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
+  const auto d_s2301 = Xor(d, s2301);
+  const auto t_s2301 = Xor(state, d_s2301);  // t(s*3) = XOR-sum {s, d(s*2)}
+  const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
+  return Xor(d_s2301, t1230_s3012);  // XOR-sum of 4 terms
+}
+
+}  // namespace detail
+
+template <class V>  // u8
+HWY_API V AESRound(V state, const V round_key) {
+  // Intel docs swap the first two steps, but it does not matter because
+  // ShiftRows is a permutation and SubBytes is independent of lane index.
+  state = detail::SubBytes(state);
+  state = detail::ShiftRows(state);
+  state = detail::MixColumns(state);
+  state = Xor(state, round_key);  // AddRoundKey
+  return state;
+}
+
+// Constant-time implementation inspired by
+// https://www.bearssl.org/constanttime.html, but about half the cost because we
+// use 64x64 multiplies and 128-bit XORs.
+template <class V>
+HWY_API V CLMulLower(V a, V b) {
+  const DFromV<V> d;
+  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
+  const auto k1 = Set(d, 0x1111111111111111ULL);
+  const auto k2 = Set(d, 0x2222222222222222ULL);
+  const auto k4 = Set(d, 0x4444444444444444ULL);
+  const auto k8 = Set(d, 0x8888888888888888ULL);
+  const auto a0 = And(a, k1);
+  const auto a1 = And(a, k2);
+  const auto a2 = And(a, k4);
+  const auto a3 = And(a, k8);
+  const auto b0 = And(b, k1);
+  const auto b1 = And(b, k2);
+  const auto b2 = And(b, k4);
+  const auto b3 = And(b, k8);
+
+  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
+  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
+  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
+  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
+  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
+  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
+  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
+  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
+  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
+}
+
+template <class V>
+HWY_API V CLMulUpper(V a, V b) {
+  const DFromV<V> d;
+  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
+  const auto k1 = Set(d, 0x1111111111111111ULL);
+  const auto k2 = Set(d, 0x2222222222222222ULL);
+  const auto k4 = Set(d, 0x4444444444444444ULL);
+  const auto k8 = Set(d, 0x8888888888888888ULL);
+  const auto a0 = And(a, k1);
+  const auto a1 = And(a, k2);
+  const auto a2 = And(a, k4);
+  const auto a3 = And(a, k8);
+  const auto b0 = And(b, k1);
+  const auto b1 = And(b, k2);
+  const auto b2 = And(b, k4);
+  const auto b3 = And(b, k8);
+
+  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
+  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
+  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
+  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
+  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
+  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
+  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
+  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
+  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
+}
+
+#endif  // HWY_NATIVE_AES
+#endif  // HWY_TARGET != HWY_SCALAR
+
+// "Include guard": skip if native POPCNT-related instructions are available.
+#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
+#ifdef HWY_NATIVE_POPCNT
+#undef HWY_NATIVE_POPCNT
+#else
+#define HWY_NATIVE_POPCNT
+#endif
+
+template <typename V, HWY_IF_LANES_ARE(uint8_t, V)>
+HWY_API V PopulationCount(V v) {
+  constexpr DFromV<V> d;
+  HWY_ALIGN constexpr uint8_t kLookup[16] = {
+      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+  };
+  auto lo = And(v, Set(d, 0xF));
+  auto hi = ShiftRight<4>(v);
+  auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup);
+  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
+}
+
+template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
+HWY_API V PopulationCount(V v) {
+  const DFromV<V> d;
+  Repartition<uint8_t, decltype(d)> d8;
+  auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
+  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
+}
+
+template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
+HWY_API V PopulationCount(V v) {
+  const DFromV<V> d;
+  Repartition<uint16_t, decltype(d)> d16;
+  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
+  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
+}
+
+#if HWY_CAP_INTEGER64
+template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
+HWY_API V PopulationCount(V v) {
+  const DFromV<V> d;
+  Repartition<uint32_t, decltype(d)> d32;
+  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
+  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
+}
+#endif
+
+#endif  // HWY_NATIVE_POPCNT
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
--- a/third_party/highway/hwy/ops/set_macros-inl.h
+++ b/third_party/highway/hwy/ops/set_macros-inl.h
@ -25,35 +25,84 @@

 #endif  // HWY_SET_MACROS_PER_TARGET

-#include "hwy/targets.h"
+#include "hwy/detect_targets.h"

 #undef HWY_NAMESPACE
 #undef HWY_ALIGN
+#undef HWY_MAX_BYTES
 #undef HWY_LANES

 #undef HWY_CAP_INTEGER64
+#undef HWY_CAP_FLOAT16
 #undef HWY_CAP_FLOAT64
 #undef HWY_CAP_GE256
 #undef HWY_CAP_GE512

 #undef HWY_TARGET_STR

+#if defined(HWY_DISABLE_PCLMUL_AES)
+#define HWY_TARGET_STR_PCLMUL_AES ""
+#else
+#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
+#endif
+
+#if defined(HWY_DISABLE_BMI2_FMA)
+#define HWY_TARGET_STR_BMI2_FMA ""
+#else
+#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
+#endif
+
+#if defined(HWY_DISABLE_F16C)
+#define HWY_TARGET_STR_F16C ""
+#else
+#define HWY_TARGET_STR_F16C ",f16c"
+#endif
+
+#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
+
+#define HWY_TARGET_STR_SSE4 \
+  HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
+// Include previous targets, which are the half-vectors of the next target.
+#define HWY_TARGET_STR_AVX2 \
+  HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
+#define HWY_TARGET_STR_AVX3 \
+  HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw"
+
 // Before include guard so we redefine HWY_TARGET_STR on each include,
 // governed by the current HWY_TARGET.
 //-----------------------------------------------------------------------------
-// SSE4
-#if HWY_TARGET == HWY_SSE4
+// SSSE3
+#if HWY_TARGET == HWY_SSSE3

-#define HWY_NAMESPACE N_SSE4
+#define HWY_NAMESPACE N_SSSE3
 #define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_AES 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
+//-----------------------------------------------------------------------------
+// SSE4
+#elif HWY_TARGET == HWY_SSE4
+
+#define HWY_NAMESPACE N_SSE4
+#define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

-#define HWY_TARGET_STR "sse2,ssse3,sse4.1"
+#define HWY_TARGET_STR HWY_TARGET_STR_SSE4

 //-----------------------------------------------------------------------------
 // AVX2
@ -61,47 +110,57 @@

 #define HWY_NAMESPACE N_AVX2
 #define HWY_ALIGN alignas(32)
+#define HWY_MAX_BYTES 32
 #define HWY_LANES(T) (32 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 0

-#if defined(HWY_DISABLE_BMI2_FMA)
-#define HWY_TARGET_STR "avx,avx2,f16c"
-#else
-#define HWY_TARGET_STR "avx,avx2,bmi,bmi2,fma,f16c"
-#endif
+#define HWY_TARGET_STR HWY_TARGET_STR_AVX2

 //-----------------------------------------------------------------------------
-// AVX3
-#elif HWY_TARGET == HWY_AVX3
+// AVX3[_DL]
+#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL

 #define HWY_ALIGN alignas(64)
+#define HWY_MAX_BYTES 64
 #define HWY_LANES(T) (64 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 1
 #define HWY_CAP_GE512 1

-#define HWY_NAMESPACE N_AVX3
+#if HWY_TARGET == HWY_AVX3

-// Must include AVX2 because an AVX3 test may call AVX2 functions (e.g. when
-// converting to half-vectors). HWY_DISABLE_BMI2_FMA is not relevant because if
-// we have AVX3, we should also have BMI2/FMA.
+#define HWY_NAMESPACE N_AVX3
+#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
+
+#elif HWY_TARGET == HWY_AVX3_DL
+
+#define HWY_NAMESPACE N_AVX3_DL
 #define HWY_TARGET_STR \
-  "avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw"
+  HWY_TARGET_STR_AVX3  \
+      ",vpclmulqdq,avx512vbmi2,vaes,avxvnni,avx512bitalg,avx512vpopcntdq"
+
+#else
+#error "Logic error"
+#endif  // HWY_TARGET == HWY_AVX3_DL

 //-----------------------------------------------------------------------------
 // PPC8
 #elif HWY_TARGET == HWY_PPC8

 #define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 0
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
@ -115,9 +174,11 @@
 #elif HWY_TARGET == HWY_NEON

 #define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

@ -135,12 +196,22 @@
 // SVE[2]
 #elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE

+#if defined(HWY_EMULATE_SVE) && !defined(__F16C__)
+#error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16"
+#endif
+
 // SVE only requires lane alignment, not natural alignment of the entire vector.
 #define HWY_ALIGN alignas(8)
-// Upper bound, not the actual lane count!
-#define HWY_LANES(T) (256 / sizeof(T))
+
+#define HWY_MAX_BYTES 256
+
+// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
+// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
+// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
+#define HWY_LANES(T) (32768 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
@ -151,16 +222,18 @@
 #define HWY_NAMESPACE N_SVE
 #endif

-// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
+// HWY_TARGET_STR remains undefined

 //-----------------------------------------------------------------------------
 // WASM
 #elif HWY_TARGET == HWY_WASM

 #define HWY_ALIGN alignas(16)
+#define HWY_MAX_BYTES 16
 #define HWY_LANES(T) (16 / sizeof(T))

 #define HWY_CAP_INTEGER64 0
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 0
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
@ -169,6 +242,24 @@

 #define HWY_TARGET_STR "simd128"

+//-----------------------------------------------------------------------------
+// WASM2
+#elif HWY_TARGET == HWY_WASM2
+
+#define HWY_ALIGN alignas(32)
+#define HWY_MAX_BYTES 32
+#define HWY_LANES(T) (32 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 0
+#define HWY_CAP_FLOAT16 1
+#define HWY_CAP_FLOAT64 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_WASM2
+
+#define HWY_TARGET_STR "simd128"
+
 //-----------------------------------------------------------------------------
 // RVV
 #elif HWY_TARGET == HWY_RVV
@ -177,16 +268,25 @@
 // and the compiler already aligns builtin types, so nothing to do here.
 #define HWY_ALIGN

-// Arbitrary constant, not the actual lane count! Large enough that we can
-// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
-#define HWY_LANES(T) (4096 / sizeof(T))
+// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
+#define HWY_MAX_BYTES 65536

+// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
+// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
+// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
+#define HWY_LANES(T) (8388608 / sizeof(T))

 #define HWY_CAP_INTEGER64 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0

+#if defined(__riscv_zfh)
+#define HWY_CAP_FLOAT16 1
+#else
+#define HWY_CAP_FLOAT16 0
+#endif
+
 #define HWY_NAMESPACE N_RVV

 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
@ -197,10 +297,11 @@
 #elif HWY_TARGET == HWY_SCALAR

 #define HWY_ALIGN
-// For internal use only; use Lanes(d) instead.
+#define HWY_MAX_BYTES 8
 #define HWY_LANES(T) 1

 #define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT16 1
 #define HWY_CAP_FLOAT64 1
 #define HWY_CAP_GE256 0
 #define HWY_CAP_GE512 0
--- a/third_party/highway/hwy/ops/shared-inl.h
+++ b/third_party/highway/hwy/ops/shared-inl.h
@ -16,6 +16,8 @@

 #include <cmath>

+#include "hwy/base.h"
+
 // Separate header because foreach_target.h re-enables its include guard.
 #include "hwy/ops/set_macros-inl.h"

@ -24,14 +26,11 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

-// SIMD operations are implemented as overloaded functions selected using a
-// "descriptor" D := Simd<T, N>. T is the lane type, N a number of lanes >= 1
-// (always a power of two). Users generally do not choose N directly, but
-// instead use HWY_FULL(T[, LMUL]) (the largest available size). N is not
-// necessarily the actual number of lanes, which is returned by Lanes(D()).
-//
-// Only HWY_FULL(T) and N <= 16 / sizeof(T) are guaranteed to be available - the
-// latter are useful if >128 bit vectors are unnecessary or undesirable.
+// SIMD operations are implemented as overloaded functions selected using a tag
+// type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
+// use only. Users create D via aliases ScalableTag<T>() (a full vector),
+// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
+// (always a power of two) is Lanes(D()).
 template <typename Lane, size_t N>
 struct Simd {
  constexpr Simd() = default;
@ -39,7 +38,7 @@ struct Simd {
  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");

  // Widening/narrowing ops change the number of lanes and/or their type.
-  // To initialize such vectors, we need the corresponding descriptor types:
+  // To initialize such vectors, we need the corresponding tag types:

  // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
  template <typename NewLane>
@ -59,10 +58,88 @@ struct Simd {
  using Twice = Simd<T, 2 * N>;
 };

+namespace detail {
+
+// Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
+// - a full vector (pow2 = 0);
+// - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
+// - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
+constexpr size_t ScaleByPower(size_t N, int pow2) {
+#if HWY_TARGET == HWY_RVV
+  // For fractions, if N == 1 ensure we still return at least one lane.
+  return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
+#else
+  // If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
+  return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
+#endif
+}
+
+// Struct wrappers enable validation of arguments via static_assert.
+template <typename T, int kPow2>
+struct ScalableTagChecker {
+  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
+  using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
+};
+
+template <typename T, size_t kLimit>
+struct CappedTagChecker {
+  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
+  using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
+};
+
+template <typename T, size_t kNumLanes>
+struct FixedTagChecker {
+  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
+  static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
+#if HWY_TARGET == HWY_SCALAR
+  // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
+  static_assert(kNumLanes == 1, "Scalar only supports one lane");
+#endif
+  using type = Simd<T, kNumLanes>;
+};
+
+}  // namespace detail
+
+// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
+// e.g. 1D loops where the application does not care about the vector size) or a
+// fraction/multiple of one. Multiples are the same as full vectors for all
+// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
+// value of type promotion and demotion.
+template <typename T, int kPow2 = 0>
+using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
+
+// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
+// targets with scalable vectors and HWY_SCALAR. The runtime lane count
+// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
+// typically used for 1D loops with a relatively low application-defined upper
+// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
+// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
+// chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
+// this would enable vector-length-agnostic loops using ScalableTag).
+template <typename T, size_t kLimit>
+using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
+
+// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
+// even on targets with scalable vectors. All targets except HWY_SCALAR support
+// up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
+// on that is non-portable and discouraged.
+//
+// NOTE: if the application does not need to support HWY_SCALAR (+), use this
+// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
+// This is useful for data structures that rely on exactly 128-bit SIMD, but
+// these are discouraged because they cannot benefit from wider vectors.
+// Instead, applications would ideally define a larger problem size and loop
+// over it with the (unknown size) vectors from ScalableTag.
+//
+// + e.g. if the baseline is known to support SIMD, or the application requires
+//   ops such as TableLookupBytes not supported by HWY_SCALAR.
+template <typename T, size_t kNumLanes>
+using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
+
 template <class D>
 using TFromD = typename D::T;

-// Descriptor for the same number of lanes as D, but with the LaneType T.
+// Tag for the same number of lanes as D, but with the LaneType T.
 template <class T, class D>
 using Rebind = typename D::template Rebind<T>;

@ -73,7 +150,7 @@ using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
 template <class D>
 using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;

-// Descriptor for the same total size as D, but with the LaneType T.
+// Tag for the same total size as D, but with the LaneType T.
 template <class T, class D>
 using Repartition = typename D::template Repartition<T>;

@ -82,7 +159,7 @@ using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
 template <class D>
 using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;

-// Descriptor for the same lane type as D, but half the lanes.
+// Tag for the same lane type as D, but half the lanes.
 template <class D>
 using Half = typename D::Half;

@ -98,6 +175,17 @@ using Twice = typename D::Twice;
 #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
 #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)

+// Same, but with a vector argument.
+#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
+#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
+#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
+#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
+
+// For implementing functions for a specific type.
+// IsSame<...>() in template arguments is broken on MSVC2015.
+#define HWY_IF_LANES_ARE(T, V) \
+  EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr
+
 // Compile-time-constant, (typically but not guaranteed) an upper bound on the
 // number of lanes.
 // Prefer instead using Lanes() and dynamic allocation, or Rebind, or
@ -119,6 +207,25 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {

 #endif

+// NOTE: GCC generates incorrect code for vector arguments to non-inlined
+// functions in two situations:
+// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
+//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
+// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
+//   all) tests to fail.
+//
+// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
+// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
+// and possibly also other functions that are not inlined.
+#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
+    ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64)
+template <class V>
+using VecArg = const V&;
+#else
+template <class V>
+using VecArg = V;
+#endif
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
--- a/third_party/highway/hwy/ops/wasm_256-inl.h
+++ b/third_party/highway/hwy/ops/wasm_256-inl.h
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@ -19,6 +19,7 @@
 #include <stdio.h>

 #include <atomic>
+#include <cstddef>
 #include <limits>

 #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
@ -40,14 +41,14 @@ namespace {

 #if HWY_ARCH_X86

-bool IsBitSet(const uint32_t reg, const int index) {
+HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
  return (reg & (1U << index)) != 0;
 }

 // Calls CPUID instruction with eax=level and ecx=count and returns the result
 // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
-void Cpuid(const uint32_t level, const uint32_t count,
-           uint32_t* HWY_RESTRICT abcd) {
+HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
+                      uint32_t* HWY_RESTRICT abcd) {
 #if HWY_COMPILER_MSVC
  int regs[4];
  __cpuidex(regs, level, count);
@ -95,37 +96,87 @@ uint32_t supported_targets_for_test_ = 0;
 uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};

 #if HWY_ARCH_X86
-// Bits indicating which instruction set extensions are supported.
-constexpr uint32_t kSSE = 1 << 0;
-constexpr uint32_t kSSE2 = 1 << 1;
-constexpr uint32_t kSSE3 = 1 << 2;
-constexpr uint32_t kSSSE3 = 1 << 3;
-constexpr uint32_t kSSE41 = 1 << 4;
-constexpr uint32_t kSSE42 = 1 << 5;
-constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
+// Arbritrary bit indices indicating which instruction set extensions are
+// supported. Use enum to ensure values are distinct.
+enum class FeatureIndex : uint32_t {
+  kSSE = 0,
+  kSSE2,
+  kSSE3,
+  kSSSE3,

-constexpr uint32_t kAVX = 1u << 6;
-constexpr uint32_t kAVX2 = 1u << 7;
-constexpr uint32_t kFMA = 1u << 8;
-constexpr uint32_t kLZCNT = 1u << 9;
-constexpr uint32_t kBMI = 1u << 10;
-constexpr uint32_t kBMI2 = 1u << 11;
+  kSSE41,
+  kSSE42,
+  kCLMUL,
+  kAES,
+
+  kAVX,
+  kAVX2,
+  kF16C,
+  kFMA,
+  kLZCNT,
+  kBMI,
+  kBMI2,
+
+  kAVX512F,
+  kAVX512VL,
+  kAVX512DQ,
+  kAVX512BW,
+
+  kVNNI,
+  kVPCLMULQDQ,
+  kVBMI2,
+  kVAES,
+  kPOPCNTDQ,
+  kBITALG,
+
+  kSentinel
+};
+static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
+              "Too many bits for u64");
+
+HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
+  return 1ull << static_cast<size_t>(index);
+}
+
+constexpr uint64_t kGroupSSSE3 =
+    Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
+    Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
+
+constexpr uint64_t kGroupSSE4 =
+    Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
+    Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;

 // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
 // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
 // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
 // avoiding using and requiring these so AVX2 can still be used.
 #ifdef HWY_DISABLE_BMI2_FMA
-constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
+constexpr uint64_t kGroupBMI2_FMA = 0;
 #else
-constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
+constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
+                                    Bit(FeatureIndex::kBMI2) |
+                                    Bit(FeatureIndex::kFMA);
 #endif

-constexpr uint32_t kAVX512F = 1u << 12;
-constexpr uint32_t kAVX512VL = 1u << 13;
-constexpr uint32_t kAVX512DQ = 1u << 14;
-constexpr uint32_t kAVX512BW = 1u << 15;
-constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
+#ifdef HWY_DISABLE_F16C
+constexpr uint64_t kGroupF16C = 0;
+#else
+constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
+#endif
+
+constexpr uint64_t kGroupAVX2 =
+    Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
+    Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
+
+constexpr uint64_t kGroupAVX3 =
+    Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
+    Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
+
+constexpr uint64_t kGroupAVX3_DL =
+    Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
+    Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
+    Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
+
 #endif  // HWY_ARCH_X86

 }  // namespace
@ -150,6 +201,8 @@ HWY_NORETURN void HWY_FORMAT(3, 4)

 #if HWY_COMPILER_MSVC
  abort();  // Compile error without this due to HWY_NORETURN.
+#elif HWY_ARCH_RVV
+  exit(1);  // trap/abort just freeze Spike
 #else
  __builtin_trap();
 #endif
@ -193,69 +246,90 @@ uint32_t SupportedTargets() {
  bits = HWY_SCALAR;

 #if HWY_ARCH_X86
-  uint32_t flags = 0;
-  uint32_t abcd[4];
+  bool has_osxsave = false;
+  {  // ensures we do not accidentally use flags outside this block
+    uint64_t flags = 0;
+    uint32_t abcd[4];

-  Cpuid(0, 0, abcd);
-  const uint32_t max_level = abcd[0];
+    Cpuid(0, 0, abcd);
+    const uint32_t max_level = abcd[0];

-  // Standard feature flags
-  Cpuid(1, 0, abcd);
-  flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
-  flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
-  flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
-  flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
-  flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
-  flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
-  flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
-  flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
-  const bool has_osxsave = IsBitSet(abcd[2], 27);
+    // Standard feature flags
+    Cpuid(1, 0, abcd);
+    flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
+    flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
+    flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
+    flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
+    flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
+    flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
+    flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
+    flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
+    flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
+    flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
+    flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
+    has_osxsave = IsBitSet(abcd[2], 27);

-  // Extended feature flags
-  Cpuid(0x80000001U, 0, abcd);
-  flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
+    // Extended feature flags
+    Cpuid(0x80000001U, 0, abcd);
+    flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;

-  // Extended features
-  if (max_level >= 7) {
-    Cpuid(7, 0, abcd);
-    flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
-    flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
-    flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
+    // Extended features
+    if (max_level >= 7) {
+      Cpuid(7, 0, abcd);
+      flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
+      flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
+      flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;

-    flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
-    flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
-    flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
-    flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
+      flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
+      flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
+      flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
+      flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
+
+      flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
+      flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
+      flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
+      flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
+      flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
+      flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
+    }
+
+    // Set target bit(s) if all their group's flags are all set.
+    if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
+      bits |= HWY_AVX3_DL;
+    }
+    if ((flags & kGroupAVX3) == kGroupAVX3) {
+      bits |= HWY_AVX3;
+    }
+    if ((flags & kGroupAVX2) == kGroupAVX2) {
+      bits |= HWY_AVX2;
+    }
+    if ((flags & kGroupSSE4) == kGroupSSE4) {
+      bits |= HWY_SSE4;
+    }
+    if ((flags & kGroupSSSE3) == kGroupSSSE3) {
+      bits |= HWY_SSSE3;
+    }
  }

-  // Verify OS support for XSAVE, without which XMM/YMM registers are not
-  // preserved across context switches and are not safe to use.
+  // Clear bits if the OS does not support XSAVE - otherwise, registers
+  // are not preserved across context switches.
  if (has_osxsave) {
    const uint32_t xcr0 = ReadXCR0();
    // XMM
    if (!IsBitSet(xcr0, 1)) {
-      flags = 0;
+      bits &=
+          ~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
    }
    // YMM
    if (!IsBitSet(xcr0, 2)) {
-      flags &= ~kGroupAVX2;
+      bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
    }
    // ZMM + opmask
    if ((xcr0 & 0x70) != 0x70) {
-      flags &= ~kGroupAVX3;
+      bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
    }
  }

-  // Set target bit(s) if all their group's flags are all set.
-  if ((flags & kGroupAVX3) == kGroupAVX3) {
-    bits |= HWY_AVX3;
-  }
-  if ((flags & kGroupAVX2) == kGroupAVX2) {
-    bits |= HWY_AVX2;
-  }
-  if ((flags & kGroupSSE4) == kGroupSSE4) {
-    bits |= HWY_SSE4;
-  }
 #else
  // TODO(janwas): detect for other platforms
  bits = HWY_ENABLED_BASELINE;
--- a/third_party/highway/hwy/targets.h
+++ b/third_party/highway/hwy/targets.h
@ -21,281 +21,23 @@
 // generate and call.

 #include "hwy/base.h"
-
-//------------------------------------------------------------------------------
-// Optional configuration
-
-// See ../quick_reference.md for documentation of these macros.
-
-// Uncomment to override the default baseline determined from predefined macros:
-// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
-
-// Uncomment to override the default blocklist:
-// #define HWY_BROKEN_TARGETS HWY_AVX3
-
-// Uncomment to definitely avoid generating those target(s):
-// #define HWY_DISABLED_TARGETS HWY_SSE4
-
-// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
-// AVX2 target for VMs which support AVX2 but not the other instruction sets)
-// #define HWY_DISABLE_BMI2_FMA
-
-//------------------------------------------------------------------------------
-// Targets
-
-// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
-// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
-//
-// All values are unconditionally defined so we can test HWY_TARGETS without
-// first checking the HWY_ARCH_*.
-//
-// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
-// can use 32-bit literals.
-
-// 1,2,4: reserved
-#define HWY_AVX3 8
-#define HWY_AVX2 16
-// 32: reserved for AVX
-#define HWY_SSE4 64
-// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
-
-// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
-// dynamic dispatch. All x86 target bits must be lower or equal to
-// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
-// HWY_MAX_DYNAMIC_TARGETS in total.
-#define HWY_HIGHEST_TARGET_BIT_X86 9
-
-#define HWY_SVE2 0x400
-#define HWY_SVE 0x800
-// 0x1000 reserved for Helium
-#define HWY_NEON 0x2000
-
-#define HWY_HIGHEST_TARGET_BIT_ARM 13
-
-// 0x4000, 0x8000 reserved
-#define HWY_PPC8 0x10000  // v2.07 or 3
-// 0x20000, 0x40000 reserved for prior VSX/AltiVec
-
-#define HWY_HIGHEST_TARGET_BIT_PPC 18
-
-// 0x80000 reserved
-#define HWY_WASM 0x100000
-
-#define HWY_HIGHEST_TARGET_BIT_WASM 20
-
-// 0x200000, 0x400000, 0x800000 reserved
-
-#define HWY_RVV 0x1000000
-
-#define HWY_HIGHEST_TARGET_BIT_RVV 24
-
-// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
-
-#define HWY_SCALAR 0x20000000
-
-#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
-
-// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
-
-//------------------------------------------------------------------------------
-// Set default blocklists
-
-// Disabled means excluded from enabled at user's request. A separate config
-// macro allows disabling without deactivating the blocklist below.
-#ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS 0
-#endif
-
-// Broken means excluded from enabled due to known compiler issues. Allow the
-// user to override this blocklist without any guarantee of success.
-#ifndef HWY_BROKEN_TARGETS
-
-// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
-// SSE4 codegen (possibly only for msan), so disable all those targets.
-#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
-#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
-// This entails a major speed reduction, so warn unless the user explicitly
-// opts in to scalar-only.
-#if !defined(HWY_COMPILE_ONLY_SCALAR)
-#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
-#endif
-
-// 32-bit may fail to compile AVX2/3.
-#elif HWY_ARCH_X86_32
-#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
-
-// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
-#elif HWY_COMPILER_MSVC != 0
-#define HWY_BROKEN_TARGETS (HWY_AVX3)
-
-// armv7be has not been tested and is not yet supported.
-#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
-#define HWY_BROKEN_TARGETS (HWY_NEON)
-
-#else
-#define HWY_BROKEN_TARGETS 0
-#endif
-
-#endif  // HWY_BROKEN_TARGETS
-
-// Enabled means not disabled nor blocklisted.
-#define HWY_ENABLED(targets) \
-  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
-
-//------------------------------------------------------------------------------
-// Detect baseline targets using predefined macros
-
-// Baseline means the targets for which the compiler is allowed to generate
-// instructions, implying the target CPU would have to support them. Do not use
-// this directly because it does not take the blocklist into account. Allow the
-// user to override this without any guarantee of success.
-#ifndef HWY_BASELINE_TARGETS
-
-// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
-// HWY_TARGET == HWY_SCALAR.
-
-#if HWY_ARCH_WASM && defined(__wasm_simd128__)
-#define HWY_BASELINE_WASM HWY_WASM
-#else
-#define HWY_BASELINE_WASM 0
-#endif
-
-// Avoid choosing the PPC target until we have an implementation.
-#if HWY_ARCH_PPC && defined(__VSX__) && 0
-#define HWY_BASELINE_PPC8 HWY_PPC8
-#else
-#define HWY_BASELINE_PPC8 0
-#endif
-
-// Avoid choosing the SVE[2] targets the implementation is ready.
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
-#define HWY_BASELINE_SVE2 HWY_SVE2
-#else
-#define HWY_BASELINE_SVE2 0
-#endif
-
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
-#define HWY_BASELINE_SVE HWY_SVE
-#else
-#define HWY_BASELINE_SVE 0
-#endif
-
-// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
-#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
-#define HWY_BASELINE_NEON HWY_NEON
-#else
-#define HWY_BASELINE_NEON 0
-#endif
-
-// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
-// we at least get SSE4 on machines supporting AVX but not AVX2.
-// https://stackoverflow.com/questions/18563978/
-#if HWY_ARCH_X86 && \
-    (defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
-#define HWY_BASELINE_SSE4 HWY_SSE4
-#else
-#define HWY_BASELINE_SSE4 0
-#endif
-
-#if HWY_ARCH_X86 && defined(__AVX2__)
-#define HWY_BASELINE_AVX2 HWY_AVX2
-#else
-#define HWY_BASELINE_AVX2 0
-#endif
-
-#if HWY_ARCH_X86 && defined(__AVX512F__)
-#define HWY_BASELINE_AVX3 HWY_AVX3
-#else
-#define HWY_BASELINE_AVX3 0
-#endif
-
-#if HWY_ARCH_RVV && defined(__riscv_vector)
-#define HWY_BASELINE_RVV HWY_RVV
-#else
-#define HWY_BASELINE_RVV 0
-#endif
-
-#define HWY_BASELINE_TARGETS                                                \
-  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
-   HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 |               \
-   HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)
-
-#endif  // HWY_BASELINE_TARGETS
-
-//------------------------------------------------------------------------------
-// Choose target for static dispatch
-
-#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
-#if HWY_ENABLED_BASELINE == 0
-#error "At least one baseline target must be defined and enabled"
-#endif
-
-// Best baseline, used for static dispatch. This is the least-significant 1-bit
-// within HWY_ENABLED_BASELINE and lower bit values imply "better".
-#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
-
-// Start by assuming static dispatch. If we later use dynamic dispatch, this
-// will be defined to other targets during the multiple-inclusion, and finally
-// return to the initial value. Defining this outside begin/end_target ensures
-// inl headers successfully compile by themselves (required by Bazel).
-#define HWY_TARGET HWY_STATIC_TARGET
-
-//------------------------------------------------------------------------------
-// Choose targets for dynamic dispatch according to one of four policies
-
-#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
-     defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
-#error "Invalid config: can only define a single policy for targets"
-#endif
-
-// Attainable means enabled and the compiler allows intrinsics (even when not
-// allowed to autovectorize). Used in 3 and 4.
-#if HWY_ARCH_X86
-#define HWY_ATTAINABLE_TARGETS \
-  HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
-#else
-#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
-#endif
-
-// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
-// to ~HWY_SCALAR, but this is more explicit).
-#if defined(HWY_COMPILE_ONLY_SCALAR)
-#undef HWY_STATIC_TARGET
-#define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
-#define HWY_TARGETS HWY_SCALAR
-
-// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
-#elif defined(HWY_COMPILE_ONLY_STATIC)
-#define HWY_TARGETS HWY_STATIC_TARGET
-
-// 3) For tests: include all attainable targets (in particular: scalar)
-#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
-#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
-
-// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
-// excluding superseded targets, in particular scalar.
-#else
-#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
-
-#endif  // target policy
-
-// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
-// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
-// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
-#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
-#error "Logic error: best baseline should be included in dynamic targets"
-#endif
-
-//------------------------------------------------------------------------------
+#include "hwy/detect_targets.h"

 namespace hwy {

 // Returns (cached) bitfield of enabled targets that are supported on this CPU.
-// Implemented in supported_targets.cc; unconditionally compiled to support the
-// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
-// allow eliding calls to this function.
+// Implemented in targets.cc; unconditionally compiled to support the use case
+// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
+// eliding calls to this function.
 uint32_t SupportedTargets();

+// Evaluates to a function call, or literal if there is a single target.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
+#define HWY_SUPPORTED_TARGETS HWY_TARGETS
+#else
+#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
+#endif
+
 // Disable from runtime dispatch the mask of compiled in targets. Targets that
 // were not enabled at compile time are ignored. This function is useful to
 // disable a target supported by the CPU that is known to have bugs or when a
@ -304,17 +46,9 @@ uint32_t SupportedTargets();
 // returns at least the baseline target.
 void DisableTargets(uint32_t disabled_targets);

-// Single target: reduce code size by eliding the call and conditional branches
-// inside Choose*() functions.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
-#define HWY_SUPPORTED_TARGETS HWY_TARGETS
-#else
-#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
-#endif
-
 // Set the mock mask of CPU supported targets instead of the actual CPU
 // supported targets computed in SupportedTargets(). The return value of
-// SupportedTargets() will still be affected by the DisabledTargets() mask
+// SupportedTargets() will still be affected by the DisableTargets() mask
 // regardless of this mock, to prevent accidentally adding targets that are
 // known to be buggy in the current CPU. Call with a mask of 0 to disable the
 // mock and use the actual CPU supported targets instead.
@ -340,12 +74,16 @@ HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
 static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
  switch (target) {
 #if HWY_ARCH_X86
+    case HWY_SSSE3:
+      return "SSSE3";
    case HWY_SSE4:
      return "SSE4";
    case HWY_AVX2:
      return "AVX2";
    case HWY_AVX3:
      return "AVX3";
+    case HWY_AVX3_DL:
+      return "AVX3_DL";
 #endif

 #if HWY_ARCH_ARM
@ -423,17 +161,17 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 // HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
 // HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
 // corresponds to the best target. Don't include a "," at the end of the list.
-#define HWY_CHOOSE_TARGET_LIST(func_name)        \
-  nullptr,                        /* reserved */ \
-      nullptr,                    /* reserved */ \
-      nullptr,                    /* reserved */ \
-      HWY_CHOOSE_AVX3(func_name), /* AVX3 */     \
-      HWY_CHOOSE_AVX2(func_name), /* AVX2 */     \
-      nullptr,                    /* AVX */      \
-      HWY_CHOOSE_SSE4(func_name), /* SSE4 */     \
-      nullptr,                    /* SSSE3 */    \
-      nullptr,                    /* SSE3 */     \
-      nullptr                     /* SSE2 */
+#define HWY_CHOOSE_TARGET_LIST(func_name)           \
+  nullptr,                           /* reserved */ \
+      nullptr,                       /* reserved */ \
+      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */  \
+      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */     \
+      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */     \
+      nullptr,                       /* AVX */      \
+      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */     \
+      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */    \
+      nullptr,                       /* SSE3 */     \
+      nullptr                        /* SSE2 */

 #elif HWY_ARCH_ARM
 // See HWY_ARCH_X86 above for details.
@ -460,11 +198,11 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
 // See HWY_ARCH_X86 above for details.
 #define HWY_MAX_DYNAMIC_TARGETS 4
 #define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
-#define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  nullptr,                       /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_WASM(func_name) /* WASM */
+#define HWY_CHOOSE_TARGET_LIST(func_name)         \
+  nullptr,                         /* reserved */ \
+      nullptr,                     /* reserved */ \
+      HWY_CHOOSE_WASM2(func_name), /* WASM2 */    \
+      HWY_CHOOSE_WASM(func_name)   /* WASM */

 #elif HWY_ARCH_RVV
 // See HWY_ARCH_X86 above for details.
--- a/third_party/highway/hwy/targets_test.cc
+++ b/third_party/highway/hwy/targets_test.cc
@ -23,10 +23,14 @@ namespace fake {
    uint32_t FakeFunction(int) { return HWY_##TGT; } \
  }

+DECLARE_FUNCTION(AVX3_DL)
 DECLARE_FUNCTION(AVX3)
 DECLARE_FUNCTION(AVX2)
 DECLARE_FUNCTION(SSE4)
+DECLARE_FUNCTION(SSSE3)
 DECLARE_FUNCTION(NEON)
+DECLARE_FUNCTION(SVE)
+DECLARE_FUNCTION(SVE2)
 DECLARE_FUNCTION(PPC8)
 DECLARE_FUNCTION(WASM)
 DECLARE_FUNCTION(RVV)
@ -49,10 +53,14 @@ void CheckFakeFunction() {
    /* Second call uses the cached value from the previous call. */         \
    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
  }
+  CHECK_ARRAY_ENTRY(AVX3_DL)
  CHECK_ARRAY_ENTRY(AVX3)
  CHECK_ARRAY_ENTRY(AVX2)
  CHECK_ARRAY_ENTRY(SSE4)
+  CHECK_ARRAY_ENTRY(SSSE3)
  CHECK_ARRAY_ENTRY(NEON)
+  CHECK_ARRAY_ENTRY(SVE)
+  CHECK_ARRAY_ENTRY(SVE2)
  CHECK_ARRAY_ENTRY(PPC8)
  CHECK_ARRAY_ENTRY(WASM)
  CHECK_ARRAY_ENTRY(RVV)
@ -84,14 +92,14 @@ TEST_F(HwyTargetsTest, DisabledTargetsTest) {

  DisableTargets(0);  // Reset the mask.
  uint32_t current_targets = SupportedTargets();
-  if ((current_targets & ~HWY_ENABLED_BASELINE) == 0) {
+  if ((current_targets & ~uint32_t(HWY_ENABLED_BASELINE)) == 0) {
    // We can't test anything else if the only compiled target is the baseline.
    return;
  }
  // Get the lowest bit in the mask (the best target) and disable that one.
  uint32_t lowest_target = current_targets & (~current_targets + 1);
  // The lowest target shouldn't be one in the baseline.
-  HWY_ASSERT((lowest_target & ~HWY_ENABLED_BASELINE) != 0);
+  HWY_ASSERT((lowest_target & ~uint32_t(HWY_ENABLED_BASELINE)) != 0);
  DisableTargets(lowest_target);

  // Check that the other targets are still enabled.
@ -100,3 +108,9 @@ TEST_F(HwyTargetsTest, DisabledTargetsTest) {
 }

 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/third_party/highway/hwy/tests/arithmetic_test.cc
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>

@ -40,7 +41,7 @@ struct TestPlusMinus {
    for (size_t i = 0; i < N; ++i) {
      lanes[i] = static_cast<T>((2 + i) + (3 + i));
    }
-    HWY_ASSERT_VEC_EQ(d, lanes.get(), v2 + v3);
+    HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3));
    HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2));

    for (size_t i = 0; i < N; ++i) {
@ -199,7 +200,7 @@ struct TestLeftShifts {

    // 1
    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
      expected[i] = T(TU(value) << 1);
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
@ -207,7 +208,7 @@ struct TestLeftShifts {

    // max
    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
+      const T value = kSigned ? T(T(i) - T(N)) : T(i);
      expected[i] = T(TU(value) << kMaxShift);
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
@ -301,6 +302,43 @@ struct TestUnsignedRightShifts {
  }
 };

+struct TestRotateRight {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    constexpr size_t kBits = sizeof(T) * 8;
+    const auto mask_shift = Set(d, T{kBits});
+    // Cover as many bit positions as possible to test shifting out
+    const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
+
+    // Rotate by 0
+    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
+
+    // Rotate by 1
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
+
+    // Rotate by half
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
+
+    // Rotate by max
+    Store(values, d, expected.get());
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
+  }
+};
+
 struct TestVariableUnsignedRightShifts {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -353,9 +391,9 @@ T RightShiftNegative(T val) {
  TU bits;
  CopyBytes<sizeof(T)>(&val, &bits);

-  const TU shifted = bits >> kAmount;
+  const TU shifted = TU(bits >> kAmount);

-  const TU all = ~TU(0);
+  const TU all = TU(~TU(0));
  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());

@ -376,7 +414,7 @@ class TestSignedRightShifts {

    // First test positive values, negative are checked below.
    const auto v0 = Zero(d);
-    const auto values = Iota(d, 0) & Set(d, kMax);
+    const auto values = And(Iota(d, 0), Set(d, kMax));

    // Shift by 0
    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
@ -512,6 +550,14 @@ HWY_NOINLINE void TestAllVariableShifts() {
 #endif
 }

+HWY_NOINLINE void TestAllRotateRight() {
+  const ForPartialVectors<TestRotateRight> test;
+  test(uint32_t());
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+#endif
+}
+
 struct TestUnsignedMinMax {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -616,7 +662,7 @@ struct TestUnsignedMul {
    for (size_t i = 0; i < N; ++i) {
      expected[i] = static_cast<T>((1 + i) * (1 + i));
    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), vi * vi);
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));

    for (size_t i = 0; i < N; ++i) {
      expected[i] = static_cast<T>((1 + i) * (3 + i));
@ -655,8 +701,8 @@ struct TestSignedMul {
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));

-    for (int i = 0; i < static_cast<int>(N); ++i) {
-      expected[i] = static_cast<T>((-T(N) + i) * (1 + i));
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
@ -695,22 +741,22 @@ struct TestMulHigh {

    // Large positive squared
    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = LimitsMax<T>() >> i;
-      expected_lanes[i] = (Wide(in_lanes[i]) * in_lanes[i]) >> 16;
+      in_lanes[i] = T(LimitsMax<T>() >> i);
+      expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
    }
    auto v = Load(d, in_lanes.get());
    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));

    // Large positive * small positive
-    for (int i = 0; i < static_cast<int>(N); ++i) {
-      expected_lanes[i] = static_cast<T>((Wide(in_lanes[i]) * T(1 + i)) >> 16);
+    for (size_t i = 0; i < N; ++i) {
+      expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
    }
    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));

    // Large positive * small negative
    for (size_t i = 0; i < N; ++i) {
-      expected_lanes[i] = (Wide(in_lanes[i]) * T(i - N)) >> 16;
+      expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
    }
    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
@ -747,10 +793,52 @@ struct TestMulEven {
  }
 };

+struct TestMulEvenOdd64 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
+
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto expected_even = AllocateAligned<T>(N);
+    auto expected_odd = AllocateAligned<T>(N);
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = Random64(&rng);
+        in2[i] = Random64(&rng);
+      }
+
+      for (size_t i = 0; i < N; i += 2) {
+        expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
+        expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
+      }
+
+      const auto a = Load(d, in1.get());
+      const auto b = Load(d, in2.get());
+      HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
+      HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
+    }
+#else
+    (void)d;
+#endif  // HWY_TARGET != HWY_SCALAR
+  }
+};
+
 HWY_NOINLINE void TestAllMulEven() {
-  ForPartialVectors<TestMulEven> test;
+  ForExtendableVectors<TestMulEven> test;
  test(int32_t());
  test(uint32_t());
+
+  ForGE128Vectors<TestMulEvenOdd64>()(uint64_t());
 }

 struct TestMulAdd {
@ -784,7 +872,8 @@ struct TestMulAdd {
    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));

    for (size_t i = 0; i < N; ++i) {
-      expected[i] = -T(i + 2) * (i + 2) + (1 + i);
+      expected[i] =
+          T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
    }
    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));

@ -819,6 +908,78 @@ HWY_NOINLINE void TestAllMulAdd() {
  ForFloatTypes(ForPartialVectors<TestMulAdd>());
 }

+struct TestReorderWidenMulAccumulate {
+  template <typename TN, class DN>
+  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
+    using TW = MakeWide<TN>;
+    const RepartitionToWide<DN> dw;
+    const auto f0 = Zero(dw);
+    const auto f1 = Set(dw, 1.0f);
+    const auto fi = Iota(dw, 1);
+    const auto bf0 = ReorderDemote2To(dn, f0, f0);
+    const auto bf1 = ReorderDemote2To(dn, f1, f1);
+    const auto bfi = ReorderDemote2To(dn, fi, fi);
+    const size_t NW = Lanes(dw);
+    auto delta = AllocateAligned<TW>(2 * NW);
+    for (size_t i = 0; i < 2 * NW; ++i) {
+      delta[i] = 0.0f;
+    }
+
+    // Any input zero => both outputs zero
+    auto sum1 = f0;
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+    HWY_ASSERT_VEC_EQ(dw, f0,
+                      ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
+    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
+
+    // delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
+    for (size_t p = 0; p < 2 * NW; ++p) {
+      delta[p] = 1.0f;
+      const auto delta0 = Load(dw, delta.get() + 0);
+      const auto delta1 = Load(dw, delta.get() + NW);
+      delta[p] = 0.0f;
+      const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
+
+      {
+        sum1 = f0;
+        const auto sum0 =
+            ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
+        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Swapped arg order
+      {
+        sum1 = f0;
+        const auto sum0 =
+            ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
+        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Start with nonzero sum0 or sum1
+      {
+        sum1 = delta1;
+        const auto sum0 =
+            ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
+        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+      // Start with nonzero sum0 or sum1, and swap arg order
+      {
+        sum1 = delta1;
+        const auto sum0 =
+            ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
+        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
+  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
+}
+
 struct TestDiv {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -852,13 +1013,24 @@ struct TestApproximateReciprocal {
    Store(ApproximateReciprocal(nonzero), d, actual.get());

    double max_l1 = 0.0;
+    double worst_expected = 0.0;
+    double worst_actual = 0.0;
    for (size_t i = 0; i < N; ++i) {
-      max_l1 = std::max<double>(max_l1, std::abs((1.0 / input[i]) - actual[i]));
+      const double expected = 1.0 / input[i];
+      const double l1 = std::abs(expected - actual[i]);
+      if (l1 > max_l1) {
+        max_l1 = l1;
+        worst_expected = expected;
+        worst_actual = actual[i];
+      }
+    }
+    const double abs_worst_expected = std::abs(worst_expected);
+    if (abs_worst_expected > 1E-5) {
+      const double max_rel = max_l1 / abs_worst_expected;
+      fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
+              worst_expected, worst_actual);
+      HWY_ASSERT(max_rel < 0.004);
    }
-    const double max_rel = max_l1 / std::abs(1.0 / input[N - 1]);
-    printf("max err %f\n", max_rel);
-
-    HWY_ASSERT(max_rel < 0.002);
  }
 };

@ -870,7 +1042,7 @@ struct TestSquareRoot {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const auto vi = Iota(d, 0);
-    HWY_ASSERT_VEC_EQ(d, vi, Sqrt(vi * vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
  }
 };

@ -888,7 +1060,11 @@ struct TestReciprocalSquareRoot {
    for (size_t i = 0; i < N; ++i) {
      float err = lanes[i] - 0.090166f;
      if (err < 0.0f) err = -err;
-      HWY_ASSERT(err < 1E-4f);
+      if (err >= 4E-4f) {
+        HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
+                  static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
+                  err);
+      }
    }
  }
 };
@ -901,36 +1077,53 @@ template <typename T, class D>
 AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
  const T eps = std::numeric_limits<T>::epsilon();
  const T test_cases[] = {
-      // +/- 1
-      T(1), T(-1),
-      // +/- 0
-      T(0), T(-0),
-      // near 0
-      T(0.4), T(-0.4),
-      // +/- integer
-      T(4), T(-32),
-      // positive near limit
-      MantissaEnd<T>() - T(1.5), MantissaEnd<T>() + T(1.5),
-      // negative near limit
-      -MantissaEnd<T>() - T(1.5), -MantissaEnd<T>() + T(1.5),
-      // +/- huge (but still fits in float)
-      T(1E34), T(-1E35),
-      // positive tiebreak
-      T(1.5), T(2.5),
-      // negative tiebreak
-      T(-1.5), T(-2.5),
-      // positive +/- delta
-      T(2.0001), T(3.9999),
-      // negative +/- delta
-      T(-999.9999), T(-998.0001),
-      // positive +/- epsilon
-      T(1) + eps, T(1) - eps,
-      // negative +/- epsilon
-      T(-1) + eps, T(-1) - eps,
-      // +/- infinity
-      std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(),
-      // qNaN
-      GetLane(NaN(d))};
+    // +/- 1
+    T(1),
+    T(-1),
+    // +/- 0
+    T(0),
+    T(-0),
+    // near 0
+    T(0.4),
+    T(-0.4),
+    // +/- integer
+    T(4),
+    T(-32),
+    // positive near limit
+    MantissaEnd<T>() - T(1.5),
+    MantissaEnd<T>() + T(1.5),
+    // negative near limit
+    -MantissaEnd<T>() - T(1.5),
+    -MantissaEnd<T>() + T(1.5),
+    // positive tiebreak
+    T(1.5),
+    T(2.5),
+    // negative tiebreak
+    T(-1.5),
+    T(-2.5),
+    // positive +/- delta
+    T(2.0001),
+    T(3.9999),
+    // negative +/- delta
+    T(-999.9999),
+    T(-998.0001),
+    // positive +/- epsilon
+    T(1) + eps,
+    T(1) - eps,
+    // negative +/- epsilon
+    T(-1) + eps,
+    T(-1) - eps,
+#if !defined(HWY_EMULATE_SVE)  // these are not safe to just cast to int
+    // +/- huge (but still fits in float)
+    T(1E34),
+    T(-1E35),
+    // +/- infinity
+    std::numeric_limits<T>::infinity(),
+    -std::numeric_limits<T>::infinity(),
+    // qNaN
+    GetLane(NaN(d))
+#endif
+  };
  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
  const size_t N = Lanes(d);
  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
@ -952,7 +1145,7 @@ struct TestRound {
      // Avoid [std::]round, which does not round to nearest *even*.
      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = nearbyint(in[i]);
+      expected[i] = static_cast<T>(nearbyint(in[i]));
    }
    for (size_t i = 0; i < padded; i += Lanes(d)) {
      HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
@ -983,7 +1176,7 @@ struct TestNearestInt {
        // Avoid undefined result for lrintf
        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
      } else {
-        expected[i] = lrintf(in[i]);
+        expected[i] = static_cast<TI>(lrintf(in[i]));
      }
    }
    for (size_t i = 0; i < padded; i += Lanes(df)) {
@ -1008,7 +1201,7 @@ struct TestTrunc {
    for (size_t i = 0; i < padded; ++i) {
      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = trunc(in[i]);
+      expected[i] = static_cast<T>(trunc(in[i]));
    }
    for (size_t i = 0; i < padded; i += Lanes(d)) {
      HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
@ -1074,30 +1267,20 @@ struct TestSumOfLanes {
      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
      sum += static_cast<double>(in_lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(Load(d, in_lanes.get())));
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
+                      SumOfLanes(d, Load(d, in_lanes.get())));

    // Lane i = i (iota) to include upper lanes
    sum = 0.0;
    for (size_t i = 0; i < N; ++i) {
      sum += static_cast<double>(i);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(Iota(d, 0)));
+    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
  }
 };

 HWY_NOINLINE void TestAllSumOfLanes() {
-  const ForPartialVectors<TestSumOfLanes> sum;
-
-  // No u8/u16/i8/i16.
-  sum(uint32_t());
-  sum(int32_t());
-
-#if HWY_CAP_INTEGER64
-  sum(uint64_t());
-  sum(int64_t());
-#endif
-
-  ForFloatTypes(sum);
+  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
 }

 struct TestMinOfLanes {
@ -1112,17 +1295,17 @@ struct TestMinOfLanes {
    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
    for (size_t i = 0; i < N; ++i) {
      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
-      min = std::min(min, in_lanes[i]);
+      min = HWY_MIN(min, in_lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(Load(d, in_lanes.get())));
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));

    // Lane i = N - i to include upper lanes
    min = HighestValue<T>();
    for (size_t i = 0; i < N; ++i) {
      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
-      min = std::min(min, in_lanes[i]);
+      min = HWY_MIN(min, in_lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(Load(d, in_lanes.get())));
+    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
  }
 };

@ -1137,39 +1320,29 @@ struct TestMaxOfLanes {
    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
    for (size_t i = 0; i < N; ++i) {
      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      max = std::max(max, in_lanes[i]);
+      max = HWY_MAX(max, in_lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(Load(d, in_lanes.get())));
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));

    // Lane i = i to include upper lanes
    max = LowestValue<T>();
    for (size_t i = 0; i < N; ++i) {
      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
-      max = std::max(max, in_lanes[i]);
+      max = HWY_MAX(max, in_lanes[i]);
    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(Load(d, in_lanes.get())));
+    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
  }
 };

 HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  const ForPartialVectors<TestMinOfLanes> min;
-  const ForPartialVectors<TestMaxOfLanes> max;
-
-  // No u8/u16/i8/i16.
-  min(uint32_t());
-  max(uint32_t());
-  min(int32_t());
-  max(int32_t());
-
-#if HWY_CAP_INTEGER64
-  min(uint64_t());
-  max(uint64_t());
-  min(int64_t());
-  max(int64_t());
-#endif
-
-  ForFloatTypes(min);
-  ForFloatTypes(max);
+  const ForPartialVectors<TestMinOfLanes> test_min;
+  const ForPartialVectors<TestMaxOfLanes> test_max;
+  ForUIF3264(test_min);
+  ForUIF3264(test_max);
+  test_min(uint16_t());
+  test_max(uint16_t());
+  test_min(int16_t());
+  test_max(int16_t());
 }

 struct TestAbsDiff {
@ -1219,12 +1392,14 @@ HWY_NOINLINE void TestAllNeg() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyArithmeticTest);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
@ -1232,6 +1407,7 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulHigh);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulEven);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulAdd);
+HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReorderWidenMulAccumulate);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllDiv);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllApproximateReciprocal);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSquareRoot);
@ -1246,4 +1422,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
 HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/tests/blockwise_test.cc
+++ b/third_party/highway/hwy/tests/blockwise_test.cc
@ -0,0 +1,645 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestShiftBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Bytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> du8;
+    const size_t N8 = Lanes(du8);
+
+    // Zero remains zero
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
+
+    // Zero after shifting out the high/low byte
+    auto bytes = AllocateAligned<uint8_t>(N8);
+    std::fill(bytes.get(), bytes.get() + N8, 0);
+    bytes[N8 - 1] = 0x7F;
+    const auto vhi = BitCast(d, Load(du8, bytes.get()));
+    bytes[N8 - 1] = 0;
+    bytes[0] = 0x7F;
+    const auto vlo = BitCast(d, Load(du8, bytes.get()));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
+
+    // Check expected result with Iota
+    const size_t N = Lanes(d);
+    auto in = AllocateAligned<T>(N);
+    const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
+    const auto v = BitCast(d, Iota(du8, 1));
+    Store(v, d, in.get());
+
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    const size_t kBlockSize = HWY_MIN(N8, 16);
+    for (size_t block = 0; block < N8; block += kBlockSize) {
+      expected_bytes[block] = 0;
+      memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
+
+    for (size_t block = 0; block < N8; block += kBlockSize) {
+      memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
+      expected_bytes[block + kBlockSize - 1] = 0;
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftBytes() {
+  ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
+}
+
+struct TestShiftLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const auto v = Iota(d, T(1));
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
+    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
+    HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
+
+    for (size_t i = 0; i < N; ++i) {
+      const size_t mod = i % kLanesPerBlock;
+      expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftLanes() {
+  ForAllTypes(ForPartialVectors<TestShiftLanes>());
+}
+
+template <typename D, int kLane>
+struct TestBroadcastR {
+  HWY_NOINLINE void operator()() const {
+    using T = typename D::T;
+    const D d;
+    const size_t N = Lanes(d);
+    if (kLane >= N) return;
+    auto in_lanes = AllocateAligned<T>(N);
+    std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
+    const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
+    // Need to set within each 128-bit block
+    for (size_t block = 0; block < N; block += blockN) {
+      in_lanes[block + kLane] = static_cast<T>(block + 1);
+    }
+    const auto in = Load(d, in_lanes.get());
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += blockN) {
+      for (size_t i = 0; i < blockN; ++i) {
+        expected[block + i] = T(block + 1);
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
+
+    TestBroadcastR<D, kLane - 1>()();
+  }
+};
+
+template <class D>
+struct TestBroadcastR<D, -1> {
+  void operator()() const {}
+};
+
+struct TestBroadcast {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
+  }
+};
+
+HWY_NOINLINE void TestAllBroadcast() {
+  const ForPartialVectors<TestBroadcast> test;
+  // No u/i8.
+  test(uint16_t());
+  test(int16_t());
+  ForUIF3264(test);
+}
+
+template <bool kFull>
+struct ChooseTableSize {
+  template <typename T, typename DIdx>
+  using type = DIdx;
+};
+template <>
+struct ChooseTableSize<true> {
+  template <typename T, typename DIdx>
+  using type = ScalableTag<T>;
+};
+
+template <bool kFull>
+struct TestTableLookupBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    RandomState rng;
+    const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
+    const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
+    const size_t NT8 = Lanes(d_tbl8);
+
+    const Repartition<uint8_t, D> d8;
+    const size_t N = Lanes(d);
+    const size_t N8 = Lanes(d8);
+
+    // Random input bytes
+    auto in_bytes = AllocateAligned<uint8_t>(NT8);
+    for (size_t i = 0; i < NT8; ++i) {
+      in_bytes[i] = Random32(&rng) & 0xFF;
+    }
+    const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
+
+    // Enough test data; for larger vectors, upper lanes will be zero.
+    const uint8_t index_bytes_source[64] = {
+        // Same index as source, multiple outputs from same input,
+        // unused input (9), ascending/descending and nonconsecutive neighbors.
+        0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
+        11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
+        4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
+    auto index_bytes = AllocateAligned<uint8_t>(N8);
+    const size_t max_index = HWY_MIN(N8, 16) - 1;
+    for (size_t i = 0; i < N8; ++i) {
+      index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
+      // Avoid asan error for partial vectors.
+      index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
+    }
+    const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    for (size_t block = 0; block < N8; block += 16) {
+      for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
+        const uint8_t index = index_bytes[block + i];
+        HWY_ASSERT(block + index < N8);  // indices were already capped to N8.
+        // For large vectors, the lane index may wrap around due to block.
+        expected_bytes[block + i] = in_bytes[(block & 0xFF) + index];
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
+
+    // Individually test zeroing each byte position.
+    for (size_t i = 0; i < N8; ++i) {
+      const uint8_t prev_expected = expected_bytes[i];
+      const uint8_t prev_index = index_bytes[i];
+      expected_bytes[i] = 0;
+
+      const int idx = 0x80 + (int(Random32(&rng) & 7) << 4);
+      HWY_ASSERT(0x80 <= idx && idx < 256);
+      index_bytes[i] = static_cast<uint8_t>(idx);
+
+      const auto indices =
+          Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
+      expected_bytes[i] = prev_expected;
+      index_bytes[i] = prev_index;
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllTableLookupBytes() {
+  // Partial index, same-sized table.
+  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
+
+// TODO(janwas): requires LMUL trunc/ext, which is not yet implemented.
+#if HWY_TARGET != HWY_RVV
+  // Partial index, full-size table.
+  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
+#endif
+}
+
+struct TestInterleaveLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      const size_t index = (i % blockN) + block * 2 * blockN;
+      expected[i] = static_cast<T>(index & LimitsMax<TU>());
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
+  }
+};
+
+struct TestInterleaveUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllInterleave() {
+  // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
+  ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
+  ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
+}
+
+struct TestZipLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const Repartition<WideT, D> dw;
+    const size_t NW = Lanes(dw);
+    auto expected = AllocateAligned<WideT>(NW);
+    const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
+
+    for (size_t i = 0; i < NW; ++i) {
+      const size_t block = i / blockN;
+      // Value of least-significant lane in lo-vector.
+      const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
+      const size_t kBits = sizeof(T) * 8;
+      expected[i] = static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) +
+                                       static_cast<WideT>(lo));
+    }
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(dw, even, odd));
+  }
+};
+
+struct TestZipUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    if (N < 16 / sizeof(T)) return;
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+        const Repartition<WideT, D> dw;
+    const size_t NW = Lanes(dw);
+    auto expected = AllocateAligned<WideT>(NW);
+    const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
+
+    for (size_t i = 0; i < NW; ++i) {
+      const size_t block = i / blockN;
+      const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
+      const size_t kBits = sizeof(T) * 8;
+      expected[i] = static_cast<WideT>(
+          (static_cast<WideT>(lo + 2 * blockN + 1) << kBits) +
+          static_cast<WideT>(lo + 2 * blockN));
+    }
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(dw, even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllZip() {
+  const ForDemoteVectors<TestZipLower> lower_unsigned;
+  // TODO(janwas): enable after LowerHalf available
+#if HWY_TARGET != HWY_RVV
+  lower_unsigned(uint8_t());
+#endif
+  lower_unsigned(uint16_t());
+#if HWY_CAP_INTEGER64
+  lower_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForDemoteVectors<TestZipLower> lower_signed;
+#if HWY_TARGET != HWY_RVV
+  lower_signed(int8_t());
+#endif
+  lower_signed(int16_t());
+#if HWY_CAP_INTEGER64
+  lower_signed(int32_t());  // generates i64
+#endif
+
+  const ForShrinkableVectors<TestZipUpper> upper_unsigned;
+#if HWY_TARGET != HWY_RVV
+  upper_unsigned(uint8_t());
+#endif
+  upper_unsigned(uint16_t());
+#if HWY_CAP_INTEGER64
+  upper_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForShrinkableVectors<TestZipUpper> upper_signed;
+#if HWY_TARGET != HWY_RVV
+  upper_signed(int8_t());
+#endif
+  upper_signed(int16_t());
+#if HWY_CAP_INTEGER64
+  upper_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+template <int kBytes>
+struct TestCombineShiftRightBytesR {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const size_t kBlockSize = 16;
+    static_assert(kBytes < kBlockSize, "Shift count is per block");
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    if (N8 < 16) return;
+    auto hi_bytes = AllocateAligned<uint8_t>(N8);
+    auto lo_bytes = AllocateAligned<uint8_t>(N8);
+    auto expected_bytes = AllocateAligned<uint8_t>(N8);
+    uint8_t combined[2 * kBlockSize];
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+      for (size_t i = 0; i < N8; ++i) {
+        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+      }
+      for (size_t i = 0; i < N8; i += kBlockSize) {
+        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+        CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
+      }
+
+      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
+    }
+
+    TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+template <int kLanes>
+struct TestCombineShiftRightLanesR {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    if (N8 < 16) return;
+
+    auto hi_bytes = AllocateAligned<uint8_t>(N8);
+    auto lo_bytes = AllocateAligned<uint8_t>(N8);
+    auto expected_bytes = AllocateAligned<uint8_t>(N8);
+    const size_t kBlockSize = 16;
+    uint8_t combined[2 * kBlockSize];
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
+      for (size_t i = 0; i < N8; ++i) {
+        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
+      }
+      for (size_t i = 0; i < N8; i += kBlockSize) {
+        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
+        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
+        CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
+                              &expected_bytes[i]);
+      }
+
+      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
+      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
+      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
+      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
+    }
+
+    TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+template <>
+struct TestCombineShiftRightBytesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};
+
+template <>
+struct TestCombineShiftRightLanesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};
+
+struct TestCombineShiftRight {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
+    TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
+    TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
+  }
+};
+
+HWY_NOINLINE void TestAllCombineShiftRight() {
+  // Need at least 2 lanes.
+  ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
+}
+
+class TestSpecialShuffle32 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+    VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
+    VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
+  }
+
+ private:
+  template <class D, class V>
+  HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
+                                  const size_t i2, const size_t i1,
+                                  const size_t i0, const char* filename,
+                                  const int line) {
+    using T = TFromD<D>;
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    const size_t N = Lanes(d);
+    if (N < 4) return;
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += kBlockN) {
+      expected[block + 3] = static_cast<T>(block + i3);
+      expected[block + 2] = static_cast<T>(block + i2);
+      expected[block + 1] = static_cast<T>(block + i1);
+      expected[block + 0] = static_cast<T>(block + i0);
+    }
+    AssertVecEqual(d, expected.get(), actual, filename, line);
+  }
+};
+
+class TestSpecialShuffle64 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+    VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
+  }
+
+ private:
+  template <class D, class V>
+  HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
+                                  const size_t i0, const char* filename,
+                                  const int line) {
+    using T = TFromD<D>;
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    const size_t N = Lanes(d);
+    if (N < 2) return;
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += kBlockN) {
+      expected[block + 1] = static_cast<T>(block + i1);
+      expected[block + 0] = static_cast<T>(block + i0);
+    }
+    AssertVecEqual(d, expected.get(), actual, filename, line);
+  }
+};
+
+HWY_NOINLINE void TestAllSpecialShuffles() {
+  const ForGE128Vectors<TestSpecialShuffle32> test32;
+  test32(uint32_t());
+  test32(int32_t());
+  test32(float());
+
+#if HWY_CAP_INTEGER64
+  const ForGE128Vectors<TestSpecialShuffle64> test64;
+  test64(uint64_t());
+  test64(int64_t());
+#endif
+
+#if HWY_CAP_FLOAT64
+  const ForGE128Vectors<TestSpecialShuffle64> test_d;
+  test_d(double());
+#endif
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyBlockwiseTest);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftBytes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftLanes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytes);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZip);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllCombineShiftRight);
+HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
+}  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/third_party/highway/hwy/tests/combine_test.cc
+++ b/third_party/highway/hwy/tests/combine_test.cc
@ -36,16 +36,21 @@ struct TestLowerHalf {

    const size_t N = Lanes(d);
    auto lanes = AllocateAligned<T>(N);
+    auto lanes2 = AllocateAligned<T>(N);
    std::fill(lanes.get(), lanes.get() + N, T(0));
+    std::fill(lanes2.get(), lanes2.get() + N, T(0));
    const auto v = Iota(d, 1);
-    Store(LowerHalf(v), d2, lanes.get());
+    Store(LowerHalf(d2, v), d2, lanes.get());
+    Store(LowerHalf(v), d2, lanes2.get());  // optionally without D
    size_t i = 0;
    for (; i < Lanes(d2); ++i) {
      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+      HWY_ASSERT_EQ(T(1 + i), lanes2[i]);
    }
    // Other half remains unchanged
    for (; i < N; ++i) {
      HWY_ASSERT_EQ(T(0), lanes[i]);
+      HWY_ASSERT_EQ(T(0), lanes2[i]);
    }
  }
 };
@ -53,29 +58,35 @@ struct TestLowerHalf {
 struct TestLowerQuarter {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Half<Half<D>> d4;
+    const Half<D> d2;
+    const Half<decltype(d2)> d4;

    const size_t N = Lanes(d);
    auto lanes = AllocateAligned<T>(N);
+    auto lanes2 = AllocateAligned<T>(N);
    std::fill(lanes.get(), lanes.get() + N, T(0));
+    std::fill(lanes2.get(), lanes2.get() + N, T(0));
    const auto v = Iota(d, 1);
-    const auto lo = LowerHalf(LowerHalf(v));
+    const auto lo = LowerHalf(d4, LowerHalf(d2, v));
+    const auto lo2 = LowerHalf(LowerHalf(v));  // optionally without D
    Store(lo, d4, lanes.get());
+    Store(lo2, d4, lanes2.get());
    size_t i = 0;
    for (; i < Lanes(d4); ++i) {
      HWY_ASSERT_EQ(T(i + 1), lanes[i]);
+      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
    }
    // Upper 3/4 remain unchanged
    for (; i < N; ++i) {
      HWY_ASSERT_EQ(T(0), lanes[i]);
+      HWY_ASSERT_EQ(T(0), lanes2[i]);
    }
  }
 };

 HWY_NOINLINE void TestAllLowerHalf() {
-  constexpr size_t kDiv = 1;
-  ForAllTypes(ForPartialVectors<TestLowerHalf, kDiv, /*kMinLanes=*/2>());
-  ForAllTypes(ForPartialVectors<TestLowerQuarter, kDiv, /*kMinLanes=*/4>());
+  ForAllTypes(ForDemoteVectors<TestLowerHalf>());
+  ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>());
 }

 struct TestUpperHalf {
@ -90,7 +101,7 @@ struct TestUpperHalf {
    auto lanes = AllocateAligned<T>(N);
    std::fill(lanes.get(), lanes.get() + N, T(0));

-    Store(UpperHalf(v), d2, lanes.get());
+    Store(UpperHalf(d2, v), d2, lanes.get());
    size_t i = 0;
    for (; i < Lanes(d2); ++i) {
      HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
@ -106,13 +117,12 @@ struct TestUpperHalf {
 };

 HWY_NOINLINE void TestAllUpperHalf() {
-  ForAllTypes(ForGE128Vectors<TestUpperHalf>());
+  ForAllTypes(ForShrinkableVectors<TestUpperHalf>());
 }

 struct TestZeroExtendVector {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_CAP_GE256
    const Twice<D> d2;

    const auto v = Iota(d, 1);
@ -121,7 +131,7 @@ struct TestZeroExtendVector {
    Store(v, d, &lanes[0]);
    Store(v, d, &lanes[N2 / 2]);

-    const auto ext = ZeroExtendVector(v);
+    const auto ext = ZeroExtendVector(d2, v);
    Store(ext, d2, lanes.get());

    size_t i = 0;
@ -133,9 +143,6 @@ struct TestZeroExtendVector {
    for (; i < N2; ++i) {
      HWY_ASSERT_EQ(T(0), lanes[i]);
    }
-#else
-    (void)d;
-#endif
  }
 };

@ -146,21 +153,17 @@ HWY_NOINLINE void TestAllZeroExtendVector() {
 struct TestCombine {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_CAP_GE256
    const Twice<D> d2;
    const size_t N2 = Lanes(d2);
    auto lanes = AllocateAligned<T>(N2);

    const auto lo = Iota(d, 1);
    const auto hi = Iota(d, N2 / 2 + 1);
-    const auto combined = Combine(hi, lo);
+    const auto combined = Combine(d2, hi, lo);
    Store(combined, d2, lanes.get());

    const auto expected = Iota(d2, 1);
    HWY_ASSERT_VEC_EQ(d2, expected, combined);
-#else
-    (void)d;
-#endif
  }
 };

@ -168,102 +171,81 @@ HWY_NOINLINE void TestAllCombine() {
  ForAllTypes(ForExtendableVectors<TestCombine>());
 }

-
-template <int kBytes>
-struct TestCombineShiftRightBytesR {
+struct TestConcat {
  template <class T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-// Scalar does not define CombineShiftRightBytes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    const Repartition<uint8_t, D> d8;
-    const size_t N8 = Lanes(d8);
-    const auto lo = BitCast(d, Iota(d8, 1));
-    const auto hi = BitCast(d, Iota(d8, 1 + N8));
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+    const size_t half_bytes = N * sizeof(T) / 2;

-    auto expected = AllocateAligned<T>(Lanes(d));
-    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+    auto hi = AllocateAligned<T>(N);
+    auto lo = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    RandomState rng;
+    for (size_t rep = 0; rep < 10; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        hi[i] = static_cast<T>(Random64(&rng) & 0xFF);
+        lo[i] = static_cast<T>(Random64(&rng) & 0xFF);
+      }

-    const size_t kBlockSize = 16;
-    for (size_t i = 0; i < N8; ++i) {
-      const size_t block = i / kBlockSize;
-      const size_t lane = i % kBlockSize;
-      const size_t first_lo = block * kBlockSize;
-      const size_t idx = lane + kBytes;
-      const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
-      const bool at_end = idx >= 2 * kBlockSize;
-      expected_bytes[i] =
-          at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
+      {
+        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+        memcpy(&expected[0], &lo[0], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
+        memcpy(&expected[0], &lo[N / 2], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[0], half_bytes);
+        memcpy(&expected[0], &lo[N / 2], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo));
+      }
+
+      {
+        memcpy(&expected[N / 2], &hi[0], half_bytes);
+        memcpy(&expected[0], &lo[0], half_bytes);
+        const auto vhi = Load(d, hi.get());
+        const auto vlo = Load(d, lo.get());
+        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo));
+      }
    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(),
-                      CombineShiftRightBytes<kBytes>(hi, lo));
+  }
+};

-    TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
+HWY_NOINLINE void TestAllConcat() {
+  ForAllTypes(ForShrinkableVectors<TestConcat>());
+}
+
+struct TestConcatOddEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
+    const size_t N = Lanes(d);
+    const auto hi = Iota(d, N);
+    const auto lo = Iota(d, 0);
+    const auto even = Add(Iota(d, 0), Iota(d, 0));
+    const auto odd = Add(even, Set(d, 1));
+    HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
+    HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
 #else
-    (void)t;
    (void)d;
-#endif  // #if HWY_TARGET != HWY_SCALAR
+#endif
  }
 };

-template <int kLanes>
-struct TestCombineShiftRightLanesR {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    const Repartition<uint8_t, D> d8;
-    const size_t N8 = Lanes(d8);
-    const auto lo = BitCast(d, Iota(d8, 1));
-    const auto hi = BitCast(d, Iota(d8, 1 + N8));
-
-    auto expected = AllocateAligned<T>(Lanes(d));
-
-    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
-
-    const size_t kBlockSize = 16;
-    for (size_t i = 0; i < N8; ++i) {
-      const size_t block = i / kBlockSize;
-      const size_t lane = i % kBlockSize;
-      const size_t first_lo = block * kBlockSize;
-      const size_t idx = lane + kLanes * sizeof(T);
-      const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
-      const bool at_end = idx >= 2 * kBlockSize;
-      expected_bytes[i] =
-          at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(),
-                      CombineShiftRightLanes<kLanes>(hi, lo));
-
-    TestCombineShiftRightBytesR<kLanes - 1>()(t, d);
-#else
-    (void)t;
-    (void)d;
-#endif  // #if HWY_TARGET != HWY_SCALAR
-  }
-};
-
-template <>
-struct TestCombineShiftRightBytesR<0> {
-  template <class T, class D>
-  void operator()(T /*unused*/, D /*unused*/) {}
-};
-
-template <>
-struct TestCombineShiftRightLanesR<0> {
-  template <class T, class D>
-  void operator()(T /*unused*/, D /*unused*/) {}
-};
-
-struct TestCombineShiftRight {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    TestCombineShiftRightBytesR<15>()(t, d);
-    TestCombineShiftRightLanesR<16 / sizeof(T) - 1>()(t, d);
-  }
-};
-
-HWY_NOINLINE void TestAllCombineShiftRight() {
-  ForAllTypes(ForGE128Vectors<TestCombineShiftRight>());
+HWY_NOINLINE void TestAllConcatOddEven() {
+  ForUIF3264(ForShrinkableVectors<TestConcatOddEven>());
 }

 // NOLINTNEXTLINE(google-readability-namespace-comments)
@ -272,15 +254,24 @@ HWY_NOINLINE void TestAllCombineShiftRight() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyCombineTest);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
 HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven);
 }  // namespace hwy
-#endif
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif  // HWY_ONCE

 #else
 int main(int, char**) { return 0; }
--- a/third_party/highway/hwy/tests/compare_test.cc
+++ b/third_party/highway/hwy/tests/compare_test.cc
@ -26,25 +26,6 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

-// All types.
-struct TestMask {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-
-    std::fill(lanes.get(), lanes.get() + N, T(0));
-    const auto actual_false = MaskFromVec(Load(d, lanes.get()));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
-
-    memset(lanes.get(), 0xFF, N * sizeof(T));
-    const auto actual_true = MaskFromVec(Load(d, lanes.get()));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
-  }
-};
-
-HWY_NOINLINE void TestAllMask() { ForAllTypes(ForPartialVectors<TestMask>()); }
-
 // All types.
 struct TestEquality {
  template <typename T, class D>
@ -57,8 +38,14 @@ struct TestEquality {
    const auto mask_true = MaskTrue(d);

    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b));
  }
 };

@ -90,6 +77,37 @@ void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {

 #define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)

+struct TestStrictUnsigned {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T max = LimitsMax<T>();
+    const auto v0 = Zero(d);
+    const auto v2 = And(Iota(d, T(2)), Set(d, 255));  // 0..255
+
+    const auto mask_false = MaskFalse(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 128, 127);
+    HWY_ENSURE_GREATER(d, max, max / 2);
+    HWY_ENSURE_GREATER(d, max, 1);
+    HWY_ENSURE_GREATER(d, max, 0);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictUnsigned() {
+  ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>());
+}
+
 struct TestStrictInt {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -97,7 +115,7 @@ struct TestStrictInt {
    const T max = LimitsMax<T>();
    const auto v0 = Zero(d);
    const auto v2 = And(Iota(d, T(2)), Set(d, 127));  // 0..127
-    const auto vn = Neg(v2) - Set(d, 1);              // -1..-128
+    const auto vn = Sub(Neg(v2), Set(d, 1));          // -1..-128

    const auto mask_false = MaskFalse(d);
    const auto mask_true = MaskTrue(d);
@ -131,14 +149,14 @@ struct TestStrictInt {
 };

 HWY_NOINLINE void TestAllStrictInt() {
-  ForSignedTypes(ForExtendableVectors<TestStrictInt>());
+  ForSignedTypes(ForPartialVectors<TestStrictInt>());
 }

 struct TestStrictFloat {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const T huge_neg = -1E35;
-    const T huge_pos = 1E36;
+    const T huge_neg = T(-1E35);
+    const T huge_pos = T(1E36);
    const auto v0 = Zero(d);
    const auto v2 = Iota(d, T(2));
    const auto vn = Neg(v2);
@ -173,13 +191,13 @@ struct TestStrictFloat {
 };

 HWY_NOINLINE void TestAllStrictFloat() {
-  ForFloatTypes(ForExtendableVectors<TestStrictFloat>());
+  ForFloatTypes(ForPartialVectors<TestStrictFloat>());
 }

 struct TestWeakFloat {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v2 = Iota(d, 2);
+    const auto v2 = Iota(d, T(2));
    const auto vn = Iota(d, -T(Lanes(d)));

    const auto mask_false = MaskFalse(d);
@ -206,12 +224,20 @@ HWY_NOINLINE void TestAllWeakFloat() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyCompareTest);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
 HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/tests/convert_test.cc
+++ b/third_party/highway/hwy/tests/convert_test.cc
@ -34,7 +34,10 @@ struct TestBitCast {
  template <typename T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const Repartition<ToT, D> dto;
-    HWY_ASSERT_EQ(Lanes(d) * sizeof(T), Lanes(dto) * sizeof(ToT));
+    const size_t N = Lanes(d);
+    const size_t Nto = Lanes(dto);
+    if (N == 0 || Nto == 0) return;
+    HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT));
    const auto vf = Iota(d, 1);
    const auto vt = BitCast(dto, vf);
    // Must return the same bits
@ -130,8 +133,10 @@ HWY_NOINLINE void TestAllBitCast() {
 #endif  // HWY_CAP_INTEGER64
 #endif  // HWY_CAP_FLOAT64

+#if HWY_TARGET != HWY_SCALAR
  // For non-scalar vectors, we can cast all types to all.
-  ForAllTypes(ForGE128Vectors<TestBitCastFrom>());
+  ForAllTypes(ForGE64Vectors<TestBitCastFrom>());
+#endif
 }

 template <typename ToT>
@ -146,7 +151,7 @@ struct TestPromoteTo {
    auto expected = AllocateAligned<ToT>(N);

    RandomState rng;
-    for (size_t rep = 0; rep < 200; ++rep) {
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        const uint64_t bits = rng();
        memcpy(&from[i], &bits, sizeof(T));
@ -160,39 +165,39 @@ struct TestPromoteTo {
 };

 HWY_NOINLINE void TestAllPromoteTo() {
-  const ForPartialVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
+  const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
  to_u16div2(uint8_t());

-  const ForPartialVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
  to_u32div4(uint8_t());

-  const ForPartialVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
+  const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
  to_u32div2(uint16_t());

-  const ForPartialVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
+  const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
  to_i16div2(uint8_t());
  to_i16div2(int8_t());

-  const ForPartialVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
  to_i32div2(uint16_t());
  to_i32div2(int16_t());

-  const ForPartialVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
+  const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
  to_i32div4(uint8_t());
  to_i32div4(int8_t());

-  // Must test f16 separately because we can only load/store/convert them.
+  // Must test f16/bf16 separately because we can only load/store/convert them.

 #if HWY_CAP_INTEGER64
-  const ForPartialVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
+  const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
  to_u64div2(uint32_t());

-  const ForPartialVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
+  const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
  to_i64div2(int32_t());
 #endif

 #if HWY_CAP_FLOAT64
-  const ForPartialVectors<TestPromoteTo<double>, 2> to_f64div2;
+  const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2;
  to_f64div2(int32_t());
  to_f64div2(float());
 #endif
@ -224,14 +229,23 @@ struct TestDemoteTo {
    const T min = LimitsMin<ToT>();
    const T max = LimitsMax<ToT>();

+    const auto value_ok = [&](T& value) {
+      if (!IsFinite(value)) return false;
+#if HWY_EMULATE_SVE
+      // farm_sve just casts, which is undefined if the value is out of range.
+      value = HWY_MIN(HWY_MAX(min, value), max);
+#endif
+      return true;
+    };
+
    RandomState rng;
-    for (size_t rep = 0; rep < 1000; ++rep) {
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
          memcpy(&from[i], &bits, sizeof(T));
-        } while (!IsFinite(from[i]));
-        expected[i] = static_cast<ToT>(std::min(std::max(min, from[i]), max));
+        } while (!value_ok(from[i]));
+        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
      }

      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
@ -241,22 +255,22 @@ struct TestDemoteTo {
 };

 HWY_NOINLINE void TestAllDemoteToInt() {
-  ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
  ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());

-  ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
  ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());

-  const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> to_u16;
+  const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
  to_u16(int32_t());

-  const ForDemoteVectors<TestDemoteTo<int16_t>, 2> to_i16;
+  const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
  to_i16(int32_t());
 }

 HWY_NOINLINE void TestAllDemoteToMixed() {
 #if HWY_CAP_FLOAT64
-  const ForDemoteVectors<TestDemoteTo<int32_t>, 2> to_i32;
+  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
  to_i32(double());
 #endif
 }
@ -275,7 +289,7 @@ struct TestDemoteToFloat {
    auto expected = AllocateAligned<ToT>(N);

    RandomState rng;
-    for (size_t rep = 0; rep < 1000; ++rep) {
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
      for (size_t i = 0; i < N; ++i) {
        do {
          const uint64_t bits = rng();
@ -285,7 +299,7 @@ struct TestDemoteToFloat {
        const T max_abs = HighestValue<ToT>();
        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-        const T clipped = copysign(std::min(magn, max_abs), from[i]);
+        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
        expected[i] = static_cast<ToT>(clipped);
      }

@ -338,6 +352,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
 struct TestF16 {
  template <typename TF32, class DF32>
  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_CAP_FLOAT16
    size_t padded;
    auto in = F16TestCases(d32, padded);
    using TF16 = float16_t;
@ -350,10 +365,192 @@ struct TestF16 {
      Store(DemoteTo(d16, loaded), d16, temp16.get());
      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
    }
+#else
+    (void)d32;
+#endif
  }
 };

-HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16, 2>()(float()); }
+HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
+
+template <class D>
+AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // +/- 1
+      1.0f, -1.0f,
+      // +/- 0
+      0.0f, -0.0f,
+      // near 0
+      0.25f, -0.25f,
+      // +/- integer
+      4.0f, -32.0f,
+      // positive near limit
+      3.389531389251535E38f, 1.99384199368e+38f,
+      // negative near limit
+      -3.389531389251535E38f, -1.99384199368e+38f,
+      // positive +/- delta
+      2.015625f, 3.984375f,
+      // negative +/- delta
+      -2.015625f, -3.984375f,
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  return in;
+}
+
+struct TestBF16 {
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_TARGET != HWY_RVV
+    size_t padded;
+    auto in = BF16TestCases(d32, padded);
+    using TBF16 = bfloat16_t;
+#if HWY_TARGET == HWY_SCALAR
+    const Rebind<TBF16, DF32> dbf16;  // avoid 4/2 = 2 lanes
+#else
+    const Repartition<TBF16, DF32> dbf16;
+#endif
+    const Half<decltype(dbf16)> dbf16_half;
+    const size_t N = Lanes(d32);
+    auto temp16 = AllocateAligned<TBF16>(N);
+
+    for (size_t i = 0; i < padded; i += N) {
+      const auto loaded = Load(d32, &in[i]);
+      const auto v16 = DemoteTo(dbf16_half, loaded);
+      Store(v16, dbf16_half, temp16.get());
+      const auto v16_loaded = Load(dbf16_half, temp16.get());
+      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded));
+    }
+#else
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
+
+template <class D>
+AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // Same as BF16TestCases:
+      // +/- 1
+      1.0f,
+      -1.0f,
+      // +/- 0
+      0.0f,
+      -0.0f,
+      // near 0
+      0.25f,
+      -0.25f,
+      // +/- integer
+      4.0f,
+      -32.0f,
+      // positive +/- delta
+      2.015625f,
+      3.984375f,
+      // negative +/- delta
+      -2.015625f,
+      -3.984375f,
+
+      // No huge values - would interfere with sum. But add more to fill 2 * N:
+      -2.0f,
+      -10.0f,
+      0.03125f,
+      1.03125f,
+      1.5f,
+      2.0f,
+      4.0f,
+      5.0f,
+      6.0f,
+      8.0f,
+      10.0f,
+      256.0f,
+      448.0f,
+      2080.0f,
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  return in;
+}
+
+class TestReorderDemote2To {
+  // In-place N^2 selection sort to avoid dependencies
+  void Sort(float* p, size_t count) {
+    for (size_t i = 0; i < count - 1; ++i) {
+      // Find min_element
+      size_t idx_min = i;
+      for (size_t j = i + 1; j < count; j++) {
+        if (p[j] < p[idx_min]) {
+          idx_min = j;
+        }
+      }
+
+      // Swap with current
+      const float tmp = p[i];
+      p[i] = p[idx_min];
+      p[idx_min] = tmp;
+    }
+  }
+
+ public:
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+#if HWY_TARGET != HWY_SCALAR
+    size_t padded;
+    auto in = ReorderBF16TestCases(d32, padded);
+
+    using TBF16 = bfloat16_t;
+    const Repartition<TBF16, DF32> dbf16;
+    const Half<decltype(dbf16)> dbf16_half;
+    const size_t N = Lanes(d32);
+    auto temp16 = AllocateAligned<TBF16>(2 * N);
+    auto expected = AllocateAligned<float>(2 * N);
+    auto actual = AllocateAligned<float>(2 * N);
+
+    for (size_t i = 0; i < padded; i += 2 * N) {
+      const auto f0 = Load(d32, &in[i + 0]);
+      const auto f1 = Load(d32, &in[i + N]);
+      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
+      Store(v16, dbf16, temp16.get());
+      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
+      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
+
+      // Smoke test: sum should be same (with tolerance for non-associativity)
+      const auto sum_expected =
+          GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
+      const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
+      HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
+                 sum_expected <= sum_actual + 1E-4);
+
+      // Ensure values are the same after sorting to undo the Reorder
+      Store(f0, d32, expected.get() + 0);
+      Store(f1, d32, expected.get() + N);
+      Store(promoted0, d32, actual.get() + 0);
+      Store(promoted1, d32, actual.get() + N);
+      Sort(expected.get(), 2 * N);
+      Sort(actual.get(), 2 * N);
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
+      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
+    }
+#else  // HWY_SCALAR
+    (void)d32;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllReorderDemote2To() {
+  ForShrinkableVectors<TestReorderDemote2To>()(float());
+}

 struct TestConvertU8 {
  template <typename T, class D>
@ -376,8 +573,9 @@ struct TestIntFromFloatHuge {
  template <typename TF, class DF>
  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
    // Still does not work, although ARMv7 manual says that float->int
-    // saturates, i.e. chooses the nearest representable value.
-#if HWY_TARGET != HWY_NEON
+    // saturates, i.e. chooses the nearest representable value. Also causes
+    // out-of-memory for MSVC, and unsafe cast in farm_sve.
+#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE)
    using TI = MakeSigned<TF>;
    const Rebind<TI, DF> di;

@ -395,9 +593,68 @@ struct TestIntFromFloatHuge {
  }
 };

-struct TestIntFromFloat {
+class TestIntFromFloat {
  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+  static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    constexpr size_t kBits = sizeof(TF) * 8;
+
+    // Powers of two, plus offsets to set some mantissa bits.
+    const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)};
+    for (int sign = 0; sign < 2; ++sign) {
+      for (size_t shift = 0; shift < kBits - 1; ++shift) {
+        for (int64_t ofs : ofs_table) {
+          const int64_t mag = (int64_t(1) << shift) + ofs;
+          const int64_t val = sign ? mag : -mag;
+          HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
+                            ConvertTo(di, Set(df, static_cast<TF>(val))));
+        }
+      }
+    }
+  }
+
+  template <typename TF, class DF>
+  static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // TF does not have enough precision to represent TI.
+    const double min = static_cast<double>(LimitsMin<TI>());
+    const double max = static_cast<double>(LimitsMax<TI>());
+
+    // Also check random values.
+    auto from = AllocateAligned<TF>(N);
+    auto expected = AllocateAligned<TI>(N);
+    RandomState rng;
+    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(TF));
+        } while (!std::isfinite(from[i]));
+#if defined(HWY_EMULATE_SVE)
+        // farm_sve just casts, which is undefined if the value is out of range.
+        from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2);
+#endif
+        if (from[i] >= max) {
+          expected[i] = LimitsMax<TI>();
+        } else if (from[i] <= min) {
+          expected[i] = LimitsMin<TI>();
+        } else {
+          expected[i] = static_cast<TI>(from[i]);
+        }
+      }
+
+      HWY_ASSERT_VEC_EQ(di, expected.get(),
+                        ConvertTo(di, Load(df, from.get())));
+    }
+  }
+
+ public:
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF tf, const DF df) {
    using TI = MakeSigned<TF>;
    const Rebind<TI, DF> di;
    const size_t N = Lanes(df);
@ -423,32 +680,8 @@ struct TestIntFromFloat {
    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
                      ConvertTo(di, Iota(df, -TF(N + 1) - eps)));

-    // TF does not have enough precision to represent TI.
-    const double min = static_cast<double>(LimitsMin<TI>());
-    const double max = static_cast<double>(LimitsMax<TI>());
-
-    // Also check random values.
-    auto from = AllocateAligned<TF>(N);
-    auto expected = AllocateAligned<TI>(N);
-    RandomState rng;
-    for (size_t rep = 0; rep < 1000; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          memcpy(&from[i], &bits, sizeof(TF));
-        } while (!std::isfinite(from[i]));
-        if (from[i] >= max) {
-          expected[i] = LimitsMax<TI>();
-        } else if (from[i] <= min) {
-          expected[i] = LimitsMin<TI>();
-        } else {
-          expected[i] = static_cast<TI>(from[i]);
-        }
-      }
-
-      HWY_ASSERT_VEC_EQ(di, expected.get(),
-                        ConvertTo(di, Load(df, from.get())));
-    }
+    TestPowers(tf, df);
+    TestRandom(tf, df);
  }
 };

@ -458,10 +691,10 @@ HWY_NOINLINE void TestAllIntFromFloat() {
 }

 struct TestFloatFromInt {
-  template <typename TI, class DI>
-  HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
-    using TF = MakeFloat<TI>;
-    const Rebind<TF, DI> df;
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const RebindToSigned<DF> di;
    const size_t N = Lanes(df);

    // Integer positive
@ -481,10 +714,7 @@ struct TestFloatFromInt {
 };

 HWY_NOINLINE void TestAllFloatFromInt() {
-  ForPartialVectors<TestFloatFromInt>()(int32_t());
-#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
-  ForPartialVectors<TestFloatFromInt>()(int64_t());
-#endif
+  ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
 }

 struct TestI32F64 {
@ -521,14 +751,6 @@ struct TestI32F64 {
                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));

-    // Huge positive float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
-                      DemoteTo(di, Set(df, TF(1E12))));
-
-    // Huge negative float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
-                      DemoteTo(di, Set(df, TF(-1E12))));
-
    // Max positive int
    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
                      PromoteTo(df, Set(di, LimitsMax<TI>())));
@ -536,12 +758,23 @@ struct TestI32F64 {
    // Min negative int
    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
                      PromoteTo(df, Set(di, LimitsMin<TI>())));
+
+    // farm_sve just casts, which is undefined if the value is out of range.
+#if !defined(HWY_EMULATE_SVE)
+    // Huge positive float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
+                      DemoteTo(di, Set(df, TF(1E12))));
+
+    // Huge negative float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
+                      DemoteTo(di, Set(df, TF(-1E12))));
+#endif
  }
 };

 HWY_NOINLINE void TestAllI32F64() {
 #if HWY_CAP_FLOAT64
-  ForDemoteVectors<TestI32F64, 2>()(double());
+  ForDemoteVectors<TestI32F64>()(double());
 #endif
 }

@ -552,6 +785,7 @@ HWY_NOINLINE void TestAllI32F64() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyConvertTest);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
@ -560,9 +794,18 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
 HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/tests/crypto_test.cc
+++ b/third_party/highway/hwy/tests/crypto_test.cc
@ -0,0 +1,549 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcpy
+
+#include "hwy/aligned_allocator.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+#define HWY_PRINT_CLMUL_GOLDEN 0
+
+#if HWY_TARGET != HWY_SCALAR
+
+class TestAES {
+  template <typename T, class D>
+  HWY_NOINLINE void TestSBox(T /*unused*/, D d) {
+    // The generic implementation of the S-box is difficult to verify by
+    // inspection, so we add a white-box test that verifies it using enumeration
+    // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box).
+    const uint8_t sbox[256] = {
+        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
+        0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
+        0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
+        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
+        0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
+        0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
+        0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
+        0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
+        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
+        0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
+        0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
+        0xb0, 0x54, 0xbb, 0x16};
+
+    // Ensure it's safe to load an entire vector by padding.
+    const size_t N = Lanes(d);
+    const size_t padded = RoundUpTo(256, N);
+    auto expected = AllocateAligned<T>(padded);
+    // Must wrap around to match the input (Iota).
+    for (size_t pos = 0; pos < padded;) {
+      const size_t remaining = HWY_MIN(padded - pos, size_t(256));
+      memcpy(expected.get() + pos, sbox, remaining);
+      pos += remaining;
+    }
+
+    for (size_t i = 0; i < 256; i += N) {
+      const auto in = Iota(d, i);
+      HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
+    }
+  }
+
+ public:
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    // Test vector (after first KeyAddition) from
+    // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf
+    alignas(16) constexpr uint8_t test_lanes[16] = {
+        0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30,
+        0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
+    const auto test = LoadDup128(d, test_lanes);
+
+    // = MixColumn result
+    alignas(16) constexpr uint8_t expected0_lanes[16] = {
+        0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
+        0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
+    const auto expected0 = LoadDup128(d, expected0_lanes);
+
+    // = KeyAddition result
+    alignas(16) constexpr uint8_t expected_lanes[16] = {
+        0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B,
+        0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C};
+    const auto expected = LoadDup128(d, expected_lanes);
+
+    alignas(16) uint8_t key_lanes[16];
+    for (size_t i = 0; i < 16; ++i) {
+      key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i];
+    }
+    const auto round_key = LoadDup128(d, key_lanes);
+
+    HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d)));
+    HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
+
+    TestSBox(t, d);
+  }
+};
+HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); }
+
+#else
+HWY_NOINLINE void TestAllAES() {}
+#endif  // HWY_TARGET != HWY_SCALAR
+
+struct TestCLMul {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // needs 64 bit lanes and 128-bit result
+#if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64
+    const size_t N = Lanes(d);
+    if (N == 1) return;
+
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+
+    constexpr size_t kCLMulNum = 512;
+    // Depends on rng!
+    static constexpr uint64_t kCLMulLower[kCLMulNum] = {
+        0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL,
+        0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL,
+        0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL,
+        0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL,
+        0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL,
+        0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL,
+        0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL,
+        0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL,
+        0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL,
+        0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL,
+        0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL,
+        0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL,
+        0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL,
+        0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL,
+        0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL,
+        0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL,
+        0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL,
+        0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL,
+        0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL,
+        0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL,
+        0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL,
+        0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL,
+        0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL,
+        0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL,
+        0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL,
+        0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL,
+        0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL,
+        0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL,
+        0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL,
+        0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL,
+        0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL,
+        0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL,
+        0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL,
+        0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL,
+        0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL,
+        0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL,
+        0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL,
+        0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL,
+        0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL,
+        0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL,
+        0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL,
+        0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL,
+        0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL,
+        0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL,
+        0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL,
+        0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL,
+        0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL,
+        0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL,
+        0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL,
+        0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL,
+        0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL,
+        0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL,
+        0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL,
+        0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL,
+        0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL,
+        0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL,
+        0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL,
+        0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL,
+        0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL,
+        0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL,
+        0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL,
+        0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL,
+        0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL,
+        0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL,
+        0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL,
+        0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL,
+        0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL,
+        0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL,
+        0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL,
+        0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL,
+        0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL,
+        0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL,
+        0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL,
+        0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL,
+        0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL,
+        0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL,
+        0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL,
+        0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL,
+        0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL,
+        0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL,
+        0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL,
+        0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL,
+        0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL,
+        0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL,
+        0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL,
+        0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL,
+        0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL,
+        0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL,
+        0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL,
+        0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL,
+        0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL,
+        0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL,
+        0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL,
+        0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL,
+        0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL,
+        0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL,
+        0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL,
+        0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL,
+        0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL,
+        0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL,
+        0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL,
+        0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL,
+        0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL,
+        0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL,
+        0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL,
+        0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL,
+        0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL,
+        0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL,
+        0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL,
+        0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL,
+        0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL,
+        0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL,
+        0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL,
+        0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL,
+        0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL,
+        0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL,
+        0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL,
+        0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL,
+        0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL,
+        0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL,
+        0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL,
+        0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL,
+        0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL,
+        0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL,
+        0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL,
+        0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL,
+        0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL,
+        0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL,
+        0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL,
+        0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL,
+        0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL,
+        0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL,
+        0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL,
+        0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL,
+        0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL,
+        0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL,
+        0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL,
+        0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL,
+        0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL,
+        0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL,
+        0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL,
+        0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL,
+        0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL,
+        0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL,
+        0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL,
+        0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL,
+        0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL,
+        0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL,
+        0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL,
+        0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL,
+        0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL,
+        0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL,
+        0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL,
+        0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL,
+        0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL,
+        0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL,
+        0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL,
+        0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL,
+        0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL,
+        0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL,
+        0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL,
+        0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL,
+        0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL,
+        0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL,
+        0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL,
+        0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL,
+        0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL,
+        0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL,
+        0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL,
+        0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL,
+        0xeaedae76e975b17cULL, 0x1852aa090effe18eULL};
+
+    static constexpr uint64_t kCLMulUpper[kCLMulNum] = {
+        0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL,
+        0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL,
+        0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL,
+        0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL,
+        0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL,
+        0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL,
+        0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL,
+        0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL,
+        0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL,
+        0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL,
+        0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL,
+        0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL,
+        0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL,
+        0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL,
+        0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL,
+        0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL,
+        0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL,
+        0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL,
+        0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL,
+        0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL,
+        0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL,
+        0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL,
+        0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL,
+        0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL,
+        0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL,
+        0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL,
+        0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL,
+        0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL,
+        0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL,
+        0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL,
+        0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL,
+        0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL,
+        0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL,
+        0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL,
+        0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL,
+        0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL,
+        0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL,
+        0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL,
+        0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL,
+        0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL,
+        0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL,
+        0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL,
+        0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL,
+        0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL,
+        0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL,
+        0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL,
+        0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL,
+        0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL,
+        0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL,
+        0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL,
+        0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL,
+        0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL,
+        0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL,
+        0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL,
+        0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL,
+        0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL,
+        0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL,
+        0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL,
+        0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL,
+        0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL,
+        0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL,
+        0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL,
+        0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL,
+        0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL,
+        0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL,
+        0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL,
+        0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL,
+        0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL,
+        0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL,
+        0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL,
+        0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL,
+        0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL,
+        0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL,
+        0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL,
+        0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL,
+        0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL,
+        0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL,
+        0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL,
+        0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL,
+        0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL,
+        0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL,
+        0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL,
+        0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL,
+        0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL,
+        0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL,
+        0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL,
+        0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL,
+        0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL,
+        0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL,
+        0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL,
+        0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL,
+        0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL,
+        0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL,
+        0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL,
+        0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL,
+        0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL,
+        0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL,
+        0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL,
+        0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL,
+        0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL,
+        0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL,
+        0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL,
+        0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL,
+        0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL,
+        0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL,
+        0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL,
+        0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL,
+        0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL,
+        0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL,
+        0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL,
+        0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL,
+        0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL,
+        0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL,
+        0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL,
+        0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL,
+        0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL,
+        0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL,
+        0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL,
+        0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL,
+        0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL,
+        0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL,
+        0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL,
+        0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL,
+        0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL,
+        0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL,
+        0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL,
+        0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL,
+        0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL,
+        0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL,
+        0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL,
+        0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL,
+        0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL,
+        0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL,
+        0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL,
+        0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL,
+        0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL,
+        0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL,
+        0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL,
+        0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL,
+        0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL,
+        0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL,
+        0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL,
+        0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL,
+        0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL,
+        0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL,
+        0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL,
+        0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL,
+        0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL,
+        0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL,
+        0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL,
+        0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL,
+        0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL,
+        0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL,
+        0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL,
+        0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL,
+        0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL,
+        0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL,
+        0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL,
+        0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL,
+        0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL,
+        0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL,
+        0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL,
+        0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL,
+        0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL,
+        0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL,
+        0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL,
+        0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL,
+        0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL,
+        0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL,
+        0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL,
+        0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL,
+    };
+
+    const size_t padded = RoundUpTo(kCLMulNum, N);
+    auto expected_lower = AllocateAligned<T>(padded);
+    auto expected_upper = AllocateAligned<T>(padded);
+    memcpy(expected_lower.get(), kCLMulLower, kCLMulNum * sizeof(T));
+    memcpy(expected_upper.get(), kCLMulUpper, kCLMulNum * sizeof(T));
+    const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
+    memset(expected_lower.get() + kCLMulNum, 0, padding_size);
+    memset(expected_upper.get() + kCLMulNum, 0, padding_size);
+
+    // Random inputs in each lane
+    RandomState rng;
+    for (size_t rep = 0; rep < kCLMulNum / N; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = Random64(&rng);
+        in2[i] = Random64(&rng);
+      }
+
+      const auto a = Load(d, in1.get());
+      const auto b = Load(d, in2.get());
+#if HWY_PRINT_CLMUL_GOLDEN
+      Store(CLMulLower(a, b), d, expected_lower.get() + rep * N);
+      Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N);
+#else
+      HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b));
+      HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b));
+#endif
+    }
+
+#if HWY_PRINT_CLMUL_GOLDEN
+    // RVV lacks PRIu64, so print 32-bit halves.
+    for (size_t i = 0; i < kCLMulNum; ++i) {
+      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32),
+             static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU));
+    }
+    printf("\n");
+    for (size_t i = 0; i < kCLMulNum; ++i) {
+      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32),
+             static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU));
+    }
+#endif  // HWY_PRINT_CLMUL_GOLDEN
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyCryptoTest);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES);
+HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul);
+}  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/third_party/highway/hwy/tests/hwy_gtest.h
+++ b/third_party/highway/hwy/tests/hwy_gtest.h
@ -0,0 +1,156 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_HWY_GTEST_H_
+#define HWY_TESTS_HWY_GTEST_H_
+
+// Adapters for GUnit to run tests for all targets.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+#include <utility>  // std::tuple
+
+#include "gtest/gtest.h"
+#include "hwy/highway.h"
+
+namespace hwy {
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Helper class to run parametric tests using the hwy target as parameter. To
+// use this define the following in your test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   TEST_P(MyTestSuite, MyTest) { ... }
+class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
+ protected:
+  void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(SupportedTargetsCalledForTest())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+};
+
+// Function to convert the test parameter of a TestWithParamTarget for
+// displaying it in the gtest test name.
+static inline std::string TestParamTargetName(
+    const testing::TestParamInfo<uint32_t>& info) {
+  return TargetName(info.param);
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite)              \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                           \
+      suite##Group, suite,                                      \
+      testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+      ::hwy::TestParamTargetName)
+
+// Helper class similar to TestWithParamTarget to run parametric tests that
+// depend on the target and another parametric test. If you need to use multiple
+// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
+// the generator. To use this class define the following in your test:
+//   class MyTestSuite : public TestWithParamTargetT<int> {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
+//   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
+template <typename T>
+class TestWithParamTargetAndT
+    : public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
+ public:
+  // Expose the parametric type here so it can be used by the
+  // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
+  using HwyParamType = T;
+
+ protected:
+  void SetUp() override {
+    SetSupportedTargetsForTest(std::get<0>(
+        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
+  }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(SupportedTargetsCalledForTest())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+
+  T GetParam() {
+    return std::get<1>(
+        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
+  }
+};
+
+template <typename T>
+std::string TestParamTargetNameAndT(
+    const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
+  return std::string(TargetName(std::get<0>(info.param))) + "_" +
+         ::testing::PrintToString(std::get<1>(info.param));
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator)     \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                               \
+      suite##Group, suite,                                          \
+      ::testing::Combine(                                           \
+          testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+          generator),                                               \
+      ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
+
+// Helper macro to export a function and define a test that tests it. This is
+// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
+#define HWY_EXPORT_AND_TEST_P(suite, func_name)                   \
+  HWY_EXPORT(func_name);                                          \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_EXPORT_AND_TEST_P_T(suite, func_name)                           \
+  HWY_EXPORT(func_name);                                                    \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_BEFORE_TEST(suite)                      \
+  class suite : public hwy::TestWithParamTarget {}; \
+  HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
+  static_assert(true, "For requiring trailing semicolon")
+
+}  // namespace hwy
+
+#endif  // HWY_TESTS_HWY_GTEST_H_
--- a/third_party/highway/hwy/tests/list_targets.cc
+++ b/third_party/highway/hwy/tests/list_targets.cc
@ -21,7 +21,9 @@

 void PrintTargets(const char* msg, uint32_t targets) {
  fprintf(stderr, "%s", msg);
-  for (unsigned x = targets; x != 0; x = x & (x - 1)) {
+  // For each bit:
+  for (uint32_t x = targets; x != 0; x = x & (x - 1)) {
+    // Extract value of least-significant bit.
    fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
  }
  fprintf(stderr, "\n");
@ -30,5 +32,6 @@ void PrintTargets(const char* msg, uint32_t targets) {
 int main() {
  PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
  PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
+  PrintTargets("Current CPU supports:", hwy::SupportedTargets());
  return 0;
 }
--- a/third_party/highway/hwy/tests/logical_test.cc
+++ b/third_party/highway/hwy/tests/logical_test.cc
@ -16,12 +16,12 @@
 #include <stdint.h>
 #include <string.h>  // memcmp

+#include "hwy/aligned_allocator.h"
 #include "hwy/base.h"

 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "tests/logical_test.cc"
 #include "hwy/foreach_target.h"
-
 #include "hwy/highway.h"
 #include "hwy/tests/test_util-inl.h"

@ -36,7 +36,7 @@ struct TestLogicalInteger {
    const auto vi = Iota(d, 0);
    const auto ones = VecFromMask(d, Eq(v0, v0));
    const auto v1 = Set(d, 1);
-    const auto vnot1 = Set(d, ~T(1));
+    const auto vnot1 = Set(d, T(~T(1)));

    HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
    HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
@ -160,308 +160,6 @@ HWY_NOINLINE void TestAllCopySign() {
  ForFloatTypes(ForPartialVectors<TestCopySign>());
 }

-struct TestFirstN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto mask_lanes = AllocateAligned<T>(N);
-
-    // GCC workaround: we previously used zero to indicate true because we can
-    // safely compare with that value. However, that hits an ICE for u64x1 on
-    // GCC 8.3 but not 8.4, even if the implementation of operator== is
-    // simplified to return zero. Using MaskFromVec avoids this, and requires
-    // FF..FF and 0 constants.
-    T on;
-    memset(&on, 0xFF, sizeof(on));
-    const T off = 0;
-
-    for (size_t len = 0; len <= N; ++len) {
-      for (size_t i = 0; i < N; ++i) {
-        mask_lanes[i] = i < len ? on : off;
-      }
-      const auto mask_vals = Load(d, mask_lanes.get());
-      const auto mask = MaskFromVec(mask_vals);
-      HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllFirstN() {
-  ForAllTypes(ForPartialVectors<TestFirstN>());
-}
-
-struct TestIfThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto mask_lanes = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // NOTE: reverse polarity (mask is true iff lane == 0) because we cannot
-    // reliably compare against all bits set (NaN for float types).
-    const T off = 1;
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < 50; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        mask_lanes[i] = (Random32(&rng) & 1024) ? off : T(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = (mask_lanes[i] == off) ? in2[i] : in1[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = mask_lanes[i] ? T(0) : in1[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = mask_lanes[i] ? in2[i] : T(0);
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfThenElse>());
-}
-
-struct TestMaskVec {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    const size_t N = Lanes(d);
-    auto mask_lanes = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
-      }
-
-      const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
-      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMaskVec() {
-  const ForPartialVectors<TestMaskVec> test;
-
-  test(uint16_t());
-  test(int16_t());
-  // TODO(janwas): float16_t - cannot compare yet
-
-  test(uint32_t());
-  test(int32_t());
-  test(float());
-
-#if HWY_CAP_INTEGER64
-  test(uint64_t());
-  test(int64_t());
-#endif
-#if HWY_CAP_FLOAT64
-  test(double());
-#endif
-}
-
-struct TestCompress {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TU = MakeUnsigned<T>;
-    const Rebind<TU, D> du;
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-    auto mask_lanes = AllocateAligned<TU>(N);
-    auto expected = AllocateAligned<T>(N);
-    auto actual = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < 100; ++rep) {
-      size_t expected_pos = 0;
-      for (size_t i = 0; i < N; ++i) {
-        const uint64_t bits = Random32(&rng);
-        in_lanes[i] = T();  // cannot initialize float16_t directly.
-        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
-        mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
-        if (mask_lanes[i] == 0) {  // Zero means true (easier to compare)
-          expected[expected_pos++] = in_lanes[i];
-        }
-      }
-
-      const auto in = Load(d, in_lanes.get());
-      const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
-
-      Store(Compress(in, mask), d, actual.get());
-      // Upper lanes are undefined.
-      for (size_t i = 0; i < expected_pos; ++i) {
-        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
-      }
-
-      // Also check CompressStore in the same way.
-      memset(actual.get(), 0, N * sizeof(T));
-      const size_t num_written = CompressStore(in, mask, d, actual.get());
-      HWY_ASSERT_EQ(expected_pos, num_written);
-      for (size_t i = 0; i < expected_pos; ++i) {
-        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
-      }
-    }
-  }
-};
-
-#if 0
-namespace detail {  // for code folding
-void PrintCompress16x8Tables() {
-  constexpr size_t N = 8;  // 128-bit SIMD
-  for (uint64_t code = 0; code < 1ull << N; ++code) {
-    std::array<uint8_t, N> indices{0};
-    size_t pos = 0;
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-
-    // Doubled (for converting lane to byte indices)
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,", 2 * indices[i]);
-    }
-  }
-  printf("\n");
-}
-
-// Compressed to nibbles
-void PrintCompress32x8Tables() {
-  constexpr size_t N = 8;  // AVX2
-  for (uint64_t code = 0; code < 1ull << N; ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-
-    // Convert to nibbles
-    uint64_t packed = 0;
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT(indices[i] < 16);
-      packed += indices[i] << (i * 4);
-    }
-
-    HWY_ASSERT(packed < (1ull << 32));
-    printf("0x%08x,", static_cast<uint32_t>(packed));
-  }
-  printf("\n");
-}
-
-// Pairs of 32-bit lane indices
-void PrintCompress64x4Tables() {
-  constexpr size_t N = 4;  // AVX2
-  for (uint64_t code = 0; code < 1ull << N; ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
-    }
-  }
-  printf("\n");
-}
-
-// 4-tuple of byte indices
-void PrintCompress32x4Tables() {
-  using T = uint32_t;
-  constexpr size_t N = 4;  // SSE4
-  for (uint64_t code = 0; code < 1ull << N; ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%zu,", sizeof(T) * indices[i] + idx_byte);
-      }
-    }
-  }
-  printf("\n");
-}
-
-// 8-tuple of byte indices
-void PrintCompress64x2Tables() {
-  using T = uint64_t;
-  constexpr size_t N = 2;  // SSE4
-  for (uint64_t code = 0; code < 1ull << N; ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%zu,", sizeof(T) * indices[i] + idx_byte);
-      }
-    }
-  }
-  printf("\n");
-}
-}  // namespace detail
-#endif
-
-HWY_NOINLINE void TestAllCompress() {
-  // detail::PrintCompress32x8Tables();
-  // detail::PrintCompress64x4Tables();
-  // detail::PrintCompress32x4Tables();
-  // detail::PrintCompress64x2Tables();
-  // detail::PrintCompress16x8Tables();
-
-  const ForPartialVectors<TestCompress> test;
-
-  test(uint16_t());
-  test(int16_t());
-  test(float16_t());
-
-  test(uint32_t());
-  test(int32_t());
-  test(float());
-
-#if HWY_CAP_INTEGER64
-  test(uint64_t());
-  test(int64_t());
-#endif
-#if HWY_CAP_FLOAT64
-  test(double());
-#endif
-}
-
 struct TestZeroIfNegative {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -488,7 +186,7 @@ struct TestBroadcastSignBit {
    const auto s0 = Zero(d);
    const auto s1 = Set(d, -1);  // all bit set
    const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
-    const auto vneg = s1 - vpos;
+    const auto vneg = Sub(s1, vpos);

    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
@ -508,23 +206,23 @@ struct TestTestBit {
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
    const size_t kNumBits = sizeof(T) * 8;
    for (size_t i = 0; i < kNumBits; ++i) {
-      const auto bit1 = Set(d, 1ull << i);
-      const auto bit2 = Set(d, 1ull << ((i + 1) % kNumBits));
-      const auto bit3 = Set(d, 1ull << ((i + 2) % kNumBits));
+      const auto bit1 = Set(d, T(1ull << i));
+      const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits)));
+      const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits)));
      const auto bits12 = Or(bit1, bit2);
      const auto bits23 = Or(bit2, bit3);
-      HWY_ASSERT(AllTrue(TestBit(bit1, bit1)));
-      HWY_ASSERT(AllTrue(TestBit(bits12, bit1)));
-      HWY_ASSERT(AllTrue(TestBit(bits12, bit2)));
+      HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1)));
+      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1)));
+      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2)));

-      HWY_ASSERT(AllFalse(TestBit(bits12, bit3)));
-      HWY_ASSERT(AllFalse(TestBit(bits23, bit1)));
-      HWY_ASSERT(AllFalse(TestBit(bit1, bit2)));
-      HWY_ASSERT(AllFalse(TestBit(bit2, bit1)));
-      HWY_ASSERT(AllFalse(TestBit(bit1, bit3)));
-      HWY_ASSERT(AllFalse(TestBit(bit3, bit1)));
-      HWY_ASSERT(AllFalse(TestBit(bit2, bit3)));
-      HWY_ASSERT(AllFalse(TestBit(bit3, bit2)));
+      HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3)));
+      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2)));
    }
  }
 };
@ -533,198 +231,54 @@ HWY_NOINLINE void TestAllTestBit() {
  ForIntegerTypes(ForPartialVectors<TestTestBit>());
 }

-struct TestAllTrueFalse {
+struct TestPopulationCount {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto zero = Zero(d);
-    auto v = zero;
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(0));
-
-    auto mask_lanes = AllocateAligned<T>(N);
-
-    HWY_ASSERT(AllTrue(Eq(v, zero)));
-    HWY_ASSERT(!AllFalse(Eq(v, zero)));
-
-    // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
-    // lanes and one is nonzero.
-    const bool expected_all_false = (N != 1);
-
-    // Set each lane to nonzero and back to zero
-    for (size_t i = 0; i < N; ++i) {
-      lanes[i] = T(1);
-      v = Load(d, lanes.get());
-
-      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
-      // Assigning to an lvalue is insufficient but storing to memory prevents
-      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
-      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
-      HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
-
-      HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
-
-      lanes[i] = T(-1);
-      v = Load(d, lanes.get());
-      HWY_ASSERT(!AllTrue(Eq(v, zero)));
-      HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
-
-      // Reset to all zero
-      lanes[i] = T(0);
-      v = Load(d, lanes.get());
-      HWY_ASSERT(AllTrue(Eq(v, zero)));
-      HWY_ASSERT(!AllFalse(Eq(v, zero)));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllAllTrueFalse() {
-  ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
-}
-
-class TestStoreMaskBits {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*t*/, D d) {
-    // TODO(janwas): remove once implemented (cast or vse1)
-#if HWY_TARGET != HWY_RVV
-    RandomState rng;
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    const size_t expected_bytes = (N + 7) / 8;
-    auto bits = AllocateAligned<uint8_t>(expected_bytes);
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      // Generate random mask pattern.
-      for (size_t i = 0; i < N; ++i) {
-        lanes[i] = static_cast<T>((rng() & 1024) ? 1 : 0);
-      }
-      const auto mask = Load(d, lanes.get()) == Zero(d);
-
-      const size_t bytes_written = StoreMaskBits(mask, bits.get());
-
-      HWY_ASSERT_EQ(expected_bytes, bytes_written);
-      size_t i = 0;
-      // Stored bits must match original mask
-      for (; i < N; ++i) {
-        const bool bit = (bits[i / 8] & (1 << (i % 8))) != 0;
-        HWY_ASSERT_EQ(bit, lanes[i] == 0);
-      }
-      // Any partial bits in the last byte must be zero
-      for (; i < 8 * bytes_written; ++i) {
-        const int bit = (bits[i / 8] & (1 << (i % 8)));
-        HWY_ASSERT_EQ(bit, 0);
-      }
-    }
+#if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD
+    constexpr size_t kNumTests = 1 << 14;
 #else
-    (void)d;
+    constexpr size_t kNumTests = 1 << 20;
 #endif
-  }
-};
-
-HWY_NOINLINE void TestAllStoreMaskBits() {
-  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
-}
-
-struct TestCountTrue {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = std::min(N, size_t(10));
-
-    auto lanes = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(1));
-
-    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
-      // Number of zeros written = number of mask lanes that are true.
-      size_t expected = 0;
-      for (size_t i = 0; i < max_lanes; ++i) {
-        lanes[i] = T(1);
-        if (code & (1ull << i)) {
-          ++expected;
-          lanes[i] = T(0);
-        }
+    RandomState rng;
+    size_t N = Lanes(d);
+    auto data = AllocateAligned<T>(N);
+    auto popcnt = AllocateAligned<T>(N);
+    for (size_t i = 0; i < kNumTests / N; i++) {
+      for (size_t i = 0; i < N; i++) {
+        data[i] = static_cast<T>(rng());
+        popcnt[i] = static_cast<T>(PopCount(data[i]));
      }
-
-      const auto mask = Eq(Load(d, lanes.get()), Zero(d));
-      const size_t actual = CountTrue(mask);
-      HWY_ASSERT_EQ(expected, actual);
+      HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get())));
    }
  }
 };

-HWY_NOINLINE void TestAllCountTrue() {
-  ForAllTypes(ForPartialVectors<TestCountTrue>());
+HWY_NOINLINE void TestAllPopulationCount() {
+  ForUnsignedTypes(ForPartialVectors<TestPopulationCount>());
 }

-struct TestLogicalMask {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto m0 = MaskFalse(d);
-    const auto m_all = MaskTrue(d);
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(1));
-
-    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
-    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
-
-    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = std::min(N, size_t(6));
-    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
-      for (size_t i = 0; i < max_lanes; ++i) {
-        lanes[i] = T(1);
-        if (code & (1ull << i)) {
-          lanes[i] = T(0);
-        }
-      }
-
-      const auto m = Eq(Load(d, lanes.get()), Zero(d));
-
-      HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
-      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
-      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
-
-      HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
-      HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
-      HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
-      HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
-      HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
-      HWY_ASSERT_MASK_EQ(d, m, And(m, m));
-      HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
-      HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
-      HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLogicalMask() {
-  ForAllTypes(ForPartialVectors<TestLogicalMask>());
-}
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyLogicalTest);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
 HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllTrueFalse);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/tests/mask_test.cc
+++ b/third_party/highway/hwy/tests/mask_test.cc
@ -0,0 +1,465 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestFromVec {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+
+    memset(lanes.get(), 0, N * sizeof(T));
+    const auto actual_false = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
+
+    memset(lanes.get(), 0xFF, N * sizeof(T));
+    const auto actual_true = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
+  }
+};
+
+HWY_NOINLINE void TestAllFromVec() {
+  ForAllTypes(ForPartialVectors<TestFromVec>());
+}
+
+struct TestFirstN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    const RebindToSigned<D> di;
+    using TI = TFromD<decltype(di)>;
+    using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>;
+    const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
+
+    for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) {
+      const auto expected =
+          RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len))));
+      const auto actual = FirstN(d, len);
+      HWY_ASSERT_MASK_EQ(d, expected, actual);
+    }
+
+    // Also ensure huge values yield all-true.
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), FirstN(d, max_len));
+  }
+};
+
+HWY_NOINLINE void TestAllFirstN() {
+  ForAllTypes(ForPartialVectors<TestFirstN>());
+}
+
+struct TestIfThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? in1[i] : T(0);
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = bool_lanes[i] ? T(0) : in2[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestMaskVec {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskVec() {
+  const ForPartialVectors<TestMaskVec> test;
+
+  test(uint16_t());
+  test(int16_t());
+  // TODO(janwas): float16_t - cannot compare yet
+
+  ForUIF3264(test);
+}
+
+struct TestMaskedLoad {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(d);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    auto lanes = AllocateAligned<T>(N);
+    Store(Iota(d, T{1}), d, lanes.get());
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
+      const auto actual = MaskedLoad(mask, d, lanes.get());
+      HWY_ASSERT_VEC_EQ(d, expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskedLoad() {
+  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
+}
+
+struct TestAllTrueFalse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto zero = Zero(d);
+    auto v = zero;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+
+    auto mask_lanes = AllocateAligned<T>(N);
+
+    HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+    HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+
+    // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
+    // lanes and one is nonzero.
+    const bool expected_all_false = (N != 1);
+
+    // Set each lane to nonzero and back to zero
+    for (size_t i = 0; i < N; ++i) {
+      lanes[i] = T(1);
+      v = Load(d, lanes.get());
+
+      // GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
+      // Assigning to an lvalue is insufficient but storing to memory prevents
+      // the bug; so does Print of VecFromMask(d, Eq(v, zero)).
+      Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
+      HWY_ASSERT(!AllTrue(d, MaskFromVec(Load(d, mask_lanes.get()))));
+
+      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+      lanes[i] = T(-1);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
+      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
+
+      // Reset to all zero
+      lanes[i] = T(0);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(AllTrue(d, Eq(v, zero)));
+      HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllAllTrueFalse() {
+  ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
+}
+
+class TestStoreMaskBits {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
+    // TODO(janwas): remove once implemented (cast or vse1)
+#if HWY_TARGET != HWY_RVV
+    RandomState rng;
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+
+    const ScalableTag<uint8_t, -3> d_bits;
+    const size_t expected_num_bytes = (N + 7) / 8;
+    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
+    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
+
+    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
+      // Generate random mask pattern.
+      for (size_t i = 0; i < N; ++i) {
+        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
+      }
+      const auto bools = Load(di, bool_lanes.get());
+      const auto mask = Gt(bools, Zero(di));
+
+      // Requires at least 8 bytes, ensured above.
+      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
+      if (bytes_written != expected_num_bytes) {
+        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
+                TypeName(T(), N).c_str(),
+                static_cast<uint64_t>(expected_num_bytes),
+                static_cast<uint64_t>(bytes_written));
+
+        HWY_ASSERT(false);
+      }
+
+// TODO(janwas): enable after implemented
+#if HWY_TARGET != HWY_RVV
+      // Requires at least 8 bytes, ensured above.
+      const auto mask2 = LoadMaskBits(di, actual.get());
+      HWY_ASSERT_MASK_EQ(di, mask, mask2);
+#endif
+
+      memset(expected.get(), 0, expected_num_bytes);
+      for (size_t i = 0; i < N; ++i) {
+        expected[i / 8] = uint8_t(expected[i / 8] | (bool_lanes[i] << (i % 8)));
+      }
+
+      size_t i = 0;
+      // Stored bits must match original mask
+      for (; i < N; ++i) {
+        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
+        if (is_set != bool_lanes[i]) {
+          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
+                  int(bool_lanes[i]), int(is_set));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+      // Any partial bits in the last byte must be zero
+      for (; i < 8 * bytes_written; ++i) {
+        const int bit = (actual[i / 8] & (1 << (i % 8)));
+        if (bit != 0) {
+          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
+                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
+          Print(di, "bools", bools, 0, N);
+          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
+                expected_num_bytes);
+          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
+                expected_num_bytes);
+
+          HWY_ASSERT(false);
+        }
+      }
+    }
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+struct TestCountTrue {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = HWY_MIN(N, size_t(10));
+
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      // Number of zeros written = number of mask lanes that are true.
+      size_t expected = 0;
+      for (size_t i = 0; i < max_lanes; ++i) {
+        const bool is_true = (code & (1ull << i)) != 0;
+        bool_lanes[i] = is_true ? TI(1) : TI(0);
+        expected += is_true;
+      }
+
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const size_t actual = CountTrue(d, mask);
+      HWY_ASSERT_EQ(expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllCountTrue() {
+  ForAllTypes(ForPartialVectors<TestCountTrue>());
+}
+
+struct TestFindFirstTrue {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = HWY_MIN(N, size_t(10));
+
+    HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
+    HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
+
+    for (size_t code = 1; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+      }
+
+      const intptr_t expected =
+          static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(uint32_t(code)));
+      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+      const intptr_t actual = FindFirstTrue(d, mask);
+      HWY_ASSERT_EQ(expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllFindFirstTrue() {
+  ForAllTypes(ForPartialVectors<TestFindFirstTrue>());
+}
+
+struct TestLogicalMask {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m_all = MaskTrue(d);
+
+    using TI = MakeSigned<T>;  // For mask > 0 comparison
+    const Rebind<TI, D> di;
+    const size_t N = Lanes(di);
+    auto bool_lanes = AllocateAligned<TI>(N);
+    memset(bool_lanes.get(), 0, N * sizeof(TI));
+
+    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
+    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = HWY_MIN(N, size_t(6));
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
+      }
+
+      const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
+
+      HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
+
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
+      HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLogicalMask() {
+  ForAllTypes(ForPartialVectors<TestLogicalMask>());
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace hwy {
+HWY_BEFORE_TEST(HwyMaskTest);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
+HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
+}  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#endif
--- a/third_party/highway/hwy/tests/memory_test.cc
+++ b/third_party/highway/hwy/tests/memory_test.cc
@ -85,6 +85,8 @@ HWY_NOINLINE void TestAllLoadStore() {
 struct TestStoreInterleaved3 {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+// TODO(janwas): restore once segment intrinsics are available
+#if HWY_TARGET != HWY_RVV
    const size_t N = Lanes(d);

    RandomState rng;
@ -124,6 +126,9 @@ struct TestStoreInterleaved3 {
        HWY_ASSERT(false);
      }
    }
+#else
+    (void)d;
+#endif
  }
 };

@ -140,6 +145,8 @@ HWY_NOINLINE void TestAllStoreInterleaved3() {
 struct TestStoreInterleaved4 {
  template <class T, class D>
  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+// TODO(janwas): restore once segment intrinsics are available
+#if HWY_TARGET != HWY_RVV
    const size_t N = Lanes(d);

    RandomState rng;
@ -182,6 +189,9 @@ struct TestStoreInterleaved4 {
        HWY_ASSERT(false);
      }
    }
+#else
+    (void)d;
+#endif
  }
 };

@ -287,8 +297,8 @@ struct TestScatter {
      std::fill(expected.get(), expected.get() + range, T(0));
      std::fill(actual.get(), actual.get() + range, T(0));
      for (size_t i = 0; i < N; ++i) {
-        offsets[i] =
-            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+        // Must be aligned
+        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
        CopyBytes<sizeof(T)>(
            bytes.get() + i * sizeof(T),
            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
@ -307,7 +317,7 @@ struct TestScatter {
      for (size_t i = 0; i < N; ++i) {
        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
-                             &expected[offsets[i]]);
+                             &expected[size_t(offsets[i])]);
      }
      const auto vindices = Load(d_offsets, offsets.get());
      ScatterIndex(data, d, actual.get(), vindices);
@ -321,17 +331,7 @@ struct TestScatter {
 };

 HWY_NOINLINE void TestAllScatter() {
-  // No u8,u16,i8,i16.
-  const ForPartialVectors<TestScatter> test;
-  test(uint32_t());
-  test(int32_t());
-
-#if HWY_CAP_INTEGER64
-  test(uint64_t());
-  test(int64_t());
-#endif
-
-  ForFloatTypes(test);
+  ForUIF3264(ForPartialVectors<TestScatter>());
 }

 struct TestGather {
@ -340,11 +340,12 @@ struct TestGather {
    using Offset = MakeSigned<T>;

    const size_t N = Lanes(d);
+    const size_t range = 4 * N;                  // number of items to gather
+    const size_t max_bytes = range * sizeof(T);  // upper bound on offset

    RandomState rng;

    // Data to be gathered from
-    const size_t max_bytes = 4 * N * sizeof(T);  // upper bound on offset
    auto bytes = AllocateAligned<uint8_t>(max_bytes);
    for (size_t i = 0; i < max_bytes; ++i) {
      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
@ -357,8 +358,8 @@ struct TestGather {
    for (size_t rep = 0; rep < 100; ++rep) {
      // Offsets
      for (size_t i = 0; i < N; ++i) {
-        offsets[i] =
-            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+        // Must be aligned
+        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
        CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
      }

@ -380,16 +381,7 @@ struct TestGather {
 };

 HWY_NOINLINE void TestAllGather() {
-  // No u8,u16,i8,i16.
-  const ForPartialVectors<TestGather> test;
-  test(uint32_t());
-  test(int32_t());
-
-#if HWY_CAP_INTEGER64
-  test(uint64_t());
-  test(int64_t());
-#endif
-  ForFloatTypes(test);
+  ForUIF3264(ForPartialVectors<TestGather>());
 }

 HWY_NOINLINE void TestAllCache() {
@ -407,6 +399,7 @@ HWY_NOINLINE void TestAllCache() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(HwyMemoryTest);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
@ -418,4 +411,11 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
 HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/hwy/tests/swizzle_test.cc
+++ b/third_party/highway/hwy/tests/swizzle_test.cc
--- a/third_party/highway/hwy/tests/test_util-inl.h
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@ -12,254 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Normal include guard for non-SIMD portion of this header.
-#ifndef HWY_TESTS_TEST_UTIL_H_
-#define HWY_TESTS_TEST_UTIL_H_
+// Target-specific helper functions for use by *_test.cc.

-// Helper functions for use by *_test.cc.
-
-#include <stddef.h>
+#include <inttypes.h>
 #include <stdint.h>
-#include <stdio.h>
-#include <string.h>

-#include <cstddef>
-#include <string>
-#include <utility>  // std::forward
-
-#include "hwy/aligned_allocator.h"
 #include "hwy/base.h"
-#include "hwy/highway.h"
-
-#include "gtest/gtest.h"
-
-namespace hwy {
-
-// The maximum vector size used in tests when defining test data. DEPRECATED.
-constexpr size_t kTestMaxVectorSize = 64;
-
-// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
-// used INSTANTIATE_TEST_CASE_P which is now deprecated.
-#ifdef INSTANTIATE_TEST_SUITE_P
-#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
-#else
-#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
-#endif
-
-// Helper class to run parametric tests using the hwy target as parameter. To
-// use this define the following in your test:
-//   class MyTestSuite : public TestWithParamTarget {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
-//   TEST_P(MyTestSuite, MyTest) { ... }
-class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
- protected:
-  void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
-
-  void TearDown() override {
-    // Check that the parametric test calls SupportedTargets() when the source
-    // was compiled with more than one target. In the single-target case only
-    // static dispatch will be used anyway.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
-        << "This hwy target parametric test doesn't use dynamic-dispatch and "
-           "doesn't need to be parametric.";
-#endif
-    SetSupportedTargetsForTest(0);
-  }
-};
-
-// Function to convert the test parameter of a TestWithParamTarget for
-// displaying it in the gtest test name.
-static inline std::string TestParamTargetName(
-    const testing::TestParamInfo<uint32_t>& info) {
-  return TargetName(info.param);
-}
-
-#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite)              \
-  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                           \
-      suite##Group, suite,                                      \
-      testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
-      ::hwy::TestParamTargetName)
-
-// Helper class similar to TestWithParamTarget to run parametric tests that
-// depend on the target and another parametric test. If you need to use multiple
-// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
-// the generator. To use this class define the following in your test:
-//   class MyTestSuite : public TestWithParamTargetT<int> {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
-//   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
-template <typename T>
-class TestWithParamTargetAndT
-    : public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
- public:
-  // Expose the parametric type here so it can be used by the
-  // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
-  using HwyParamType = T;
-
- protected:
-  void SetUp() override {
-    SetSupportedTargetsForTest(std::get<0>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
-  }
-
-  void TearDown() override {
-    // Check that the parametric test calls SupportedTargets() when the source
-    // was compiled with more than one target. In the single-target case only
-    // static dispatch will be used anyway.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(SupportedTargetsCalledForTest())
-        << "This hwy target parametric test doesn't use dynamic-dispatch and "
-           "doesn't need to be parametric.";
-#endif
-    SetSupportedTargetsForTest(0);
-  }
-
-  T GetParam() {
-    return std::get<1>(
-        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
-  }
-};
-
-template <typename T>
-std::string TestParamTargetNameAndT(
-    const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
-  return std::string(TargetName(std::get<0>(info.param))) + "_" +
-         ::testing::PrintToString(std::get<1>(info.param));
-}
-
-#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator)     \
-  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                               \
-      suite##Group, suite,                                          \
-      ::testing::Combine(                                           \
-          testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
-          generator),                                               \
-      ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
-
-// Helper macro to export a function and define a test that tests it. This is
-// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
-//   class MyTestSuite : public TestWithParamTarget {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
-//   HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
-#define HWY_EXPORT_AND_TEST_P(suite, func_name)                   \
-  HWY_EXPORT(func_name);                                          \
-  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
-  static_assert(true, "For requiring trailing semicolon")
-
-#define HWY_EXPORT_AND_TEST_P_T(suite, func_name)                           \
-  HWY_EXPORT(func_name);                                                    \
-  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
-  static_assert(true, "For requiring trailing semicolon")
-
-#define HWY_BEFORE_TEST(suite)                      \
-  class suite : public hwy::TestWithParamTarget {}; \
-  HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
-  static_assert(true, "For requiring trailing semicolon")
-
-// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
-// which triggers a compiler bug.
-class RandomState {
- public:
-  explicit RandomState(const uint64_t seed = 0x123456789ull) {
-    s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
-    s1_ = SplitMix64(s0_);
-  }
-
-  HWY_INLINE uint64_t operator()() {
-    uint64_t s1 = s0_;
-    const uint64_t s0 = s1_;
-    const uint64_t bits = s1 + s0;
-    s0_ = s0;
-    s1 ^= s1 << 23;
-    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
-    s1_ = s1;
-    return bits;
-  }
-
- private:
-  static uint64_t SplitMix64(uint64_t z) {
-    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
-    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
-    return z ^ (z >> 31);
-  }
-
-  uint64_t s0_;
-  uint64_t s1_;
-};
-
-static HWY_INLINE uint32_t Random32(RandomState* rng) {
-  return static_cast<uint32_t>((*rng)());
-}
-
-// Prevents the compiler from eliding the computations that led to "output".
-// Works by indicating to the compiler that "output" is being read and modified.
-// The +r constraint avoids unnecessary writes to memory, but only works for
-// built-in types.
-template <class T>
-inline void PreventElision(T&& output) {
-#if HWY_COMPILER_MSVC
-  (void)output;
-#else   // HWY_COMPILER_MSVC
-  asm volatile("" : "+r"(output) : : "memory");
-#endif  // HWY_COMPILER_MSVC
-}
-
-// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
-// unsigned/signed/floating point, followed by the number of bits per lane;
-// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
-// understanding which instantiation of a generic test failed.
-template <typename T>
-static inline std::string TypeName(T /*unused*/, size_t N) {
-  const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
-  char name[64];
-  // Omit the xN suffix for scalars.
-  if (N == 1) {
-    snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
-  } else {
-    snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
-  }
-  return name;
-}
-
-// String comparison
-
-template <typename T1, typename T2>
-inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
-                       size_t* pos = nullptr) {
-  const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
-  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
-  for (size_t i = 0; i < size; ++i) {
-    if (bytes1[i] != bytes2[i]) {
-      fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
-              size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
-              TypeName(T2(), 1).c_str());
-      if (pos != nullptr) {
-        *pos = i;
-      }
-      return false;
-    }
-  }
-  return true;
-}
-
-inline bool StringsEqual(const char* s1, const char* s2) {
-  while (*s1 == *s2++) {
-    if (*s1++ == '\0') return true;
-  }
-  return false;
-}
-
-}  // namespace hwy
-
-#endif  // HWY_TESTS_TEST_UTIL_H_
+#include "hwy/tests/hwy_gtest.h"
+#include "hwy/tests/test_util.h"

 // Per-target include guard
-#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
 #ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
 #undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
 #else
@ -270,164 +34,139 @@ HWY_BEFORE_NAMESPACE();
 namespace hwy {
 namespace HWY_NAMESPACE {

+template <typename T, HWY_IF_LANE_SIZE(T, 1)>
+HWY_NOINLINE void PrintValue(T value) {
+  uint8_t byte;
+  CopyBytes<1>(&value, &byte);  // endian-safe: we ensured sizeof(T)=1.
+  fprintf(stderr, "0x%02X,", byte);
+}
+
+#if HWY_CAP_FLOAT16
+HWY_NOINLINE void PrintValue(float16_t value) {
+  uint16_t bits;
+  CopyBytes<2>(&value, &bits);
+  fprintf(stderr, "0x%02X,", bits);
+}
+#endif
+
+
+
+template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
+HWY_NOINLINE void PrintValue(T value) {
+  fprintf(stderr, "%g,", double(value));
+}
+
 // Prints lanes around `lane`, in memory order.
-template <class D>
-HWY_NOINLINE void Print(const D d, const char* caption, const Vec<D> v,
-                        intptr_t lane = 0) {
+template <class D, class V = Vec<D>>
+void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
+           size_t max_lanes = 7) {
  using T = TFromD<D>;
  const size_t N = Lanes(d);
  auto lanes = AllocateAligned<T>(N);
  Store(v, d, lanes.get());
-  const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
-  const size_t end = std::min(begin + 7, N);
-  fprintf(stderr, "%s %s [%zu+ ->]:\n  ", TypeName(T(), N).c_str(), caption,
-          begin);
-  for (size_t i = begin; i < end; ++i) {
-    fprintf(stderr, "%g,", double(lanes[i]));
-  }
-  if (begin >= end) fprintf(stderr, "(out of bounds)");
-  fprintf(stderr, "\n");
-}

-static HWY_NORETURN HWY_NOINLINE void NotifyFailure(
-    const char* filename, const int line, const char* type_name,
-    const size_t lane, const char* expected, const char* actual) {
-  hwy::Abort(filename, line,
-             "%s, %s lane %zu mismatch: expected '%s', got '%s'.\n",
-             hwy::TargetName(HWY_TARGET), type_name, lane, expected, actual);
-}
-
-template <class Out, class In>
-inline Out BitCast(const In& in) {
-  static_assert(sizeof(Out) == sizeof(In), "");
-  Out out;
-  CopyBytes<sizeof(out)>(&in, &out);
-  return out;
-}
-
-// Computes the difference in units of last place between x and y.
-template <typename TF>
-MakeUnsigned<TF> ComputeUlpDelta(TF x, TF y) {
-  static_assert(IsFloat<TF>(), "Only makes sense for floating-point");
-  using TU = MakeUnsigned<TF>;
-
-  // Handle -0 == 0 and infinities.
-  if (x == y) return 0;
-
-  // Consider "equal" if both are NaN, so we can verify an expected NaN.
-  // Needs a special case because there are many possible NaN representations.
-  if (std::isnan(x) && std::isnan(y)) return 0;
-
-  // NOTE: no need to check for differing signs; they will result in large
-  // differences, which is fine, and we avoid overflow.
-
-  const TU ux = BitCast<TU>(x);
-  const TU uy = BitCast<TU>(y);
-  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
-  return std::max(ux, uy) - std::min(ux, uy);
-}
-
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
-  return expected == actual;
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
-  return ComputeUlpDelta(expected, actual) <= 1;
-}
-
-// Compare non-vector, non-string T.
-template <typename T>
-HWY_NOINLINE void AssertEqual(const T expected, const T actual,
-                              const std::string& type_name,
-                              const char* filename = "", const int line = -1,
-                              const size_t lane = 0) {
-  if (!IsEqual(expected, actual)) {
-    char expected_str[100];
-    snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
-    char actual_str[100];
-    snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
-    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
-                  actual_str);
-  }
-}
-
-static HWY_NOINLINE HWY_MAYBE_UNUSED void AssertStringEqual(
-    const char* expected, const char* actual, const char* filename = "",
-    const int line = -1, const size_t lane = 0) {
-  if (!hwy::StringsEqual(expected, actual)) {
-    NotifyFailure(filename, line, "string", lane, expected, actual);
-  }
+  const auto info = hwy::detail::MakeTypeInfo<T>();
+  hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes);
 }

 // Compare expected vector to vector.
-template <class D, class V>
-HWY_NOINLINE void AssertVecEqual(D d, const V expected, const V actual,
-                                 const char* filename, const int line) {
-  using T = TFromD<D>;
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
+                    const char* filename, const int line) {
  const size_t N = Lanes(d);
-  auto expected_lanes = AllocateAligned<T>(N);
  auto actual_lanes = AllocateAligned<T>(N);
-  Store(expected, d, expected_lanes.get());
  Store(actual, d, actual_lanes.get());
-  for (size_t i = 0; i < N; ++i) {
-    if (!IsEqual(expected_lanes[i], actual_lanes[i])) {
-      fprintf(stderr, "\n\n");
-      Print(d, "expect", expected, i);
-      Print(d, "actual", actual, i);

-      char expected_str[100];
-      snprintf(expected_str, sizeof(expected_str), "%g",
-               double(expected_lanes[i]));
-      char actual_str[100];
-      snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
-
-      NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
-                    expected_str, actual_str);
-    }
-  }
+  const auto info = hwy::detail::MakeTypeInfo<T>();
+  const char* target_name = hwy::TargetName(HWY_TARGET);
+  hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,
+                                target_name, filename, line);
 }

 // Compare expected lanes to vector.
-template <class D>
-HWY_NOINLINE void AssertVecEqual(D d, const TFromD<D>* expected, Vec<D> actual,
+template <class D, typename T = TFromD<D>, class V = Vec<D>>
+HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
                                 const char* filename, int line) {
-  AssertVecEqual(d, LoadU(d, expected), actual, filename, line);
+  auto expected_lanes = AllocateAligned<T>(Lanes(d));
+  Store(expected, d, expected_lanes.get());
+  AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
 }

+// Only checks the valid mask elements (those whose index < Lanes(d)).
 template <class D>
-HWY_NOINLINE void AssertMaskEqual(D d, Mask<D> a, Mask<D> b,
+HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
                                  const char* filename, int line) {
  AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);

-  const std::string type_name = TypeName(TFromD<D>(), Lanes(d));
-  AssertEqual(CountTrue(a), CountTrue(b), type_name, filename, line, 0);
-  AssertEqual(AllTrue(a), AllTrue(b), type_name, filename, line, 0);
-  AssertEqual(AllFalse(a), AllFalse(b), type_name, filename, line, 0);
+  const char* target_name = hwy::TargetName(HWY_TARGET);
+  AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
+  AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);
+  AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);

-  // TODO(janwas): StoreMaskBits
+  // TODO(janwas): remove RVV once implemented (cast or vse1)
+#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
+  const size_t N = Lanes(d);
+  const Repartition<uint8_t, D> d8;
+  const size_t N8 = Lanes(d8);
+  auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
+  auto bits_b = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
+  memset(bits_a.get(), 0, N8);
+  memset(bits_b.get(), 0, N8);
+  const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
+  const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
+  AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);
+  size_t i = 0;
+  // First check whole bytes (if that many elements are still valid)
+  for (; i < N / 8; ++i) {
+    if (bits_a[i] != bits_b[i]) {
+      fprintf(stderr, "Mismatch in byte %" PRIu64 ": %d != %d\n",
+              static_cast<uint64_t>(i), bits_a[i], bits_b[i]);
+      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+      hwy::Abort(filename, line, "Masks not equal");
+    }
+  }
+  // Then the valid bit(s) in the last byte.
+  const size_t remainder = N % 8;
+  if (remainder != 0) {
+    const int mask = (1 << remainder) - 1;
+    const int valid_a = bits_a[i] & mask;
+    const int valid_b = bits_b[i] & mask;
+    if (valid_a != valid_b) {
+      fprintf(stderr, "Mismatch in last byte %" PRIu64 ": %d != %d\n",
+              static_cast<uint64_t>(i), valid_a, valid_b);
+      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
+      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
+      hwy::Abort(filename, line, "Masks not equal");
+    }
+  }
+#endif
+}
+
+// Only sets valid elements (those whose index < Lanes(d)). This helps catch
+// tests that are not masking off the (undefined) upper mask elements.
+//
+// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.
+template <class D>
+HWY_INLINE Mask<D> MaskTrue(const D d) {
+  return FirstN(d, Lanes(d));
 }

 template <class D>
-HWY_NOINLINE Mask<D> MaskTrue(const D d) {
-  const auto v0 = Zero(d);
-  return Eq(v0, v0);
-}
-
-template <class D>
-HWY_NOINLINE Mask<D> MaskFalse(const D d) {
-  // Lt is only for signed types and we cannot yet cast mask types.
-  return Eq(Zero(d), Set(d, 1));
+HWY_INLINE Mask<D> MaskFalse(const D d) {
+  const auto zero = Zero(RebindToSigned<D>());
+  return RebindMask(d, Lt(zero, zero));
 }

 #ifndef HWY_ASSERT_EQ

-#define HWY_ASSERT_EQ(expected, actual) \
-  AssertEqual(expected, actual, hwy::TypeName(expected, 1), __FILE__, __LINE__)
+#define HWY_ASSERT_EQ(expected, actual)                                     \
+  hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \
+                   __LINE__)

-#define HWY_ASSERT_STRING_EQ(expected, actual) \
-  AssertStringEqual(expected, actual, __FILE__, __LINE__)
+#define HWY_ASSERT_STRING_EQ(expected, actual)                          \
+  hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \
+                         __FILE__, __LINE__)

 #define HWY_ASSERT_VEC_EQ(d, expected, actual) \
  AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
@ -439,83 +178,182 @@ HWY_NOINLINE Mask<D> MaskFalse(const D d) {

 // Helpers for instantiating tests with combinations of lane types / counts.

-// For all powers of two in [kMinLanes, N * kMinLanes] (so that recursion stops
-// at N == 0)
-template <typename T, size_t N, size_t kMinLanes, class Test>
+// For ensuring we do not call tests with D such that widening D results in 0
+// lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
+// of a u64 vector in this case.
+template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
+HWY_INLINE size_t PromotedLanes(const D d) {
+  return Lanes(RepartitionToWide<decltype(d)>());
+}
+// Already the widest possible T, cannot widen.
+template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
+HWY_INLINE size_t PromotedLanes(const D d) {
+  return Lanes(d);
+}
+
+// For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
+// stops at kMul == 0). Note that N may be capped or a fraction.
+template <typename T, size_t kMul, size_t kMinLanes, class Test,
+          bool kPromote = false>
 struct ForeachSizeR {
  static void Do() {
-    static_assert(N != 0, "End of recursion");
-    Test()(T(), Simd<T, N * kMinLanes>());
-    ForeachSizeR<T, N / 2, kMinLanes, Test>::Do();
+    const Simd<T, kMul * kMinLanes> d;
+
+    // Skip invalid fractions (e.g. 1/8th of u32x4).
+    const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d);
+    if (lanes < kMinLanes) return;
+
+    Test()(T(), d);
+
+    static_assert(kMul != 0, "Recursion should have ended already");
+    ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do();
  }
 };

 // Base case to stop the recursion.
-template <typename T, size_t kMinLanes, class Test>
-struct ForeachSizeR<T, 0, kMinLanes, Test> {
+template <typename T, size_t kMinLanes, class Test, bool kPromote>
+struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> {
  static void Do() {}
 };

 // These adapters may be called directly, or via For*Types:

-// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
-template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
-struct ForPartialVectors {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-#if HWY_TARGET == HWY_RVV
-    // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
-    ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
-#else
-    ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
-                 Test>::Do();
-#endif
-  }
-};
-
-// Calls Test for all vectors that can be demoted log2(kFactor) times.
-template <class Test, size_t kFactor>
-struct ForDemoteVectors {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-#if HWY_TARGET == HWY_RVV
-    // Only m1..8 for now.
-    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
-#else
-    ForeachSizeR<T, HWY_LANES(T), 1, Test>::Do();
-#endif
-  }
-};
-
-// Calls Test for all powers of two in [128 bits, max bits].
-template <class Test>
-struct ForGE128Vectors {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-#if HWY_TARGET == HWY_RVV
-    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
-#else
-    ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
-                 Test>::Do();
-
-#endif
-  }
-};
-
-// Calls Test for all vectors that can be expanded by kFactor.
+// Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
+// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
 template <class Test, size_t kFactor = 2>
 struct ForExtendableVectors {
  template <typename T>
  void operator()(T /*unused*/) const {
-#if HWY_TARGET == HWY_RVV
-    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
 #else
-    ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
+    constexpr bool kPromote = true;
+#if HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
+    // Fractions
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do();
+#endif
+#endif  // HWY_SCALAR
+  }
+};
+
+// Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
+// that narrow their input, e.g. UpperHalf.
+template <class Test, size_t kFactor = 2>
+struct ForShrinkableVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do();
+#elif HWY_TARGET == HWY_SCALAR
+    // not supported
+#else
+    ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do();
+#endif
+  }
+};
+
+// Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
+// ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
+template <class Test>
+struct ForGE128Vectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
                 Test>::Do();
 #endif
  }
 };

+// Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
+// ops that require at least 64 bits, e.g. casts.
+template <class Test>
+struct ForGE64Vectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    // not supported
+#elif HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+    // TODO(janwas): also capped
+    // ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
+#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
+    // Capped
+    ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do();
+    // Fractions
+    ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)),
+                 Test>::Do();
+#endif
+  }
+};
+
+// Calls Test for all N that can be promoted (not the same as Extendable because
+// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
+template <class Test, size_t kFactor = 2>
+struct ForPromoteVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do();
+#else
+    return ForExtendableVectors<Test, kFactor>()(T());
+#endif
+  }
+};
+
+// Calls Test for all N than can be demoted (not the same as Shrinkable because
+// HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
+template <class Test, size_t kFactor = 2>
+struct ForDemoteVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_SCALAR
+    ForeachSizeR<T, 1, 1, Test>::Do();
+#else
+    return ForShrinkableVectors<Test, kFactor>()(T());
+#endif
+  }
+};
+
+// Calls Test for all power of two N in [1, Lanes(d)]. This is the default
+// for ops that do not narrow nor widen their input, nor require 128 bits.
+template <class Test>
+struct ForPartialVectors {
+  template <typename T>
+  void operator()(T t) const {
+    ForExtendableVectors<Test, 1>()(t);
+  }
+};
+
 // Type lists to shorten call sites:

 template <class Func>
@ -558,6 +396,42 @@ void ForAllTypes(const Func& func) {
  ForFloatTypes(func);
 }

+template <class Func>
+void ForUIF3264(const Func& func) {
+  func(uint32_t());
+  func(int32_t());
+#if HWY_CAP_INTEGER64
+  func(uint64_t());
+  func(int64_t());
+#endif
+
+  ForFloatTypes(func);
+}
+
+template <class Func>
+void ForUIF163264(const Func& func) {
+  ForUIF3264(func);
+  func(uint16_t());
+  func(int16_t());
+#if HWY_CAP_FLOAT16
+  func(float16_t());
+#endif
+}
+
+// For tests that involve loops, adjust the trip count so that emulated tests
+// finish quickly (but always at least 2 iterations to ensure some diversity).
+constexpr size_t AdjustedReps(size_t max_reps) {
+#if HWY_ARCH_RVV
+  return HWY_MAX(max_reps / 16, 2);
+#elif HWY_ARCH_ARM
+  return HWY_MAX(max_reps / 4, 2);
+#elif HWY_IS_DEBUG_BUILD
+  return HWY_MAX(max_reps / 8, 2);
+#else
+  return HWY_MAX(max_reps, 2);
+#endif
+}
+
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace hwy
--- a/third_party/highway/hwy/tests/test_util.cc
+++ b/third_party/highway/hwy/tests/test_util.cc
@ -0,0 +1,198 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/tests/test_util.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include "hwy/base.h"
+
+namespace hwy {
+
+bool BytesEqual(const void* p1, const void* p2, const size_t size,
+                size_t* pos) {
+  const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
+  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
+  for (size_t i = 0; i < size; ++i) {
+    if (bytes1[i] != bytes2[i]) {
+      fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n",
+              static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i],
+              bytes2[i]);
+      if (pos != nullptr) {
+        *pos = i;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+void AssertStringEqual(const char* expected, const char* actual,
+                       const char* target_name, const char* filename,
+                       int line) {
+  while (*expected == *actual++) {
+    if (*expected++ == '\0') return;
+  }
+
+  Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n",
+        target_name, expected, actual);
+}
+
+namespace detail {
+
+bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+             const void* actual_ptr) {
+  if (!info.is_float) {
+    return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t);
+  }
+
+  if (info.sizeof_t == 4) {
+    float expected, actual;
+    CopyBytes<4>(expected_ptr, &expected);
+    CopyBytes<4>(actual_ptr, &actual);
+    return ComputeUlpDelta(expected, actual) <= 1;
+  } else if (info.sizeof_t == 8) {
+    double expected, actual;
+    CopyBytes<8>(expected_ptr, &expected);
+    CopyBytes<8>(actual_ptr, &actual);
+    return ComputeUlpDelta(expected, actual) <= 1;
+  } else {
+    HWY_ABORT("Unexpected float size %" PRIu64 "\n",
+              static_cast<uint64_t>(info.sizeof_t));
+    return false;
+  }
+}
+
+void TypeName(const TypeInfo& info, size_t N, char* string100) {
+  const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
+  // Omit the xN suffix for scalars.
+  if (N == 1) {
+    snprintf(string100, 64, "%c%" PRIu64, prefix,
+             static_cast<uint64_t>(info.sizeof_t * 8));
+  } else {
+    snprintf(string100, 64, "%c%" PRIu64 "x%" PRIu64, prefix,
+             static_cast<uint64_t>(info.sizeof_t * 8),
+             static_cast<uint64_t>(N));
+  }
+}
+
+void ToString(const TypeInfo& info, const void* ptr, char* string100) {
+  if (info.sizeof_t == 1) {
+    uint8_t byte;
+    CopyBytes<1>(ptr, &byte);  // endian-safe: we ensured sizeof(T)=1.
+    snprintf(string100, 100, "0x%02X", byte);
+  } else if (info.sizeof_t == 2) {
+    uint16_t bits;
+    CopyBytes<2>(ptr, &bits);
+    snprintf(string100, 100, "0x%04X", bits);
+  } else if (info.sizeof_t == 4) {
+    if (info.is_float) {
+      float value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%g", double(value));
+    } else if (info.is_signed) {
+      int32_t value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%d", value);
+    } else {
+      uint32_t value;
+      CopyBytes<4>(ptr, &value);
+      snprintf(string100, 100, "%u", value);
+    }
+  } else {
+    HWY_ASSERT(info.sizeof_t == 8);
+    if (info.is_float) {
+      double value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%g", value);
+    } else if (info.is_signed) {
+      int64_t value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%" PRIi64 "", value);
+    } else {
+      uint64_t value;
+      CopyBytes<8>(ptr, &value);
+      snprintf(string100, 100, "%" PRIu64 "", value);
+    }
+  }
+}
+
+void PrintArray(const TypeInfo& info, const char* caption,
+                const void* array_void, size_t N, size_t lane_u,
+                size_t max_lanes) {
+  const uint8_t* array_bytes = reinterpret_cast<const uint8_t*>(array_void);
+
+  char type_name[100];
+  TypeName(info, N, type_name);
+
+  const intptr_t lane = intptr_t(lane_u);
+  const size_t begin = static_cast<size_t>(HWY_MAX(0, lane - 2));
+  const size_t end = HWY_MIN(begin + max_lanes, N);
+  fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n  ", type_name, caption,
+          static_cast<uint64_t>(begin));
+  for (size_t i = begin; i < end; ++i) {
+    const void* ptr = array_bytes + i * info.sizeof_t;
+    char str[100];
+    ToString(info, ptr, str);
+    fprintf(stderr, "%s,", str);
+  }
+  if (begin >= end) fprintf(stderr, "(out of bounds)");
+  fprintf(stderr, "\n");
+}
+
+HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
+                                        const void* expected_ptr,
+                                        const void* actual_ptr,
+                                        const char* target_name,
+                                        const char* filename, int line,
+                                        size_t lane, size_t num_lanes) {
+  char type_name[100];
+  TypeName(info, 1, type_name);
+  char expected_str[100];
+  ToString(info, expected_ptr, expected_str);
+  char actual_str[100];
+  ToString(info, actual_ptr, actual_str);
+  Abort(filename, line,
+        "%s, %sx%" PRIu64 " lane %" PRIu64
+        " mismatch: expected '%s', got '%s'.\n",
+        target_name, type_name, static_cast<uint64_t>(num_lanes),
+        static_cast<uint64_t>(lane), expected_str, actual_str);
+}
+
+void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
+                      const void* actual_void, size_t N,
+                      const char* target_name, const char* filename, int line) {
+  const uint8_t* expected_array =
+      reinterpret_cast<const uint8_t*>(expected_void);
+  const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void);
+  for (size_t i = 0; i < N; ++i) {
+    const void* expected_ptr = expected_array + i * info.sizeof_t;
+    const void* actual_ptr = actual_array + i * info.sizeof_t;
+    if (!IsEqual(info, expected_ptr, actual_ptr)) {
+      fprintf(stderr, "\n\n");
+      PrintArray(info, "expect", expected_array, N, i);
+      PrintArray(info, "actual", actual_array, N, i);
+
+      PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name,
+                            filename, line, i, N);
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace hwy
--- a/third_party/highway/hwy/tests/test_util.h
+++ b/third_party/highway/hwy/tests/test_util.h
@ -0,0 +1,185 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HWY_TESTS_TEST_UTIL_H_
+#define HWY_TESTS_TEST_UTIL_H_
+
+// Target-independent helper functions for use by *_test.cc.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/highway.h"
+
+namespace hwy {
+
+// The maximum vector size used in tests when defining test data. DEPRECATED.
+constexpr size_t kTestMaxVectorSize = 64;
+
+// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
+// which triggers a compiler bug.
+class RandomState {
+ public:
+  explicit RandomState(const uint64_t seed = 0x123456789ull) {
+    s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+    s1_ = SplitMix64(s0_);
+  }
+
+  HWY_INLINE uint64_t operator()() {
+    uint64_t s1 = s0_;
+    const uint64_t s0 = s1_;
+    const uint64_t bits = s1 + s0;
+    s0_ = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s1_ = s1;
+    return bits;
+  }
+
+ private:
+  static uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+  uint64_t s0_;
+  uint64_t s1_;
+};
+
+static HWY_INLINE uint32_t Random32(RandomState* rng) {
+  return static_cast<uint32_t>((*rng)());
+}
+
+static HWY_INLINE uint64_t Random64(RandomState* rng) {
+  return (*rng)();
+}
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// built-in types.
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC
+  (void)output;
+#else   // HWY_COMPILER_MSVC
+  asm volatile("" : "+r"(output) : : "memory");
+#endif  // HWY_COMPILER_MSVC
+}
+
+bool BytesEqual(const void* p1, const void* p2, const size_t size,
+                size_t* pos = nullptr);
+
+void AssertStringEqual(const char* expected, const char* actual,
+                       const char* target_name, const char* filename, int line);
+
+namespace detail {
+
+template <typename T, typename TU = MakeUnsigned<T>>
+TU ComputeUlpDelta(const T expected, const T actual) {
+  // Handle -0 == 0 and infinities.
+  if (expected == actual) return 0;
+
+  // Consider "equal" if both are NaN, so we can verify an expected NaN.
+  // Needs a special case because there are many possible NaN representations.
+  if (std::isnan(expected) && std::isnan(actual)) return 0;
+
+  // Compute the difference in units of last place. We do not need to check for
+  // differing signs; they will result in large differences, which is fine.
+  TU ux, uy;
+  CopyBytes<sizeof(T)>(&expected, &ux);
+  CopyBytes<sizeof(T)>(&actual, &uy);
+
+  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
+  const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
+  return ulp;
+}
+
+// For implementing value comparisons etc. as type-erased functions to reduce
+// template bloat.
+struct TypeInfo {
+  size_t sizeof_t;
+  bool is_float;
+  bool is_signed;
+};
+
+template <typename T>
+HWY_INLINE TypeInfo MakeTypeInfo() {
+  TypeInfo info;
+  info.sizeof_t = sizeof(T);
+  info.is_float = IsFloat<T>();
+  info.is_signed = IsSigned<T>();
+  return info;
+}
+
+bool IsEqual(const TypeInfo& info, const void* expected_ptr,
+             const void* actual_ptr);
+
+void TypeName(const TypeInfo& info, size_t N, char* string100);
+
+void PrintArray(const TypeInfo& info, const char* caption,
+                const void* array_void, size_t N, size_t lane_u = 0,
+                size_t max_lanes = 7);
+
+HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
+                                        const void* expected_ptr,
+                                        const void* actual_ptr,
+                                        const char* target_name,
+                                        const char* filename, int line,
+                                        size_t lane = 0, size_t num_lanes = 1);
+
+void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
+                      const void* actual_void, size_t N,
+                      const char* target_name, const char* filename, int line);
+
+}  // namespace detail
+
+// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
+// unsigned/signed/floating point, followed by the number of bits per lane;
+// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
+// understanding which instantiation of a generic test failed.
+template <typename T>
+std::string TypeName(T /*unused*/, size_t N) {
+  char string100[100];
+  detail::TypeName(detail::MakeTypeInfo<T>(), N, string100);
+  return string100;
+}
+
+// Compare non-vector, non-string T.
+template <typename T>
+HWY_INLINE bool IsEqual(const T expected, const T actual) {
+  const auto info = detail::MakeTypeInfo<T>();
+  return detail::IsEqual(info, &expected, &actual);
+}
+
+template <typename T>
+HWY_INLINE void AssertEqual(const T expected, const T actual,
+                            const char* target_name, const char* filename,
+                            int line, size_t lane = 0) {
+  const auto info = detail::MakeTypeInfo<T>();
+  if (!detail::IsEqual(info, &expected, &actual)) {
+    detail::PrintMismatchAndAbort(info, &expected, &actual, target_name,
+                                  filename, line, lane);
+  }
+}
+
+}  // namespace hwy
+
+#endif  // HWY_TESTS_TEST_UTIL_H_
--- a/third_party/highway/hwy/tests/test_util_test.cc
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@ -30,19 +30,19 @@ struct TestName {
  HWY_NOINLINE void operator()(T t, D d) {
    char num[10];
    std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
-    snprintf(num, sizeof(num), "%zu", sizeof(T) * 8);
+    snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8));
    expected += num;

    const size_t N = Lanes(d);
    if (N != 1) {
      expected += 'x';
-      snprintf(num, sizeof(num), "%zu", N);
+      snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N));
      expected += num;
    }
    const std::string actual = TypeName(t, N);
    if (expected != actual) {
-      NotifyFailure(__FILE__, __LINE__, expected.c_str(), 0, expected.c_str(),
-                    actual.c_str());
+      HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n",
+                hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str());
    }
  }
 };
@ -94,9 +94,17 @@ HWY_NOINLINE void TestAllEqual() {
 HWY_AFTER_NAMESPACE();

 #if HWY_ONCE
+
 namespace hwy {
 HWY_BEFORE_TEST(TestUtilTest);
 HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
 HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
 }  // namespace hwy
+
+// Ought not to be necessary, but without this, no tests run on RVV.
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
 #endif
--- a/third_party/highway/libhwy-test.pc.in
+++ b/third_party/highway/libhwy-test.pc.in
@ -1,8 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
+libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
 includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@

 Name: libhwy-test
 Description: Efficient and performance-portable SIMD wrapper, test helpers.
 Requires: gtest
 Version: @HWY_LIBRARY_VERSION@
+Libs: -L${libdir} -lhwy_test
 Cflags: -I${includedir}
--- a/third_party/highway/run_tests.bat
+++ b/third_party/highway/run_tests.bat
@ -5,7 +5,7 @@ cd %~dp0
 if not exist build_win mkdir build_win

 cd build_win
-cmake .. -G Ninja || goto error
+cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -G Ninja || goto error
 ninja || goto error
 ctest -j || goto error

--- a/third_party/highway/run_tests.sh
+++ b/third_party/highway/run_tests.sh
@ -7,9 +7,74 @@ cd "${MYDIR}"
 # Exit if anything fails
 set -e

-mkdir -p build
+#######################################
+echo RELEASE
+rm -rf build
+mkdir build
 cd build
-cmake ..
+cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
 make -j
 ctest -j
+cd ..
+rm -rf build
+
+#######################################
+echo DEBUG Clang 7
+rm -rf build_dbg
+mkdir build_dbg
+cd build_dbg
+CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
+make -j
+ctest -j
+cd ..
+rm -rf build_dbg
+
+#######################################
+echo 32-bit GCC
+rm -rf build_32
+mkdir build_32
+cd build_32
+CFLAGS=-m32 CXXFLAGS=-m32 LDFLAGS=-m32 CXX=g++ CC=gcc cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+make -j
+ctest -j
+cd ..
+rm -rf build_32
+
+#######################################
+for VER in 8 9 10; do
+  echo GCC $VER
+  rm -rf build_g$VER
+  mkdir build_g$VER
+  cd build_g$VER
+  CC=gcc-$VER CXX=g++-$VER cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+  make -j
+  make test
+  cd ..
+  rm -rf build_g$VER
+done
+
+#######################################
+echo ARMv7 GCC
+export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
+rm -rf build_arm7
+mkdir build_arm7
+cd build_arm7
+CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+make -j8
+ctest
+cd ..
+rm -rf build_arm7
+
+#######################################
+echo ARMv8 GCC
+export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
+rm -rf build_arm8
+mkdir build_arm8
+cd build_arm8
+CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
+make -j8
+ctest
+cd ..
+rm -rf build_arm8
+
 echo Success
--- a/third_party/highway/test.py
+++ b/third_party/highway/test.py
@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""Helper for running tests for all platforms.
-
-One-time setup:
-sudo apt-get install npm
-sudo npm install -g npm jsvu
-
-Usage:
-third_party/highway/test.py [--test=memory_test]
-
-"""
-
-import os
-import sys
-import tarfile
-import tempfile
-import argparse
-import subprocess
-
-
-def run_subprocess(args, work_dir):
-  """Runs subprocess and checks for success."""
-  process = subprocess.Popen(args, cwd=work_dir)
-  process.communicate()
-  assert process.returncode == 0
-
-
-def print_status(name):
-  print("=" * 60, name)
-
-
-def run_blaze_tests(work_dir, target, desired_config, config_name, blazerc,
-                    config):
-  """Builds and runs via blaze or returns 0 if skipped."""
-  if desired_config is not None and desired_config != config_name:
-    return 0
-  print_status(config_name)
-  default_config = ["-c", "opt", "--copt=-DHWY_COMPILE_ALL_ATTAINABLE"]
-  args = ["blaze"] + blazerc + ["test", ":" + target] + config + default_config
-  run_subprocess(args, work_dir)
-  return 1
-
-
-def run_wasm_tests(work_dir, target, desired_config, config_name, options):
-  """Runs wasm via blaze/v8, or returns 0 if skipped."""
-  if desired_config is not None and desired_config != config_name:
-    return 0
-  args = [options.v8, "--no-liftoff", "--experimental-wasm-simd"]
-  # Otherwise v8 returns 0 even after compile failures!
-  args.append("--no-wasm-async-compilation")
-  if options.profile:
-    args.append("--perf-basic-prof-only-functions")
-
-  num_tests_run = 0
-  skipped = []
-
-  # Build (no blazerc to avoid failures caused by local .blazerc)
-  run_subprocess([
-      "blaze", "--blazerc=/dev/null", "build", "--config=wasm",
-      "--features=wasm_simd", ":" + target
-  ], work_dir)
-
-  path = "blaze-bin/third_party/highway/"
-  TEST_SUFFIX = "_test"
-  for test in os.listdir(path):
-    # Only test binaries (avoids non-tar files)
-    if not test.endswith(TEST_SUFFIX):
-      continue
-
-    # Skip directories
-    tar_pathname = os.path.join(path, test)
-    if os.path.isdir(tar_pathname):
-      continue
-
-    # Only the desired test (if given)
-    if options.test is not None and options.test != test:
-      skipped.append(test[0:-len(TEST_SUFFIX)])
-      continue
-
-    with tempfile.TemporaryDirectory() as extract_dir:
-      with tarfile.open(tar_pathname, mode="r:") as tar:
-        tar.extractall(extract_dir)
-
-      test_args = args + [os.path.join(extract_dir, test) + ".js"]
-      run_subprocess(test_args, extract_dir)
-      num_tests_run += 1
-
-  print("Finished", num_tests_run, "; skipped", ",".join(skipped))
-  assert (num_tests_run != 0)
-  return 1
-
-
-def main(args):
-  parser = argparse.ArgumentParser(description="Run test(s)")
-  parser.add_argument("--v8", help="Pathname to v8 (default ~/jsvu/v8)")
-  parser.add_argument("--test", help="Which test to run (defaults to all)")
-  parser.add_argument("--config", help="Which config to run (defaults to all)")
-  parser.add_argument("--profile", action="store_true", help="Enable profiling")
-  options = parser.parse_args(args)
-  if options.v8 is None:
-    options.v8 = os.path.join(os.getenv("HOME"), ".jsvu", "v8")
-
-  work_dir = os.path.dirname(os.path.realpath(__file__))
-  target = "hwy_ops_tests" if options.test is None else options.test
-
-  num_config = 0
-  num_config += run_blaze_tests(work_dir, target, options.config, "x86", [], [])
-  num_config += run_blaze_tests(
-      work_dir, target, options.config, "rvv",
-      ["--blazerc=../../third_party/unsupported_toolchains/mpu/blazerc"],
-      ["--config=mpu64_gcc"])
-  num_config += run_blaze_tests(work_dir, target, options.config, "arm8", [],
-                                ["--config=android_arm64"])
-  num_config += run_blaze_tests(work_dir, target, options.config, "arm7", [], [
-      "--config=android_arm", "--copt=-mfpu=neon-vfpv4",
-      "--copt=-mfloat-abi=softfp"
-  ])
-  num_config += run_blaze_tests(work_dir, target, options.config, "msvc", [],
-                                ["--config=msvc"])
-  num_config += run_wasm_tests(work_dir, target, options.config, "wasm",
-                               options)
-
-  if num_config == 0:
-    print_status("ERROR: unknown --config=%s, omit to see valid names" %
-                 (options.config,))
-  else:
-    print_status("done")
-
-
-if __name__ == "__main__":
-  main(sys.argv[1:])
--- a/third_party/jpeg-xl/AUTHORS
+++ b/third_party/jpeg-xl/AUTHORS
@ -16,19 +16,21 @@ Cloudinary Ltd. <*@cloudinary.com>
 Google LLC <*@google.com>

 # Individuals:
+Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
 Alexander Sago <cagelight@gmail.com>
+Andrius Lukas Narbutas <andrius4669@gmail.com>
+Artem Selishchev
 Dirk Lemstra <dirk@lemstra.org>
+Don Olmstead <don.j.olmstead@gmail.com>
 Jon Sneyers <jon@cloudinary.com>
-Lovell Fuller
 Kleis Auke Wolthuizen <github@kleisauke.nl>
+Leo Izen <leo.izen@gmail.com>
+Lovell Fuller
 Marcin Konicki <ahwayakchih@gmail.com>
+Misaki Kasumi <misakikasumi@outlook.com>
 Petr Diblík
 Pieter Wuille
+Samuel Leong <wvvwvvvvwvvw@gmail.com>
+Vincent Torri <vincent.torri@gmail.com>
 xiota
 Ziemowit Zabawa <ziemek.zabawa@outlook.com>
-Andrius Lukas Narbutas <andrius4669@gmail.com>
-Misaki Kasumi <misakikasumi@outlook.com>
-Samuel Leong <wvvwvvvvwvvw@gmail.com>
-Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
-Vincent Torri <vincent.torri@gmail.com>
-Artem Selishchev
--- a/third_party/jpeg-xl/CHANGELOG.md
+++ b/third_party/jpeg-xl/CHANGELOG.md
@ -12,11 +12,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   `JxlDecoderGetBoxType`, `JxlDecoderGetBoxSizeRaw` and
   `JxlDecoderSetDecompressBoxes`
 - decoder API: ability to mark the input is finished: `JxlDecoderCloseInput`
+ - encoder API: ability to add metadata boxes, added new functions
+   `JxlEncoderAddBox`, `JxlEncoderUseBoxes`, `JxlEncoderCloseBoxes` and
+   `JxlEncoderCloseFrames`.
+ - decoder API: new function `JxlDecoderSetCoalesced` to allow decoding
+   non-coalesced (unblended) frames, e.g. layers of a composite still image
+   or the cropped frames of a recompressed GIF/APNG.
+ - decoder API: field added to `JxlFrameHeader`: a `JxlLayerInfo` struct
+   that contains crop dimensions and offsets and blending information for
+   the non-coalesced case.
+ - decoder API: new function `JxlDecoderGetExtraChannelBlendInfo` to get
+   the blending information for extra channels in the non-coalesced case.

 ### Changed
- decoder API: `JxlDecoderCloseInput` is required when using JXL_DEC_BOX, and
-  also encouraged, but not required for backwards compatiblity, for any other
-  usage of the decoder.
+- decoder API: using `JxlDecoderCloseInput` at the end of all input is required
+  when using JXL_DEC_BOX, and is now also encouraged in other cases, but not
+  required in those other cases for backwards compatiblity.
+- encoder API: `JxlEncoderCloseInput` now closes both frames and boxes input.

 ## [0.6.1] - 2021-10-29
 ### Changed
--- a/third_party/jpeg-xl/CMakeLists.txt
+++ b/third_party/jpeg-xl/CMakeLists.txt
@ -5,6 +5,7 @@

 # Ubuntu bionic ships with cmake 3.10.
 cmake_minimum_required(VERSION 3.10)
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")

 # Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
 if(POLICY CMP0063)
@ -122,10 +123,12 @@ set(JPEGXL_FORCE_NEON false CACHE BOOL


 # Force system dependencies.
-set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
-    "Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
 set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
    "Force using system installed brotli instead of third_party/brotli source.")
+set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
+    "Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
+set(JPEGXL_FORCE_SYSTEM_LCMS2 false CACHE BOOL
+    "Force using system installed lcms2 instead of third_party/lcms source.")
 set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
    "Force using system installed highway (libhwy-dev) instead of third_party/highway source.")

@ -158,12 +161,12 @@ message(STATUS
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

 if(JPEGXL_STATIC)
-  set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
  set(BUILD_SHARED_LIBS 0)
  # Clang developers say that in case to use "static" we have to build stdlib
  # ourselves; for real use case we don't care about stdlib, as it is "granted",
  # so just linking all other libraries is fine.
-  if (NOT APPLE)
+  if (NOT MSVC AND NOT APPLE)
+    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
    set(CMAKE_EXE_LINKER_FLAGS
        "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
  endif()
--- a/third_party/jpeg-xl/SECURITY.md
+++ b/third_party/jpeg-xl/SECURITY.md
@ -1,37 +1,73 @@
-# Security and Vulnerability Policy for JPEG XL
+# Security and Vulnerability Policy for libjxl

-The current focus of the reference implementation is to provide a vehicle for
-evaluating the JPEG XL codec compression density, quality, features and its
-actual performance on different platforms. With this focus in mind we provide
-source code releases with improvements on performance and quality so developers
-can evaluate the codec.
+## TL;DR:

-At this time, **we don't provide security and vulnerability support** for any
-of these releases. This means that the source code may contain bugs, including
-security bugs, that may be added or fixed between releases and will **not** be
-individually documented. All of these
-[releases](https://gitlab.com/wg1/jpeg-xl/-/releases) include the following
-note to that effect:
+CPE prefix: `cpe:2.3:a:libjxl_project:libjxl`

-* Note: This release is for evaluation purposes and may contain bugs, including
-  security bugs, that will *not* be individually documented when fixed. Always
-  prefer to use the latest release. Please provide feedback and report bugs
-  [here](https://gitlab.com/wg1/jpeg-xl/-/issues).
+To report a security issue, please email libjxl-security@google.com.

-To be clear, this means that because a release doesn't mention any CVE it
-doesn't mean that no security issues in previous versions were fixed. You should
-assume that any previous release contains security issues if that's a concern
-for your use case.
+Include in your email a description of the issue, the steps you took to create
+the issue, affected versions, and if known, mitigations for the issue. Our
+vulnerability management team will acknowledge receiving your email within 3
+working days.

-This however doesn't impede you from evaluating the codec with your own trusted
-inputs, such as `.jxl` you encoded yourself, or when taking appropriate measures
-for your application like sandboxing if processing untrusted inputs.
+This project follows a 90 day disclosure timeline.

-## Future plans
+For all other bugs, where there are no security implications about disclosing
+the unpatched bug, open a [new issue](https://github.com/libjxl/libjxl/issues)
+checking first for existing similar issues. If in doubt about the security
+impact of a bug you discovered, email first.

-To help our users and developers integrating this implementation into their
-software we plan to provide support for security and vulnerability tracking of
-this implementation in the future.
+## Policy overview

-When we can provide such support we will update this Policy with the details and
-expectations and clearly mention that fact in the release notes.
+libjxl's Security Policy is based on the [Google Open Source program
+guidelines](https://github.com/google/oss-vulnerability-guide) for coordinated
+vulnerability disclosure.
+
+Early versions of `libjxl` had a different security policy that didn't provide
+security and vulnerability disclosure support. Versions up to and including
+0.3.7 are not covered and won't receive any security advisory.
+
+Only released versions, starting from version 0.5, are covered by this policy.
+Development branches, arbitrary commits from `main` branch or even releases with
+backported features externally patched on top are not covered. Only those
+versions with a release tag in `libjxl`'s repository are covered, starting from
+version 0.5.
+
+## What's a "Security bug"
+
+A security bug is a bug that can potentially be exploited to let an attacker
+gain unauthorized access or privileges such as disclosing information or
+arbitrary code execution. Not all fuzzer-found bugs and not all assert()
+failures are considered security bugs in libjxl. For a detailed explanation and
+examples see our [Security Vulnerabilities Playbook](doc/vuln_playbook.md).
+
+## What to expect
+
+To report a security issue, please email libjxl-security@google.com with all the
+details about the bug you encountered.
+
+ * Include a description of the issue, steps to reproduce, etc. Compiler
+   versions, flags, exact version used and even CPU are often relevant given our
+   usage of SIMD and run-time dispatch of SIMD instructions.
+
+ * A member of our security team will reply to you within 3 business days. Note
+   that business days are different in different countries.
+
+ * We will evaluate the issue and we may require more input from your side to
+   reproduce it.
+
+ * If the issue fits in the description of a security bug, we will issue a
+   CVE, publish a fix and make a new minor or patch release with it. There is
+   a maximum of 90 day disclosure timeline, we ask you to not publish the
+   details before the 90 day deadline or the release date (whichever comes
+   first).
+
+ * In the case that we publish a CVE we will credit the external researcher who
+   reported the issue. When reporting security issues please let us know if you
+   need to include specific information while doing so, like for example a
+   company affiliation.
+
+Our security team follows the [Security Vulnerabilities
+Playbook](doc/vuln_playbook.md). For more details about the process and policies
+please take a look at it.
--- a/third_party/jpeg-xl/ci.sh
+++ b/third_party/jpeg-xl/ci.sh
@ -558,6 +558,7 @@ cmd_coverage_report() {
    --filter '.*jxl/.*'
    --exclude '.*_test.cc'
    --exclude '.*_testonly..*'
+    --exclude '.*_debug.*'
    --exclude '.*test_utils..*'
    --object-directory "${real_build_dir}"
  )
--- a/third_party/jpeg-xl/cmake/FindHWY.cmake
+++ b/third_party/jpeg-xl/cmake/FindHWY.cmake
@ -0,0 +1,66 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+  pkg_check_modules(PC_HWY QUIET libhwy)
+  set(HWY_VERSION ${PC_HWY_VERSION})
+endif ()
+
+find_path(HWY_INCLUDE_DIR
+  NAMES hwy/highway.h
+  HINTS ${PC_HWY_INCLUDEDIR} ${PC_HWY_INCLUDE_DIRS}
+)
+
+find_library(HWY_LIBRARY
+  NAMES ${HWY_NAMES} hwy
+  HINTS ${PC_HWY_LIBDIR} ${PC_HWY_LIBRARY_DIRS}
+)
+
+if (HWY_INCLUDE_DIR AND NOT HWY_VERSION)
+  if (EXISTS "${HWY_INCLUDE_DIR}/hwy/highway.h")
+    file(READ "${HWY_INCLUDE_DIR}/hwy/highway.h" HWY_VERSION_CONTENT)
+
+    string(REGEX MATCH "#define HWY_MAJOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_MAJOR "${CMAKE_MATCH_1}")
+
+    string(REGEX MATCH "#define +HWY_MINOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_MINOR "${CMAKE_MATCH_1}")
+
+    string(REGEX MATCH "#define +HWY_PATCH +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    set(HWY_VERSION_PATCH "${CMAKE_MATCH_1}")
+
+    set(HWY_VERSION "${HWY_VERSION_MAJOR}.${HWY_VERSION_MINOR}.${HWY_VERSION_PATCH}")
+  endif ()
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(HWY
+  FOUND_VAR HWY_FOUND
+  REQUIRED_VARS HWY_LIBRARY HWY_INCLUDE_DIR
+  VERSION_VAR HWY_VERSION
+)
+
+if (HWY_LIBRARY AND NOT TARGET hwy)
+  add_library(hwy INTERFACE IMPORTED GLOBAL)
+
+  if(${CMAKE_VERSION} VERSION_LESS "3.13.5")
+    set_property(TARGET hwy PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${HWY_INCLUDE_DIR})
+    target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
+    set_property(TARGET hwy PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_HWY_CFLAGS_OTHER})
+  else()
+    target_include_directories(hwy INTERFACE ${HWY_INCLUDE_DIR})
+    target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
+    target_link_options(hwy INTERFACE ${PC_HWY_LDFLAGS_OTHER})
+    target_compile_options(hwy INTERFACE ${PC_HWY_CFLAGS_OTHER})
+  endif()
+endif()
+
+mark_as_advanced(HWY_INCLUDE_DIR HWY_LIBRARY)
+
+if (HWY_FOUND)
+    set(HWY_LIBRARIES ${HWY_LIBRARY})
+    set(HWY_INCLUDE_DIRS ${HWY_INCLUDE_DIR})
+endif ()
--- a/third_party/jpeg-xl/cmake/FindLCMS2.cmake
+++ b/third_party/jpeg-xl/cmake/FindLCMS2.cmake
@ -0,0 +1,59 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+find_package(PkgConfig QUIET)
+if (PkgConfig_FOUND)
+  pkg_check_modules(PC_LCMS2 QUIET libLCMS2)
+  set(LCMS2_VERSION ${PC_LCMS2_VERSION})
+endif ()
+
+find_path(LCMS2_INCLUDE_DIR
+  NAMES lcms2.h
+  HINTS ${PC_LCMS2_INCLUDEDIR} ${PC_LCMS2_INCLUDE_DIRS}
+)
+
+find_library(LCMS2_LIBRARY
+  NAMES ${LCMS2_NAMES} lcms2 liblcms2 lcms-2 liblcms-2
+  HINTS ${PC_LCMS2_LIBDIR} ${PC_LCMS2_LIBRARY_DIRS}
+)
+
+if (LCMS2_INCLUDE_DIR AND NOT LCMS_VERSION)
+    file(READ ${LCMS2_INCLUDE_DIR}/lcms2.h LCMS2_VERSION_CONTENT)
+    string(REGEX MATCH "#define[ \t]+LCMS_VERSION[ \t]+([0-9]+)[ \t]*\n" LCMS2_VERSION_MATCH ${LCMS2_VERSION_CONTENT})
+    if (LCMS2_VERSION_MATCH)
+        string(SUBSTRING ${CMAKE_MATCH_1} 0 1 LCMS2_VERSION_MAJOR)
+        string(SUBSTRING ${CMAKE_MATCH_1} 1 2 LCMS2_VERSION_MINOR)
+        set(LCMS2_VERSION "${LCMS2_VERSION_MAJOR}.${LCMS2_VERSION_MINOR}")
+    endif ()
+endif ()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LCMS2
+  FOUND_VAR LCMS2_FOUND
+  REQUIRED_VARS LCMS2_LIBRARY LCMS2_INCLUDE_DIR
+  VERSION_VAR LCMS2_VERSION
+)
+
+if (LCMS2_LIBRARY AND NOT TARGET lcms2)
+  add_library(lcms2 INTERFACE IMPORTED GLOBAL)
+
+  if(${CMAKE_VERSION} VERSION_LESS "3.13.5")
+    set_property(TARGET lcms2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${LCMS2_INCLUDE_DIR})
+    target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
+    set_property(TARGET lcms2 PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_LCMS2_CFLAGS_OTHER})
+  else()
+    target_include_directories(lcms2 INTERFACE ${LCMS2_INCLUDE_DIR})
+    target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
+    target_link_options(lcms2 INTERFACE ${PC_LCMS2_LDFLAGS_OTHER})
+    target_compile_options(lcms2 INTERFACE ${PC_LCMS2_CFLAGS_OTHER})
+  endif()
+endif()
+
+mark_as_advanced(LCMS2_INCLUDE_DIR LCMS2_LIBRARY)
+
+if (LCMS2_FOUND)
+    set(LCMS2_LIBRARIES ${LCMS2_LIBRARY})
+    set(LCMS2_INCLUDE_DIRS ${LCMS2_INCLUDE_DIR})
+endif ()
--- a/third_party/jpeg-xl/debian/control
+++ b/third_party/jpeg-xl/debian/control
@ -14,7 +14,7 @@ Build-Depends:
 libgmock-dev,
 libgoogle-perftools-dev,
 libgtest-dev,
- libhwy-dev,
+ libhwy-dev (>= 0.15.0),
 libjpeg-dev,
 libopenexr-dev,
 libpng-dev,
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -13,7 +13,7 @@ MYDIR=$(dirname $(realpath "$0"))

 # Git revisions we use for the given submodules. Update these whenever you
 # update a git submodule.
-THIRD_PARTY_HIGHWAY="e2397743fe092df68b760d358253773699a16c93"
+THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5"
 THIRD_PARTY_LODEPNG="8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a"
 THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
 THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
@ -23,7 +23,7 @@ download_github() {
  local path="$1"
  local project="$2"

-  local varname="${path^^}"
+  local varname=`echo "$path" | tr '[:lower:]' '[:upper:]'`
  varname="${varname/\//_}"
  local sha
  eval "sha=\${${varname}}"
--- a/third_party/jpeg-xl/examples/jxlinfo.c
+++ b/third_party/jpeg-xl/examples/jxlinfo.c
@ -67,6 +67,7 @@ int PrintBasicInfo(FILE* file) {
      }
      data_size = remaining + read_size;
      JxlDecoderSetInput(dec, data, data_size);
+      if (feof(file)) JxlDecoderCloseInput(dec);
    } else if (status == JXL_DEC_SUCCESS) {
      // Finished all processing.
      break;
@ -319,9 +320,11 @@ int main(int argc, char* argv[]) {
  }

  if (!PrintBasicInfo(file)) {
+    fclose(file);
    fprintf(stderr, "Couldn't print basic info\n");
    return 1;
  }

+  fclose(file);
  return 0;
 }
--- a/third_party/jpeg-xl/lib/CMakeLists.txt
+++ b/third_party/jpeg-xl/lib/CMakeLists.txt
@ -15,7 +15,7 @@ set(JPEGXL_LIBRARY_VERSION
 # It is important to update this value when making incompatible API/ABI changes
 # so that programs that depend on libjxl can update their dependencies. Semantic
 # versioning allows 0.y.z to have incompatible changes in minor versions.
-set(JPEGXL_SO_MINOR_VERSION 6)
+set(JPEGXL_SO_MINOR_VERSION 7)
 if (JPEGXL_MAJOR_VERSION EQUAL 0)
 set(JPEGXL_LIBRARY_SOVERSION
    "${JPEGXL_MAJOR_VERSION}.${JPEGXL_SO_MINOR_VERSION}")
--- a/third_party/jpeg-xl/lib/extras/codec_apng.cc
+++ b/third_party/jpeg-xl/lib/extras/codec_apng.cc
@ -297,6 +297,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
              // TODO(veluca): this could in principle be implemented.
              if (last_base_was_none && !all_dispose_bg &&
                  (x0 != 0 || y0 != 0 || w0 != w || h0 != h || bop != 0)) {
+                delete[] frameRaw.rows;
+                delete[] frameRaw.p;
                return JXL_FAILURE(
                    "APNG with dispose-to-0 is not supported for non-full or "
                    "blended frames");
--- a/third_party/jpeg-xl/lib/extras/codec_jpg.cc
+++ b/third_party/jpeg-xl/lib/extras/codec_jpg.cc
@ -301,22 +301,23 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
    if (cinfo.arith_code) {
      return failure("arithmetic code JPEGs are not supported");
    }
+    int nbcomp = cinfo.num_components;
+    if (nbcomp != 1 && nbcomp != 3) {
+      return failure("unsupported number of components in JPEG");
+    }
    if (!ReadICCProfile(&cinfo, &ppf->icc)) {
      ppf->icc.clear();
      // Default to SRGB
-      ppf->color_encoding.color_space = cinfo.output_components == 1
-                                            ? JXL_COLOR_SPACE_GRAY
-                                            : JXL_COLOR_SPACE_RGB;
+      // Actually, (cinfo.output_components == nbcomp) will be checked after
+      // `jpeg_start_decompress`.
+      ppf->color_encoding.color_space =
+          (nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
    }
    ReadExif(&cinfo, &ppf->metadata.exif);
-    int nbcomp = cinfo.num_components;
-    if (nbcomp != 1 && nbcomp != 3) {
-      return failure("unsupported number of components in JPEG");
-    }
    if (!ApplyColorHints(color_hints, /*color_already_set=*/true,
                         /*is_gray=*/false, ppf)) {
      return failure("ApplyColorHints failed");
--- a/third_party/jpeg-xl/lib/include/jxl/codestream_header.h
+++ b/third_party/jpeg-xl/lib/include/jxl/codestream_header.h
@ -289,7 +289,69 @@ typedef struct {
  uint64_t extensions;
 } JxlHeaderExtensions;

-/** The header of one displayed frame. */
+/** Frame blend modes.
+ * If coalescing is enabled (default), this can be ignored.
+ */
+typedef enum {
+  JXL_BLEND_REPLACE = 0,
+  JXL_BLEND_ADD = 1,
+  JXL_BLEND_BLEND = 2,
+  JXL_BLEND_MULADD = 3,
+  JXL_BLEND_MUL = 4,
+} JxlBlendMode;
+
+/** The information about blending the color channels or a single extra channel.
+ * If coalescing is enabled (default), this can be ignored.
+ */
+typedef struct {
+  /** Blend mode.
+   * Always equal to JXL_BLEND_REPLACE if coalescing is enabled.
+   */
+  JxlBlendMode blendmode;
+  /** Reference frame ID to use as the 'bottom' layer (0-3).
+   */
+  uint32_t source;
+  /** Which extra channel to use as the 'alpha' channel for blend modes
+   * JXL_BLEND_BLEND and JXL_BLEND_MULADD.
+   */
+  uint32_t alpha;
+  /** Clamp values to [0,1] for the purpose of blending.
+   */
+  JXL_BOOL clamp;
+} JxlBlendInfo;
+
+/** The information about layers.
+ * If coalescing is enabled (default), this can be ignored.
+ */
+typedef struct {
+  /** Horizontal offset of the frame (can be negative, always zero if coalescing
+   * is enabled)
+   */
+  int32_t crop_x0;
+  /** Vertical offset of the frame (can be negative, always zero if coalescing
+   * is enabled)
+   */
+  int32_t crop_y0;
+  /** Width of the frame (number of columns, always equal to image width if
+   * coalescing is enabled)
+   */
+  uint32_t xsize;
+  /** Height of the frame (number of rows, always equal to image height if
+   * coalescing is enabled)
+   */
+  uint32_t ysize;
+  /** The blending info for the color channels. Blending info for extra channels
+   * has to be retrieved separately using JxlDecoderGetExtraChannelBlendInfo.
+   */
+  JxlBlendInfo blend_info;
+  /** After blending, save the frame as reference frame with this ID (0-3).
+   * Special case: if the frame duration is nonzero, ID 0 means "will not be
+   * referenced in the future".
+   */
+  uint32_t save_as_reference;
+} JxlLayerInfo;
+
+/** The header of one displayed frame or non-coalesced layer. */
 typedef struct {
  /** How long to wait after rendering in ticks. The duration in seconds of a
   * tick is given by tps_numerator and tps_denominator in JxlAnimationHeader.
@ -314,6 +376,10 @@ typedef struct {
  /** Indicates this is the last animation frame.
   */
  JXL_BOOL is_last;
+
+  /** Information about the layer in case of no coalescing.
+   */
+  JxlLayerInfo layer_info;
 } JxlFrameHeader;

 #if defined(__cplusplus) || defined(c_plusplus)
--- a/third_party/jpeg-xl/lib/include/jxl/decode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/decode.h
@ -206,12 +206,17 @@ typedef enum {
   * JxlDecoderGetFrameHeader can be used at this point. A note on frames:
   * a JPEG XL image can have internal frames that are not intended to be
   * displayed (e.g. used for compositing a final frame), but this only returns
-   * displayed frames. A displayed frame either has an animation duration or is
-   * the only or last frame in the image. This event occurs max once per
-   * displayed frame, always later than JXL_DEC_COLOR_ENCODING, and always
-   * earlier than any pixel data. While JPEG XL supports encoding a single frame
-   * as the composition of multiple internal sub-frames also called frames, this
-   * event is not indicated for the internal frames.
+   * displayed frames, unless JxlDecoderSetCoalescing was set to JXL_FALSE: in
+   * that case, the individual layers are returned, without blending. Note that
+   * even when coalescing is disabled, only frames of type kRegularFrame are
+   * returned; frames of type kReferenceOnly and kLfFrame are always for
+   * internal purposes only and cannot be accessed. A displayed frame either has
+   * an animation duration or is the only or last frame in the image. This event
+   * occurs max once per displayed frame, always later than
+   * JXL_DEC_COLOR_ENCODING, and always earlier than any pixel data. While JPEG
+   * XL supports encoding a single frame as the composition of multiple internal
+   * sub-frames also called frames, this event is not indicated for the internal
+   * frames.
   */
  JXL_DEC_FRAME = 0x400,

@ -228,11 +233,12 @@ typedef enum {
   */
  JXL_DEC_DC_IMAGE = 0x800,

-  /** Informative event by JxlDecoderProcessInput: full frame decoded.
-   * JxlDecoderSetImageOutBuffer must be used after getting the basic image
-   * information to be able to get the image pixels, if not this return status
-   * only indicates we're past this point in the codestream. This event occurs
-   * max once per frame and always later than JXL_DEC_DC_IMAGE.
+  /** Informative event by JxlDecoderProcessInput: full frame (or layer, in case
+   * coalescing is disabled) is decoded. JxlDecoderSetImageOutBuffer must be
+   * used after getting the basic image information to be able to get the image
+   * pixels, if not this return status only indicates we're past this point in
+   * the codestream. This event occurs max once per frame and always later than
+   * JXL_DEC_DC_IMAGE.
   */
  JXL_DEC_FULL_IMAGE = 0x1000,

@ -418,6 +424,22 @@ JxlDecoderSetKeepOrientation(JxlDecoder* dec, JXL_BOOL keep_orientation);
 JXL_EXPORT JxlDecoderStatus
 JxlDecoderSetRenderSpotcolors(JxlDecoder* dec, JXL_BOOL render_spotcolors);

+/** Enables or disables coalescing of zero-duration frames. By default, frames
+ * are returned with coalescing enabled, i.e. all frames have the image
+ * dimensions, and are blended if needed. When coalescing is disabled, frames
+ * can have arbitrary dimensions, a non-zero crop offset, and blending is not
+ * performed. For display, coalescing is recommended. For loading a multi-layer
+ * still image as separate layers (as opposed to the merged image), coalescing
+ * has to be disabled.
+ *
+ * @param dec decoder object
+ * @param coalescing JXL_TRUE to enable coalescing (default), JXL_FALSE to
+ * disable it.
+ * @return JXL_DEC_SUCCESS if no error, JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
+                                                    JXL_BOOL coalescing);
+
 /**
 * Decodes JPEG XL file using the available bytes. Requires input has been
 * set with JxlDecoderSetInput. After JxlDecoderProcessInput, input can
@ -770,6 +792,21 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
 JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
                                                   char* name, size_t size);

+/**
+ * Outputs the blend information for the current frame for a specific extra
+ * channel. This function can be called when JXL_DEC_FRAME occurred for the
+ * current frame, even when have_animation in the JxlBasicInfo is JXL_FALSE.
+ * This information is only useful if coalescing is disabled; otherwise the
+ * decoder will have performed blending already.
+ *
+ * @param dec decoder object
+ * @param index the index of the extra channel
+ * @param blend_info struct to copy the information into
+ * @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
+    const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
+
 /**
 * Returns the minimum size in bytes of the DC image output buffer
 * for the given format. This is the buffer for JxlDecoderSetDCOutBuffer.
@ -811,7 +848,11 @@ JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderSetDCOutBuffer(
 /**
 * Returns the minimum size in bytes of the image output pixel buffer for the
 * given format. This is the buffer for JxlDecoderSetImageOutBuffer. Requires
- * the basic image information is available in the decoder.
+ * that the basic image information is available in the decoder in the case of
+ * coalescing enabled (default). In case coalescing is disabled, this can only
+ * be called after the JXL_DEC_FRAME event occurs. In that case, it will return
+ * the size required to store the possibly cropped frame (which can be larger or
+ * smaller than the image dimensions).
 *
 * @param dec decoder object
 * @param format format of the pixels.
--- a/third_party/jpeg-xl/lib/include/jxl/encode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/encode.h
@ -110,127 +110,165 @@ typedef enum {
   */
  JXL_ENC_OPTION_EXTRA_CHANNEL_RESAMPLING = 3,

+  /** Indicates the frame added with @ref JxlEncoderAddImageFrame is already
+   * downsampled by the downsampling factor set with @ref
+   * JXL_ENC_OPTION_RESAMPLING. The input frame must then be given in the
+   * downsampled resolution, not the full image resolution. The downsampled
+   * resolution is given by ceil(xsize / resampling), ceil(ysize / resampling)
+   * with xsize and ysize the dimensions given in the basic info, and resampling
+   * the factor set with @ref JXL_ENC_OPTION_RESAMPLING.
+   * Use 0 to disable, 1 to enable. Default value is 0.
+   */
+  JXL_ENC_OPTION_ALREADY_DOWNSAMPLED = 4,
+
  /** Adds noise to the image emulating photographic film noise, the higher the
   * given number, the grainier the image will be. As an example, a value of 100
   * gives low noise whereas a value of 3200 gives a lot of noise. The default
   * value is 0.
   */
-  JXL_ENC_OPTION_PHOTON_NOISE = 4,
+  JXL_ENC_OPTION_PHOTON_NOISE = 5,

  /** Enables adaptive noise generation. This setting is not recommended for
   * use, please use JXL_ENC_OPTION_PHOTON_NOISE instead. Use -1 for the default
   * (encoder chooses), 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_NOISE = 5,
+  JXL_ENC_OPTION_NOISE = 6,

  /** Enables or disables dots generation. Use -1 for the default (encoder
   * chooses), 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_DOTS = 6,
+  JXL_ENC_OPTION_DOTS = 7,

  /** Enables or disables patches generation. Use -1 for the default (encoder
   * chooses), 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_PATCHES = 7,
+  JXL_ENC_OPTION_PATCHES = 8,

  /** Edge preserving filter level, -1 to 3. Use -1 for the default (encoder
   * chooses), 0 to 3 to set a strength.
   */
-  JXL_ENC_OPTION_EPF = 8,
+  JXL_ENC_OPTION_EPF = 9,

  /** Enables or disables the gaborish filter. Use -1 for the default (encoder
   * chooses), 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_GABORISH = 9,
+  JXL_ENC_OPTION_GABORISH = 10,

  /** Enables modular encoding. Use -1 for default (encoder
   * chooses), 0 to enforce VarDCT mode (e.g. for photographic images), 1 to
   * enforce modular mode (e.g. for lossless images).
   */
-  JXL_ENC_OPTION_MODULAR = 10,
+  JXL_ENC_OPTION_MODULAR = 11,

  /** Enables or disables preserving color of invisible pixels. Use -1 for the
   * default (1 if lossless, 0 if lossy), 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_KEEP_INVISIBLE = 11,
+  JXL_ENC_OPTION_KEEP_INVISIBLE = 12,

  /** Determines the order in which 256x256 regions are stored in the codestream
   * for progressive rendering. Use -1 for the encoder
   * default, 0 for scanline order, 1 for center-first order.
   */
-  JXL_ENC_OPTION_GROUP_ORDER = 12,
+  JXL_ENC_OPTION_GROUP_ORDER = 13,

  /** Determines the horizontal position of center for the center-first group
   * order. Use -1 to automatically use the middle of the image, 0..xsize to
   * specifically set it.
   */
-  JXL_ENC_OPTION_GROUP_ORDER_CENTER_X = 13,
+  JXL_ENC_OPTION_GROUP_ORDER_CENTER_X = 14,

  /** Determines the center for the center-first group order. Use -1 to
   * automatically use the middle of the image, 0..ysize to specifically set it.
   */
-  JXL_ENC_OPTION_GROUP_ORDER_CENTER_Y = 14,
+  JXL_ENC_OPTION_GROUP_ORDER_CENTER_Y = 15,

  /** Enables or disables progressive encoding for modular mode. Use -1 for the
   * encoder default, 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_RESPONSIVE = 15,
+  JXL_ENC_OPTION_RESPONSIVE = 16,

  /** Set the progressive mode for the AC coefficients of VarDCT, using spectral
   * progression from the DCT coefficients. Use -1 for the encoder default, 0 to
   * disable, 1 to enable.
   */
-  JXL_ENC_OPTION_PROGRESSIVE_AC = 16,
+  JXL_ENC_OPTION_PROGRESSIVE_AC = 17,

  /** Set the progressive mode for the AC coefficients of VarDCT, using
   * quantization of the least significant bits. Use -1 for the encoder default,
   * 0 to disable, 1 to enable.
   */
-  JXL_ENC_OPTION_QPROGRESSIVE_AC = 17,
+  JXL_ENC_OPTION_QPROGRESSIVE_AC = 18,

  /** Set the progressive mode using lower-resolution DC images for VarDCT. Use
   * -1 for the encoder default, 0 to disable, 1 to have an extra 64x64 lower
   * resolution pass, 2 to have a 512x512 and 64x64 lower resolution pass.
   */
-  JXL_ENC_OPTION_PROGRESSIVE_DC = 18,
+  JXL_ENC_OPTION_PROGRESSIVE_DC = 19,

  /** Use Global channel palette if the amount of colors is smaller than this
   * percentage of range. Use 0-100 to set an explicit percentage, -1 to use the
   * encoder default. Used for modular encoding.
   */
-  JXL_ENC_OPTION_CHANNEL_COLORS_PRE_TRANSFORM_PERCENT = 19,
+  JXL_ENC_OPTION_CHANNEL_COLORS_GLOBAL_PERCENT = 20,

-  /** Use Local channel palette if the amount of colors is smaller than this
-   * percentage of range. Use 0-100 to set an explicit percentage, -1 to use the
-   * encoder default. Used for modular encoding.
+  /** Use Local (per-group) channel palette if the amount of colors is smaller
+   * than this percentage of range. Use 0-100 to set an explicit percentage, -1
+   * to use the encoder default. Used for modular encoding.
   */
-  JXL_ENC_OPTION_CHANNEL_COLORS_PERCENT = 20,
+  JXL_ENC_OPTION_CHANNEL_COLORS_GROUP_PERCENT = 21,

  /** Use color palette if amount of colors is smaller than or equal to this
   * amount, or -1 to use the encoder default. Used for modular encoding.
   */
-  JXL_ENC_OPTION_PALETTE_COLORS = 21,
+  JXL_ENC_OPTION_PALETTE_COLORS = 22,

  /** Enables or disables delta palette. Use -1 for the default (encoder
   * chooses), 0 to disable, 1 to enable. Used in modular mode.
   */
-  JXL_ENC_OPTION_LOSSY_PALETTE = 22,
+  JXL_ENC_OPTION_LOSSY_PALETTE = 23,

-  /** Color space for modular encoding: 0=RGB, 1=YCoCg, 2-37=RCT, -1=default:
-   * try several, depending on speed.
+  /** Color transform for internal encoding: -1 = default, 0=XYB, 1=none (RGB),
+   * 2=YCbCr. The XYB setting performs the forward XYB transform. None and
+   * YCbCr both perform no transform, but YCbCr is used to indicate that the
+   * encoded data losslessly represents YCbCr values.
   */
-  JXL_ENC_OPTION_MODULAR_COLOR_SPACE = 23,
+  JXL_ENC_OPTION_COLOR_TRANSFORM = 24,
+
+  /** Color space for modular encoding: -1=default, 0-35=reverse color transform
+   * index, e.g. index 0 = none, index 6 = YCoCg.
+   * The default behavior is to try several, depending on the speed setting.
+   */
+  JXL_ENC_OPTION_MODULAR_COLOR_SPACE = 25,

  /** Group size for modular encoding: -1=default, 0=128, 1=256, 2=512, 3=1024.
   */
-  JXL_ENC_OPTION_MODULAR_GROUP_SIZE = 24,
+  JXL_ENC_OPTION_MODULAR_GROUP_SIZE = 26,

  /** Predictor for modular encoding. -1 = default, 0=zero, 1=left, 2=top,
   * 3=avg0, 4=select, 5=gradient, 6=weighted, 7=topright, 8=topleft,
   * 9=leftleft, 10=avg1, 11=avg2, 12=avg3, 13=toptop predictive average 14=mix
   * 5 and 6, 15=mix everything.
   */
-  JXL_ENC_OPTION_MODULAR_PREDICTOR = 25,
+  JXL_ENC_OPTION_MODULAR_PREDICTOR = 27,
+
+  /** Fraction of pixels used to learn MA trees as a percentage. -1 = default,
+   * 0 = no MA and fast decode, 50 = default value, 100 = all, values above
+   * 100 are also permitted. Higher values use more encoder memory.
+   */
+  JXL_ENC_OPTION_MODULAR_MA_TREE_LEARNING_PERCENT = 28,
+
+  /** Number of extra (previous-channel) MA tree properties to use. -1 =
+   * default, 0-11 = valid values. Recommended values are in the range 0 to 3,
+   * or 0 to amount of channels minus 1 (including all extra channels, and
+   * excluding color channels when using VarDCT mode). Higher value gives slower
+   * encoding and slower decoding.
+   */
+  JXL_ENC_OPTION_MODULAR_NB_PREV_CHANNELS = 29,
+
+  /** Enable or disable CFL (chroma-from-luma) for lossless JPEG recompression.
+   * -1 = default, 0 = disable CFL, 1 = enable CFL.
+   */
+  JXL_ENC_OPTION_JPEG_RECON_CFL = 30,

  /** Enum value not to be used as an option. This value is added to force the
   * C compiler to have the enum to take a known size.
@ -337,11 +375,22 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderAddJPEGFrame(
 * JxlEncoderSetBasicInfo before JxlEncoderAddImageFrame.
 *
 * Currently only some data types for pixel formats are supported:
- * - JXL_TYPE_UINT8
- * - JXL_TYPE_UINT16
+ * - JXL_TYPE_UINT8, with range 0..255
+ * - JXL_TYPE_UINT16, with range 0..65535
 * - JXL_TYPE_FLOAT16, with nominal range 0..1
 * - JXL_TYPE_FLOAT, with nominal range 0..1
 *
+ * Note: the sample data type in pixel_format is allowed to be different from
+ * what is described in the JxlBasicInfo. The type in pixel_format describes the
+ * format of the uncompressed pixel buffer. The bits_per_sample and
+ * exponent_bits_per_sample in the JxlBasicInfo describes what will actually be
+ * encoded in the JPEG XL codestream. For example, to encode a 12-bit image, you
+ * would set bits_per_sample to 12, and you could use e.g. JXL_TYPE_UINT16
+ * (where the values are rescaled to 16-bit, i.e. multiplied by 65535/4095) or
+ * JXL_TYPE_FLOAT (where the values are rescaled to 0..1, i.e. multiplied
+ * by 1.f/4095.f). While it is allowed, it is obviously not recommended to use a
+ * pixel_format with lower precision than what is specified in the JxlBasicInfo.
+ *
 * We support interleaved channels as described by the JxlPixelFormat:
 * - single-channel data, e.g. grayscale
 * - single-channel + alpha
@ -357,6 +406,10 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderAddJPEGFrame(
 * JxlEncoderSetICCProfile. If false, the pixels are assumed to be nonlinear
 * sRGB for integer data types (JXL_TYPE_UINT8, JXL_TYPE_UINT16), and linear
 * sRGB for floating point data types (JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT).
+ * Sample values in floating-point pixel formats are allowed to be outside the
+ * nominal range, e.g. to represent out-of-sRGB-gamut colors in the
+ * uses_original_profile=false case. They are however not allowed to be NaN or
+ * +-infinity.
 *
 * @param options set of encoder options to use when encoding the frame.
 * @param pixel_format format for pixels. Object owned by the caller and its
@ -396,7 +449,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
    const void* buffer, size_t size, uint32_t index);

 /** Adds a metadata box to the file format. JxlEncoderProcessOutput must be used
- * to effectively write the box to the output. @ref JxlEncoderUseContainer must
+ * to effectively write the box to the output. @ref JxlEncoderUseBoxes must
 * be enabled before using this function.
 *
 * Background information about the container format and boxes follows here:
@ -488,20 +541,14 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
 * multiple boxes of type jxlp are used. This is handled by the encoder when
 * needed.
 *
- * For now metadata boxes can only be added before or after the codestream with
- * all frames, so using JxlEncoderAddBox is only possible before the first
- * JxlEncoderAddImageFrame call, and/or after the last JxlEncoderAddImageFrame
- * call and JxlEncoderCloseInput. Support for adding boxes in-between the
- * codestream, and/or in-between image frames may be added later, and would
- * cause the encoder to use jxlp boxes for the codestream.
- *
 * @param enc encoder object.
 * @param type the box type, e.g. "Exif" for EXIF metadata, "XML " for XMP or
 * IPTC metadata, "jumb" for JUMBF metadata.
 * @param contents the full contents of the box, for example EXIF
 * data. For an "Exif" box, the EXIF data must be prepended by a 4-byte tiff
 * header offset, which may be 4 zero-bytes. The ISO BMFF box header must not
- * be included, only the contents.
+ * be included, only the contents. Owned by the caller and its contents are
+ * copied internally.
 * @param size size of the box contents.
 * @param compress_box Whether to compress this box as a "brob" box. Requires
 * Brotli support.
@ -509,18 +556,62 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
 * using this function without JxlEncoderUseContainer, or adding a box type
 * that would result in an invalid file format.
 */
-JXL_EXPORT JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, JxlBoxType type,
+JXL_EXPORT JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc,
+                                             const JxlBoxType type,
                                             const uint8_t* contents,
                                             size_t size,
                                             JXL_BOOL compress_box);

 /**
- * Declares that this encoder will not encode any further frames. Further
- * metadata boxes may still be added.
+ * Indicates the intention to add metadata boxes. This allows @ref
+ * JxlEncoderAddBox to be used. When using this function, then it is required
+ * to use @ref JxlEncoderCloseBoxes at the end.
 *
- * Must be called between JxlEncoderAddImageFrame/JPEGFrame of the last frame
+ * By default the encoder assumes no metadata boxes will be added.
+ *
+ * This setting can only be set at the beginning, before encoding starts.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc);
+
+/**
+ * Declares that no further boxes will be added with @ref JxlEncoderAddBox.
+ * This function must be called after the last box is added so the encoder knows
+ * the stream will be finished. It is not necessary to use this function if
+ * @ref JxlEncoderUseBoxes is not used. Further frames may still be added.
+ *
+ * Must be called between JxlEncoderAddBox of the last box
 * and the next call to JxlEncoderProcessOutput, or JxlEncoderProcessOutput
- * won't output the last frame correctly.
+ * won't output the last box correctly.
+ *
+ * NOTE: if you don't need to close frames and boxes at separate times, you can
+ * use @ref JxlEncoderCloseInput instead to close both at once.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderCloseBoxes(JxlEncoder* enc);
+
+/**
+ * Declares that no frames will be added and @ref JxlEncoderAddImageFrame and
+ * @ref JxlEncoderAddJPEGFrame won't be called anymore. Further metadata boxes
+ * may still be added.
+ *
+ * NOTE: if you don't need to close frames and boxes at separate times, you can
+ * use @ref JxlEncoderCloseInput instead to close both at once.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderCloseFrames(JxlEncoder* enc);
+
+/**
+ * Closes any input to the encoder, equivalent to calling JxlEncoderCloseFrames
+ * as well as calling JxlEncoderCloseBoxes if needed. No further input of any
+ * kind may be given to the encoder, but further @ref JxlEncoderProcessOutput
+ * calls should be done to create the final output.
+ *
+ * The requirements of both @ref JxlEncoderCloseFrames and @ref
+ * JxlEncoderCloseBoxes apply to this function.
 *
 * @param enc encoder object.
 */
@ -618,7 +709,8 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
 * @param enc encoder object
 * @param index index of the extra channel to set.
 * @param name buffer with the name of the extra channel.
- * @param size size of the name buffer in bytes.
+ * @param size size of the name buffer in bytes, not counting the terminating
+ * character.
 * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
 */
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
@ -642,16 +734,13 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
 JXL_EXPORT JxlEncoderStatus JxlEncoderOptionsSetInteger(
    JxlEncoderOptions* options, JxlEncoderOptionId option, int32_t value);

-/** Indicates the encoder should use the box-based JPEG XL container format
- * (BMFF) instead of outputting the codestream bytes directly. Both with and
- * without container are valid JPEG XL files, but the container is necessary
- * when metadata, level 10 features or JPEG reconstruction is used.
+/** Forces the encoder to use the box-based container format (BMFF) even
+ * when not necessary.
 *
- * If enabled, the encoder always uses the container format, even if not
- * necessary. If disabled, the encoder will still use the container format if
- * required (such as for JPEG metadata @ref JxlEncoderStoreJPEGMetadata).
- *
- * This setting must be explicitely enabled before using @ref JxlEncoderAddBox.
+ * When using @ref JxlEncoderUseBoxes, @ref JxlEncoderStoreJPEGMetadata or @ref
+ * JxlEncoderSetCodestreamLevel with level 10, the encoder will automatically
+ * also use the container format, it is not necessary to use
+ * JxlEncoderUseContainer for those use cases.
 *
 * By default this setting is disabled.
 *
@ -659,7 +748,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderOptionsSetInteger(
 *
 * @param enc encoder object.
 * @param use_container true if the encoder should always output the JPEG XL
- * container format.
+ * container format, false to only output it when necessary.
 * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
 * otherwise.
 */
@ -715,8 +804,8 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc,
 * Enables lossless encoding.
 *
 * This is not an option like the others on itself, but rather while enabled it
- * overrides a set of existing options (such as distance and modular mode) that
- * enables bit-for-bit lossless encoding.
+ * overrides a set of existing options (such as distance, modular mode and
+ * color transform) that enables bit-for-bit lossless encoding.
 *
 * When disabled, those options are not overridden, but since those options
 * could still have been manually set to a combination that operates losslessly,
--- a/third_party/jpeg-xl/lib/jxl.cmake
+++ b/third_party/jpeg-xl/lib/jxl.cmake
@ -110,6 +110,15 @@ set(JPEGXL_INTERNAL_SOURCES_DEC
  jxl/entropy_coder.h
  jxl/epf.cc
  jxl/epf.h
+  jxl/fast_dct-inl.h
+  jxl/fast_dct.cc
+  jxl/fast_dct.h
+  jxl/fast_dct128-inl.h
+  jxl/fast_dct16-inl.h
+  jxl/fast_dct256-inl.h
+  jxl/fast_dct32-inl.h
+  jxl/fast_dct64-inl.h
+  jxl/fast_dct8-inl.h
  jxl/fast_math-inl.h
  jxl/field_encodings.h
  jxl/fields.cc
--- a/third_party/jpeg-xl/lib/jxl/aux_out.cc
+++ b/third_party/jpeg-xl/lib/jxl/aux_out.cc
@ -59,23 +59,6 @@ void AuxOut::Print(size_t num_inputs) const {
  }
 }

-void AuxOut::DumpCoeffImage(const char* label,
-                            const Image3S& coeff_image) const {
-  JXL_ASSERT(coeff_image.xsize() % 64 == 0);
-  Image3S reshuffled(coeff_image.xsize() / 8, coeff_image.ysize() * 8);
-  for (size_t c = 0; c < 3; c++) {
-    for (size_t y = 0; y < coeff_image.ysize(); y++) {
-      for (size_t x = 0; x < coeff_image.xsize(); x += 64) {
-        for (size_t i = 0; i < 64; i++) {
-          reshuffled.PlaneRow(c, 8 * y + i / 8)[x / 8 + i % 8] =
-              coeff_image.PlaneRow(c, y)[x + i];
-        }
-      }
-    }
-  }
-  DumpImage(label, reshuffled);
-}
-
 void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
                      BitWriter::Allotment* JXL_RESTRICT allotment,
                      size_t layer, AuxOut* JXL_RESTRICT aux_out) {
--- a/third_party/jpeg-xl/lib/jxl/aux_out.h
+++ b/third_party/jpeg-xl/lib/jxl/aux_out.h
@ -251,12 +251,6 @@ struct AuxOut {
    DumpImage(label, normalized);
  }

-  // This dumps coefficients as a 16-bit PNG with coefficients of a block placed
-  // in the area that would contain that block in a normal image. To view the
-  // resulting image manually, rescale intensities by using:
-  // $ convert -auto-level IMAGE.PNG - | display -
-  void DumpCoeffImage(const char* label, const Image3S& coeff_image) const;
-
  void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) {
    inspector_image3f_ = inspector;
  }
--- a/third_party/jpeg-xl/lib/jxl/color_management_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
@ -233,5 +233,27 @@ TEST_F(ColorManagementTest, D2700ToSRGB) {
                          FloatNear(0.601, 1e-3)));
 }

+TEST_F(ColorManagementTest, P3HLGTo2020HLG) {
+  ColorEncoding p3_hlg;
+  p3_hlg.SetColorSpace(ColorSpace::kRGB);
+  p3_hlg.white_point = WhitePoint::kD65;
+  p3_hlg.primaries = Primaries::kP3;
+  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.CreateICC());
+
+  ColorEncoding rec2020_hlg = p3_hlg;
+  rec2020_hlg.primaries = Primaries::k2100;
+  ASSERT_TRUE(rec2020_hlg.CreateICC());
+
+  ColorSpaceTransform transform;
+  ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1));
+  const float p3_hlg_values[3] = {0., 0.75, 0.};
+  float rec2020_hlg_values[3];
+  DoColorSpaceTransform(&transform, 0, p3_hlg_values, rec2020_hlg_values);
+  EXPECT_THAT(rec2020_hlg_values,
+              ElementsAre(FloatNear(0.3973, 1e-4), FloatNear(0.7382, 1e-4),
+                          FloatNear(0.1183, 1e-4)));
+}
+
 }  // namespace
 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/dec_frame.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
@ -153,8 +153,8 @@ Status DecodeFrame(const DecompressParams& dparams,
      true));

  // Handling of progressive decoding.
+  const FrameHeader& frame_header = frame_decoder.GetFrameHeader();
  {
-    const FrameHeader& frame_header = frame_decoder.GetFrameHeader();
    size_t max_passes = dparams.max_passes;
    size_t max_downsampling = std::max(
        dparams.max_downsampling >> (frame_header.dc_level * 3), size_t(1));
@ -177,6 +177,7 @@ Status DecodeFrame(const DecompressParams& dparams,
    frame_decoder.SetMaxPasses(max_passes);
  }
  frame_decoder.SetRenderSpotcolors(dparams.render_spotcolors);
+  frame_decoder.SetCoalescing(dparams.coalescing);

  size_t processed_bytes = reader->TotalBitsConsumed() / kBitsPerByte;

@ -192,9 +193,18 @@ Status DecodeFrame(const DecompressParams& dparams,
      size_t e = b + frame_decoder.SectionSizes()[i];
      bytes_to_skip += e - b;
      size_t pos = reader->TotalBitsConsumed() / kBitsPerByte;
-      if (pos + e <= reader->TotalBytes()) {
-        auto br = make_unique<BitReader>(
-            Span<const uint8_t>(reader->FirstByte() + b + pos, e - b));
+      if (pos + (dparams.allow_more_progressive_steps &&
+                         (i == 0 ||
+                          frame_header.encoding == FrameEncoding::kModular)
+                     ? b
+                     : e) <=
+              reader->TotalBytes() ||
+          (i == 0 && dparams.allow_more_progressive_steps)) {
+        auto br = make_unique<BitReader>(Span<const uint8_t>(
+            reader->FirstByte() + b + pos,
+            (pos + b > reader->TotalBytes()
+                 ? 0
+                 : std::min(reader->TotalBytes() - pos - b, e - b))));
        section_info.emplace_back(FrameDecoder::SectionInfo{br.get(), i});
        section_closers.emplace_back(
            make_unique<BitReaderScopedCloser>(br.get(), &close_ok));
@ -249,7 +259,23 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
  dec_state_->shared_storage.matrices = DequantMatrices();

  frame_header_.nonserialized_is_preview = is_preview;
-  JXL_RETURN_IF_ERROR(DecodeFrameHeader(br, &frame_header_));
+  size_t pos = br->TotalBitsConsumed() / kBitsPerByte;
+  Status have_frameheader =
+      br->TotalBytes() > pos && DecodeFrameHeader(br, &frame_header_);
+  JXL_RETURN_IF_ERROR(have_frameheader || allow_partial_frames);
+  if (!have_frameheader) {
+    if (dec_state_->shared_storage.dc_frames[0].xsize() > 0) {
+      // If we have a (partial) DC frame available, but we don't have the next
+      // frame header (so allow_partial_frames is true), then we'll assume the
+      // next frame uses that DC frame (which may not be true, e.g. there might
+      // first be a ReferenceOnly patch frame, but it's reasonable to assume
+      // that the DC frame is a good progressive preview)
+      frame_header_.flags |= FrameHeader::kUseDcFrame;
+      frame_header_.encoding = FrameEncoding::kVarDCT;
+      frame_header_.dc_level = 0;
+    } else
+      return JXL_FAILURE("Couldn't read frame header");
+  }
  frame_dim_ = frame_header_.ToFrameDimensions();

  const size_t num_passes = frame_header_.passes.num_passes;
@ -271,7 +297,8 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
  const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups,
                                           num_passes, has_ac_global);
  JXL_RETURN_IF_ERROR(ReadGroupOffsets(toc_entries, br, &section_offsets_,
-                                       &section_sizes_, &groups_total_size));
+                                       &section_sizes_, &groups_total_size) ||
+                      allow_partial_frames);

  JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0);
  const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte;
@ -371,11 +398,14 @@ Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
  if (shared.frame_header.flags & FrameHeader::kNoise) {
    JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params));
  }
+  if (!allow_partial_dc_global_ ||
+      br->TotalBitsConsumed() < br->TotalBytes() * kBitsPerByte) {
+    JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));

-  JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));
-  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
-    JXL_RETURN_IF_ERROR(
-        jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
+    if (frame_header_.encoding == FrameEncoding::kVarDCT) {
+      JXL_RETURN_IF_ERROR(
+          jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
+    }
  }
  // Splines' draw cache uses the color correlation map.
  if (shared.frame_header.flags & FrameHeader::kSplines) {
@ -406,7 +436,7 @@ Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) {
                   frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
  JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
      mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id),
-      /*zerofill=*/false, nullptr, nullptr));
+      /*zerofill=*/false, nullptr, nullptr, allow_partial_frames_));
  if (frame_header_.encoding == FrameEncoding::kVarDCT) {
    JXL_RETURN_IF_ERROR(
        modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_));
@ -608,13 +638,13 @@ Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
          mrect, br[i - decoded_passes_per_ac_group_[ac_group_id]], minShift,
          maxShift, ModularStreamId::ModularAC(ac_group_id, i),
-          /*zerofill=*/false, dec_state_, decoded_));
+          /*zerofill=*/false, dec_state_, decoded_, allow_partial_frames_));
    } else if (i >= decoded_passes_per_ac_group_[ac_group_id] + num_passes &&
               force_draw) {
      JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
          mrect, nullptr, minShift, maxShift,
          ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true,
-          dec_state_, decoded_));
+          dec_state_, decoded_, allow_partial_frames_));
    }
  }
  decoded_passes_per_ac_group_[ac_group_id] += num_passes;
@ -846,7 +876,7 @@ Status FrameDecoder::Flush() {
      dec_state_, pool_, decoded_, is_finalized_));
  JXL_RETURN_IF_ERROR(FinalizeFrameDecoding(decoded_, dec_state_, pool_,
                                            /*force_fir=*/false,
-                                            /*skip_blending=*/false,
+                                            /*skip_blending=*/!coalescing_,
                                            /*move_ec=*/is_finalized_));

  num_renders_++;
@ -941,7 +971,8 @@ Status FrameDecoder::FinalizeFrame() {

  JXL_RETURN_IF_ERROR(Flush());

-  if (dec_state_->shared->frame_header.CanBeReferenced()) {
+  if (dec_state_->shared->frame_header.CanBeReferenced() &&
+      (frame_header_.frame_type != kRegularFrame || coalescing_)) {
    size_t id = dec_state_->shared->frame_header.save_as_reference;
    auto& reference_frame = dec_state_->shared_storage.reference_frames[id];
    if (dec_state_->pre_color_transform_frame.xsize() == 0) {
@ -949,7 +980,7 @@ Status FrameDecoder::FinalizeFrame() {
    } else {
      reference_frame.storage = ImageBundle(decoded_->metadata());
      reference_frame.storage.SetFromImage(
-          std::move(dec_state_->pre_color_transform_frame),
+          CopyImage(dec_state_->pre_color_transform_frame),
          decoded_->c_current());
      if (decoded_->HasExtraChannels()) {
        const std::vector<ImageF>* ecs = &dec_state_->pre_color_transform_ec;
@ -989,8 +1020,8 @@ Status FrameDecoder::FinalizeFrame() {
    // coalesced frame of size equal to image dimensions. Other frames are not
    // blended, thus their final size is the size that was defined in the
    // frame_header.
-    if (frame_header_.frame_type == kRegularFrame ||
-        frame_header_.frame_type == kSkipProgressive) {
+    if (coalescing_ && (frame_header_.frame_type == kRegularFrame ||
+                        frame_header_.frame_type == kSkipProgressive)) {
      decoded_->ShrinkTo(
          dec_state_->shared->frame_header.nonserialized_metadata->xsize(),
          dec_state_->shared->frame_header.nonserialized_metadata->ysize());
--- a/third_party/jpeg-xl/lib/jxl/dec_frame.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h
@ -63,6 +63,7 @@ class FrameDecoder {
    constraints_ = constraints;
  }
  void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; }
+  void SetCoalescing(bool c) { coalescing_ = c; }

  // Read FrameHeader and table of contents from the given BitReader.
  // Also checks frame dimensions for their limits, and sets the output
@ -254,6 +255,7 @@ class FrameDecoder {
  bool allow_partial_frames_;
  bool allow_partial_dc_global_;
  bool render_spotcolors_ = true;
+  bool coalescing_ = true;

  std::vector<uint8_t> processed_section_;
  std::vector<uint8_t> decoded_passes_per_ac_group_;
--- a/third_party/jpeg-xl/lib/jxl/dec_modular.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.cc
@ -160,17 +160,21 @@ Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader,
    nb_chans = 1;
  }
  do_color = decode_color;
-  if (!do_color) nb_chans = 0;
  size_t nb_extra = metadata.extra_channel_info.size();
  bool has_tree = reader->ReadBits(1);
-  if (has_tree) {
-    size_t tree_size_limit = std::min(
-        static_cast<size_t>(1 << 22),
-        1024 + frame_dim.xsize * frame_dim.ysize * (nb_chans + nb_extra) / 16);
-    JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
-    JXL_RETURN_IF_ERROR(
-        DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
+  if (!allow_truncated_group ||
+      reader->TotalBitsConsumed() < reader->TotalBytes() * kBitsPerByte) {
+    if (has_tree) {
+      size_t tree_size_limit =
+          std::min(static_cast<size_t>(1 << 22),
+                   1024 + frame_dim.xsize * frame_dim.ysize *
+                              (nb_chans + nb_extra) / 16);
+      JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
+      JXL_RETURN_IF_ERROR(
+          DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
+    }
  }
+  if (!do_color) nb_chans = 0;

  bool fp = metadata.bit_depth.floating_point_sample;

@ -257,12 +261,10 @@ void ModularFrameDecoder::MaybeDropFullImage() {
  }
 }

-Status ModularFrameDecoder::DecodeGroup(const Rect& rect, BitReader* reader,
-                                        int minShift, int maxShift,
-                                        const ModularStreamId& stream,
-                                        bool zerofill,
-                                        PassesDecoderState* dec_state,
-                                        ImageBundle* output) {
+Status ModularFrameDecoder::DecodeGroup(
+    const Rect& rect, BitReader* reader, int minShift, int maxShift,
+    const ModularStreamId& stream, bool zerofill, PassesDecoderState* dec_state,
+    ImageBundle* output, bool allow_truncated) {
  JXL_DASSERT(stream.kind == ModularStreamId::kModularDC ||
              stream.kind == ModularStreamId::kModularAC);
  const size_t xsize = rect.xsize();
@ -302,9 +304,11 @@ Status ModularFrameDecoder::DecodeGroup(const Rect& rect, BitReader* reader,
  if (gi.channel.empty()) return true;
  ModularOptions options;
  if (!zerofill) {
-    if (!ModularGenericDecompress(
-            reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options,
-            /*undo_transforms=*/true, &tree, &code, &context_map)) {
+    if (!ModularGenericDecompress(reader, gi, /*header=*/nullptr,
+                                  stream.ID(frame_dim), &options,
+                                  /*undo_transforms=*/true, &tree, &code,
+                                  &context_map, allow_truncated) &&
+        !allow_truncated) {
      return JXL_FAILURE("Failed to decode modular group");
    }
  }
--- a/third_party/jpeg-xl/lib/jxl/dec_modular.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_modular.h
@ -91,7 +91,8 @@ class ModularFrameDecoder {
                          bool allow_truncated_group);
  Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift,
                     int maxShift, const ModularStreamId& stream, bool zerofill,
-                     PassesDecoderState* dec_state, ImageBundle* output);
+                     PassesDecoderState* dec_state, ImageBundle* output,
+                     bool allow_truncated);
  // Decodes a VarDCT DC group (`group_id`) from the given `reader`.
  Status DecodeVarDCTDC(size_t group_id, BitReader* reader,
                        PassesDecoderState* dec_state);
--- a/third_party/jpeg-xl/lib/jxl/dec_params.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_params.h
@ -26,6 +26,8 @@ struct DecompressParams {
  bool keep_dct = false;
  // If true, render spot colors (otherwise only returned as extra channels)
  bool render_spotcolors = true;
+  // If true, coalesce frames (otherwise return unblended frames)
+  bool coalescing = true;

  // These cannot be kOn because they need encoder support.
  Override preview = Override::kDefault;
--- a/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc
@ -114,8 +114,8 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
                           " > %" PRIuS,
                           pos.y, ref_pos.ysize, ysize);
      }
-      for (size_t i = 0; i < shared_->metadata->m.extra_channel_info.size() + 1;
-           i++) {
+      for (size_t j = 0; j < shared_->metadata->m.extra_channel_info.size() + 1;
+           j++) {
        uint32_t blend_mode = read_num(kPatchBlendModeContext);
        if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) {
          return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode);
@ -125,7 +125,7 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
        if (UsesAlpha(info.mode)) {
          *uses_extra_channels = true;
        }
-        if (info.mode != PatchBlendMode::kNone && i > 0) {
+        if (info.mode != PatchBlendMode::kNone && j > 0) {
          *uses_extra_channels = true;
        }
        if (UsesAlpha(info.mode) &&
--- a/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc
@ -357,8 +357,8 @@ void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect,
        Store(left, d, out + x);
        Store(right, d, out + x + 1);
 #else
-        Store(InterleaveLower(left, right), d, out + x);
-        Store(InterleaveUpper(left, right), d, out + x + Lanes(d));
+        Store(InterleaveLower(d, left, right), d, out + x);
+        Store(InterleaveUpper(d, left, right), d, out + x + Lanes(d));
 #endif
      }
    }
--- a/third_party/jpeg-xl/lib/jxl/dec_upsample.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_upsample.cc
@ -176,8 +176,8 @@ void Upsample(const ImageF& src, const Rect& src_rect, ImageF* dst,
        min = Min(LoadU(df, raw_min_row + sx + fx), min);
        max = Max(LoadU(df, raw_max_row + sx + fx), max);
      }
-      min = MinOfLanes(min);
-      max = MaxOfLanes(max);
+      min = MinOfLanes(df, min);
+      max = MaxOfLanes(df, max);
      for (size_t lx = 0; lx < N; lx += V) {
        StoreU(min, df, min_row + N * sx + lx);
        StoreU(max, df, max_row + N * sx + lx);
--- a/third_party/jpeg-xl/lib/jxl/decode.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode.cc
@ -175,10 +175,15 @@ size_t BitsPerChannel(JxlDataType data_type) {
 }

 enum class DecoderStage : uint32_t {
-  kInited,    // Decoder created, no JxlDecoderProcessInput called yet
-  kStarted,   // Running JxlDecoderProcessInput calls
-  kFinished,  // Codestream done, but other boxes could still occur
-  kError,     // Error occurred, decoder object no longer usable
+  kInited,              // Decoder created, no JxlDecoderProcessInput called yet
+  kStarted,             // Running JxlDecoderProcessInput calls
+  kCodestreamFinished,  // Codestream done, but other boxes could still occur.
+                        // This stage can also occur before having seen the
+                        // entire codestream if the user didn't subscribe to any
+                        // codestream events at all, e.g. only to box events,
+                        // or, the user only subscribed to basic info, and only
+                        // the header of the codestream was parsed.
+  kError,               // Error occurred, decoder object no longer usable
 };

 enum class FrameStage : uint32_t {
@ -403,10 +408,11 @@ struct JxlDecoderStruct {
  // Status of progression, internal.
  bool got_signature;
  bool first_codestream_seen;
-  // Indicates we know that we've seen the last codestream, however this is not
-  // guaranteed to be true for the last box because a jxl file may have multiple
-  // "jxlp" boxes and it is possible (and permitted) that the last one is not a
-  // final box that uses size 0 to indicate the end.
+  // Indicates we know that we've seen the last codestream box: either this
+  // was a jxlc box, or a jxlp box that has its index indicated as last by
+  // having its most significant bit set, or no boxes are used at all. This
+  // does not indicate the full codestream has already been seen, only the
+  // last box of it has been initiated.
  bool last_codestream_seen;
  bool got_basic_info;
  size_t header_except_icc_bits = 0;  // To skip everything before ICC.
@ -453,6 +459,7 @@ struct JxlDecoderStruct {
  // Settings
  bool keep_orientation;
  bool render_spotcolors;
+  bool coalescing;

  // Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the
  // decoder returns a status. By default, do not return for any of the events,
@ -594,6 +601,17 @@ struct JxlDecoderStruct {
    avail_in -= size;
    file_pos += size;
  }
+
+  // Whether the decoder can use more codestream input for a purpose it needs.
+  // This returns false if the user didn't subscribe to any events that
+  // require the codestream (e.g. only subscribed to metadata boxes), or all
+  // parts of the codestream that are subscribed to (e.g. only basic info) have
+  // already occured.
+  bool CanUseMoreCodestreamInput() const {
+    // The decoder can set this to finished early if all relevant events were
+    // processed, so this check works.
+    return stage != DecoderStage::kCodestreamFinished;
+  }
 };

 // TODO(zond): Make this depend on the data loaded into the decoder.
@ -687,6 +705,7 @@ void JxlDecoderReset(JxlDecoder* dec) {
  dec->thread_pool.reset();
  dec->keep_orientation = false;
  dec->render_spotcolors = true;
+  dec->coalescing = true;
  dec->orig_events_wanted = 0;
  dec->frame_references.clear();
  dec->frame_saved_as.clear();
@ -797,6 +816,14 @@ JxlDecoderStatus JxlDecoderSetRenderSpotcolors(JxlDecoder* dec,
  return JXL_DEC_SUCCESS;
 }

+JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) {
+  if (dec->stage != DecoderStage::kInited) {
+    return JXL_API_ERROR("Must set coalescing option before starting");
+  }
+  dec->coalescing = !!coalescing;
+  return JXL_DEC_SUCCESS;
+}
+
 namespace jxl {
 namespace {

@ -1171,6 +1198,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
      jxl::DecompressParams dparams;
      dparams.preview = want_preview ? jxl::Override::kOn : jxl::Override::kOff;
      dparams.render_spotcolors = dec->render_spotcolors;
+      dparams.coalescing = true;
      jxl::ImageBundle ib(&dec->metadata.m);
      PassesDecoderState preview_dec_state;
      JXL_API_RETURN_IF_ERROR(preview_dec_state.output_encoding_info.Set(
@ -1236,6 +1264,10 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
      // is last of current still
      dec->is_last_of_still =
          dec->is_last_total || dec->frame_header->animation_frame.duration > 0;
+      // is kRegularFrame and coalescing is disabled
+      dec->is_last_of_still |=
+          (!dec->coalescing &&
+           dec->frame_header->frame_type == FrameType::kRegularFrame);

      const size_t internal_frame_index = dec->internal_frames;
      const size_t external_frame_index = dec->external_frames;
@ -1319,6 +1351,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
      dec->frame_dec.reset(new FrameDecoder(
          dec->passes_state.get(), dec->metadata, dec->thread_pool.get()));
      dec->frame_dec->SetRenderSpotcolors(dec->render_spotcolors);
+      dec->frame_dec->SetCoalescing(dec->coalescing);

      // If JPEG reconstruction is wanted and possible, set the jpeg_data of
      // the ImageBundle.
@ -1516,7 +1549,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
    }
  }

-  dec->stage = DecoderStage::kFinished;
+  dec->stage = DecoderStage::kCodestreamFinished;
  // Return success, this means there is nothing more to do.
  return JXL_DEC_SUCCESS;
 }
@ -1710,13 +1743,14 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {

    if (dec->box_stage == BoxStage::kHeader) {
      if (!dec->have_container) {
-        if (dec->stage == DecoderStage::kFinished) return JXL_DEC_SUCCESS;
+        if (dec->stage == DecoderStage::kCodestreamFinished)
+          return JXL_DEC_SUCCESS;
        dec->box_stage = BoxStage::kCodestream;
        dec->box_contents_unbounded = true;
        continue;
      }
      if (dec->avail_in == 0) {
-        if (dec->stage != DecoderStage::kFinished) {
+        if (dec->stage != DecoderStage::kCodestreamFinished) {
          // Not yet seen (all) codestream boxes.
          return JXL_DEC_NEED_MORE_INPUT;
        }
@ -1813,6 +1847,10 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
      if (memcmp(dec->box_type, "ftyp", 4) == 0) {
        dec->box_stage = BoxStage::kFtyp;
      } else if (memcmp(dec->box_type, "jxlc", 4) == 0) {
+        if (dec->last_codestream_seen) {
+          return JXL_API_ERROR("there can only be one jxlc box");
+        }
+        dec->last_codestream_seen = true;
        dec->box_stage = BoxStage::kCodestream;
      } else if (memcmp(dec->box_type, "jxlp", 4) == 0) {
        dec->box_stage = BoxStage::kPartialCodestream;
@ -1844,7 +1882,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
      dec->box_stage = BoxStage::kSkip;
    } else if (dec->box_stage == BoxStage::kPartialCodestream) {
      if (dec->last_codestream_seen) {
-        return JXL_API_ERROR("cannot have codestream after last codestream");
+        return JXL_API_ERROR("cannot have jxlp box after last jxlp box");
      }
      // TODO(lode): error if box is unbounded but last bit not set
      if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
@ -1913,6 +1951,15 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
          // Codestream done, but there may be more other boxes.
          dec->box_stage = BoxStage::kSkip;
          continue;
+        } else if (!dec->last_codestream_seen &&
+                   dec->CanUseMoreCodestreamInput()) {
+          // Even though the codestream was successfully decoded, the last seen
+          // jxlp box was not marked as last, so more jxlp boxes are expected.
+          // Since the codestream already successfully finished, the only valid
+          // case where this could happen is if there are empty jxlp boxes after
+          // this.
+          dec->box_stage = BoxStage::kSkip;
+          continue;
        } else {
          // Codestream decoded, and no box output requested, skip all further
          // input and return success.
@ -2031,6 +2078,8 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {

    if (sig == JXL_SIG_CONTAINER) {
      dec->have_container = 1;
+    } else {
+      dec->last_codestream_seen = true;
    }
  }

@ -2043,10 +2092,13 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
  // Even if the box handling returns success, certain types of
  // data may be missing.
  if (status == JXL_DEC_SUCCESS) {
-    if (dec->stage != DecoderStage::kFinished) {
-      // TODO(lode): consider not returning this error if only subscribed to
-      // the JXL_DEC_BOX event and so finishing the image frames is not
-      // required.
+    if (dec->CanUseMoreCodestreamInput()) {
+      if (!dec->last_codestream_seen) {
+        // In case of jxlp boxes, this means no jxlp box marked as final was
+        // seen yet. Perhaps there is an empty jxlp box after this, this is not
+        // an error but will require more input.
+        return JXL_DEC_NEED_MORE_INPUT;
+      }
      return JXL_API_ERROR("codestream never finished");
    }

@ -2274,6 +2326,10 @@ JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,
    // Don't know image dimensions yet, cannot check for valid size.
    return JXL_DEC_NEED_MORE_INPUT;
  }
+  if (!dec->coalescing &&
+      (!dec->frame_header || dec->frame_stage == FrameStage::kHeader)) {
+    return JXL_API_ERROR("Don't know frame dimensions yet");
+  }
  if (format->num_channels > 4) {
    return JXL_API_ERROR("More than 4 channels not supported");
  }
@ -2292,6 +2348,22 @@ JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,

  return JXL_DEC_SUCCESS;
 }
+
+// helper function to get the dimensions of the current image buffer
+void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize,
+                          bool oriented) {
+  xsize = dec->metadata.oriented_xsize(dec->keep_orientation || !oriented);
+  ysize = dec->metadata.oriented_ysize(dec->keep_orientation || !oriented);
+  if (!dec->coalescing) {
+    xsize = dec->frame_header->ToFrameDimensions().xsize;
+    ysize = dec->frame_header->ToFrameDimensions().ysize;
+    if (!dec->keep_orientation && oriented &&
+        static_cast<int>(dec->metadata.m.GetOrientation()) > 4) {
+      std::swap(xsize, ysize);
+    }
+  }
+}
+
 }  // namespace

 JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
@ -2320,7 +2392,9 @@ JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
  // ConvertImageInternal.
  size_t xsize = dec->ib->xsize();
  size_t ysize = dec->ib->ysize();
-  dec->ib->ShrinkTo(dec->metadata.size.xsize(), dec->metadata.size.ysize());
+  size_t xsize_nopadding, ysize_nopadding;
+  GetCurrentDimensions(dec, xsize_nopadding, ysize_nopadding, false);
+  dec->ib->ShrinkTo(xsize_nopadding, ysize_nopadding);
  JxlDecoderStatus status = jxl::ConvertImageInternal(
      dec, *dec->ib, dec->image_out_format,
      /*want_extra_channel=*/false,
@ -2413,15 +2487,14 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
  if (format->num_channels < 3 && !dec->metadata.m.color_encoding.IsGray()) {
    return JXL_API_ERROR("Grayscale output not possible for color image");
  }
-
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize, true);
  size_t row_size =
-      jxl::DivCeil(dec->metadata.oriented_xsize(dec->keep_orientation) *
-                       format->num_channels * bits,
-                   jxl::kBitsPerByte);
+      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
  if (format->align > 1) {
    row_size = jxl::DivCeil(row_size, format->align) * format->align;
  }
-  *size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation);
+  *size = row_size * ysize;

  return JXL_DEC_SUCCESS;
 }
@ -2474,13 +2547,14 @@ JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec,
  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
  if (status != JXL_DEC_SUCCESS) return status;

-  size_t row_size = jxl::DivCeil(
-      dec->metadata.oriented_xsize(dec->keep_orientation) * num_channels * bits,
-      jxl::kBitsPerByte);
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize, true);
+  size_t row_size =
+      jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte);
  if (format->align > 1) {
    row_size = jxl::DivCeil(row_size, format->align) * format->align;
  }
-  *size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation);
+  *size = row_size * ysize;

  return JXL_DEC_SUCCESS;
 }
@ -2549,7 +2623,70 @@ JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
  }
  header->name_length = dec->frame_header->name.size();
  header->is_last = dec->frame_header->is_last;
+  size_t xsize, ysize;
+  GetCurrentDimensions(dec, xsize, ysize, true);
+  header->layer_info.xsize = xsize;
+  header->layer_info.ysize = ysize;
+  if (!dec->coalescing && dec->frame_header->custom_size_or_origin) {
+    header->layer_info.crop_x0 = dec->frame_header->frame_origin.x0;
+    header->layer_info.crop_y0 = dec->frame_header->frame_origin.y0;
+  } else {
+    header->layer_info.crop_x0 = 0;
+    header->layer_info.crop_y0 = 0;
+  }
+  if (!dec->keep_orientation && !dec->coalescing) {
+    // orient the crop offset
+    size_t W = dec->metadata.oriented_xsize(false);
+    size_t H = dec->metadata.oriented_ysize(false);
+    if (metadata.orientation > 4) {
+      std::swap(header->layer_info.crop_x0, header->layer_info.crop_y0);
+    }
+    size_t o = (metadata.orientation - 1) & 3;
+    if (o > 0 && o < 3) {
+      header->layer_info.crop_x0 = W - xsize - header->layer_info.crop_x0;
+    }
+    if (o > 1) {
+      header->layer_info.crop_y0 = H - ysize - header->layer_info.crop_y0;
+    }
+  }
+  if (dec->coalescing) {
+    header->layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
+    header->layer_info.blend_info.source = 0;
+    header->layer_info.blend_info.alpha = 0;
+    header->layer_info.blend_info.clamp = JXL_FALSE;
+    header->layer_info.save_as_reference = 0;
+  } else {
+    header->layer_info.blend_info.blendmode =
+        static_cast<JxlBlendMode>(dec->frame_header->blending_info.mode);
+    header->layer_info.blend_info.source =
+        dec->frame_header->blending_info.source;
+    header->layer_info.blend_info.alpha =
+        dec->frame_header->blending_info.alpha_channel;
+    header->layer_info.blend_info.clamp =
+        dec->frame_header->blending_info.clamp;
+    header->layer_info.save_as_reference = dec->frame_header->save_as_reference;
+  }
+  return JXL_DEC_SUCCESS;
+}

+JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(const JxlDecoder* dec,
+                                                    size_t index,
+                                                    JxlBlendInfo* blend_info) {
+  if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
+    return JXL_API_ERROR("no frame header available");
+  }
+  const auto& metadata = dec->metadata.m;
+  if (index >= metadata.num_extra_channels) {
+    return JXL_API_ERROR("Invalid extra channel index");
+  }
+  blend_info->blendmode = static_cast<JxlBlendMode>(
+      dec->frame_header->extra_channel_blending_info[index].mode);
+  blend_info->source =
+      dec->frame_header->extra_channel_blending_info[index].source;
+  blend_info->alpha =
+      dec->frame_header->extra_channel_blending_info[index].alpha_channel;
+  blend_info->clamp =
+      dec->frame_header->extra_channel_blending_info[index].clamp;
  return JXL_DEC_SUCCESS;
 }

--- a/third_party/jpeg-xl/lib/jxl/decode_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc
@ -1496,6 +1496,7 @@ TEST_P(DecodeTestParam, PixelTest) {
                                0};
  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  cparams.resampling = config.upsampling;
  cparams.ec_resampling = config.upsampling;
  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
@ -1747,6 +1748,7 @@ TEST(DecodeTest, PixelTestWithICCProfileLossless) {
  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  // For variation: some have container and no preview, others have preview
  // and no container.
  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
@ -2264,6 +2266,7 @@ TEST(DecodeTest, AlignTest) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
      cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false);
@ -2321,6 +2324,7 @@ TEST(DecodeTest, AnimationTest) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::AuxOut aux_out;
  jxl::PaddedBytes compressed;
  jxl::PassesEncoderState enc_state;
@ -2424,6 +2428,7 @@ TEST(DecodeTest, AnimationTestStreaming) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::AuxOut aux_out;
  jxl::PaddedBytes compressed;
  jxl::PassesEncoderState enc_state;
@ -2529,6 +2534,7 @@ TEST(DecodeTest, ExtraChannelTest) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
      cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false);
@ -2645,6 +2651,7 @@ TEST(DecodeTest, SkipFrameTest) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::AuxOut aux_out;
  jxl::PaddedBytes compressed;
  jxl::PassesEncoderState enc_state;
@ -2807,6 +2814,7 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {

  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::AuxOut aux_out;
  jxl::PaddedBytes compressed;
  jxl::PassesEncoderState enc_state;
@ -2962,6 +2970,431 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
  JxlDecoderDestroy(dec);
 }

+TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
+  size_t xsize = 90, ysize = 120;
+  constexpr size_t num_frames = 16;
+  std::vector<uint8_t> frames[num_frames + 5];
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetUintSamples(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+  io.metadata.m.have_animation = true;
+  io.frames.clear();
+  io.frames.reserve(num_frames + 5);
+  io.SetSize(xsize, ysize);
+
+  std::vector<uint32_t> frame_durations_c;
+  std::vector<uint32_t> frame_durations_nc;
+  std::vector<uint32_t> frame_xsize, frame_ysize, frame_x0, frame_y0;
+
+  for (size_t i = 0; i < num_frames; ++i) {
+    size_t cropxsize = 1 + xsize * 2 / (i + 1);
+    size_t cropysize = 1 + ysize * 3 / (i + 2);
+    int cropx0 = i * 3 - 8;
+    int cropy0 = i * 4 - 7;
+    if (i < 5) {
+      std::vector<uint8_t> frame_internal =
+          jxl::test::GetSomeTestImage(xsize / 2, ysize / 2, 4, i * 2 + 1);
+      // An internal frame with 0 duration, and use_for_next_frame, this is a
+      // frame that is not rendered and not output by default by the API, but on
+      // which the rendered frames depend
+      jxl::ImageBundle bundle_internal(&io.metadata.m);
+      EXPECT_TRUE(ConvertFromExternal(
+          jxl::Span<const uint8_t>(frame_internal.data(),
+                                   frame_internal.size()),
+          xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*has_alpha=*/true,
+          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
+          JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr,
+          &bundle_internal, /*float_in=*/false));
+      bundle_internal.duration = 0;
+      bundle_internal.use_for_next_frame = true;
+      bundle_internal.origin = {13, 17};
+      io.frames.push_back(std::move(bundle_internal));
+      frame_durations_nc.push_back(0);
+      frame_xsize.push_back(xsize / 2);
+      frame_ysize.push_back(ysize / 2);
+      frame_x0.push_back(13);
+      frame_y0.push_back(17);
+    }
+
+    std::vector<uint8_t> frame =
+        jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+    // Actual rendered frame
+    jxl::ImageBundle bundle(&io.metadata.m);
+    EXPECT_TRUE(ConvertFromExternal(
+        jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*has_alpha=*/true,
+        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
+        JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle,
+        /*float_in=*/false));
+    bundle.duration = 5 + i;
+    frame_durations_nc.push_back(5 + i);
+    frame_durations_c.push_back(5 + i);
+    frame_xsize.push_back(cropxsize);
+    frame_ysize.push_back(cropysize);
+    frame_x0.push_back(cropx0);
+    frame_y0.push_back(cropy0);
+    bundle.origin = {cropx0, cropy0};
+    // Create some variation in which frames depend on which.
+    if (i != 3 && i != 9 && i != 10) {
+      bundle.use_for_next_frame = true;
+    }
+    if (i != 12) {
+      bundle.blend = true;
+      bundle.blendmode = jxl::BlendMode::kBlend;
+    }
+    io.frames.push_back(std::move(bundle));
+  }
+
+  jxl::CompressParams cparams;
+  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
+  jxl::AuxOut aux_out;
+  jxl::PaddedBytes compressed;
+  jxl::PassesEncoderState enc_state;
+  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out,
+                              nullptr));
+  // try both with and without coalescing
+  for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+    // Independently decode all frames without any skipping, to create the
+    // expected blended frames, for the actual tests below to compare with.
+    {
+      JxlDecoder* dec = JxlDecoderCreate(NULL);
+      const uint8_t* next_in = compressed.data();
+      size_t avail_in = compressed.size();
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+      void* runner = JxlThreadParallelRunnerCreate(
+          NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                     dec, JxlThreadParallelRunner, runner));
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+      for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+        EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+        size_t buffer_size;
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+        if (coalescing)
+          EXPECT_EQ(xsize * ysize * 8, buffer_size);
+        else
+          EXPECT_EQ(frame_xsize[i] * frame_ysize[i] * 8, buffer_size);
+        frames[i].resize(buffer_size);
+        EXPECT_EQ(JXL_DEC_SUCCESS,
+                  JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
+                                              frames[i].size()));
+        EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      }
+
+      // After all frames were decoded, JxlDecoderProcessInput should return
+      // success to indicate all is done.
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+      JxlThreadParallelRunnerDestroy(runner);
+      JxlDecoderDestroy(dec);
+    }
+
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+    void* runner = JxlThreadParallelRunnerCreate(
+        NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                   dec, JxlThreadParallelRunner, runner));
+
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
+                                            JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+
+    for (size_t i = 0; i < num_frames; ++i) {
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing)
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+      else
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+      if (coalescing)
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+      else
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+      EXPECT_EQ(0u,
+                ComparePixels(frames[i].data(), pixels.data(),
+                              frame_header.layer_info.xsize,
+                              frame_header.layer_info.ysize, format, format));
+
+      // Test rewinding mid-way, not decoding all frames.
+      if (i == 8) {
+        break;
+      }
+    }
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      if (i == 3) {
+        JxlDecoderSkipFrames(dec, 5);
+        i += 5;
+      }
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      if (coalescing) {
+        EXPECT_EQ(frame_header.layer_info.xsize, xsize);
+        EXPECT_EQ(frame_header.layer_info.ysize, ysize);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+      } else {
+        EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
+        EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_x0, frame_x0[i]);
+        EXPECT_EQ(frame_header.layer_info.crop_y0, frame_y0[i]);
+        EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                  i != 12 + 5 && frame_header.duration != 0
+                      ? 2
+                      : 0);  // kBlend or the default kReplace
+      }
+      EXPECT_EQ(0u,
+                ComparePixels(frames[i].data(), pixels.data(),
+                              frame_header.layer_info.xsize,
+                              frame_header.layer_info.ysize, format, format));
+    }
+
+    // After all frames were decoded, JxlDecoderProcessInput should return
+    // success to indicate all is done.
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+
+    // Test rewinding the decoder and skipping different frames
+
+    JxlDecoderRewind(dec);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
+                                   dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+
+    for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
+      int test_skipping = (i == 9) ? 3 : 0;
+
+      EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+      size_t buffer_size;
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+      std::vector<uint8_t> pixels(buffer_size);
+
+      // Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
+      // should only skip the next frame, not the currently processed one.
+      if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
+
+      JxlFrameHeader frame_header;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
+      EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
+                frame_header.duration);
+
+      EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
+                frame_header.is_last);
+
+      EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
+                                            pixels.size()));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+      EXPECT_EQ(0u,
+                ComparePixels(frames[i].data(), pixels.data(),
+                              frame_header.layer_info.xsize,
+                              frame_header.layer_info.ysize, format, format));
+
+      if (test_skipping) i += test_skipping;
+    }
+
+    JxlThreadParallelRunnerDestroy(runner);
+    JxlDecoderDestroy(dec);
+  }
+}
+
+TEST(DecodeTest, OrientedCroppedFrameTest) {
+  size_t xsize = 90, ysize = 120;
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  for (bool keep_orientation : {true, false}) {
+    for (uint32_t orientation = 1; orientation <= 8; orientation++) {
+      size_t oxsize = (!keep_orientation && orientation > 4 ? ysize : xsize);
+      size_t oysize = (!keep_orientation && orientation > 4 ? xsize : ysize);
+      jxl::CodecInOut io;
+      io.SetSize(xsize, ysize);
+      io.metadata.m.SetUintSamples(16);
+      io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
+      io.metadata.m.orientation = orientation;
+      io.frames.clear();
+      io.SetSize(xsize, ysize);
+
+      for (size_t i = 0; i < 3; ++i) {
+        size_t cropxsize = 1 + xsize * 2 / (i + 1);
+        size_t cropysize = 1 + ysize * 3 / (i + 2);
+        int cropx0 = i * 3 - 8;
+        int cropy0 = i * 4 - 7;
+
+        std::vector<uint8_t> frame =
+            jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
+        jxl::ImageBundle bundle(&io.metadata.m);
+        EXPECT_TRUE(ConvertFromExternal(
+            jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
+            cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+            /*has_alpha=*/true,
+            /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
+            JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle,
+            /*float_in=*/false));
+        bundle.origin = {cropx0, cropy0};
+        bundle.use_for_next_frame = true;
+        io.frames.push_back(std::move(bundle));
+      }
+
+      jxl::CompressParams cparams;
+      cparams
+          .SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+      cparams.speed_tier = jxl::SpeedTier::kThunder;
+      jxl::AuxOut aux_out;
+      jxl::PaddedBytes compressed;
+      jxl::PassesEncoderState enc_state;
+      EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
+                                  &aux_out, nullptr));
+
+      // 0 is merged frame as decoded with coalescing enabled (default)
+      // 1-3 are non-coalesced frames as decoded with coalescing disabled
+      // 4 is the manually merged frame
+      std::vector<uint8_t> frames[5];
+      frames[4].resize(xsize * ysize * 8, 0);
+
+      // try both with and without coalescing
+      for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
+        // Independently decode all frames without any skipping, to create the
+        // expected blended frames, for the actual tests below to compare with.
+        {
+          JxlDecoder* dec = JxlDecoderCreate(NULL);
+          const uint8_t* next_in = compressed.data();
+          size_t avail_in = compressed.size();
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetKeepOrientation(dec, keep_orientation));
+          void* runner = JxlThreadParallelRunnerCreate(
+              NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
+                                         dec, JxlThreadParallelRunner, runner));
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
+          EXPECT_EQ(JXL_DEC_SUCCESS,
+                    JxlDecoderSetInput(dec, next_in, avail_in));
+          for (size_t i = (coalescing ? 0 : 1); i < (coalescing ? 1 : 4); ++i) {
+            EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER,
+                      JxlDecoderProcessInput(dec));
+            JxlFrameHeader frame_header;
+            EXPECT_EQ(JXL_DEC_SUCCESS,
+                      JxlDecoderGetFrameHeader(dec, &frame_header));
+            size_t buffer_size;
+            EXPECT_EQ(JXL_DEC_SUCCESS,
+                      JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
+            if (coalescing)
+              EXPECT_EQ(xsize * ysize * 8, buffer_size);
+            else
+              EXPECT_EQ(frame_header.layer_info.xsize *
+                            frame_header.layer_info.ysize * 8,
+                        buffer_size);
+            frames[i].resize(buffer_size);
+            EXPECT_EQ(JXL_DEC_SUCCESS,
+                      JxlDecoderSetImageOutBuffer(
+                          dec, &format, frames[i].data(), frames[i].size()));
+            EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+            EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
+                      JXL_BLEND_REPLACE);
+            if (coalescing) {
+              EXPECT_EQ(frame_header.layer_info.xsize, oxsize);
+              EXPECT_EQ(frame_header.layer_info.ysize, oysize);
+              EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
+              EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
+            } else {
+              // manually merge this layer
+              int x0 = frame_header.layer_info.crop_x0;
+              int y0 = frame_header.layer_info.crop_y0;
+              int w = frame_header.layer_info.xsize;
+              int h = frame_header.layer_info.ysize;
+              for (int y = 0; y < static_cast<int>(oysize); y++) {
+                if (y < y0 || y >= y0 + h) continue;
+                // pointers do whole 16-bit RGBA pixels at a time
+                uint64_t* row_merged = static_cast<uint64_t*>(
+                    (void*)(frames[4].data() + y * oxsize * 8));
+                uint64_t* row_layer = static_cast<uint64_t*>(
+                    (void*)(frames[i].data() + (y - y0) * w * 8));
+                for (int x = 0; x < static_cast<int>(oxsize); x++) {
+                  if (x < x0 || x >= x0 + w) continue;
+                  row_merged[x] = row_layer[x - x0];
+                }
+              }
+            }
+          }
+
+          // After all frames were decoded, JxlDecoderProcessInput should return
+          // success to indicate all is done.
+          EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
+          JxlThreadParallelRunnerDestroy(runner);
+          JxlDecoderDestroy(dec);
+        }
+      }
+
+      EXPECT_EQ(0u, ComparePixels(frames[0].data(), frames[4].data(), oxsize,
+                                  oysize, format, format));
+    }
+  }
+}
+
 TEST(DecodeTest, FlushTest) {
  // Size large enough for multiple groups, required to have progressive
  // stages
@ -3187,6 +3620,7 @@ TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
  jxl::CompressParams cparams;
  cparams.SetLossless();
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  cparams.responsive = 1;
  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
@ -3235,7 +3669,7 @@ TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {

  EXPECT_LE(ComparePixels(pixels2.data(), pixels.data(), xsize, ysize, format,
                          format, 2560.0),
-            1700u);
+            2700u);

  EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));

@ -3371,6 +3805,10 @@ TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {

  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
  EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
+  // The decoder returns success despite not having seen the final unknown box
+  // yet. This is because calling JxlDecoderCloseInput is not mandatory for
+  // backwards compatibility, so it doesn't know more bytes follow, the current
+  // bytes ended at a perfectly valid place.
  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));

  size_t remaining = JxlDecoderReleaseInput(dec);
@ -3648,6 +4086,7 @@ TEST(DecodeTest, PartialCodestreamBoxTest) {
  JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
  jxl::CompressParams cparams;
  cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
+  cparams.speed_tier = jxl::SpeedTier::kThunder;
  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
      cparams, kCSBF_Multi, JXL_ORIENT_IDENTITY, false, true);
--- a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.cc
@ -429,8 +429,8 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
    }
    entropy_v += nzeros_v * cost1;

-    entropy += GetLane(SumOfLanes(entropy_v));
-    size_t num_nzeros = GetLane(SumOfLanes(nzeros_v));
+    entropy += GetLane(SumOfLanes(df, entropy_v));
+    size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
    // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
    // number of non-zeros of the block.
    size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
@ -441,9 +441,9 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
  float ret =
      entropy +
      masking *
-          ((config.info_loss_multiplier * GetLane(SumOfLanes(info_loss))) +
+          ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) +
           (config.info_loss_multiplier2 *
-            sqrt(num_blocks * GetLane(SumOfLanes(info_loss2)))));
+            sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2)))));
  return ret;
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@ -189,7 +189,7 @@ V GammaModulation(const D d, const size_t x, const size_t y,
      overall_ratio += avg_ratio;
    }
  }
-  overall_ratio = SumOfLanes(overall_ratio);
+  overall_ratio = SumOfLanes(d, overall_ratio);
  overall_ratio *= Set(d, 1.0f / 64);
  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
  // less than that.
@ -246,12 +246,12 @@ V ColorModulation(const D d, const size_t x, const size_t y,
  // blue we consider as if it was fully red or blue.
  static const float ratio = 30.610615782142737f;  // out of 64 pixels.

-  auto overall_red_coverage = SumOfLanes(red_coverage);
+  auto overall_red_coverage = SumOfLanes(d, red_coverage);
  overall_red_coverage =
      Min(overall_red_coverage, Set(d, ratio * kRedRampLength));
  overall_red_coverage *= Set(d, red_strength / ratio);

-  auto overall_blue_coverage = SumOfLanes(blue_coverage);
+  auto overall_blue_coverage = SumOfLanes(d, blue_coverage);
  overall_blue_coverage =
      Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength));
  overall_blue_coverage *= Set(d, blue_strength / ratio);
@ -295,7 +295,7 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
    }
  }

-  sum = SumOfLanes(sum);
+  sum = SumOfLanes(d, sum);
  return MulAdd(sum, Set(d, -2.0052193233688884f / 112), out_val);
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
@ -157,7 +157,7 @@ void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
          sum += LoadU(df4, rows_in[iy] + x * 4 + ix + 2);
        }
      }
-      row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f);
+      row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
    }
  }
  // Indexing iy and ix is a bit tricky as we include a 2 pixel border
@ -193,7 +193,7 @@ void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
            sum += Load(df4, rows_in[iy] + sx + ix);
          }
        }
-        row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f);
+        row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
      } else {
        float sum = 0;
        for (size_t iy = sy; iy < ey; iy++) {
--- a/Показать больше
+++ b/Показать больше