Bug 1707590 - Part 1: Add vendored libjxl source r=aosmond

Differential Revision: https://phabricator.services.mozilla.com/D113358
2021-05-05 17:33:52 +00:00 · 2021-05-05 17:33:52 +00:00 · 784f1185b9
--- a/config/external/moz.build
+++ b/config/external/moz.build
@ -55,6 +55,9 @@ if CONFIG["CPU_ARCH"] == "arm":
 if CONFIG["MOZ_FFVPX"]:
    external_dirs += ["media/ffvpx"]

+if CONFIG["MOZ_JXL"]:
+    external_dirs += ["media/libjxl", "media/highway"]
+
 external_dirs += [
    "media/kiss_fft",
    "media/libcubeb",
--- a/media/highway/moz.build
+++ b/media/highway/moz.build
@ -0,0 +1,41 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+LOCAL_INCLUDES += [
+    "/third_party/highway/",
+]
+
+SOURCES += [
+    "/third_party/highway/contrib/image/image.cc",
+    "/third_party/highway/hwy/aligned_allocator.cc",
+    "/third_party/highway/hwy/targets.cc",
+]
+
+EXPORTS.hwy += [
+    "/third_party/highway/hwy/aligned_allocator.h",
+    "/third_party/highway/hwy/base.h",
+    "/third_party/highway/hwy/cache_control.h",
+    "/third_party/highway/hwy/foreach_target.h",
+    "/third_party/highway/hwy/highway.h",
+    "/third_party/highway/hwy/targets.h",
+]
+
+EXPORTS.hwy.ops += [
+    "/third_party/highway/hwy/ops/arm_neon-inl.h",
+    "/third_party/highway/hwy/ops/rvv-inl.h",
+    "/third_party/highway/hwy/ops/scalar-inl.h",
+    "/third_party/highway/hwy/ops/set_macros-inl.h",
+    "/third_party/highway/hwy/ops/shared-inl.h",
+    "/third_party/highway/hwy/ops/wasm_128-inl.h",
+    "/third_party/highway/hwy/ops/x86_128-inl.h",
+    "/third_party/highway/hwy/ops/x86_256-inl.h",
+    "/third_party/highway/hwy/ops/x86_512-inl.h",
+]
+
+FINAL_LIBRARY = "gkmedias"
+
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()
--- a/media/highway/moz.yaml
+++ b/media/highway/moz.yaml
@ -0,0 +1,43 @@
+# Version of this schema
+schema: 1
+
+bugzilla:
+  # Bugzilla product and component for this directory and subdirectories
+  product: Core
+  component: "ImageLib"
+
+# Document the source of externally hosted code
+origin:
+
+  # Short name of the package/library
+  name: highway
+
+  description: Performance-portable, length-agnostic SIMD with runtime dispatch
+
+  # Full URL for the package's homepage/etc
+  # Usually different from repository url
+  url: https://github.com/google/highway
+
+  # Human-readable identifier for this version/release
+  # Generally "version NNN", "tag SSS", "bookmark SSS"
+  release: commit ca1a57c342cd815053abfcffa29b44eaead4f20b (2021-04-15T17:54:35Z).
+
+  # Revision to pull in
+  # Must be a long or short commit SHA (long preferred)
+  revision: ca1a57c342cd815053abfcffa29b44eaead4f20b
+
+  # The package's license, where possible using the mnemonic from
+  # https://spdx.org/licenses/
+  # Multiple licenses can be specified (as a YAML list)
+  # A "LICENSE" file must exist containing the full license text
+  license: Apache-2.0
+
+  license-file: LICENSE
+
+vendoring:
+  url: https://github.com/google/highway.git
+  source-hosting: github
+  vendor-directory: third_party/jpeg-xl/third_party/highway
+
+  exclude:
+    - g3doc/
--- a/media/libjxl/include/jxl/jxl_export.h
+++ b/media/libjxl/include/jxl/jxl_export.h
@ -0,0 +1,12 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef JXL_EXPORT_H
+#define JXL_EXPORT_H
+
+#define JXL_EXPORT
+
+#endif /* JXL_EXPORT_H */
--- a/media/libjxl/include/jxl/jxl_threads_export.h
+++ b/media/libjxl/include/jxl/jxl_threads_export.h
@ -0,0 +1,12 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef JXL_THREADS_EXPORT_H
+#define JXL_THREADS_EXPORT_H
+
+#define JXL_THREADS_EXPORT
+
+#endif /* JXL_THREADS_EXPORT_H */
--- a/media/libjxl/moz.build
+++ b/media/libjxl/moz.build
@ -0,0 +1,114 @@
+# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
+# vim: set filetype=python:
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+LOCAL_INCLUDES += [
+    "./include/",
+    "/third_party/jpeg-xl/",
+    "/third_party/jpeg-xl/lib/include/",
+]
+
+SOURCES += [
+    "/third_party/jpeg-xl/lib/jxl/ac_strategy.cc",
+    "/third_party/jpeg-xl/lib/jxl/alpha.cc",
+    "/third_party/jpeg-xl/lib/jxl/ans_common.cc",
+    "/third_party/jpeg-xl/lib/jxl/aux_out.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/descriptive_statistics.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/status.cc",
+    "/third_party/jpeg-xl/lib/jxl/base/time.cc",
+    "/third_party/jpeg-xl/lib/jxl/blending.cc",
+    "/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc",
+    "/third_party/jpeg-xl/lib/jxl/coeff_order.cc",
+    "/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc",
+    "/third_party/jpeg-xl/lib/jxl/color_management.cc",
+    "/third_party/jpeg-xl/lib/jxl/compressed_dc.cc",
+    "/third_party/jpeg-xl/lib/jxl/convolve.cc",
+    "/third_party/jpeg-xl/lib/jxl/dct_scales.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_ans.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_context_map.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_external_image.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_frame.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_group.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_group_border.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_huffman.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_modular.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_noise.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_upsample.cc",
+    "/third_party/jpeg-xl/lib/jxl/dec_xyb.cc",
+    "/third_party/jpeg-xl/lib/jxl/decode.cc",
+    "/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc",
+    "/third_party/jpeg-xl/lib/jxl/entropy_coder.cc",
+    "/third_party/jpeg-xl/lib/jxl/epf.cc",
+    "/third_party/jpeg-xl/lib/jxl/fields.cc",
+    "/third_party/jpeg-xl/lib/jxl/filters.cc",
+    "/third_party/jpeg-xl/lib/jxl/frame_header.cc",
+    "/third_party/jpeg-xl/lib/jxl/gauss_blur.cc",
+    "/third_party/jpeg-xl/lib/jxl/headers.cc",
+    "/third_party/jpeg-xl/lib/jxl/huffman_table.cc",
+    "/third_party/jpeg-xl/lib/jxl/icc_codec.cc",
+    "/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc",
+    "/third_party/jpeg-xl/lib/jxl/image.cc",
+    "/third_party/jpeg-xl/lib/jxl/image_bundle.cc",
+    "/third_party/jpeg-xl/lib/jxl/image_metadata.cc",
+    "/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc",
+    "/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc",
+    "/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc",
+    "/third_party/jpeg-xl/lib/jxl/loop_filter.cc",
+    "/third_party/jpeg-xl/lib/jxl/luminance.cc",
+    "/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc",
+    "/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc",
+    "/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc",
+    "/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc",
+    "/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc",
+    "/third_party/jpeg-xl/lib/jxl/opsin_params.cc",
+    "/third_party/jpeg-xl/lib/jxl/passes_state.cc",
+    "/third_party/jpeg-xl/lib/jxl/quant_weights.cc",
+    "/third_party/jpeg-xl/lib/jxl/quantizer.cc",
+    "/third_party/jpeg-xl/lib/jxl/splines.cc",
+    "/third_party/jpeg-xl/lib/jxl/toc.cc",
+]
+
+SOURCES += [
+    "/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc",
+    "/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc",
+]
+
+DEFINES["JPEGXL_MAJOR_VERSION"] = "0"
+DEFINES["JPEGXL_MINOR_VERSION"] = "0"
+DEFINES["JPEGXL_PATCH_VERSION"] = "0"
+
+EXPORTS.jxl += [
+    "./include/jxl/jxl_export.h",
+    "./include/jxl/jxl_threads_export.h",
+    "/third_party/jpeg-xl/lib/include/jxl/butteraugli.h",
+    "/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h",
+    "/third_party/jpeg-xl/lib/include/jxl/codestream_header.h",
+    "/third_party/jpeg-xl/lib/include/jxl/color_encoding.h",
+    "/third_party/jpeg-xl/lib/include/jxl/decode.h",
+    "/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h",
+    "/third_party/jpeg-xl/lib/include/jxl/encode.h",
+    "/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h",
+    "/third_party/jpeg-xl/lib/include/jxl/memory_manager.h",
+    "/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h",
+    "/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h",
+    "/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h",
+    "/third_party/jpeg-xl/lib/include/jxl/types.h",
+]
+
+FINAL_LIBRARY = "gkmedias"
+
+# We allow warnings for third-party code that can be updated from upstream.
+AllowCompilerWarnings()
+
+# Clang 5.0 has a compiler bug that prevents build in c++17
+# See https://gitlab.com/wg1/jpeg-xl/-/issues/227
+# This should be okay since we are using the C API.
+if CONFIG["CC_TYPE"] == "clang":
+    CXXFLAGS += ["-std=c++11"]
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -0,0 +1,53 @@
+# Version of this schema
+schema: 1
+
+bugzilla:
+  # Bugzilla product and component for this directory and subdirectories
+  product: Core
+  component: "ImageLib"
+
+# Document the source of externally hosted code
+origin:
+
+  # Short name of the package/library
+  name: jpeg-xl
+
+  description: JPEG XL image format reference implementation
+
+  # Full URL for the package's homepage/etc
+  # Usually different from repository url
+  url: https://gitlab.com/wg1/jpeg-xl
+
+  # Human-readable identifier for this version/release
+  # Generally "version NNN", "tag SSS", "bookmark SSS"
+  release: commit 9a8f5195e4d1c45112fd65f184ebe115f4163ba2 (2021-05-04T13:15:00.000+02:00).
+
+  # Revision to pull in
+  # Must be a long or short commit SHA (long preferred)
+  # NOTE(krosylight): Update highway together when updating this!
+  revision: 9a8f5195e4d1c45112fd65f184ebe115f4163ba2
+
+  # The package's license, where possible using the mnemonic from
+  # https://spdx.org/licenses/
+  # Multiple licenses can be specified (as a YAML list)
+  # A "LICENSE" file must exist containing the full license text
+  license: Apache-2.0
+
+  license-file: LICENSE
+
+updatebot:
+    maintainer-phab: saschanaz
+    maintainer-bz: krosylight@mozilla.com
+    tasks:
+      - type: vendoring
+        enabled: True
+
+vendoring:
+  url: https://gitlab.com/wg1/jpeg-xl.git
+  source-hosting: gitlab
+  vendor-directory: third_party/jpeg-xl
+
+  exclude:
+    - doc/
+    - third_party/testdata/
+    - tools/
--- a/third_party/highway/CMakeLists.txt
+++ b/third_party/highway/CMakeLists.txt
@ -0,0 +1,300 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.10)
+
+# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
+if(POLICY CMP0083)
+  cmake_policy(SET CMP0083 NEW)
+endif()
+
+project(hwy VERSION 0.1)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+# Enabled PIE binaries by default if supported.
+include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
+if(CHECK_PIE_SUPPORTED)
+  check_pie_supported(LANGUAGES CXX)
+  if(CMAKE_CXX_LINK_PIE_SUPPORTED)
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+  endif()
+endif()
+
+include(GNUInstallDirs)
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE RelWithDebInfo)
+endif()
+
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__EMSCRIPTEN__)
+      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
+      #endif
+      return 0;
+    }"
+  HWY_EMSCRIPTEN
+)
+
+set(HWY_SOURCES
+    contrib/image/image.cc
+    contrib/image/image.h
+    contrib/math/math-inl.h
+    hwy/aligned_allocator.cc
+    hwy/aligned_allocator.h
+    hwy/base.h
+    hwy/cache_control.h
+    hwy/foreach_target.h
+    hwy/highway.h
+    hwy/nanobenchmark.cc
+    hwy/nanobenchmark.h
+    hwy/ops/arm_neon-inl.h
+    hwy/ops/scalar-inl.h
+    hwy/ops/set_macros-inl.h
+    hwy/ops/shared-inl.h
+    hwy/ops/wasm_128-inl.h
+    hwy/ops/x86_128-inl.h
+    hwy/ops/x86_256-inl.h
+    hwy/ops/x86_512-inl.h
+    hwy/targets.cc
+    hwy/targets.h
+    hwy/tests/test_util-inl.h
+)
+
+if (MSVC)
+  # TODO(janwas): add flags
+else()
+  set(HWY_FLAGS
+    # Avoid changing binaries based on the current time and date.
+    -Wno-builtin-macro-redefined
+    -D__DATE__="redacted"
+    -D__TIMESTAMP__="redacted"
+    -D__TIME__="redacted"
+
+    # Optimizations
+    -fmerge-all-constants
+
+    # Warnings
+    -Wall
+    -Wextra
+    -Wformat-security
+    -Wno-unused-function
+    -Wnon-virtual-dtor
+    -Woverloaded-virtual
+    -Wvla
+  )
+
+  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+    list(APPEND HWY_FLAGS
+      -Wc++2a-extensions
+      -Wfloat-overflow-conversion
+      -Wfloat-zero-conversion
+      -Wfor-loop-analysis
+      -Wgnu-redeclared-enum
+      -Winfinite-recursion
+      -Wself-assign
+      -Wstring-conversion
+      -Wtautological-overlap-compare
+      -Wthread-safety-analysis
+      -Wundefined-func-template
+
+      -fno-cxx-exceptions
+      -fno-slp-vectorize
+      -fno-vectorize
+
+      # Use color in messages
+      -fdiagnostics-show-option -fcolor-diagnostics
+    )
+  endif()
+
+  if (WIN32)
+    list(APPEND HWY_FLAGS
+      -Wno-c++98-compat-pedantic
+      -Wno-cast-align
+      -Wno-double-promotion
+      -Wno-float-equal
+      -Wno-format-nonliteral
+      -Wno-global-constructors
+      -Wno-language-extension-token
+      -Wno-missing-prototypes
+      -Wno-shadow
+      -Wno-shadow-field-in-constructor
+      -Wno-sign-conversion
+      -Wno-unused-member-function
+      -Wno-unused-template
+      -Wno-used-but-marked-unused
+      -Wno-zero-as-null-pointer-constant
+    )
+  else()
+    list(APPEND HWY_FLAGS
+      -fmath-errno
+      -fno-exceptions
+    )
+  endif()
+endif()
+
+add_library(hwy STATIC ${HWY_SOURCES})
+target_compile_options(hwy PRIVATE ${HWY_FLAGS})
+set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
+
+# -------------------------------------------------------- install library
+install(TARGETS hwy
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}")
+# Install all the headers keeping the relative path to the current directory
+# when installing them.
+foreach (source ${HWY_SOURCES})
+  if ("${source}" MATCHES "\.h$")
+    get_filename_component(dirname "${source}" DIRECTORY)
+    install(FILES "${source}"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
+  endif()
+endforeach()
+
+# Add a pkg-config file for libhwy and the test library.
+set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
+foreach (pc libhwy.pc libhwy-test.pc)
+  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
+  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
+      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
+endforeach()
+
+# -------------------------------------------------------- hwy_list_targets
+# Generate a tool to print the compiled-in targets as defined by the current
+# flags. This tool will print to stderr at build time, after building hwy.
+add_executable(hwy_list_targets hwy/tests/list_targets.cc)
+target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
+target_include_directories(hwy_list_targets PRIVATE
+  $<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
+# TARGET_FILE always returns the path to executable
+# Naked target also not always could be run (due to the lack of '.\' prefix)
+# Thus effective command to run should contain the full path
+# and emulator prefix (if any).
+add_custom_command(TARGET hwy POST_BUILD
+    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
+
+# -------------------------------------------------------- Examples
+
+# Avoids mismatch between GTest's static CRT and our dynamic.
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Programming exercise with integrated benchmark
+add_executable(hwy_benchmark hwy/examples/benchmark.cc)
+target_sources(hwy_benchmark PRIVATE
+    hwy/nanobenchmark.cc
+    hwy/nanobenchmark.h)
+# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
+# observe the difference in targets printed.
+target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
+target_link_libraries(hwy_benchmark hwy)
+set_target_properties(hwy_benchmark
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
+
+# -------------------------------------------------------- Tests
+
+include(CTest)
+
+if(BUILD_TESTING)
+enable_testing()
+include(GoogleTest)
+
+set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
+if(HWY_SYSTEM_GTEST)
+find_package(GTest REQUIRED)
+else()
+# Download and unpack googletest at configure time
+configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+endif()
+execute_process(COMMAND ${CMAKE_COMMAND} --build .
+  RESULT_VARIABLE result
+  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
+if(result)
+  message(FATAL_ERROR "Build step for googletest failed: ${result}")
+endif()
+
+# Prevent overriding the parent project's compiler/linker
+# settings on Windows
+set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+# Add googletest directly to our build. This defines
+# the gtest and gtest_main targets.
+add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
+                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
+                 EXCLUDE_FROM_ALL)
+
+# The gtest/gtest_main targets carry header search path
+# dependencies automatically when using CMake 2.8.11 or
+# later. Otherwise we have to add them here ourselves.
+if (CMAKE_VERSION VERSION_LESS 2.8.11)
+  include_directories("${gtest_SOURCE_DIR}/include")
+endif()
+endif() # HWY_SYSTEM_GTEST
+
+set(HWY_TEST_FILES
+  contrib/image/image_test.cc
+  # contrib/math/math_test.cc
+  hwy/aligned_allocator_test.cc
+  hwy/base_test.cc
+  hwy/highway_test.cc
+  hwy/targets_test.cc
+  hwy/examples/skeleton_test.cc
+  hwy/tests/arithmetic_test.cc
+  hwy/tests/combine_test.cc
+  hwy/tests/compare_test.cc
+  hwy/tests/convert_test.cc
+  hwy/tests/logical_test.cc
+  hwy/tests/memory_test.cc
+  hwy/tests/swizzle_test.cc
+  hwy/tests/test_util_test.cc
+)
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+foreach (TESTFILE IN LISTS HWY_TEST_FILES)
+  # The TESTNAME is the name without the extension or directory.
+  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+  add_executable(${TESTNAME} ${TESTFILE})
+  target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
+
+  if(HWY_SYSTEM_GTEST)
+    target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
+  else()
+    target_link_libraries(${TESTNAME} hwy gtest gtest_main)
+  endif()
+  # Output test targets in the test directory.
+  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+
+  if (HWY_EMSCRIPTEN)
+    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
+  endif()
+
+  if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
+    gtest_discover_tests(${TESTNAME} TIMEOUT 60)
+  else ()
+    gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
+  endif ()
+endforeach ()
+
+# The skeleton test uses the skeleton library code.
+target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
+
+endif() # BUILD_TESTING
--- a/third_party/highway/CMakeLists.txt.in
+++ b/third_party/highway/CMakeLists.txt.in
@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.2)
+
+project(googletest-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(googletest
+  GIT_REPOSITORY    https://github.com/google/googletest.git
+  GIT_TAG           master
+  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
+  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
--- a/third_party/highway/CONTRIBUTING
+++ b/third_party/highway/CONTRIBUTING
@ -0,0 +1,33 @@
+# How to Contribute
+
+We'd love to accept your patches and contributions to this project. There are
+just a few small guidelines you need to follow.
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+## Testing
+
+This repository is used by JPEG XL, so major API changes will require
+coordination. Please get in touch with us beforehand, e.g. by raising an issue.
+
+## Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
--- a/third_party/highway/LICENSE
+++ b/third_party/highway/LICENSE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/third_party/highway/Makefile
+++ b/third_party/highway/Makefile
@ -0,0 +1,44 @@
+.DELETE_ON_ERROR:
+
+OBJS := aligned_allocator.o nanobenchmark.o targets.o
+IMAGE_OBJS := image.o
+TEST_NAMES := arithmetic_test combine_test compare_test convert_test hwy_test logical_test memory_test swizzle_test
+TESTS := $(foreach i, $(TEST_NAMES), bin/$(i))
+ROOT_TEST_NAMES := aligned_allocator_test nanobenchmark_test
+ROOT_TESTS := $(foreach i, $(ROOT_TEST_NAMES), bin/$(i))
+
+CXXFLAGS += -I. -fmerge-all-constants -std=c++17 -O2 \
+    -Wno-builtin-macro-redefined -D__DATE__="redacted" \
+    -D__TIMESTAMP__="redacted" -D__TIME__="redacted"  \
+    -Wall -Wextra -Wformat-security -Wno-unused-function \
+    -Wnon-virtual-dtor -Woverloaded-virtual -Wvla
+
+.PHONY: all
+all: $(TESTS) $(ROOT_TESTS) benchmark test
+
+.PHONY: clean
+clean: ; @rm -rf $(OBJS) bin/ benchmark.o
+
+$(OBJS): %.o: hwy/%.cc
+	$(CXX) -c $(CXXFLAGS) $< -o $@
+
+$(IMAGE_OBJS): %.o: contrib/image/%.cc
+	$(CXX) -c $(CXXFLAGS) $< -o $@
+
+benchmark: $(OBJS) hwy/examples/benchmark.cc
+	mkdir -p bin && $(CXX) $(CXXFLAGS) $^ -o bin/benchmark
+
+bin/%: hwy/%.cc $(OBJS)
+	mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
+
+bin/%: hwy/tests/%.cc $(OBJS)
+	mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
+
+bin/%: contrib/image/%.cc $(OBJS)
+	mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
+bin/%: contrib/math/%.cc $(OBJS)
+	mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
+
+.PHONY: test
+test: $(TESTS) $(ROOT_TESTS)
+	for name in $^; do echo ---------------------$$name && $$name; done
--- a/third_party/highway/README.md
+++ b/third_party/highway/README.md
@ -0,0 +1,361 @@
+# Efficient and performance-portable SIMD
+
+Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
+applying the same operation to 'lanes'.
+
+## Why Highway?
+
+- more portable (same source code) than platform-specific intrinsics,
+- works on a wider range of compilers than compiler-specific vector extensions,
+- more dependable than autovectorization,
+- easier to write/maintain than assembly language,
+- supports **runtime dispatch**,
+- supports **variable-length vector** architectures.
+
+## Current status
+
+Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
+A port to RVV is in progress.
+
+Version 0.11 is considered stable enough to use in other projects, and is
+expected to remain backwards compatible unless serious issues are discovered
+while implementing SVE/RVV targets. After these targets are added, Highway will
+reach version 1.0.
+
+Continuous integration tests build with a recent version of Clang (running on
+x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). Also periodically
+tested on x86 with Clang 7-11 and GCC 8, 9 and 10.2.1.
+
+The `contrib` directory contains SIMD-related utilities: an image class with
+aligned rows, and a math library (16 functions already implemented, mostly
+trigonometry).
+
+## Installation
+
+This project uses cmake to generate and build. In a Debian-based system you can
+install it via:
+
+```bash
+sudo apt install cmake
+```
+
+Highway's unit tests use [googletest](https://github.com/google/googletest).
+By default, Highway's CMake downloads this dependency at configuration time.
+You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
+installing gtest separately:
+
+```bash
+sudo apt install libgtest-dev
+```
+
+To build and test the library the standard cmake workflow can be used:
+
+```bash
+mkdir -p build && cd build
+cmake ..
+make -j && make test
+```
+
+Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
+
+To test on all the attainable targets for your platform, use
+`cmake .. -DCMAKE_CXX_FLAGS="-DHWY_COMPILE_ALL_ATTAINABLE"`. Otherwise, the
+default configuration skips baseline targets (e.g. scalar) that are superseded
+by another baseline target.
+
+## Quick start
+
+You can use the `benchmark` inside examples/ as a starting point.
+
+A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
+and their parameters, and the [instruction_matrix][instmtx] indicates the
+number of instructions per operation.
+
+We recommend using full SIMD vectors whenever possible for maximum performance
+portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
+`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
+two) lanes: `HWY_CAPPED(T, N)`. 128-bit vectors are guaranteed to be available
+for lanes of type `T` if `HWY_TARGET != HWY_SCALAR` and `N == 16 / sizeof(T)`.
+
+Functions using Highway must be inside a namespace `namespace HWY_NAMESPACE {`
+(possibly nested in one or more other namespaces defined by the project), and
+additionally either prefixed with `HWY_ATTR`, or residing between
+`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
+
+*   For static dispatch, `HWY_TARGET` will be the best available target among
+    `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
+    [quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
+    can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
+    they are defined in. You can call the function from other modules by
+    wrapping it in a regular function and declaring the regular function in a
+    header.
+
+*   For dynamic dispatch, a table of function pointers is generated via the
+    `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
+    call the best function pointer for the current CPU supported targets. A
+    module is automatically compiled for each target in `HWY_TARGETS` (see
+    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
+    defined and foreach_target.h is included.
+
+## Strip-mining loops
+
+To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
+loop with number of iterations matching the preferred vector width.
+
+In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
+number of elements to process, and `N = Lanes(d)` the number of lanes in a full
+vector. Assume the loop body is given as a function `template<bool partial,
+class D> void LoopBody(D d, size_t max_n)`.
+
+Highway offers several ways to express loops where `N` need not divide `count`:
+
+*   Ensure all inputs/outputs are padded. Then the loop is simply
+
+    ```
+    for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
+    ```
+    Here, the template parameter and second function argument are not needed.
+
+    This is the preferred option, unless `N` is in the thousands and vector
+    operations are pipelined with long latencies. This was the case for
+    supercomputers in the 90s, but nowadays ALUs are cheap and we see most
+    implementations split vectors into 1, 2 or 4 parts, so there is little cost
+    to processing entire vectors even if we do not need all their lanes. Indeed
+    this avoids the (potentially large) cost of predication or partial
+    loads/stores on older targets, and does not duplicate code.
+
+*   Process whole vectors as above, followed by a scalar loop:
+
+    ```
+    size_t i = 0;
+    for (; i + N <= count; i += N) LoopBody<false>(d, 0);
+    for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
+    ```
+    The template parameter and second function arguments are again not needed.
+
+    This avoids duplicating code, and is reasonable if `count` is large.
+    Otherwise, multiple iterations may be slower than one `LoopBody` variant
+    with masking, especially because the `HWY_SCALAR` target selected by
+    `HWY_CAPPED(T, 1)` is slower for some operations due to workarounds for
+    undefined behavior in C++.
+
+*   Process whole vectors as above, followed by a single call to a modified
+    `LoopBody` with masking:
+
+    ```
+    size_t i = 0;
+    for (; i + N <= count; i += N) {
+      LoopBody<false>(d, 0);
+    }
+    if (i < count) {
+      LoopBody<true>(d, count - i);
+    }
+    ```
+    Now the template parameter and second function argument can be used inside
+    `LoopBody` to replace `Load/Store` of full aligned vectors with
+    `LoadN/StoreN(n)` that affect no more than `1 <= n <= N` aligned elements
+    (pending implementation).
+
+    This is a good default when it is infeasible to ensure vectors are padded.
+    In contrast to the scalar loop, only a single final iteration is needed.
+
+## Design philosophy
+
+*   Performance is important but not the sole consideration. Anyone who goes to
+    the trouble of using SIMD clearly cares about speed. However, portability,
+    maintainability and readability also matter, otherwise we would write in
+    assembly. We aim for performance within 10-20% of a hand-written assembly
+    implementation on the development platform.
+
+*   The guiding principles of C++ are "pay only for what you use" and "leave no
+    room for a lower-level language below C++". We apply these by defining a
+    SIMD API that ensures operation costs are visible, predictable and minimal.
+
+*   Performance portability is important, i.e. the API should be efficient on
+    all target platforms. Unfortunately, common idioms for one platform can be
+    inefficient on others. For example: summing lanes horizontally versus
+    shuffling. Documenting which operations are expensive does not prevent their
+    use, as evidenced by widespread use of `HADDPS`. Performance acceptance
+    tests may detect large regressions, but do not help choose the approach
+    during initial development. Analysis tools can warn about some potential
+    inefficiencies, but likely not all. We instead provide [a carefully chosen
+    set of vector types and operations that are efficient on all target
+    platforms][instmtx] (PPC8, SSE4/AVX2+, ARMv8).
+
+*   Future SIMD hardware features are difficult to predict. For example, AVX2
+    came with surprising semantics (almost no interaction between 128-bit
+    blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
+    To ensure the API reflects hardware realities, we suggest a flexible
+    approach that adds new operations as they become commonly available, with
+    scalar fallbacks where not supported.
+
+*   Masking is not yet widely supported on current CPUs. It is difficult to
+    define an interface that provides access to all platform features while
+    retaining performance portability. The P0214R5 proposal lacks support for
+    AVX-512/ARM SVE zeromasks. We suggest limiting usage of masks to the
+    `IfThen[Zero]Else[Zero]` functions until the community has gained more
+    experience with them.
+
+*   "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
+    For example, valarray-like code can iterate over a 1D array with a
+    library-specified vector width. This will result in better code when vector
+    sizes increase, and matches the direction taken by
+    [ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
+    RiscV V as well as Agner Fog's
+    [ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
+    applications may require fixed sizes, so we also guarantee support for
+    128-bit vectors in each instruction set.
+
+*   The API and its implementation should be usable and efficient with commonly
+    used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
+    instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
+    (https://godbolt.org/g/rKx5Ga). Highway requires function-specific
+    target attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
+
+*   Efficient and safe runtime dispatch is important. Modules such as image or
+    video codecs are typically embedded into larger applications such as
+    browsers, so they cannot require separate binaries for each CPU. Libraries
+    also cannot predict whether the application already uses AVX2 (and pays the
+    frequency throttling cost), so this decision must be left to the
+    application. Using only the lowest-common denominator instructions
+    sacrifices too much performance.
+    Therefore, we provide code paths for multiple instruction sets and choose
+    the most suitable at runtime. To reduce overhead, dispatch should be hoisted
+    to higher layers instead of checking inside every low-level function.
+    Highway supports inlining functions in the same file or in *-inl.h headers.
+    We generate all code paths from the same source to reduce implementation-
+    and debugging cost.
+
+*   Not every CPU need be supported. For example, pre-SSE4.1 CPUs are
+    increasingly rare and the AVX instruction set is limited to floating-point
+    operations. To reduce code size and compile time, we provide specializations
+    for SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar fallback.
+
+*   Access to platform-specific intrinsics is necessary for acceptance in
+    performance-critical projects. We provide conversions to and from intrinsics
+    to allow utilizing specialized platform-specific functionality, and simplify
+    incremental porting of existing code.
+
+*   The core API should be compact and easy to learn. We provide only the few
+    dozen operations which are necessary and sufficient for most of the 150+
+    SIMD applications we examined.
+
+## Prior API designs
+
+The author has been writing SIMD code since 2002: first via assembly language,
+then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
+of custom vector classes. The first used macros to generate the classes, which
+reduces duplication but also readability. The second used templates instead.
+The third (used in highwayhash and PIK) added support for AVX2 and runtime
+dispatch. The current design (used in JPEG XL) enables code generation for
+multiple platforms and/or instruction sets from the same source, and improves
+runtime dispatch.
+
+## Differences versus [P0214R5 proposal](https://goo.gl/zKW4SA)
+
+1.  Adding widely used and portable operations such as `AndNot`, `AverageRound`,
+    bit-shift by immediates and `IfThenElse`.
+
+1.  Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
+    crossing 128-bit 'block' boundaries.
+
+1.  Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
+    returns `fixed_size<>` vectors which are more expensive to access because
+    they reside on the stack. We can avoid this plus additional overhead on
+    ARM/AVX2 by defining width-expanding operations as functions of a vector
+    part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
+    `uint16_t`, or demoting full vectors to half vectors with half-width lanes.
+
+1.  Guaranteeing access to the underlying intrinsic vector type. This ensures
+    all platform-specific capabilities can be used. P0214R5 instead only
+    'encourages' implementations to provide access.
+
+1.  Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
+    based on the Vc library, which does not provide assistance for linking
+    multiple instruction sets into the same binary. The Vc documentation
+    suggests compiling separate executables for each instruction set or using
+    GCC's ifunc (indirect functions). The latter is compiler-specific and risks
+    crashes due to ODR violations when compiling the same function with
+    different compiler flags. We solve this problem via target-specific
+    namespaces and attributes (see HOWTO section below). We also permit a mix of
+    static target selection and runtime dispatch for hotspots that may benefit
+    from newer instruction sets if available.
+
+1.  Using built-in PPC vector types without a wrapper class. This leads to much
+    better code generation with GCC 6.3: https://godbolt.org/z/pd2PNP.
+    By contrast, P0214R5 requires a wrapper. We avoid this by using only the
+    member operators provided by the PPC vectors; all other functions and
+    typedefs are non-members. 2019-04 update: Clang power64le does not have
+    this issue, so we simplified get_part(d, v) to GetLane(v).
+
+1.  Omitting inefficient or non-performance-portable operations such as `hmax`,
+    `operator[]`, and unsupported integer comparisons. Applications can often
+    replace these operations at lower cost than emulating that exact behavior.
+
+1.  Omitting `long double` types: these are not commonly available in hardware.
+
+1.  Ensuring signed integer overflow has well-defined semantics (wraparound).
+
+1.  Simple header-only implementation and less than a tenth of the size of the
+    Vc library from which P0214 was derived (98,000 lines in
+    https://github.com/VcDevel/Vc according to the gloc Chrome extension).
+
+1.  Avoiding hidden performance costs. P0214R5 allows implicit conversions from
+    integer to float, which costs 3-4 cycles on x86. We make these conversions
+    explicit to ensure their cost is visible.
+
+## Other related work
+
+*   [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
+    adopts a similar approach with interchangeable vector/scalar types and
+    a compact interface. It allows access to the underlying intrinsics, but
+    does not appear to be designed for other platforms than x86.
+
+*   UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
+    also adopts an explicit vectorization model with vector classes.
+    However, it exposes the union of all platform capabilities, which makes the
+    API harder to learn (209-page spec) and implement (the estimated LOC count
+    is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
+    because it allows applications to use operations that are inefficient on
+    other platforms.
+
+*   Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
+    is a vector library for scientific computing with some innovative features:
+    automatic FLOPS counting, and "if/else branches" using lambda functions.
+    It supports IBM Power8, but only provides float and double types.
+
+## Overloaded function API
+
+Most C++ vector APIs rely on class templates. However, the ARM SVE vector
+type is sizeless and cannot be wrapped in a class. We instead rely on overloaded
+functions. Overloading based on vector types is also undesirable because SVE
+vectors cannot be default-constructed. We instead use a dedicated 'descriptor'
+type `Simd` for overloading, abbreviated to `D` for template arguments and
+`d` in lvalues.
+
+Note that generic function templates are possible (see highway.h).
+
+## Masks
+
+AVX-512 introduced a major change to the SIMD interface: special mask registers
+(one bit per lane) that serve as predicates. It would be expensive to force
+AVX-512 implementations to conform to the prior model of full vectors with lanes
+set to all one or all zero bits. We instead provide a Mask type that emulates
+a subset of this functionality on other platforms at zero cost.
+
+Masks are returned by comparisons and `TestBit`; they serve as the input to
+`IfThen*`. We provide conversions between masks and vector lanes. For clarity
+and safety, we use FF..FF as the definition of true. To also benefit from
+x86 instructions that only require the sign bit of floating-point inputs to be
+set, we provide a special `ZeroIfNegative` function.
+
+## Additional resources
+
+*   [Highway introduction (slides)][intro]
+*   [Overview of instructions per operation on different architectures][instmtx]
+
+[intro]: g3doc/highway_intro.pdf
+[instmtx]: g3doc/instruction_matrix.pdf
+
+This is not an officially supported Google product.
+Contact: janwas@google.com
--- a/third_party/highway/contrib/image/image.cc
+++ b/third_party/highway/contrib/image/image.cc
@ -0,0 +1,145 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "contrib/image/image.h"
+
+#include <cstddef>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "contrib/image/image.cc"
+
+#include <algorithm>  // swap
+
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+size_t GetVectorSize() { return Lanes(HWY_FULL(uint8_t)()); }
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+namespace {
+HWY_EXPORT(GetVectorSize);  // Local function.
+}  // namespace
+
+size_t ImageBase::VectorSize() {
+  // Do not cache result - must return the current value, which may be greater
+  // than the first call if it was subject to DisableTargets!
+  return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+}
+
+size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
+  const size_t vec_size = VectorSize();
+  size_t valid_bytes = xsize * sizeof_t;
+
+  // Allow unaligned accesses starting at the last valid value - this may raise
+  // msan errors unless the user calls InitializePaddingForUnalignedAccesses.
+  // Skip for the scalar case because no extra lanes will be loaded.
+  if (vec_size != 1) {
+    HWY_DASSERT(vec_size >= sizeof_t);
+    valid_bytes += vec_size - sizeof_t;
+  }
+
+  // Round up to vector and cache line size.
+  const size_t align = std::max<size_t>(vec_size, HWY_ALIGNMENT);
+  size_t bytes_per_row = RoundUpTo(valid_bytes, align);
+
+  // During the lengthy window before writes are committed to memory, CPUs
+  // guard against read after write hazards by checking the address, but
+  // only the lower 11 bits. We avoid a false dependency between writes to
+  // consecutive rows by ensuring their sizes are not multiples of 2 KiB.
+  // Avoid2K prevents the same problem for the planes of an Image3.
+  if (bytes_per_row % HWY_ALIGNMENT == 0) {
+    bytes_per_row += align;
+  }
+
+  HWY_DASSERT(bytes_per_row % align == 0);
+  return bytes_per_row;
+}
+
+ImageBase::ImageBase(const size_t xsize, const size_t ysize,
+                     const size_t sizeof_t)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
+  HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
+
+  bytes_per_row_ = 0;
+  // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
+  // if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
+  if (xsize != 0 && ysize != 0) {
+    bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
+    bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
+    HWY_ASSERT(bytes_.get() != nullptr);
+    InitializePadding(sizeof_t, Padding::kRoundUp);
+  }
+}
+
+ImageBase::ImageBase(const size_t xsize, const size_t ysize,
+                     const size_t bytes_per_row, void* const aligned)
+    : xsize_(static_cast<uint32_t>(xsize)),
+      ysize_(static_cast<uint32_t>(ysize)),
+      bytes_per_row_(bytes_per_row),
+      bytes_(static_cast<uint8_t*>(aligned),
+             AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
+  const size_t vec_size = VectorSize();
+  HWY_ASSERT(bytes_per_row % vec_size == 0);
+  HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
+}
+
+void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
+#if defined(MEMORY_SANITIZER) || HWY_IDE
+  if (xsize_ == 0 || ysize_ == 0) return;
+
+  const size_t vec_size = VectorSize();  // Bytes, independent of sizeof_t!
+  if (vec_size == 1) return;             // Scalar mode: no padding needed
+
+  const size_t valid_size = xsize_ * sizeof_t;
+  const size_t initialize_size = padding == Padding::kRoundUp
+                                     ? RoundUpTo(valid_size, vec_size)
+                                     : valid_size + vec_size - sizeof_t;
+  if (valid_size == initialize_size) return;
+
+  for (size_t y = 0; y < ysize_; ++y) {
+    uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
+#if defined(__clang__) && (__clang_major__ <= 6)
+    // There's a bug in msan in clang-6 when handling AVX2 operations. This
+    // workaround allows tests to pass on msan, although it is slower and
+    // prevents msan warnings from uninitialized images.
+    memset(row, 0, initialize_size);
+#else
+    memset(row + valid_size, 0, initialize_size - valid_size);
+#endif  // clang6
+  }
+#else
+  (void)sizeof_t;
+  (void)padding;
+#endif  // MEMORY_SANITIZER
+}
+
+void ImageBase::Swap(ImageBase& other) {
+  std::swap(xsize_, other.xsize_);
+  std::swap(ysize_, other.ysize_);
+  std::swap(bytes_per_row_, other.bytes_per_row_);
+  std::swap(bytes_, other.bytes_);
+}
+
+}  // namespace hwy
+#endif  // HWY_ONCE
--- a/third_party/highway/contrib/image/image.h
+++ b/third_party/highway/contrib/image/image.h
@ -0,0 +1,468 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
+#define HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
+
+// SIMD/multicore-friendly planar image representation with row accessors.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cstddef>
+#include <utility>  // std::move
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+
+namespace hwy {
+
+// Type-independent parts of Image<> - reduces code duplication and facilitates
+// moving member function implementations to cc file.
+struct ImageBase {
+  // Returns required alignment in bytes for externally allocated memory.
+  static size_t VectorSize();
+
+  // Returns distance [bytes] between the start of two consecutive rows, a
+  // multiple of VectorSize but NOT kAlias (see implementation).
+  static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
+
+  // No allocation (for output params or unused images)
+  ImageBase()
+      : xsize_(0),
+        ysize_(0),
+        bytes_per_row_(0),
+        bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
+
+  // Allocates memory (this is the common case)
+  ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
+
+  // References but does not take ownership of external memory. Useful for
+  // interoperability with other libraries. `aligned` must be aligned to a
+  // multiple of VectorSize() and `bytes_per_row` must also be a multiple of
+  // VectorSize() or preferably equal to BytesPerRow().
+  ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo() instead.
+  ImageBase(const ImageBase& other) = delete;
+  ImageBase& operator=(const ImageBase& other) = delete;
+
+  // Move constructor (required for returning Image from function)
+  ImageBase(ImageBase&& other) noexcept = default;
+
+  // Move assignment (required for std::vector)
+  ImageBase& operator=(ImageBase&& other) noexcept = default;
+
+  void Swap(ImageBase& other);
+
+  // Useful for pre-allocating image with some padding for alignment purposes
+  // and later reporting the actual valid dimensions. Caller is responsible
+  // for ensuring xsize/ysize are <= the original dimensions.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    xsize_ = static_cast<uint32_t>(xsize);
+    ysize_ = static_cast<uint32_t>(ysize);
+    // NOTE: we can't recompute bytes_per_row for more compact storage and
+    // better locality because that would invalidate the image contents.
+  }
+
+  // How many pixels.
+  HWY_INLINE size_t xsize() const { return xsize_; }
+  HWY_INLINE size_t ysize() const { return ysize_; }
+
+  // NOTE: do not use this for copying rows - the valid xsize may be much less.
+  HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
+
+  // Raw access to byte contents, for interfacing with other libraries.
+  // Unsigned char instead of char to avoid surprises (sign extension).
+  HWY_INLINE uint8_t* bytes() {
+    void* p = bytes_.get();
+    return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
+  }
+  HWY_INLINE const uint8_t* bytes() const {
+    const void* p = bytes_.get();
+    return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
+  }
+
+ protected:
+  // Returns pointer to the start of a row.
+  HWY_INLINE void* VoidRow(const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (y >= ysize_) {
+      HWY_ABORT("Row(%zu) >= %u\n", y, ysize_);
+    }
+#endif
+
+    void* row = bytes_.get() + y * bytes_per_row_;
+    return HWY_ASSUME_ALIGNED(row, 64);
+  }
+
+  enum class Padding {
+    // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
+    kRoundUp,
+    // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
+    // vector to be initialized. If done by default, this would suppress
+    // legitimate msan warnings. We therefore require users to explicitly call
+    // InitializePadding before using unaligned loads (e.g. convolution).
+    kUnaligned
+  };
+
+  // Initializes the minimum bytes required to suppress msan warnings from
+  // legitimate (according to Padding mode) vector loads/stores on the right
+  // border, where some lanes are uninitialized and assumed to be unused.
+  void InitializePadding(size_t sizeof_t, Padding padding);
+
+  // (Members are non-const to enable assignment during move-assignment.)
+  uint32_t xsize_;  // In valid pixels, not including any padding.
+  uint32_t ysize_;
+  size_t bytes_per_row_;  // Includes padding.
+  AlignedFreeUniquePtr<uint8_t[]> bytes_;
+};
+
+// Single channel, aligned rows separated by padding. T must be POD.
+//
+// 'Single channel' (one 2D array per channel) simplifies vectorization
+// (repeating the same operation on multiple adjacent components) without the
+// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
+// can easily iterate over all components in a row and Image requires no
+// knowledge of the pixel format beyond the component type "T".
+//
+// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
+// false sharing between two threads operating on adjacent rows.
+//
+// 'Padding' is still relevant because vectors could potentially be larger than
+// a cache line. By rounding up row sizes to the vector size, we allow
+// reading/writing ALIGNED vectors whose first lane is a valid sample. This
+// avoids needing a separate loop to handle remaining unaligned lanes.
+//
+// This image layout could also be achieved with a vector and a row accessor
+// function, but a class wrapper with support for "deleter" allows wrapping
+// existing memory allocated by clients without copying the pixels. It also
+// provides convenient accessors for xsize/ysize, which shortens function
+// argument lists. Supports move-construction so it can be stored in containers.
+template <typename ComponentType>
+class Image : public ImageBase {
+ public:
+  using T = ComponentType;
+
+  Image() = default;
+  Image(const size_t xsize, const size_t ysize)
+      : ImageBase(xsize, ysize, sizeof(T)) {}
+  Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
+        void* aligned)
+      : ImageBase(xsize, ysize, bytes_per_row, aligned) {}
+
+  void InitializePaddingForUnalignedAccesses() {
+    InitializePadding(sizeof(T), Padding::kUnaligned);
+  }
+
+  HWY_INLINE const T* ConstRow(const size_t y) const {
+    return static_cast<const T*>(VoidRow(y));
+  }
+  HWY_INLINE const T* ConstRow(const size_t y) {
+    return static_cast<const T*>(VoidRow(y));
+  }
+
+  // Returns pointer to non-const. This allows passing const Image* parameters
+  // when the callee is only supposed to fill the pixels, as opposed to
+  // allocating or resizing the image.
+  HWY_INLINE T* MutableRow(const size_t y) const {
+    return static_cast<T*>(VoidRow(y));
+  }
+  HWY_INLINE T* MutableRow(const size_t y) {
+    return static_cast<T*>(VoidRow(y));
+  }
+
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must
+  // NOT be used to determine xsize.
+  HWY_INLINE intptr_t PixelsPerRow() const {
+    return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
+  }
+};
+
+using ImageF = Image<float>;
+
+// A bundle of 3 same-sized images. To fill an existing Image3 using
+// single-channel producers, we also need access to each const Image*. Const
+// prevents breaking the same-size invariant, while still allowing pixels to be
+// changed via MutableRow.
+template <typename ComponentType>
+class Image3 {
+ public:
+  using T = ComponentType;
+  using ImageT = Image<T>;
+  static constexpr size_t kNumPlanes = 3;
+
+  Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
+
+  Image3(const size_t xsize, const size_t ysize)
+      : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
+                ImageT(xsize, ysize)} {}
+
+  Image3(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+  }
+
+  Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
+    if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
+      HWY_ABORT("Not same size: %zu x %zu, %zu x %zu, %zu x %zu\n",
+                plane0.xsize(), plane0.ysize(), plane1.xsize(), plane1.ysize(),
+                plane2.xsize(), plane2.ysize());
+    }
+    planes_[0] = std::move(plane0);
+    planes_[1] = std::move(plane1);
+    planes_[2] = std::move(plane2);
+  }
+
+  // Copy construction/assignment is forbidden to avoid inadvertent copies,
+  // which can be very expensive. Use CopyImageTo instead.
+  Image3(const Image3& other) = delete;
+  Image3& operator=(const Image3& other) = delete;
+
+  Image3& operator=(Image3&& other) noexcept {
+    for (size_t i = 0; i < kNumPlanes; i++) {
+      planes_[i] = std::move(other.planes_[i]);
+    }
+    return *this;
+  }
+
+  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
+    return static_cast<const T*>(VoidPlaneRow(c, y));
+  }
+  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
+    return static_cast<const T*>(VoidPlaneRow(c, y));
+  }
+
+  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
+    return static_cast<T*>(VoidPlaneRow(c, y));
+  }
+  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
+    return static_cast<T*>(VoidPlaneRow(c, y));
+  }
+
+  HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
+
+  void Swap(Image3& other) {
+    for (size_t c = 0; c < 3; ++c) {
+      other.planes_[c].Swap(planes_[c]);
+    }
+  }
+
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    for (ImageT& plane : planes_) {
+      plane.ShrinkTo(xsize, ysize);
+    }
+  }
+
+  // Sizes of all three images are guaranteed to be equal.
+  HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
+  HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
+  // Returns offset [bytes] from one row to the next row of the same plane.
+  // WARNING: this must NOT be used to determine xsize, nor for copying rows -
+  // the valid xsize may be much less.
+  HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
+  // Returns number of pixels (some of which are padding) per row. Useful for
+  // computing other rows via pointer arithmetic. WARNING: this must NOT be used
+  // to determine xsize.
+  HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
+
+ private:
+  // Returns pointer to the start of a row.
+  HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+    if (c >= kNumPlanes || y >= ysize()) {
+      HWY_ABORT("PlaneRow(%zu, %zu) >= %zu\n", c, y, ysize());
+    }
+#endif
+    // Use the first plane's stride because the compiler might not realize they
+    // are all equal. Thus we only need a single multiplication for all planes.
+    const size_t row_offset = y * planes_[0].bytes_per_row();
+    const void* row = planes_[c].bytes() + row_offset;
+    return static_cast<const T * HWY_RESTRICT>(
+        HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
+  }
+
+ private:
+  ImageT planes_[kNumPlanes];
+};
+
+using Image3F = Image3<float>;
+
+// Rectangular region in image(s). Factoring this out of Image instead of
+// shifting the pointer by x0/y0 allows this to apply to multiple images with
+// different resolutions. Can compare size via SameSize(rect1, rect2).
+class Rect {
+ public:
+  // Most windows are xsize_max * ysize_max, except those on the borders where
+  // begin + size_max > end.
+  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
+                 size_t ysize_max, size_t xend, size_t yend)
+      : x0_(xbegin),
+        y0_(ybegin),
+        xsize_(ClampedSize(xbegin, xsize_max, xend)),
+        ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
+
+  // Construct with origin and known size (typically from another Rect).
+  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
+      : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
+
+  // Construct a rect that covers a whole image.
+  template <typename Image>
+  explicit Rect(const Image& image)
+      : Rect(0, 0, image.xsize(), image.ysize()) {}
+
+  Rect() : Rect(0, 0, 0, 0) {}
+
+  Rect(const Rect&) = default;
+  Rect& operator=(const Rect&) = default;
+
+  Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
+               size_t ysize_max) {
+    return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
+                y0_ + ysize_);
+  }
+
+  template <typename T>
+  const T* ConstRow(const Image<T>* image, size_t y) const {
+    return image->ConstRow(y + y0_) + x0_;
+  }
+
+  template <typename T>
+  T* MutableRow(const Image<T>* image, size_t y) const {
+    return image->MutableRow(y + y0_) + x0_;
+  }
+
+  template <typename T>
+  const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
+    return image.ConstPlaneRow(c, y + y0_) + x0_;
+  }
+
+  template <typename T>
+  T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
+    return image->MutablePlaneRow(c, y + y0_) + x0_;
+  }
+
+  // Returns true if this Rect fully resides in the given image. ImageT could be
+  // Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
+  template <class ImageT>
+  bool IsInside(const ImageT& image) const {
+    return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
+  }
+
+  size_t x0() const { return x0_; }
+  size_t y0() const { return y0_; }
+  size_t xsize() const { return xsize_; }
+  size_t ysize() const { return ysize_; }
+
+ private:
+  // Returns size_max, or whatever is left in [begin, end).
+  static constexpr size_t ClampedSize(size_t begin, size_t size_max,
+                                      size_t end) {
+    return (begin + size_max <= end) ? size_max
+                                     : (end > begin ? end - begin : 0);
+  }
+
+  size_t x0_;
+  size_t y0_;
+
+  size_t xsize_;
+  size_t ysize_;
+};
+
+// Works for any image-like input type(s).
+template <class Image1, class Image2>
+HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
+  return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
+}
+
+// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
+// We assume the radius (distance outside the image) is small compared to the
+// image size, otherwise this might not terminate.
+// The mirror is outside the last column (border pixel is also replicated).
+static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
+                                                 const int64_t xsize) {
+  HWY_DASSERT(xsize != 0);
+
+  // TODO(janwas): replace with branchless version
+  while (x < 0 || x >= xsize) {
+    if (x < 0) {
+      x = -x - 1;
+    } else {
+      x = 2 * xsize - 1 - x;
+    }
+  }
+  return static_cast<size_t>(x);
+}
+
+// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
+
+// Mirrors (repeating the edge pixel once). Useful for convolutions.
+struct WrapMirror {
+  HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
+    return Mirror(coord, static_cast<int64_t>(size));
+  }
+};
+
+// Returns the same coordinate, for when we know "coord" is already valid (e.g.
+// interior of an image).
+struct WrapUnchanged {
+  HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
+    return static_cast<size_t>(coord);
+  }
+};
+
+// Similar to Wrap* but for row pointers (reduces Row() multiplications).
+
+class WrapRowMirror {
+ public:
+  template <class View>
+  WrapRowMirror(const View& image, size_t ysize)
+      : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
+
+  const float* operator()(const float* const HWY_RESTRICT row,
+                          const int64_t stride) const {
+    if (row < first_row_) {
+      const int64_t num_before = first_row_ - row;
+      // Mirrored; one row before => row 0, two before = row 1, ...
+      return first_row_ + num_before - stride;
+    }
+    if (row > last_row_) {
+      const int64_t num_after = row - last_row_;
+      // Mirrored; one row after => last row, two after = last - 1, ...
+      return last_row_ - num_after + stride;
+    }
+    return row;
+  }
+
+ private:
+  const float* const HWY_RESTRICT first_row_;
+  const float* const HWY_RESTRICT last_row_;
+};
+
+struct WrapRowUnchanged {
+  HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
+                                     int64_t /*stride*/) const {
+    return row;
+  }
+};
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
--- a/third_party/highway/contrib/image/image_test.cc
+++ b/third_party/highway/contrib/image/image_test.cc
@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "contrib/image/image.h"
+
+#include <cstddef>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "contrib/image/image_test.cc"
+#include "hwy/foreach_target.h"
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <random>
+#include <utility>
+
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Ensure we can always write full aligned vectors.
+struct TestAlignedT {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    std::mt19937 rng(129);
+    std::uniform_int_distribution<int> dist(0, 16);
+    const HWY_FULL(T) d;
+
+    for (size_t ysize = 1; ysize < 4; ++ysize) {
+      for (size_t xsize = 1; xsize < 64; ++xsize) {
+        Image<T> img(xsize, ysize);
+
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; x += Lanes(d)) {
+            const auto values = Iota(d, dist(rng));
+            Store(values, d, row + x);
+          }
+        }
+
+        // Sanity check to prevent optimizing out the writes
+        const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
+        const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
+        HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
+      }
+    }
+  }
+};
+
+void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
+
+// Ensure we can write an unaligned vector starting at the last valid value.
+struct TestUnalignedT {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+    std::mt19937 rng(129);
+    std::uniform_int_distribution<int> dist(0, 3);
+    const HWY_FULL(T) d;
+
+    for (size_t ysize = 1; ysize < 4; ++ysize) {
+      for (size_t xsize = 1; xsize < 128; ++xsize) {
+        Image<T> img(xsize, ysize);
+        img.InitializePaddingForUnalignedAccesses();
+
+// This test reads padding, which only works if it was initialized,
+// which only happens in MSAN builds.
+#if defined(MEMORY_SANITIZER) || HWY_IDE
+        // Initialize only the valid samples
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row[x] = 1 << dist(rng);
+          }
+        }
+
+        // Read padding bits
+        auto accum = Zero(d);
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            accum |= LoadU(d, row + x);
+          }
+        }
+
+        // Ensure padding was zero
+        const size_t N = Lanes(d);
+        auto lanes = AllocateAligned<T>(N);
+        Store(accum, d, lanes.get());
+        for (size_t i = 0; i < N; ++i) {
+          HWY_ASSERT(lanes[i] < 16);
+        }
+#else  // Check that writing padding does not overwrite valid samples
+       // Initialize only the valid samples
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize; ++x) {
+            row[x] = static_cast<T>(x);
+          }
+        }
+
+        // Zero padding and rightmost sample
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          StoreU(Zero(d), d, row + xsize - 1);
+        }
+
+        // Ensure no samples except the rightmost were overwritten
+        for (size_t y = 0; y < ysize; ++y) {
+          T* HWY_RESTRICT row = img.MutableRow(y);
+          for (size_t x = 0; x < xsize - 1; ++x) {
+            HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
+          }
+        }
+#endif
+      }
+    }
+  }
+};
+
+void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(ImageTest);
+HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
+HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
+}  // namespace hwy
+#endif
--- a/third_party/highway/contrib/math/math-inl.h
+++ b/third_party/highway/contrib/math/math-inl.h
--- a/third_party/highway/contrib/math/math_test.cc
+++ b/third_party/highway/contrib/math/math_test.cc
@ -0,0 +1,188 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cfloat>  // FLT_MAX
+#include <iostream>
+#include <type_traits>
+
+// clang-format off
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "contrib/math/math_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "contrib/math/math-inl.h"
+#include "hwy/tests/test_util-inl.h"
+// clang-format on
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+template <class T, class D>
+void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
+              D d, T min, T max, uint64_t max_error_ulp) {
+  constexpr bool kIsF32 = (sizeof(T) == 4);
+  using UintT = MakeUnsigned<T>;
+
+  const UintT min_bits = BitCast<UintT>(min);
+  const UintT max_bits = BitCast<UintT>(max);
+
+  // If min is negative and max is positive, the range needs to be broken into
+  // two pieces, [+0, max] and [-0, min], otherwise [min, max].
+  int range_count = 1;
+  UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
+  if ((min < 0.0) && (max > 0.0)) {
+    ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
+    ranges[0][1] = max_bits;
+    ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
+    ranges[1][1] = min_bits;
+    range_count = 2;
+  }
+
+  uint64_t max_ulp = 0;
+#if HWY_ARCH_ARM
+  // Emulation is slower, so cannot afford as many.
+  constexpr UintT kSamplesPerRange = 25000;
+#else
+  constexpr UintT kSamplesPerRange = 100000;
+#endif
+  for (int range_index = 0; range_index < range_count; ++range_index) {
+    const UintT start = ranges[range_index][0];
+    const UintT stop = ranges[range_index][1];
+    const UintT step = std::max<UintT>(1, ((stop - start) / kSamplesPerRange));
+    for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
+      const T value = BitCast<T>(std::min(value_bits, stop));
+      const T actual = GetLane(fxN(d, Set(d, value)));
+      const T expected = fx1(value);
+
+      // Skip small inputs and outputs on armv7, it flushes subnormals to zero.
+#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
+      if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
+        continue;
+      }
+#endif
+
+      const auto ulp = ComputeUlpDelta(actual, expected);
+      max_ulp = std::max<uint64_t>(max_ulp, ulp);
+      if (ulp > max_error_ulp) {
+        std::cout << name << "<" << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
+                  << ">(" << value << ") expected: " << expected
+                  << " actual: " << actual << std::endl;
+      }
+      HWY_ASSERT(ulp <= max_error_ulp);
+    }
+  }
+  std::cout << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
+            << ", Max ULP: " << max_ulp << std::endl;
+}
+
+#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
+                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
+  struct Test##NAME {                                                     \
+    template <class T, class D>                                           \
+    HWY_NOINLINE void operator()(T, D d) {                                \
+      if (sizeof(T) == 4) {                                               \
+        TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,  \
+                       F32_ERROR);                                        \
+      } else {                                                            \
+        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, F64_MIN, F64_MAX,  \
+                       F64_ERROR);                                        \
+      }                                                                   \
+    }                                                                     \
+  };                                                                      \
+  HWY_NOINLINE void TestAll##NAME() {                                     \
+    ForFloatTypes(ForPartialVectors<Test##NAME>());                       \
+  }
+
+// Floating point values closest to but less than 1.0
+const float kNearOneF = BitCast<float>(0x3F7FFFFF);
+const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
+
+// clang-format off
+DEFINE_MATH_TEST(Acos,
+  std::acos,  CallAcos,  -1.0,       +1.0,        3,  // NEON is 3 instead of 2
+  std::acos,  CallAcos,  -1.0,       +1.0,        2)
+DEFINE_MATH_TEST(Acosh,
+  std::acosh, CallAcosh, +1.0,       +FLT_MAX,    3,
+  std::acosh, CallAcosh, +1.0,       +DBL_MAX,    3)
+DEFINE_MATH_TEST(Asin,
+  std::asin,  CallAsin,  -1.0,       +1.0,        3,  // NEON is 3 instead of 2
+  std::asin,  CallAsin,  -1.0,       +1.0,        2)
+DEFINE_MATH_TEST(Asinh,
+  std::asinh, CallAsinh, -FLT_MAX,   +FLT_MAX,    3,
+  std::asinh, CallAsinh, -DBL_MAX,   +DBL_MAX,    3)
+DEFINE_MATH_TEST(Atan,
+  std::atan,  CallAtan,  -FLT_MAX,   +FLT_MAX,    3,
+  std::atan,  CallAtan,  -DBL_MAX,   +DBL_MAX,    3)
+DEFINE_MATH_TEST(Atanh,
+  std::atanh, CallAtanh, -kNearOneF, +kNearOneF,  4,  // NEON is 4 instead of 3
+  std::atanh, CallAtanh, -kNearOneD, +kNearOneD,  3)
+DEFINE_MATH_TEST(Cos,
+  std::cos,   CallCos,   -39000.0,   +39000.0,    3,
+  std::cos,   CallCos,   -39000.0,   +39000.0,    3)
+DEFINE_MATH_TEST(Exp,
+  std::exp,   CallExp,   -FLT_MAX,   +104.0,      1,
+  std::exp,   CallExp,   -DBL_MAX,   +104.0,      1)
+DEFINE_MATH_TEST(Expm1,
+  std::expm1, CallExpm1, -FLT_MAX,   +104.0,      4,
+  std::expm1, CallExpm1, -DBL_MAX,   +104.0,      4)
+DEFINE_MATH_TEST(Log,
+  std::log,   CallLog,   +FLT_MIN,   +FLT_MAX,    1,
+  std::log,   CallLog,   +DBL_MIN,   +DBL_MAX,    1)
+DEFINE_MATH_TEST(Log10,
+  std::log10, CallLog10, +FLT_MIN,   +FLT_MAX,    2,
+  std::log10, CallLog10, +DBL_MIN,   +DBL_MAX,    2)
+DEFINE_MATH_TEST(Log1p,
+  std::log1p, CallLog1p, +0.0f,      +1e37,       3,  // NEON is 3 instead of 2
+  std::log1p, CallLog1p, +0.0,       +DBL_MAX,    2)
+DEFINE_MATH_TEST(Log2,
+  std::log2,  CallLog2,  +FLT_MIN,   +FLT_MAX,    2,
+  std::log2,  CallLog2,  +DBL_MIN,   +DBL_MAX,    2)
+DEFINE_MATH_TEST(Sin,
+  std::sin,   CallSin,   -39000.0,   +39000.0,    3,
+  std::sin,   CallSin,   -39000.0,   +39000.0,    3)
+DEFINE_MATH_TEST(Sinh,
+  std::sinh,  CallSinh,  -80.0f,     +80.0f,      4,
+  std::sinh,  CallSinh,  -709.0,     +709.0,      4)
+DEFINE_MATH_TEST(Tanh,
+  std::tanh,  CallTanh,  -FLT_MAX,   +FLT_MAX,    4,
+  std::tanh,  CallTanh,  -DBL_MAX,   +DBL_MAX,    4)
+// clang-format on
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyMathTest);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
+HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
+}  // namespace hwy
+#endif
--- a/third_party/highway/debian/changelog
+++ b/third_party/highway/debian/changelog
@ -0,0 +1,31 @@
+highway (0.12.0-1) UNRELEASED; urgency=medium
+
+  * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
+  * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
+  * Proper IEEE rounding, reduce libstdc++ usage, inlined math
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 15 Apr 2021 20:00:00 +0200
+
+highway (0.11.1-1) UNRELEASED; urgency=medium
+
+  * Fix clang7 asan error, finish f16 conversions and add test
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 25 Feb 2021 16:00:00 +0200
+
+highway (0.11.0-1) UNRELEASED; urgency=medium
+
+  * Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
+
+ -- Jan Wassenberg <janwas@google.com>  Thu, 18 Feb 2021 20:00:00 +0200
+
+highway (0.7.0-1) UNRELEASED; urgency=medium
+
+  * Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
+
+ -- Jan Wassenberg <janwas@google.com>  Tue, 5 Jan 2021 17:00:00 +0200
+
+highway (0.1-1) UNRELEASED; urgency=medium
+
+  * Initial debian package.
+
+ -- Alex Deymo <deymo@google.com>  Mon, 19 Oct 2020 16:48:07 +0200
--- a/third_party/highway/debian/compat
+++ b/third_party/highway/debian/compat
@ -0,0 +1 @@
+10
--- a/third_party/highway/debian/control
+++ b/third_party/highway/debian/control
@ -0,0 +1,23 @@
+Source: highway
+Maintainer: JPEG XL Maintainers <jpegxl@google.com>
+Section: misc
+Priority: optional
+Standards-Version: 3.9.8
+Build-Depends: cmake,
+               debhelper (>= 9),
+               libgtest-dev
+Homepage: https://github.com/google/highway
+
+Package: libhwy-dev
+Architecture: any
+Section: libdevel
+Depends: ${misc:Depends}
+Description: Efficient and performance-portable SIMD wrapper (developer files)
+ This library provides type-safe and source-code portable wrappers over
+ existing platform-specific intrinsics. Its design aims for simplicity,
+ reliable efficiency across platforms, and immediate usability with current
+ compilers.
+ .
+ This package installs the development files. There's no runtime library
+ since most of Highway is implemented in headers and only a very small
+ static library is needed.
--- a/third_party/highway/debian/copyright
+++ b/third_party/highway/debian/copyright
@ -0,0 +1,20 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: highway
+
+Files: *
+Copyright: 2020 Google LLC
+License: Apache-2.0
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ .
+      http://www.apache.org/licenses/LICENSE-2.0
+ .
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ .
+ On Debian systems, the complete text of the Apache License, Version 2
+ can be found in "/usr/share/common-licenses/Apache-2.0".
--- a/third_party/highway/debian/rules
+++ b/third_party/highway/debian/rules
@ -0,0 +1,6 @@
+#!/usr/bin/make -f
+%:
+	dh $@ --buildsystem=cmake
+
+override_dh_auto_configure:
+	dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
--- a/third_party/highway/debian/source/format
+++ b/third_party/highway/debian/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/third_party/highway/hwy/aligned_allocator.cc
+++ b/third_party/highway/hwy/aligned_allocator.cc
@ -0,0 +1,138 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>  // malloc
+
+#include <atomic>
+#include <limits>
+
+#include "hwy/base.h"
+
+namespace hwy {
+namespace {
+
+constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize);
+// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
+// if this is used for single-vector allocations. 256 is more reasonable.
+constexpr size_t kAlias = kAlignment * 4;
+
+#pragma pack(push, 1)
+struct AllocationHeader {
+  void* allocated;
+  size_t payload_size;
+};
+#pragma pack(pop)
+
+// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
+size_t NextAlignedOffset() {
+  static std::atomic<uint32_t> next{0};
+  constexpr uint32_t kGroups = kAlias / kAlignment;
+  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
+  const size_t offset = kAlignment * group;
+  HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
+  return offset;
+}
+
+}  // namespace
+
+void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
+                           void* opaque_ptr) {
+  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
+    HWY_DASSERT(false && "payload_size too large");
+    return nullptr;
+  }
+
+  size_t offset = NextAlignedOffset();
+
+  // What: | misalign | unused | AllocationHeader |payload
+  // Size: |<= kAlias | offset                    |payload_size
+  //       ^allocated.^aligned.^header............^payload
+  // The header must immediately precede payload, which must remain aligned.
+  // To avoid wasting space, the header resides at the end of `unused`,
+  // which therefore cannot be empty (offset == 0).
+  if (offset == 0) {
+    offset = kAlignment;  // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
+    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
+  }
+
+  const size_t allocated_size = kAlias + offset + payload_size;
+  void* allocated;
+  if (alloc_ptr == nullptr) {
+    allocated = malloc(allocated_size);
+  } else {
+    allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
+  }
+  if (allocated == nullptr) return nullptr;
+  // Always round up even if already aligned - we already asked for kAlias
+  // extra bytes and there's no way to give them back.
+  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
+  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
+  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
+  aligned &= ~(kAlias - 1);
+
+  const uintptr_t payload = aligned + offset;  // still aligned
+
+  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
+  // The allocated_size can be reconstructed from the payload_size.
+  AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
+  header->allocated = allocated;
+  header->payload_size = payload_size;
+
+  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kMaxVectorSize);
+}
+
+void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
+                      void* opaque_ptr) {
+  if (aligned_pointer == nullptr) return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_DASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  if (free_ptr == nullptr) {
+    free(header->allocated);
+  } else {
+    (*free_ptr)(opaque_ptr, header->allocated);
+  }
+}
+
+// static
+void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
+                                        void* opaque_ptr,
+                                        ArrayDeleter deleter) {
+  if (aligned_pointer == nullptr) return;
+
+  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
+  HWY_DASSERT(payload % kAlignment == 0);
+  const AllocationHeader* header =
+      reinterpret_cast<const AllocationHeader*>(payload) - 1;
+
+  if (deleter) {
+    (*deleter)(aligned_pointer, header->payload_size);
+  }
+
+  if (free_ptr == nullptr) {
+    free(header->allocated);
+  } else {
+    (*free_ptr)(opaque_ptr, header->allocated);
+  }
+}
+
+}  // namespace hwy
--- a/third_party/highway/hwy/aligned_allocator.h
+++ b/third_party/highway/hwy/aligned_allocator.h
@ -0,0 +1,179 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
+#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
+
+// Memory allocator with support for alignment and offsets.
+
+#include <stddef.h>
+#include <memory>
+
+namespace hwy {
+
+// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
+// requires a literal. This matches typical L1 cache line sizes, which prevents
+// false sharing.
+#define HWY_ALIGNMENT 64
+
+// Pointers to functions equivalent to malloc/free with an opaque void* passed
+// to them.
+using AllocPtr = void* (*)(void* opaque, size_t bytes);
+using FreePtr = void (*)(void* opaque, void* memory);
+
+// Returns null or a pointer to at least `payload_size` (which can be zero)
+// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
+// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
+// memory or malloc() if it is null.
+void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
+                           void* opaque_ptr);
+
+// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
+// must have been returned from a previous call to `AllocateAlignedBytes`.
+// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
+// `free_ptr` function is null, uses the default free().
+void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
+                      void* opaque_ptr);
+
+// Class that deletes the aligned pointer passed to operator() calling the
+// destructor before freeing the pointer. This is equivalent to the
+// std::default_delete but for aligned objects. For a similar deleter equivalent
+// to free() for aligned memory see AlignedFreer().
+class AlignedDeleter {
+ public:
+  AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
+  AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
+      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
+
+  template <typename T>
+  void operator()(T* aligned_pointer) const {
+    return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
+                              TypedArrayDeleter<T>);
+  }
+
+ private:
+  template <typename T>
+  static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
+    size_t elems = size_in_bytes / sizeof(T);
+    for (size_t i = 0; i < elems; i++) {
+      // Explicitly call the destructor on each element.
+      (static_cast<T*>(ptr) + i)->~T();
+    }
+  }
+
+  // Function prototype that calls the destructor for each element in a typed
+  // array. TypeArrayDeleter<T> would match this prototype.
+  using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
+
+  static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
+                                 void* opaque_ptr, ArrayDeleter deleter);
+
+  FreePtr free_;
+  void* opaque_ptr_;
+};
+
+// Unique pointer to T with custom aligned deleter. This can be a single
+// element U or an array of element if T is a U[]. The custom aligned deleter
+// will call the destructor on U or each element of a U[] in the array case.
+template <typename T>
+using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
+
+// Aligned memory equivalent of make_unique<T> using the custom allocators
+// alloc/free with the passed `opaque` pointer. This function calls the
+// constructor with the passed Args... and calls the destructor of the object
+// when the AlignedUniquePtr is destroyed.
+template <typename T, typename... Args>
+AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
+                                               void* opaque, Args&&... args) {
+  T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
+  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
+                             AlignedDeleter(free, opaque));
+}
+
+// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
+// functions.
+template <typename T, typename... Args>
+AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
+  T* ptr = static_cast<T*>(AllocateAlignedBytes(
+      sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
+  return AlignedUniquePtr<T>(
+      new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
+}
+
+// Aligned memory equivalent of make_unique<T[]> for array types using the
+// custom allocators alloc/free. This function calls the constructor with the
+// passed Args... on every created item. The destructor of each element will be
+// called when the AlignedUniquePtr is destroyed.
+template <typename T, typename... Args>
+AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
+    size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
+  T* ptr =
+      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
+  for (size_t i = 0; i < items; i++) {
+    new (ptr + i) T(std::forward<Args>(args)...);
+  }
+  return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
+}
+
+template <typename T, typename... Args>
+AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
+  return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
+      items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
+}
+
+// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
+// but for aligned memory.
+class AlignedFreer {
+ public:
+  // Pass address of this to ctor to skip deleting externally-owned memory.
+  static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
+
+  AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
+  AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
+      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
+
+  template <typename T>
+  void operator()(T* aligned_pointer) const {
+    // TODO(deymo): assert that we are using a POD type T.
+    FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
+  }
+
+ private:
+  FreePtr free_;
+  void* opaque_ptr_;
+};
+
+// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
+// data use AlignedUniquePtr.
+template <typename T>
+using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
+
+// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
+// Upon destruction of the unique_ptr the aligned array will be freed.
+template <typename T>
+AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
+                                          FreePtr free, void* opaque) {
+  return AlignedFreeUniquePtr<T[]>(
+      static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
+      AlignedFreer(free, opaque));
+}
+
+// Same as previous AllocateAligned(), using default allocate/free functions.
+template <typename T>
+AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
+  return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
+}
+
+}  // namespace hwy
+#endif  // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
--- a/third_party/highway/hwy/aligned_allocator_test.cc
+++ b/third_party/highway/hwy/aligned_allocator_test.cc
@ -0,0 +1,250 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/aligned_allocator.h"
+
+#include <stddef.h>
+
+#include <new>
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "hwy/base.h"
+
+namespace {
+
+// Sample object that keeps track on an external counter of how many times was
+// the explicit constructor and destructor called.
+template <size_t N>
+class SampleObject {
+ public:
+  SampleObject() { data_[0] = 'a'; }
+  explicit SampleObject(int* counter) : counter_(counter) {
+    if (counter) (*counter)++;
+    data_[0] = 'b';
+  }
+
+  ~SampleObject() {
+    if (counter_) (*counter_)--;
+  }
+
+  static_assert(N > sizeof(int*), "SampleObject size too small.");
+  int* counter_ = nullptr;
+  char data_[N - sizeof(int*)];
+};
+
+class FakeAllocator {
+ public:
+  // static AllocPtr and FreePtr member to be used with the alligned
+  // allocator. These functions calls the private non-static members.
+  static void* StaticAlloc(void* opaque, size_t bytes) {
+    return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
+  }
+  static void StaticFree(void* opaque, void* memory) {
+    return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
+  }
+
+  // Returns the number of pending allocations to be freed.
+  size_t PendingAllocs() { return allocs_.size(); }
+
+ private:
+  void* Alloc(size_t bytes) {
+    void* ret = malloc(bytes);
+    allocs_.insert(ret);
+    return ret;
+  }
+  void Free(void* memory) {
+    if (!memory) return;
+    EXPECT_NE(allocs_.end(), allocs_.find(memory));
+    free(memory);
+    allocs_.erase(memory);
+  }
+
+  std::set<void*> allocs_;
+};
+
+}  // namespace
+
+namespace hwy {
+
+class AlignedAllocatorTest : public testing::Test {};
+
+TEST(AlignedAllocatorTest, FreeNullptr) {
+  // Calling free with a nullptr is always ok.
+  FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
+                   /*opaque_ptr=*/nullptr);
+}
+
+TEST(AlignedAllocatorTest, AllocDefaultPointers) {
+  const size_t kSize = 7777;
+  void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
+                                   /*opaque_ptr=*/nullptr);
+  ASSERT_NE(nullptr, ptr);
+  // Make sure the pointer is actually aligned.
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  char* p = static_cast<char*>(ptr);
+  size_t ret = 0;
+  for (size_t i = 0; i < kSize; i++) {
+    // Performs a computation using p[] to prevent it being optimized away.
+    p[i] = static_cast<char>(i & 0x7F);
+    if (i) ret += p[i] * p[i - 1];
+  }
+  EXPECT_NE(0U, ret);
+  FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
+}
+
+TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
+  AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
+  AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
+}
+
+TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
+  AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
+  AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
+}
+
+TEST(AlignedAllocatorTest, CustomAlloc) {
+  FakeAllocator fake_alloc;
+
+  const size_t kSize = 7777;
+  void* ptr =
+      AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
+  ASSERT_NE(nullptr, ptr);
+  // We should have only requested one alloc from the allocator.
+  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
+  // Make sure the pointer is actually aligned.
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
+  FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
+  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
+  {
+    auto ptr = MakeUniqueAligned<SampleObject<24>>();
+    // Default constructor sets the data_[0] to 'a'.
+    EXPECT_EQ('a', ptr->data_[0]);
+    EXPECT_EQ(nullptr, ptr->counter_);
+  }
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAligned) {
+  int counter = 0;
+  {
+    // Creates the object, initializes it with the explicit constructor and
+    // returns an unique_ptr to it.
+    auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
+    EXPECT_EQ(1, counter);
+    // Custom constructor sets the data_[0] to 'b'.
+    EXPECT_EQ('b', ptr->data_[0]);
+  }
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
+  int counter = 0;
+  {
+    // Creates the array of objects and initializes them with the explicit
+    // constructor.
+    auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
+    EXPECT_EQ(7, counter);
+    for (size_t i = 0; i < 7; i++) {
+      // Custom constructor sets the data_[0] to 'b'.
+      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
+    }
+  }
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, AllocSingleInt) {
+  auto ptr = AllocateAligned<uint32_t>(1);
+  ASSERT_NE(nullptr, ptr.get());
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  // Force delete of the unique_ptr now to check that it doesn't crash.
+  ptr.reset(nullptr);
+  EXPECT_EQ(nullptr, ptr.get());
+}
+
+TEST(AlignedAllocatorTest, AllocMultipleInt) {
+  const size_t kSize = 7777;
+  auto ptr = AllocateAligned<uint32_t>(kSize);
+  ASSERT_NE(nullptr, ptr.get());
+  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
+  // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
+  // underlying type chosen by AllocateAligned() for the std::unique_ptr.
+  EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
+
+  size_t ret = 0;
+  for (size_t i = 0; i < kSize; i++) {
+    // Performs a computation using ptr[] to prevent it being optimized away.
+    ptr[i] = static_cast<uint32_t>(i);
+    if (i) ret += ptr[i] * ptr[i - 1];
+  }
+  EXPECT_NE(0U, ret);
+}
+
+TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
+  int counter = 0;
+  {
+    // This doesn't call the constructor.
+    auto obj = AllocateAligned<SampleObject<24>>(1);
+    obj[0].counter_ = &counter;
+  }
+  // Destroying the unique_ptr shouldn't have called the destructor of the
+  // SampleObject<24>.
+  EXPECT_EQ(0, counter);
+}
+
+TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
+  FakeAllocator fake_alloc;
+  int counter = 0;
+  {
+    // Creates the array of objects and initializes them with the explicit
+    // constructor.
+    auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
+        7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
+        &counter);
+    // An array shold still only call a single allocation.
+    EXPECT_EQ(1u, fake_alloc.PendingAllocs());
+    EXPECT_EQ(7, counter);
+    for (size_t i = 0; i < 7; i++) {
+      // Custom constructor sets the data_[0] to 'b'.
+      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
+    }
+  }
+  EXPECT_EQ(0, counter);
+  EXPECT_EQ(0u, fake_alloc.PendingAllocs());
+}
+
+TEST(AlignedAllocatorTest, DefaultInit) {
+  // The test is whether this compiles. Default-init is useful for output params
+  // and per-thread storage.
+  std::vector<AlignedUniquePtr<int[]>> ptrs;
+  std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
+  ptrs.resize(128);
+  free_ptrs.resize(128);
+  // The following is to prevent elision of the pointers.
+  std::mt19937 rng(129);  // Emscripten lacks random_device.
+  std::uniform_int_distribution<size_t> dist(0, 127);
+  ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
+  free_ptrs[dist(rng)] = AllocateAligned<double>(456);
+  // "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
+  const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
+  const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
+  constexpr size_t kBits = sizeof(uintptr_t) * 8;
+  EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
+            (addr2 >> (kBits - 1)) >> (kBits - 1));
+}
+
+}  // namespace hwy
--- a/third_party/highway/hwy/base.h
+++ b/third_party/highway/hwy/base.h
@ -0,0 +1,635 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_BASE_H_
+#define HIGHWAY_HWY_BASE_H_
+
+// For SIMD module implementations and their callers, target-independent.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <atomic>
+#include <cfloat>
+
+// Add to #if conditions to prevent IDE from graying out code.
+#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
+    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
+#define HWY_IDE 1
+#else
+#define HWY_IDE 0
+#endif
+
+//------------------------------------------------------------------------------
+// Detect compiler using predefined macros
+
+// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
+// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
+// purpose.
+#if defined(_MSC_VER) && !defined(__clang__)
+#define HWY_COMPILER_MSVC _MSC_VER
+#else
+#define HWY_COMPILER_MSVC 0
+#endif
+
+#ifdef __INTEL_COMPILER
+#define HWY_COMPILER_ICC __INTEL_COMPILER
+#else
+#define HWY_COMPILER_ICC 0
+#endif
+
+#ifdef __GNUC__
+#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#define HWY_COMPILER_GCC 0
+#endif
+
+// Clang can masquerade as MSVC/GCC, in which case both are set.
+#ifdef __clang__
+#ifdef __APPLE__
+// Apple LLVM version is unrelated to the actual Clang version, which we need
+// for enabling workarounds. Use the presence of warning flags to deduce it.
+// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
+#if __has_warning("-Wformat-insufficient-args")
+#define HWY_COMPILER_CLANG 1200
+#elif __has_warning("-Wimplicit-const-int-float-conversion")
+#define HWY_COMPILER_CLANG 1100
+#elif __has_warning("-Wmisleading-indentation")
+#define HWY_COMPILER_CLANG 1000
+#elif defined(__FILE_NAME__)
+#define HWY_COMPILER_CLANG 900
+#elif __has_warning("-Wextra-semi-stmt") || \
+    __has_builtin(__builtin_rotateleft32)
+#define HWY_COMPILER_CLANG 800
+#elif __has_warning("-Wc++98-compat-extra-semi")
+#define HWY_COMPILER_CLANG 700
+#else  // Anything older than 7.0 is not recommended for Highway.
+#define HWY_COMPILER_CLANG 600
+#endif  // __has_warning chain
+#else   // Non-Apple: normal version
+#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
+#endif
+#else  // Not clang
+#define HWY_COMPILER_CLANG 0
+#endif
+
+// More than one may be nonzero, but we want at least one.
+#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
+    !HWY_COMPILER_CLANG
+#error "Unsupported compiler"
+#endif
+
+//------------------------------------------------------------------------------
+// Compiler-specific definitions
+
+#define HWY_STR_IMPL(macro) #macro
+#define HWY_STR(macro) HWY_STR_IMPL(macro)
+
+#if HWY_COMPILER_MSVC
+
+#include <intrin.h>
+
+#define HWY_RESTRICT __restrict
+#define HWY_INLINE __forceinline
+#define HWY_NOINLINE __declspec(noinline)
+#define HWY_FLATTEN
+#define HWY_NORETURN __declspec(noreturn)
+#define HWY_LIKELY(expr) (expr)
+#define HWY_UNLIKELY(expr) (expr)
+#define HWY_PRAGMA(tokens) __pragma(tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
+#define HWY_MAYBE_UNUSED
+#define HWY_HAS_ASSUME_ALIGNED 0
+#if (_MSC_VER >= 1700)
+#define HWY_MUST_USE_RESULT _Check_return_
+#else
+#define HWY_MUST_USE_RESULT
+#endif
+
+#else
+
+#define HWY_RESTRICT __restrict__
+#define HWY_INLINE inline __attribute__((always_inline))
+#define HWY_NOINLINE __attribute__((noinline))
+#define HWY_FLATTEN __attribute__((flatten))
+#define HWY_NORETURN __attribute__((noreturn))
+#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
+#define HWY_PRAGMA(tokens) _Pragma(#tokens)
+#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
+#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
+// Encountered "attribute list cannot appear here" when using the C++17
+// [[maybe_unused]], so only use the old style attribute for now.
+#define HWY_MAYBE_UNUSED __attribute__((unused))
+#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
+
+#endif  // !HWY_COMPILER_MSVC
+
+//------------------------------------------------------------------------------
+// Builtin/attributes
+
+#ifdef __has_builtin
+#define HWY_HAS_BUILTIN(name) __has_builtin(name)
+#else
+#define HWY_HAS_BUILTIN(name) 0
+#endif
+
+#ifdef __has_attribute
+#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
+#else
+#define HWY_HAS_ATTRIBUTE(name) 0
+#endif
+
+// Enables error-checking of format strings.
+#if HWY_HAS_ATTRIBUTE(__format__)
+#define HWY_FORMAT(idx_fmt, idx_arg) \
+  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
+#else
+#define HWY_FORMAT(idx_fmt, idx_arg)
+#endif
+
+// Returns a void* pointer which the compiler then assumes is N-byte aligned.
+// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
+//
+// The assignment semantics are required by GCC/Clang. ICC provides an in-place
+// __assume_aligned, whereas MSVC's __assume appears unsuitable.
+#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
+#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
+#else
+#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
+#endif
+
+// Clang and GCC require attributes on each function into which SIMD intrinsics
+// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
+// automatic annotation via pragmas.
+#if HWY_COMPILER_CLANG
+#define HWY_PUSH_ATTRIBUTES(targets_str)                                     \
+  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
+                                       apply_to = function))
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
+#elif HWY_COMPILER_GCC
+#define HWY_PUSH_ATTRIBUTES(targets_str) \
+  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
+#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
+#else
+#define HWY_PUSH_ATTRIBUTES(targets_str)
+#define HWY_POP_ATTRIBUTES
+#endif
+
+//------------------------------------------------------------------------------
+// Detect architecture using predefined macros
+
+#if defined(__i386__) || defined(_M_IX86)
+#define HWY_ARCH_X86_32 1
+#else
+#define HWY_ARCH_X86_32 0
+#endif
+
+#if defined(__x86_64__) || defined(_M_X64)
+#define HWY_ARCH_X86_64 1
+#else
+#define HWY_ARCH_X86_64 0
+#endif
+
+#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
+#define HWY_ARCH_X86 1
+#else
+#define HWY_ARCH_X86 0
+#endif
+
+#if defined(__powerpc64__) || defined(_M_PPC)
+#define HWY_ARCH_PPC 1
+#else
+#define HWY_ARCH_PPC 0
+#endif
+
+#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
+#define HWY_ARCH_ARM_A64 1
+#else
+#define HWY_ARCH_ARM_A64 0
+#endif
+
+#if defined(__arm__) || defined(_M_ARM)
+#define HWY_ARCH_ARM_V7 1
+#else
+#define HWY_ARCH_ARM_V7 0
+#endif
+
+#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
+#error "Cannot have both A64 and V7"
+#endif
+
+#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
+#define HWY_ARCH_ARM 1
+#else
+#define HWY_ARCH_ARM 0
+#endif
+
+#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
+#define HWY_ARCH_WASM 1
+#else
+#define HWY_ARCH_WASM 0
+#endif
+
+#ifdef __riscv
+#define HWY_ARCH_RVV 1
+#else
+#define HWY_ARCH_RVV 0
+#endif
+
+#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
+     HWY_ARCH_RVV) != 1
+#error "Must detect exactly one platform"
+#endif
+
+//------------------------------------------------------------------------------
+// Macros
+
+#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
+
+#define HWY_CONCAT_IMPL(a, b) a##b
+#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
+
+#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
+// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
+// does, without generating code.
+#if HWY_ARCH_X86
+#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
+#else
+// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
+#define HWY_FENCE
+#endif
+
+// 4 instances of a given literal value, useful as input to LoadDup128.
+#define HWY_REP4(literal) literal, literal, literal, literal
+
+#define HWY_ABORT(format, ...) \
+  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
+
+// Always enabled.
+#define HWY_ASSERT(condition)             \
+  do {                                    \
+    if (!(condition)) {                   \
+      HWY_ABORT("Assert %s", #condition); \
+    }                                     \
+  } while (0)
+
+// Only for "debug" builds
+#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
+    defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
+#define HWY_DASSERT(condition) HWY_ASSERT(condition)
+#else
+#define HWY_DASSERT(condition) \
+  do {                         \
+  } while (0)
+#endif
+
+
+namespace hwy {
+
+//------------------------------------------------------------------------------
+// Alignment
+
+// Not guaranteed to be an upper bound, but the alignment established by
+// aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
+#if HWY_ARCH_X86
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
+#define HWY_ALIGN_MAX alignas(64)
+#elif HWY_ARCH_RVV
+// Not actually an upper bound on the size, but this value prevents crossing a
+// 4K boundary (relevant on Andes).
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
+#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
+#else
+static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
+#define HWY_ALIGN_MAX alignas(16)
+#endif
+
+//------------------------------------------------------------------------------
+// Lane types
+
+// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
+// by concatenating base type and bits.
+
+// RVV already has a builtin type and the GCC intrinsics require it.
+#if HWY_ARCH_RVV && HWY_COMPILER_GCC
+using float16_t = __fp16;
+// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
+// arguments, so use a wrapper.
+// TODO(janwas): replace with _Float16 when that is supported?
+#else
+#pragma pack(push, 1)
+struct float16_t {
+  uint16_t bits;
+};
+#pragma pack(pop)
+#endif
+
+using float32_t = float;
+using float64_t = double;
+
+//------------------------------------------------------------------------------
+// Controlling overload resolution (SFINAE)
+
+template <bool Condition, class T>
+struct EnableIfT {};
+template <class T>
+struct EnableIfT<true, T> {
+  using type = T;
+};
+
+template <bool Condition, class T = void>
+using EnableIf = typename EnableIfT<Condition, T>::type;
+
+// Insert into template/function arguments to enable this overload only for
+// vectors of AT MOST this many bits.
+//
+// Note that enabling for exactly 128 bits is unnecessary because a function can
+// simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
+// other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
+#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
+#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
+#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
+
+#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
+#define HWY_IF_SIGNED(T) \
+  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
+#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
+#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
+
+#define HWY_IF_LANE_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
+#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
+  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
+
+// Empty struct used as a size tag type.
+template <size_t N>
+struct SizeTag {};
+
+//------------------------------------------------------------------------------
+// Type traits
+
+template <typename T>
+constexpr bool IsFloat() {
+  return T(1.25) != T(1);
+}
+
+template <typename T>
+constexpr bool IsSigned() {
+  return T(0) > T(-1);
+}
+
+// Largest/smallest representable integer values.
+template <typename T>
+constexpr T LimitsMax() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
+                       : static_cast<T>(~0ull);
+}
+template <typename T>
+constexpr T LimitsMin() {
+  static_assert(!IsFloat<T>(), "Only for integer types");
+  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
+}
+
+// Largest/smallest representable value (integer or float). This naming avoids
+// confusion with numeric_limits<float>::min() (the smallest positive value).
+template <typename T>
+constexpr T LowestValue() {
+  return LimitsMin<T>();
+}
+template <>
+constexpr float LowestValue<float>() {
+  return -FLT_MAX;
+}
+template <>
+constexpr double LowestValue<double>() {
+  return -DBL_MAX;
+}
+
+template <typename T>
+constexpr T HighestValue() {
+  return LimitsMax<T>();
+}
+template <>
+constexpr float HighestValue<float>() {
+  return FLT_MAX;
+}
+template <>
+constexpr double HighestValue<double>() {
+  return DBL_MAX;
+}
+
+// Returns bitmask of the exponent field in IEEE binary32/64.
+template <typename T>
+constexpr T ExponentMask() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr uint32_t ExponentMask<uint32_t>() {
+  return 0x7F800000;
+}
+template <>
+constexpr uint64_t ExponentMask<uint64_t>() {
+  return 0x7FF0000000000000ULL;
+}
+
+// Returns 1 << mantissa_bits as a floating-point number. All integers whose
+// absolute value are less than this can be represented exactly.
+template <typename T>
+constexpr T MantissaEnd() {
+  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
+  return 0;
+}
+template <>
+constexpr float MantissaEnd<float>() {
+  return 8388608.0f;  // 1 << 23
+}
+template <>
+constexpr double MantissaEnd<double>() {
+  // floating point literal with p52 requires C++17.
+  return 4503599627370496.0;  // 1 << 52
+}
+
+//------------------------------------------------------------------------------
+// Type relations
+
+namespace detail {
+
+template <typename T>
+struct Relations;
+template <>
+struct Relations<uint8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = uint16_t;
+};
+template <>
+struct Relations<int8_t> {
+  using Unsigned = uint8_t;
+  using Signed = int8_t;
+  using Wide = int16_t;
+};
+template <>
+struct Relations<uint16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = uint32_t;
+  using Narrow = uint8_t;
+};
+template <>
+struct Relations<int16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Wide = int32_t;
+  using Narrow = int8_t;
+};
+template <>
+struct Relations<uint32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = uint64_t;
+  using Narrow = uint16_t;
+};
+template <>
+struct Relations<int32_t> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = int64_t;
+  using Narrow = int16_t;
+};
+template <>
+struct Relations<uint64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = uint32_t;
+};
+template <>
+struct Relations<int64_t> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = int32_t;
+};
+template <>
+struct Relations<float16_t> {
+  using Unsigned = uint16_t;
+  using Signed = int16_t;
+  using Float = float16_t;
+  using Wide = float;
+};
+template <>
+struct Relations<float> {
+  using Unsigned = uint32_t;
+  using Signed = int32_t;
+  using Float = float;
+  using Wide = double;
+};
+template <>
+struct Relations<double> {
+  using Unsigned = uint64_t;
+  using Signed = int64_t;
+  using Float = double;
+  using Narrow = float;
+};
+
+}  // namespace detail
+
+// Aliases for types of a different category, but the same size.
+template <typename T>
+using MakeUnsigned = typename detail::Relations<T>::Unsigned;
+template <typename T>
+using MakeSigned = typename detail::Relations<T>::Signed;
+template <typename T>
+using MakeFloat = typename detail::Relations<T>::Float;
+
+// Aliases for types of the same category, but different size.
+template <typename T>
+using MakeWide = typename detail::Relations<T>::Wide;
+template <typename T>
+using MakeNarrow = typename detail::Relations<T>::Narrow;
+
+//------------------------------------------------------------------------------
+// Helper functions
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+
+// Undefined results for x == 0.
+HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
+#if HWY_COMPILER_MSVC
+  unsigned long index;  // NOLINT
+  _BitScanForward(&index, x);
+  return index;
+#else  // HWY_COMPILER_MSVC
+  return static_cast<size_t>(__builtin_ctz(x));
+#endif  // HWY_COMPILER_MSVC
+}
+
+HWY_API size_t PopCount(uint64_t x) {
+#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
+  return static_cast<size_t>(__builtin_popcountll(x));
+#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
+  return _mm_popcnt_u64(x);
+#elif HWY_COMPILER_MSVC
+  return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
+#else
+  x -= ((x >> 1) & 0x55555555U);
+  x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
+  x = (((x >> 4) + x) & 0x0F0F0F0FU);
+  x += (x >> 8);
+  x += (x >> 16);
+  x += (x >> 32);
+  x = x & 0x0000007FU;
+  return (unsigned int)x;
+#endif
+}
+
+// The source/destination must not overlap/alias.
+template <size_t kBytes, typename From, typename To>
+HWY_API void CopyBytes(const From* from, To* to) {
+#if HWY_COMPILER_MSVC
+  const uint8_t* HWY_RESTRICT from_bytes =
+      reinterpret_cast<const uint8_t*>(from);
+  uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
+  for (size_t i = 0; i < kBytes; ++i) {
+    to_bytes[i] = from_bytes[i];
+  }
+#else
+  // Avoids horrible codegen on Clang (series of PINSRB)
+  __builtin_memcpy(to, from, kBytes);
+#endif
+}
+
+HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...);
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_BASE_H_
--- a/third_party/highway/hwy/base_test.cc
+++ b/third_party/highway/hwy/base_test.cc
@ -0,0 +1,123 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <limits>
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "base_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+HWY_NOINLINE void TestAllLimits() {
+  HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
+  HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
+  HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
+  HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
+
+  HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
+  HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
+  HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
+  HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
+
+  HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
+  HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
+  HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
+  HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
+}
+
+struct TestLowestHighest {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
+    HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
+  }
+};
+
+HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
+struct TestIsUnsigned {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(!IsFloat<T>(), "Expected !IsFloat");
+    static_assert(!IsSigned<T>(), "Expected !IsSigned");
+  }
+};
+
+struct TestIsSigned {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(!IsFloat<T>(), "Expected !IsFloat");
+    static_assert(IsSigned<T>(), "Expected IsSigned");
+  }
+};
+
+struct TestIsFloat {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*unused*/) const {
+    static_assert(IsFloat<T>(), "Expected IsFloat");
+    static_assert(IsSigned<T>(), "Floats are also considered signed");
+  }
+};
+
+HWY_NOINLINE void TestAllType() {
+  ForUnsignedTypes(TestIsUnsigned());
+  ForSignedTypes(TestIsSigned());
+  ForFloatTypes(TestIsFloat());
+}
+
+HWY_NOINLINE void TestAllPopCount() {
+  HWY_ASSERT_EQ(size_t(0), PopCount(0u));
+  HWY_ASSERT_EQ(size_t(1), PopCount(1u));
+  HWY_ASSERT_EQ(size_t(1), PopCount(2u));
+  HWY_ASSERT_EQ(size_t(2), PopCount(3u));
+  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
+  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
+  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
+
+  HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
+  HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
+  HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
+  HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
+  HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(BaseTest);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
+HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/cache_control.h
+++ b/third_party/highway/hwy/cache_control.h
@ -0,0 +1,88 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
+#define HIGHWAY_HWY_CACHE_CONTROL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "hwy/base.h"
+
+// Requires SSE2; fails to compile on 32-bit Clang 7 (see
+// https://github.com/gperftools/gperftools/issues/946).
+#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
+#undef HWY_DISABLE_CACHE_CONTROL
+#define HWY_DISABLE_CACHE_CONTROL
+#endif
+
+// intrin.h is sufficient on MSVC and already included by base.h.
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
+#include <emmintrin.h>  // SSE2
+#endif
+
+namespace hwy {
+
+// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
+#define HWY_STREAM_MULTIPLE 16
+
+// The following functions may also require an attribute.
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
+#define HWY_ATTR_CACHE __attribute__((target("sse2")))
+#else
+#define HWY_ATTR_CACHE
+#endif
+
+// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
+// serves as a full fence (waits for all prior instructions to complete).
+// No effect on non-x86.
+HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_lfence();
+#endif
+}
+
+// Ensures previous weakly-ordered stores are visible. No effect on non-x86.
+HWY_INLINE HWY_ATTR_CACHE void StoreFence() {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_sfence();
+#endif
+}
+
+// Begins loading the cache line containing "p".
+template <typename T>
+HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
+#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
+  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
+  // desirable, so use the default 3 (keep in caches).
+  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
+#else
+  (void)p;
+#endif
+}
+
+// Invalidates and flushes the cache line containing "p". No effect on non-x86.
+HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
+#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
+  _mm_clflush(p);
+#else
+  (void)p;
+#endif
+}
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
--- a/third_party/highway/hwy/examples/benchmark.cc
+++ b/third_party/highway/hwy/examples/benchmark.cc
@ -0,0 +1,243 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
+#include "hwy/foreach_target.h"
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include <memory>
+#include <numeric>  // iota
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+#if HWY_TARGET != HWY_SCALAR
+using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
+#endif
+
+class TwoArray {
+ public:
+  // Passed to ctor as a value NOT known to the compiler. Must be a multiple of
+  // the vector lane count * 8.
+  static size_t NumItems() { return 3456; }
+
+  explicit TwoArray(const size_t num_items)
+      : a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
+    const float init = num_items / NumItems();  // 1, but compiler doesn't know
+    std::iota(a_.get(), a_.get() + num_items, init);
+    std::iota(b_, b_ + num_items, init);
+  }
+
+ protected:
+  AlignedFreeUniquePtr<float[]> a_;
+  float* b_;
+};
+
+// Measures durations, verifies results, prints timings.
+template <class Benchmark>
+void RunBenchmark(const char* caption) {
+  printf("%10s: ", caption);
+  const size_t kNumInputs = 1;
+  const size_t num_items = Benchmark::NumItems() * Unpredictable1();
+  const FuncInput inputs[kNumInputs] = {num_items};
+  Result results[kNumInputs];
+
+  Benchmark benchmark(num_items);
+
+  Params p;
+  p.verbose = false;
+  p.max_evals = 7;
+  p.target_rel_mad = 0.002;
+  const size_t num_results = MeasureClosure(
+      [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
+      kNumInputs, results, p);
+  if (num_results != kNumInputs) {
+    fprintf(stderr, "MeasureClosure failed.\n");
+  }
+
+  benchmark.Verify(num_items);
+
+  for (size_t i = 0; i < num_results; ++i) {
+    const double cycles_per_item = results[i].ticks / results[i].input;
+    const double mad = results[i].variability * cycles_per_item;
+    printf("%6zu: %6.3f (+/- %5.3f)\n", results[i].input, cycles_per_item, mad);
+  }
+}
+
+void Intro() {
+  HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
+  HWY_ALIGN float out[16];
+  HWY_FULL(float) d;  // largest possible vector
+  for (size_t i = 0; i < 16; i += Lanes(d)) {
+    const auto vec = Load(d, in + i);  // aligned!
+    auto result = vec * vec;
+    result += result;  // can update if not const
+    Store(result, d, out + i);
+  }
+  printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
+}
+
+// BEGINNER: dot product
+// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
+class BenchmarkDot : public TwoArray {
+ public:
+  explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
+
+  FuncOutput operator()(const size_t num_items) {
+    HWY_FULL(float) d;
+    const size_t N = Lanes(d);
+    using V = decltype(Zero(d));
+    constexpr int unroll = 8;
+    // Compiler doesn't make independent sum* accumulators, so unroll manually.
+    // Some older compilers might not be able to fit the 8 arrays in registers,
+    // so manual unrolling can be helpfull if you run into this issue.
+    // 2 FMA ports * 4 cycle latency = 8x unrolled.
+    V sum[unroll];
+    for (int i = 0; i < unroll; ++i) {
+      sum[i] = Zero(d);
+    }
+    const float* const HWY_RESTRICT pa = &a_[0];
+    const float* const HWY_RESTRICT pb = b_;
+    for (size_t i = 0; i < num_items; i += unroll * N) {
+      for (int j = 0; j < unroll; ++j) {
+        const auto a = Load(d, pa + i + j * N);
+        const auto b = Load(d, pb + i + j * N);
+        sum[j] = MulAdd(a, b, sum[j]);
+      }
+    }
+    // Reduction tree: sum of all accumulators by pairs into sum[0], then the
+    // lanes.
+    for (int power = 1; power < unroll; power *= 2) {
+      for (int i = 0; i < unroll; i += 2 * power) {
+        sum[i] += sum[i + power];
+      }
+    }
+    return dot_ = GetLane(SumOfLanes(sum[0]));
+  }
+  void Verify(size_t num_items) {
+    if (dot_ == -1.0f) {
+      fprintf(stderr, "Dot: must call Verify after benchmark");
+      abort();
+    }
+
+    const float expected =
+        std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
+    const float rel_err = std::abs(expected - dot_) / expected;
+    if (rel_err > 1.1E-6f) {
+      fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
+              rel_err);
+      abort();
+    }
+  }
+
+ private:
+  float dot_;  // for Verify
+};
+
+// INTERMEDIATE: delta coding
+// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
+struct BenchmarkDelta : public TwoArray {
+  explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
+
+  FuncOutput operator()(const size_t num_items) const {
+#if HWY_TARGET == HWY_SCALAR
+    b_[0] = a_[0];
+    for (size_t i = 1; i < num_items; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+#elif HWY_CAP_GE256
+    // Larger vectors are split into 128-bit blocks, easiest to use the
+    // unaligned load support to shift between them.
+    const HWY_FULL(float) df;
+    const size_t N = Lanes(df);
+    size_t i;
+    b_[0] = a_[0];
+    for (i = 1; i < N; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+    for (; i < num_items; i += N) {
+      const auto a = Load(df, &a_[i]);
+      const auto shifted = LoadU(df, &a_[i - 1]);
+      Store(a - shifted, df, &b_[i]);
+    }
+#else  // 128-bit
+    // Slightly better than unaligned loads
+    const HWY_CAPPED(float, 4) df;
+    const size_t N = Lanes(df);
+    size_t i;
+    b_[0] = a_[0];
+    for (i = 1; i < N; ++i) {
+      b_[i] = a_[i] - a_[i - 1];
+    }
+    auto prev = Load(df, &a_[0]);
+    for (; i < num_items; i += Lanes(df)) {
+      const auto a = Load(df, &a_[i]);
+      const auto shifted = CombineShiftRightLanes<3>(a, prev);
+      prev = a;
+      Store(a - shifted, df, &b_[i]);
+    }
+#endif
+    return b_[num_items - 1];
+  }
+
+  void Verify(size_t num_items) {
+    for (size_t i = 0; i < num_items; ++i) {
+      const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
+      const float err = std::abs(expected - b_[i]);
+      if (err > 1E-6f) {
+        fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
+      }
+    }
+  }
+};
+
+void RunBenchmarks() {
+  Intro();
+  printf("------------------------ %s\n", TargetName(HWY_TARGET));
+  RunBenchmark<BenchmarkDot>("dot");
+  RunBenchmark<BenchmarkDelta>("delta");
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_EXPORT(RunBenchmarks);
+
+void Run() {
+  for (uint32_t target : SupportedAndGeneratedTargets()) {
+    SetSupportedTargetsForTest(target);
+    HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
+  }
+  SetSupportedTargetsForTest(0);  // Reset the mask afterwards.
+}
+
+}  // namespace hwy
+
+int main(int /*argc*/, char** /*argv*/) {
+  hwy::Run();
+  return 0;
+}
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/examples/skeleton-inl.h
+++ b/third_party/highway/hwy/examples/skeleton-inl.h
@ -0,0 +1,62 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo of functions that might be called from multiple SIMD modules (either
+// other -inl.h files, or a .cc file between begin/end_target-inl). This is
+// optional - all SIMD code can reside in .cc files. However, this allows
+// splitting code into different files while still inlining instead of requiring
+// calling through function pointers.
+
+// Include guard (still compiled once per target)
+#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#else
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
+#endif
+
+// It is fine to #include normal or *-inl headers.
+#include <stddef.h>
+
+#include "hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+using namespace hwy::HWY_NAMESPACE;
+
+// Example of a type-agnostic (caller-specified lane type) and width-agnostic
+// (uses best available instruction set) function in a header.
+//
+// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
+template <class D, typename T>
+HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
+                                 const T* HWY_RESTRICT add_array,
+                                 const size_t size, T* HWY_RESTRICT x_array) {
+  for (size_t i = 0; i < size; i += Lanes(d)) {
+    const auto mul = Load(d, mul_array + i);
+    const auto add = Load(d, add_array + i);
+    auto x = Load(d, x_array + i);
+    x = MulAdd(mul, x, add);
+    Store(x, d, x_array + i);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#endif  // include guard
--- a/third_party/highway/hwy/examples/skeleton.cc
+++ b/third_party/highway/hwy/examples/skeleton.cc
@ -0,0 +1,103 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/examples/skeleton.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+// First undef to prevent error when re-included.
+#undef HWY_TARGET_INCLUDE
+// For runtime dispatch, specify the name of the current file (unfortunately
+// __FILE__ is not reliable) so that foreach_target.h can re-include it.
+#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
+// Generates code for each enabled target by re-including this source file.
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+
+// Optional, can instead add HWY_ATTR to all functions.
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+// Highway ops reside here; ADL does not find templates nor builtins.
+using namespace hwy::HWY_NAMESPACE;
+
+// Computes log2 by converting to a vector of floats. Compiled once per target.
+template <class DF>
+HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
+                               uint8_t* HWY_RESTRICT log2) {
+  // Type tags for converting to other element types (Rebind = same count).
+  const Rebind<int32_t, DF> d32;
+  const Rebind<uint8_t, DF> d8;
+
+  const auto u8 = Load(d8, values);
+  const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
+  const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
+  Store(DemoteTo(d8, exponent), d8, log2);
+}
+
+HWY_NOINLINE void CodepathDemo() {
+  // Highway defaults to portability, but per-target codepaths may be selected
+  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
+#if HWY_CAP_INTEGER64
+  const char* gather = "Has int64";
+#else
+  const char* gather = "No int64";
+#endif
+  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
+}
+
+HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
+                            uint8_t* HWY_RESTRICT log2) {
+  CodepathDemo();
+
+  // Second argument is necessary on RVV until it supports fractional lengths.
+  HWY_FULL(float, 4) df;
+
+  // Caller padded memory to a multiple of Lanes(). If that is not possible,
+  // we could use blended or masked stores, see README.md.
+  const size_t N = Lanes(df);
+  for (size_t i = 0; i < count; i += N) {
+    OneFloorLog2(df, values + i, log2 + i);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace skeleton {
+
+// This macro declares a static array used for dynamic dispatch; it resides in
+// the same outer namespace that contains FloorLog2.
+HWY_EXPORT(FloorLog2);
+
+// This function is optional and only needed in the case of exposing it in the
+// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
+// is equivalent to inlining this function.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+                   uint8_t* HWY_RESTRICT out) {
+  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
+}
+
+// Optional: anything to compile only once, e.g. non-SIMD implementations of
+// public functions provided by this module, can go inside #if HWY_ONCE.
+
+}  // namespace skeleton
+#endif  // HWY_ONCE
--- a/third_party/highway/hwy/examples/skeleton.h
+++ b/third_party/highway/hwy/examples/skeleton.h
@ -0,0 +1,35 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Demo interface to target-specific code in skeleton.cc
+
+// Normal header with include guard and namespace.
+#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
+
+#include <stddef.h>
+
+// Platform-specific definitions used for declaring an interface, independent of
+// the SIMD instruction set.
+#include "hwy/base.h"  // HWY_RESTRICT
+
+namespace skeleton {
+
+// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
+void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
+                   uint8_t* HWY_RESTRICT out);
+
+}  // namespace skeleton
+
+#endif  // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
--- a/third_party/highway/hwy/examples/skeleton_test.cc
+++ b/third_party/highway/hwy/examples/skeleton_test.cc
@ -0,0 +1,104 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Example of unit test for the "skeleton" library.
+
+#include "hwy/examples/skeleton.h"
+
+#include <stdio.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Optional: factor out parts of the implementation into *-inl.h
+#include "hwy/examples/skeleton-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace skeleton {
+namespace HWY_NAMESPACE {
+
+using namespace hwy::HWY_NAMESPACE;
+
+// Calls function defined in skeleton.cc.
+struct TestFloorLog2 {
+  template <class T, class DF>
+  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
+    const size_t count = 5 * Lanes(df);
+    auto in = hwy::AllocateAligned<uint8_t>(count);
+    auto expected = hwy::AllocateAligned<uint8_t>(count);
+
+    hwy::RandomState rng;
+    for (size_t i = 0; i < count; ++i) {
+      expected[i] = Random32(&rng) & 7;
+      in[i] = static_cast<uint8_t>(1u << expected[i]);
+    }
+    auto out = hwy::AllocateAligned<uint8_t>(count);
+    CallFloorLog2(in.get(), count, out.get());
+    int sum = 0;
+    for (size_t i = 0; i < count; ++i) {
+      HWY_ASSERT_EQ(expected[i], out[i]);
+      sum += out[i];
+    }
+    hwy::PreventElision(sum);
+  }
+};
+
+HWY_NOINLINE void TestAllFloorLog2() {
+  ForFullVectors<TestFloorLog2>()(float());
+}
+
+// Calls function defined in skeleton-inl.h.
+struct TestSumMulAdd {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    hwy::RandomState rng;
+    const size_t count = 4096;
+    EXPECT_TRUE(count % Lanes(d) == 0);
+    auto mul = hwy::AllocateAligned<T>(count);
+    auto x = hwy::AllocateAligned<T>(count);
+    auto add = hwy::AllocateAligned<T>(count);
+    for (size_t i = 0; i < count; ++i) {
+      mul[i] = static_cast<T>(Random32(&rng) & 0xF);
+      x[i] = static_cast<T>(Random32(&rng) & 0xFF);
+      add[i] = static_cast<T>(Random32(&rng) & 0xFF);
+    }
+    double expected_sum = 0.0;
+    for (size_t i = 0; i < count; ++i) {
+      expected_sum += mul[i] * x[i] + add[i];
+    }
+
+    MulAddLoop(d, mul.get(), add.get(), count, x.get());
+    HWY_ASSERT_EQ(4344240.0, expected_sum);
+  }
+};
+
+HWY_NOINLINE void TestAllSumMulAdd() {
+  ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace skeleton
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace skeleton {
+HWY_BEFORE_TEST(SkeletonTest);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
+HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
+}  // namespace skeleton
+#endif
--- a/third_party/highway/hwy/foreach_target.h
+++ b/third_party/highway/hwy/foreach_target.h
@ -0,0 +1,161 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
+#define HIGHWAY_HWY_FOREACH_TARGET_H_
+
+// Re-includes the translation unit zero or more times to compile for any
+// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
+// highway.h defines the corresponding macro/namespace.
+
+#include "hwy/targets.h"
+
+// *_inl.h may include other headers, which requires include guards to prevent
+// repeated inclusion. The guards must be reset after compiling each target, so
+// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
+// defining it if undefined and vice versa. This macro is initially undefined
+// so that IDEs don't gray out the contents of each header.
+#ifdef HWY_TARGET_TOGGLE
+#error "This macro must not be defined outside foreach_target.h"
+#endif
+
+#ifdef HWY_HIGHWAY_INCLUDED  // highway.h include guard
+// Trigger fixup at the bottom of this header.
+#define HWY_ALREADY_INCLUDED
+
+// The next highway.h must re-include set_macros-inl.h because the first
+// highway.h chose the static target instead of what we will set below.
+#undef HWY_SET_MACROS_PER_TARGET
+#endif
+
+// Disable HWY_EXPORT in user code until we have generated all targets. Note
+// that a subsequent highway.h will not override this definition.
+#undef HWY_ONCE
+#define HWY_ONCE (0 || HWY_IDE)
+
+// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
+// also skip if only 1 target defined (no re-inclusion will be necessary).
+#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
+
+#if !defined(HWY_TARGET_INCLUDE)
+#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
+#endif
+
+#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SCALAR
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_NEON
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_SSE4
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX2
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_AVX3
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_WASM
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
+#undef HWY_TARGET
+#define HWY_TARGET HWY_PPC8
+#include HWY_TARGET_INCLUDE
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+#endif
+
+#endif  // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
+
+// Now that all but the static target have been generated, re-enable HWY_EXPORT.
+#undef HWY_ONCE
+#define HWY_ONCE 1
+
+// If we re-include once per enabled target, the translation unit's
+// implementation would have to be skipped via #if to avoid redefining symbols.
+// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
+// implementation when resuming compilation of the translation unit.
+#undef HWY_TARGET
+#define HWY_TARGET HWY_STATIC_TARGET
+
+#ifdef HWY_ALREADY_INCLUDED
+// Revert the previous toggle to prevent redefinitions for the static target.
+#ifdef HWY_TARGET_TOGGLE
+#undef HWY_TARGET_TOGGLE
+#else
+#define HWY_TARGET_TOGGLE
+#endif
+
+// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
+#ifdef HWY_SET_MACROS_PER_TARGET
+#undef HWY_SET_MACROS_PER_TARGET
+#else
+#define HWY_SET_MACROS_PER_TARGET
+#endif
+#endif
+
+#endif  // HIGHWAY_HWY_FOREACH_TARGET_H_
--- a/third_party/highway/hwy/highway.h
+++ b/third_party/highway/hwy/highway.h
@ -0,0 +1,337 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This include guard is checked by foreach_target, so avoid the usual _H_
+// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
+// after/outside this include guard.
+#ifndef HWY_HIGHWAY_INCLUDED
+#define HWY_HIGHWAY_INCLUDED
+
+// Main header required before using vector types.
+
+#include "hwy/base.h"
+#include "hwy/targets.h"
+
+namespace hwy {
+
+// API version (https://semver.org/)
+#define HWY_MAJOR 0
+#define HWY_MINOR 12
+#define HWY_PATCH 0
+
+//------------------------------------------------------------------------------
+// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
+
+// Because Highway functions take descriptor and/or vector arguments, ADL finds
+// these functions without requiring users in project::HWY_NAMESPACE to
+// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
+// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
+
+// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
+// registers in the group, and is ignored on targets that do not support groups.
+#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
+#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
+// Workaround for MSVC grouping __VA_ARGS__ into a single argument
+#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
+// Trailing comma avoids -pedantic false alarm
+#define HWY_CHOOSE_FULL(...) \
+  HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
+#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
+
+// Vector of up to MAX_N lanes.
+#define HWY_CAPPED(T, MAX_N) \
+  hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
+
+//------------------------------------------------------------------------------
+// Export user functions for static/dynamic dispatch
+
+// Evaluates to 0 inside a translation unit if it is generating anything but the
+// static target (the last one if multiple targets are enabled). Used to prevent
+// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
+// compile once anyway, so this is 1 unless it is or has been included.
+#ifndef HWY_ONCE
+#define HWY_ONCE 1
+#endif
+
+// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
+// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
+// defined), and can be used to deduce the return type of Choose*.
+#if HWY_STATIC_TARGET == HWY_SCALAR
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_RVV
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_WASM
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_NEON
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_PPC8
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_SSE4
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX2
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
+#elif HWY_STATIC_TARGET == HWY_AVX3
+#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
+#endif
+
+// Dynamic dispatch declarations.
+
+template <typename RetType, typename... Args>
+struct FunctionCache {
+ public:
+  typedef RetType(FunctionType)(Args...);
+
+  // A template function that when instantiated has the same signature as the
+  // function being called. This function initializes the global cache of the
+  // current supported targets mask used for dynamic dispatch and calls the
+  // appropriate function. Since this mask used for dynamic dispatch is a
+  // global cache, all the highway exported functions, even those exposed by
+  // different modules, will be initialized after this function runs for any one
+  // of those exported functions.
+  template <FunctionType* const table[]>
+  static RetType ChooseAndCall(Args... args) {
+    // If we are running here it means we need to update the chosen target.
+    chosen_target.Update();
+    return (table[chosen_target.GetIndex()])(args...);
+  }
+};
+
+// Factory function only used to infer the template parameters RetType and Args
+// from a function passed to the factory.
+template <typename RetType, typename... Args>
+FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
+  return FunctionCache<RetType, Args...>();
+}
+
+// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
+// nullptr is that target was not compiled.
+#if HWY_TARGETS & HWY_SCALAR
+#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
+#else
+// When scalar is not present and we try to use scalar because other targets
+// were disabled at runtime we fall back to the baseline with
+// HWY_STATIC_DISPATCH()
+#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
+#endif
+
+#if HWY_TARGETS & HWY_WASM
+#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
+#else
+#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_RVV
+#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
+#else
+#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_NEON
+#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
+#else
+#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_PPC8
+#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
+#else
+#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_SSE4
+#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
+#else
+#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_AVX2
+#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
+#endif
+
+#if HWY_TARGETS & HWY_AVX3
+#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
+#else
+#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
+#endif
+
+#define HWY_DISPATCH_TABLE(FUNC_NAME) \
+  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
+
+// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
+// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
+// static array must be defined at the same namespace level as the function
+// it is exporting.
+// After being exported, it can be called from other parts of the same source
+// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
+// like in the following example:
+//
+//   #include "hwy/highway.h"
+//   HWY_BEFORE_NAMESPACE();
+//   namespace skeleton {
+//   namespace HWY_NAMESPACE {
+//
+//   void MyFunction(int a, char b, const char* c) { ... }
+//
+//   // NOLINTNEXTLINE(google-readability-namespace-comments)
+//   }  // namespace HWY_NAMESPACE
+//   }  // namespace skeleton
+//   HWY_AFTER_NAMESPACE();
+//
+//   namespace skeleton {
+//   HWY_EXPORT(MyFunction);  // Defines the dispatch table in this scope.
+//
+//   void MyFunction(int a, char b, const char* c) {
+//     return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
+//   }
+//   }  // namespace skeleton
+//
+
+#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
+
+// Simplified version for IDE or the dynamic dispatch case with only one target.
+// This case still uses a table, although of a single element, to provide the
+// same compile error conditions as with the dynamic dispatch case when multiple
+// targets are being compiled.
+#define HWY_EXPORT(FUNC_NAME)                                       \
+  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
+      const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {                    \
+          &HWY_STATIC_DISPATCH(FUNC_NAME)}
+#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
+
+#else
+
+// Dynamic dispatch case with one entry per dynamic target plus the scalar
+// mode and the initialization wrapper.
+#define HWY_EXPORT(FUNC_NAME)                                              \
+  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME))                         \
+      const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
+          /* The first entry in the table initializes the global cache and \
+           * calls the appropriate function. */                            \
+          &decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH(        \
+              FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,  \
+          HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                               \
+          HWY_CHOOSE_SCALAR(FUNC_NAME),                                    \
+  }
+#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
+  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
+
+#endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
+
+}  // namespace hwy
+
+#endif  // HWY_HIGHWAY_INCLUDED
+
+//------------------------------------------------------------------------------
+
+// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
+// to include them once per target, which is ensured by the toggle check.
+// Because ops/*.h are included under it, they do not need their own guard.
+#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
+#ifdef HWY_HIGHWAY_PER_TARGET
+#undef HWY_HIGHWAY_PER_TARGET
+#else
+#define HWY_HIGHWAY_PER_TARGET
+#endif
+
+#undef HWY_FULL2
+#if HWY_TARGET == HWY_RVV
+#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
+#else
+#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
+#endif
+
+// These define ops inside namespace hwy::HWY_NAMESPACE.
+#if HWY_TARGET == HWY_SSE4
+#include "hwy/ops/x86_128-inl.h"
+#elif HWY_TARGET == HWY_AVX2
+#include "hwy/ops/x86_256-inl.h"
+#elif HWY_TARGET == HWY_AVX3
+#include "hwy/ops/x86_512-inl.h"
+#elif HWY_TARGET == HWY_PPC8
+#elif HWY_TARGET == HWY_NEON
+#include "hwy/ops/arm_neon-inl.h"
+#elif HWY_TARGET == HWY_WASM
+#include "hwy/ops/wasm_128-inl.h"
+#elif HWY_TARGET == HWY_RVV
+#include "hwy/ops/rvv-inl.h"
+#elif HWY_TARGET == HWY_SCALAR
+#include "hwy/ops/scalar-inl.h"
+#else
+#pragma message("HWY_TARGET does not match any known target")
+#endif  // HWY_TARGET
+
+// Commonly used functions/types that must come after ops are defined.
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
+template <class V>
+using LaneType = decltype(GetLane(V()));
+
+// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
+// of functions that do not take a vector argument, or as an argument type if
+// the function only has a template argument for D, or for explicit type names
+// instead of auto. This may be a built-in type.
+template <class D>
+using Vec = decltype(Zero(D()));
+
+// Mask type. Useful as the return type of functions that do not take a mask
+// argument, or as an argument type if the function only has a template argument
+// for D, or for explicit type names instead of auto.
+template <class D>
+using Mask = decltype(MaskFromVec(Zero(D())));
+
+// Returns the closest value to v within [lo, hi].
+template <class V>
+HWY_API V Clamp(const V v, const V lo, const V hi) {
+  return Min(Max(lo, v), hi);
+}
+
+// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
+// TODO(janwas): implement for RVV
+#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
+
+template <size_t kLanes, class V>
+HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
+  return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
+}
+
+#endif
+
+// Returns lanes with the most significant bit set and all other bits zero.
+template <class D>
+HWY_API Vec<D> SignBit(D d) {
+  using Unsigned = MakeUnsigned<TFromD<D>>;
+  const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
+  return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
+}
+
+// Returns quiet NaN.
+template <class D>
+HWY_API Vec<D> NaN(D d) {
+  const RebindToSigned<D> di;
+  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
+  // mantissa MSB (to indicate quiet) would be sufficient.
+  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // HWY_HIGHWAY_PER_TARGET
--- a/third_party/highway/hwy/highway_test.cc
+++ b/third_party/highway/hwy/highway_test.cc
@ -0,0 +1,313 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "highway_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/nanobenchmark.h"  // Unpredictable1
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestSet {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Zero
+    const auto v0 = Zero(d);
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+    std::fill(expected.get(), expected.get() + N, T(0));
+    HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
+
+    // Set
+    const auto v2 = Set(d, T(2));
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = 2;
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
+
+    // Iota
+    const auto vi = Iota(d, T(5));
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = T(5 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
+
+    // Undefined
+    const auto vu = Undefined(d);
+    Store(vu, d, expected.get());
+  }
+};
+
+HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
+
+// Ensures wraparound (mod 2^bits)
+struct TestOverflow {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(1));
+    const auto vmax = Set(d, LimitsMax<T>());
+    const auto vmin = Set(d, LimitsMin<T>());
+    // Unsigned underflow / negative -> positive
+    HWY_ASSERT_VEC_EQ(d, vmax, vmin - v1);
+    // Unsigned overflow / positive -> negative
+    HWY_ASSERT_VEC_EQ(d, vmin, vmax + v1);
+  }
+};
+
+HWY_NOINLINE void TestAllOverflow() {
+  ForIntegerTypes(ForPartialVectors<TestOverflow>());
+}
+
+struct TestSignBitInteger {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto all = VecFromMask(d, Eq(v0, v0));
+    const auto vs = SignBit(d);
+    const auto other = Sub(vs, Set(d, 1));
+
+    // Shifting left by one => overflow, equal zero
+    HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
+    // Verify the lower bits are zero (only +/- and logical ops are available
+    // for all types)
+    HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
+  }
+};
+
+struct TestSignBitFloat {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vs = SignBit(d);
+    const auto vp = Set(d, 2.25);
+    const auto vn = Set(d, -2.25);
+    HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
+    HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
+    HWY_ASSERT_VEC_EQ(d, v0, vs);
+  }
+};
+
+HWY_NOINLINE void TestAllSignBit() {
+  ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
+  ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
+}
+
+// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
+template <typename TF>
+bool IsNaN(TF f) {
+  MakeUnsigned<TF> bits;
+  memcpy(&bits, &f, sizeof(TF));
+  bits += bits;
+  bits >>= 1;  // clear sign bit
+  // NaN if all exponent bits are set and the mantissa is not zero.
+  return bits > ExponentMask<decltype(bits)>();
+}
+
+template <class D, class V>
+HWY_NOINLINE void AssertNaN(const D d, const V v, const char* file, int line) {
+  using T = TFromD<D>;
+  const T lane = GetLane(v);
+  if (!IsNaN(lane)) {
+    const std::string type_name = TypeName(T(), Lanes(d));
+    MakeUnsigned<T> bits;
+    memcpy(&bits, &lane, sizeof(T));
+    // RVV lacks PRIu64, so use size_t; double will be truncated on 32-bit.
+    Abort(file, line, "Expected %s NaN, got %E (%zu)", type_name.c_str(), lane,
+          size_t(bits));
+  }
+}
+
+#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
+
+struct TestNaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    HWY_ASSERT_NAN(d, nan);
+
+    // Arithmetic
+    HWY_ASSERT_NAN(d, Add(nan, v1));
+    HWY_ASSERT_NAN(d, Add(v1, nan));
+    HWY_ASSERT_NAN(d, Sub(nan, v1));
+    HWY_ASSERT_NAN(d, Sub(v1, nan));
+    HWY_ASSERT_NAN(d, Mul(nan, v1));
+    HWY_ASSERT_NAN(d, Mul(v1, nan));
+    HWY_ASSERT_NAN(d, Div(nan, v1));
+    HWY_ASSERT_NAN(d, Div(v1, nan));
+
+    // FMA
+    HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
+    HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
+    HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
+    HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
+    HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
+    HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
+    HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
+    HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
+    HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
+    HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
+    HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
+    HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
+
+    // Rcp/Sqrt
+    HWY_ASSERT_NAN(d, Sqrt(nan));
+
+    // Sign manipulation
+    HWY_ASSERT_NAN(d, Abs(nan));
+    HWY_ASSERT_NAN(d, Neg(nan));
+    HWY_ASSERT_NAN(d, CopySign(nan, v1));
+    HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
+
+    // Rounding
+    HWY_ASSERT_NAN(d, Ceil(nan));
+    HWY_ASSERT_NAN(d, Floor(nan));
+    HWY_ASSERT_NAN(d, Round(nan));
+    HWY_ASSERT_NAN(d, Trunc(nan));
+
+    // Logical (And/AndNot/Xor will clear NaN!)
+    HWY_ASSERT_NAN(d, Or(nan, v1));
+
+    // Comparison
+    HWY_ASSERT(AllFalse(Eq(nan, v1)));
+    HWY_ASSERT(AllFalse(Gt(nan, v1)));
+    HWY_ASSERT(AllFalse(Lt(nan, v1)));
+    HWY_ASSERT(AllFalse(Ge(nan, v1)));
+    HWY_ASSERT(AllFalse(Le(nan, v1)));
+  }
+};
+
+// For functions only available for float32
+struct TestF32NaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+    HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
+    HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
+    HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
+    HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
+  }
+};
+
+// TODO(janwas): move to TestNaN once supported for partial vectors
+struct TestFullNaN {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v1 = Set(d, T(Unpredictable1()));
+    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
+
+    HWY_ASSERT_NAN(d, SumOfLanes(nan));
+// Reduction (pending clarification on RVV)
+#if HWY_TARGET != HWY_RVV
+    HWY_ASSERT_NAN(d, MinOfLanes(nan));
+    HWY_ASSERT_NAN(d, MaxOfLanes(nan));
+#endif
+
+#if HWY_ARCH_X86 && HWY_TARGET != HWY_SCALAR
+    // x86 SIMD returns the second operand if any input is NaN.
+    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
+    HWY_ASSERT_NAN(d, Min(v1, nan));
+    HWY_ASSERT_NAN(d, Max(v1, nan));
+#elif HWY_ARCH_WASM
+    // Should return NaN if any input is NaN, but does not for scalar.
+    // TODO(janwas): remove once this is fixed.
+#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
+    // ARMv7 NEON returns NaN if any input is NaN.
+    HWY_ASSERT_NAN(d, Min(v1, nan));
+    HWY_ASSERT_NAN(d, Max(v1, nan));
+    HWY_ASSERT_NAN(d, Min(nan, v1));
+    HWY_ASSERT_NAN(d, Max(nan, v1));
+#else
+    // IEEE 754-2019 minimumNumber is defined as the other argument if exactly
+    // one is NaN, and qNaN if both are.
+    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
+    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
+    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
+#endif
+    HWY_ASSERT_NAN(d, Min(nan, nan));
+    HWY_ASSERT_NAN(d, Max(nan, nan));
+
+    // Comparison
+    HWY_ASSERT(AllFalse(Eq(nan, v1)));
+    HWY_ASSERT(AllFalse(Gt(nan, v1)));
+    HWY_ASSERT(AllFalse(Lt(nan, v1)));
+    HWY_ASSERT(AllFalse(Ge(nan, v1)));
+    HWY_ASSERT(AllFalse(Le(nan, v1)));
+  }
+};
+
+HWY_NOINLINE void TestAllNaN() {
+  ForFloatTypes(ForPartialVectors<TestNaN>());
+  ForPartialVectors<TestF32NaN>()(float());
+  ForFloatTypes(ForFullVectors<TestFullNaN>());
+}
+
+struct TestCopyAndAssign {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // copy V
+    const auto v3 = Iota(d, 3);
+    auto v3b(v3);
+    HWY_ASSERT_VEC_EQ(d, v3, v3b);
+
+    // assign V
+    auto v3c = Undefined(d);
+    v3c = v3;
+    HWY_ASSERT_VEC_EQ(d, v3, v3c);
+  }
+};
+
+HWY_NOINLINE void TestAllCopyAndAssign() {
+  ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
+}
+
+struct TestGetLane {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
+    HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
+  }
+};
+
+HWY_NOINLINE void TestAllGetLane() {
+  ForAllTypes(ForPartialVectors<TestGetLane>());
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HighwayTest);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
+HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/nanobenchmark.cc
+++ b/third_party/highway/hwy/nanobenchmark.cc
@ -0,0 +1,722 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/nanobenchmark.h"
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>  // abort
+#include <string.h>  // memcpy
+#include <time.h>    // clock_gettime
+
+#include <algorithm>  // sort
+#include <array>
+#include <atomic>
+#include <limits>
+#include <numeric>  // iota
+#include <random>
+#include <string>
+#include <vector>
+
+#include "hwy/base.h"
+#if HWY_ARCH_PPC
+#include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
+#elif HWY_ARCH_X86
+
+#if HWY_COMPILER_MSVC
+#include <intrin.h>
+#else
+#include <cpuid.h>  // NOLINT
+#endif              // HWY_COMPILER_MSVC
+
+#endif  // HWY_ARCH_X86
+
+namespace hwy {
+namespace platform {
+namespace {
+
+#if HWY_ARCH_X86
+
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HWY_RESTRICT abcd) {
+#if HWY_COMPILER_MSVC
+  int regs[4];
+  __cpuidex(regs, level, count);
+  for (int i = 0; i < 4; ++i) {
+    abcd[i] = regs[i];
+  }
+#else
+  uint32_t a;
+  uint32_t b;
+  uint32_t c;
+  uint32_t d;
+  __cpuid_count(level, count, a, b, c, d);
+  abcd[0] = a;
+  abcd[1] = b;
+  abcd[2] = c;
+  abcd[3] = d;
+#endif
+}
+
+std::string BrandString() {
+  char brand_string[49];
+  std::array<uint32_t, 4> abcd;
+
+  // Check if brand string is supported (it is on all reasonable Intel/AMD)
+  Cpuid(0x80000000U, 0, abcd.data());
+  if (abcd[0] < 0x80000004U) {
+    return std::string();
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
+    memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
+  }
+  brand_string[48] = 0;
+  return brand_string;
+}
+
+// Returns the frequency quoted inside the brand string. This does not
+// account for throttling nor Turbo Boost.
+double NominalClockRate() {
+  const std::string& brand_string = BrandString();
+  // Brand strings include the maximum configured frequency. These prefixes are
+  // defined by Intel CPUID documentation.
+  const char* prefixes[3] = {"MHz", "GHz", "THz"};
+  const double multipliers[3] = {1E6, 1E9, 1E12};
+  for (size_t i = 0; i < 3; ++i) {
+    const size_t pos_prefix = brand_string.find(prefixes[i]);
+    if (pos_prefix != std::string::npos) {
+      const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
+      if (pos_space != std::string::npos) {
+        const std::string digits =
+            brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
+        return std::stod(digits) * multipliers[i];
+      }
+    }
+  }
+
+  return 0.0;
+}
+
+#endif  // HWY_ARCH_X86
+
+}  // namespace
+
+// Returns tick rate. Invariant means the tick counter frequency is independent
+// of CPU throttling or sleep. May be expensive, caller should cache the result.
+double InvariantTicksPerSecond() {
+#if HWY_ARCH_PPC
+  return __ppc_get_timebase_freq();
+#elif HWY_ARCH_X86
+  // We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
+  return NominalClockRate();
+#else
+  // Fall back to clock_gettime nanoseconds.
+  return 1E9;
+#endif
+}
+
+}  // namespace platform
+namespace {
+
+// Prevents the compiler from eliding the computations that led to "output".
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC == 0
+  // Works by indicating to the compiler that "output" is being read and
+  // modified. The +r constraint avoids unnecessary writes to memory, but only
+  // works for built-in types (typically FuncOutput).
+  asm volatile("" : "+r"(output) : : "memory");
+#else
+  // MSVC does not support inline assembly anymore (and never supported GCC's
+  // RTL constraints). Self-assignment with #pragma optimize("off") might be
+  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
+  // with volatile pointers generates inefficient code on MSVC 2017.
+  static std::atomic<T> dummy(T{});
+  dummy.store(output, std::memory_order_relaxed);
+#endif
+}
+
+namespace timer {
+
+// Start/Stop return absolute timestamps and must be placed immediately before
+// and after the region to measure. We provide separate Start/Stop functions
+// because they use different fences.
+//
+// Background: RDTSC is not 'serializing'; earlier instructions may complete
+// after it, and/or later instructions may complete before it. 'Fences' ensure
+// regions' elapsed times are independent of such reordering. The only
+// documented unprivileged serializing instruction is CPUID, which acts as a
+// full fence (no reordering across it in either direction). Unfortunately
+// the latency of CPUID varies wildly (perhaps made worse by not initializing
+// its EAX input). Because it cannot reliably be deducted from the region's
+// elapsed time, it must not be included in the region to measure (i.e.
+// between the two RDTSC).
+//
+// The newer RDTSCP is sometimes described as serializing, but it actually
+// only serves as a half-fence with release semantics. Although all
+// instructions in the region will complete before the final timestamp is
+// captured, subsequent instructions may leak into the region and increase the
+// elapsed time. Inserting another fence after the final RDTSCP would prevent
+// such reordering without affecting the measured region.
+//
+// Fortunately, such a fence exists. The LFENCE instruction is only documented
+// to delay later loads until earlier loads are visible. However, Intel's
+// reference manual says it acts as a full fence (waiting until all earlier
+// instructions have completed, and delaying later instructions until it
+// completes). AMD assigns the same behavior to MFENCE.
+//
+// We need a fence before the initial RDTSC to prevent earlier instructions
+// from leaking into the region, and arguably another after RDTSC to avoid
+// region instructions from completing before the timestamp is recorded.
+// When surrounded by fences, the additional RDTSCP half-fence provides no
+// benefit, so the initial timestamp can be recorded via RDTSC, which has
+// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
+// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
+//
+// Using Start+Start leads to higher variance and overhead than Stop+Stop.
+// However, Stop+Stop includes an LFENCE in the region measurements, which
+// adds a delay dependent on earlier loads. The combination of Start+Stop
+// is faster than Start+Start and more consistent than Stop+Stop because
+// the first LFENCE already delayed subsequent loads before the measured
+// region. This combination seems not to have been considered in prior work:
+// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
+//
+// Note: performance counters can measure 'exact' instructions-retired or
+// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
+// requires fences. Unfortunately, it is not accessible on all OSes and we
+// prefer to avoid kernel-mode drivers. Performance counters are also affected
+// by several under/over-count errata, so we use the TSC instead.
+
+// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
+// divide by InvariantTicksPerSecond.
+inline uint64_t Start64() {
+  uint64_t t;
+#if HWY_ARCH_PPC
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+  t = __rdtsc();
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rdx", "memory", "cc");
+#elif HWY_ARCH_RVV
+  asm volatile("rdcycle %0" : "=r"(t));
+#else
+  // Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
+  timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
+#endif
+  return t;
+}
+
+inline uint64_t Stop64() {
+  uint64_t t;
+#if HWY_ARCH_PPC
+  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
+#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  unsigned aux;
+  t = __rdtscp(&aux);
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "shl $32, %%rdx\n\t"
+      "or %%rdx, %0\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      // "cc" = flags modified by SHL.
+      : "rcx", "rdx", "memory", "cc");
+#else
+  t = Start64();
+#endif
+  return t;
+}
+
+// Returns a 32-bit timestamp with about 4 cycles less overhead than
+// Start64. Only suitable for measuring very short regions because the
+// timestamp overflows about once a second.
+inline uint32_t Start32() {
+  uint32_t t;
+#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+  t = static_cast<uint32_t>(__rdtsc());
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  asm volatile(
+      "lfence\n\t"
+      "rdtsc\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rdx = TSC >> 32.
+      : "rdx", "memory");
+#elif HWY_ARCH_RVV
+  asm volatile("rdcycle %0" : "=r"(t));
+#else
+  t = static_cast<uint32_t>(Start64());
+#endif
+  return t;
+}
+
+inline uint32_t Stop32() {
+  uint32_t t;
+#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
+  _ReadWriteBarrier();
+  unsigned aux;
+  t = static_cast<uint32_t>(__rdtscp(&aux));
+  _ReadWriteBarrier();
+  _mm_lfence();
+  _ReadWriteBarrier();
+#elif HWY_ARCH_X86_64
+  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
+  asm volatile(
+      "rdtscp\n\t"
+      "lfence"
+      : "=a"(t)
+      :
+      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
+      : "rcx", "rdx", "memory");
+#else
+  t = static_cast<uint32_t>(Stop64());
+#endif
+  return t;
+}
+
+}  // namespace timer
+
+namespace robust_statistics {
+
+// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
+// than std::sort for input distributions with very few unique values.
+template <class T>
+void CountingSort(T* values, size_t num_values) {
+  // Unique values and their frequency (similar to flat_map).
+  using Unique = std::pair<T, int>;
+  std::vector<Unique> unique;
+  for (size_t i = 0; i < num_values; ++i) {
+    const T value = values[i];
+    const auto pos =
+        std::find_if(unique.begin(), unique.end(),
+                     [value](const Unique u) { return u.first == value; });
+    if (pos == unique.end()) {
+      unique.push_back(std::make_pair(value, 1));
+    } else {
+      ++pos->second;
+    }
+  }
+
+  // Sort in ascending order of value (pair.first).
+  std::sort(unique.begin(), unique.end());
+
+  // Write that many copies of each unique value to the array.
+  T* HWY_RESTRICT p = values;
+  for (const auto& value_count : unique) {
+    std::fill(p, p + value_count.second, value_count.first);
+    p += value_count.second;
+  }
+  NANOBENCHMARK_CHECK(p == values + num_values);
+}
+
+// @return i in [idx_begin, idx_begin + half_count) that minimizes
+// sorted[i + half_count] - sorted[i].
+template <typename T>
+size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
+                const size_t half_count) {
+  T min_range = std::numeric_limits<T>::max();
+  size_t min_idx = 0;
+
+  for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
+    NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
+    const T range = sorted[idx + half_count] - sorted[idx];
+    if (range < min_range) {
+      min_range = range;
+      min_idx = idx;
+    }
+  }
+
+  return min_idx;
+}
+
+// Returns an estimate of the mode by calling MinRange on successively
+// halved intervals. "sorted" must be in ascending order. This is the
+// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
+// estimator of the mode", with complexity O(N log N). The mode is less
+// affected by outliers in highly-skewed distributions than the median.
+// The averaging operation below assumes "T" is an unsigned integer type.
+template <typename T>
+T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
+  size_t idx_begin = 0;
+  size_t half_count = num_values / 2;
+  while (half_count > 1) {
+    idx_begin = MinRange(sorted, idx_begin, half_count);
+    half_count >>= 1;
+  }
+
+  const T x = sorted[idx_begin + 0];
+  if (half_count == 0) {
+    return x;
+  }
+  NANOBENCHMARK_CHECK(half_count == 1);
+  const T average = (x + sorted[idx_begin + 1] + 1) / 2;
+  return average;
+}
+
+// Returns the mode. Side effect: sorts "values".
+template <typename T>
+T Mode(T* values, const size_t num_values) {
+  CountingSort(values, num_values);
+  return ModeOfSorted(values, num_values);
+}
+
+template <typename T, size_t N>
+T Mode(T (&values)[N]) {
+  return Mode(&values[0], N);
+}
+
+// Returns the median value. Side effect: sorts "values".
+template <typename T>
+T Median(T* values, const size_t num_values) {
+  NANOBENCHMARK_CHECK(!values->empty());
+  std::sort(values, values + num_values);
+  const size_t half = num_values / 2;
+  // Odd count: return middle
+  if (num_values % 2) {
+    return values[half];
+  }
+  // Even count: return average of middle two.
+  return (values[half] + values[half - 1] + 1) / 2;
+}
+
+// Returns a robust measure of variability.
+template <typename T>
+T MedianAbsoluteDeviation(const T* values, const size_t num_values,
+                          const T median) {
+  NANOBENCHMARK_CHECK(num_values != 0);
+  std::vector<T> abs_deviations;
+  abs_deviations.reserve(num_values);
+  for (size_t i = 0; i < num_values; ++i) {
+    const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
+    abs_deviations.push_back(static_cast<T>(abs));
+  }
+  return Median(abs_deviations.data(), num_values);
+}
+
+}  // namespace robust_statistics
+
+// Ticks := platform-specific timer values (CPU cycles on x86). Must be
+// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
+// read than 64 bit.
+using Ticks = uint32_t;
+
+// Returns timer overhead / minimum measurable difference.
+Ticks TimerResolution() {
+  // Nested loop avoids exceeding stack/L1 capacity.
+  Ticks repetitions[Params::kTimerSamples];
+  for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
+    Ticks samples[Params::kTimerSamples];
+    for (size_t i = 0; i < Params::kTimerSamples; ++i) {
+      const Ticks t0 = timer::Start32();
+      const Ticks t1 = timer::Stop32();
+      samples[i] = t1 - t0;
+    }
+    repetitions[rep] = robust_statistics::Mode(samples);
+  }
+  return robust_statistics::Mode(repetitions);
+}
+
+static const Ticks timer_resolution = TimerResolution();
+
+// Estimates the expected value of "lambda" values with a variable number of
+// samples until the variability "rel_mad" is less than "max_rel_mad".
+template <class Lambda>
+Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
+                        const Params& p, const Lambda& lambda) {
+  // Choose initial samples_per_eval based on a single estimated duration.
+  Ticks t0 = timer::Start32();
+  lambda();
+  Ticks t1 = timer::Stop32();
+  Ticks est = t1 - t0;
+  static const double ticks_per_second = platform::InvariantTicksPerSecond();
+  const size_t ticks_per_eval =
+      static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
+  size_t samples_per_eval =
+      est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
+  samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
+
+  std::vector<Ticks> samples;
+  samples.reserve(1 + samples_per_eval);
+  samples.push_back(est);
+
+  // Percentage is too strict for tiny differences, so also allow a small
+  // absolute "median absolute deviation".
+  const Ticks max_abs_mad = (timer_resolution + 99) / 100;
+  *rel_mad = 0.0;  // ensure initialized
+
+  for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
+    samples.reserve(samples.size() + samples_per_eval);
+    for (size_t i = 0; i < samples_per_eval; ++i) {
+      t0 = timer::Start32();
+      lambda();
+      t1 = timer::Stop32();
+      samples.push_back(t1 - t0);
+    }
+
+    if (samples.size() >= p.min_mode_samples) {
+      est = robust_statistics::Mode(samples.data(), samples.size());
+    } else {
+      // For "few" (depends also on the variance) samples, Median is safer.
+      est = robust_statistics::Median(samples.data(), samples.size());
+    }
+    NANOBENCHMARK_CHECK(est != 0);
+
+    // Median absolute deviation (mad) is a robust measure of 'variability'.
+    const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
+        samples.data(), samples.size(), est);
+    *rel_mad = static_cast<double>(int(abs_mad)) / est;
+
+    if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
+      if (p.verbose) {
+        printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
+               samples.size(), est, abs_mad, *rel_mad * 100.0);
+      }
+      return est;
+    }
+  }
+
+  if (p.verbose) {
+    printf(
+        "WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
+        *rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
+  }
+  return est;
+}
+
+using InputVec = std::vector<FuncInput>;
+
+// Returns vector of unique input values.
+InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
+  InputVec unique(inputs, inputs + num_inputs);
+  std::sort(unique.begin(), unique.end());
+  unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
+  return unique;
+}
+
+// Returns how often we need to call func for sufficient precision, or zero
+// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
+size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
+               const Params& p) {
+  // Min elapsed ticks for any input.
+  Ticks min_duration = ~0u;
+
+  for (const FuncInput input : unique) {
+    // Make sure a 32-bit timer is sufficient.
+    const uint64_t t0 = timer::Start64();
+    PreventElision(func(arg, input));
+    const uint64_t t1 = timer::Stop64();
+    const uint64_t elapsed = t1 - t0;
+    if (elapsed >= (1ULL << 30)) {
+      fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
+              input);
+      return 0;
+    }
+
+    double rel_mad;
+    const Ticks total = SampleUntilStable(
+        p.target_rel_mad, &rel_mad, p,
+        [func, arg, input]() { PreventElision(func(arg, input)); });
+    min_duration = std::min(min_duration, total - timer_resolution);
+  }
+
+  // Number of repetitions required to reach the target resolution.
+  const size_t max_skip = p.precision_divisor;
+  // Number of repetitions given the estimated duration.
+  const size_t num_skip =
+      min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
+  if (p.verbose) {
+    printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
+           max_skip, min_duration, num_skip);
+  }
+  return num_skip;
+}
+
+// Replicates inputs until we can omit "num_skip" occurrences of an input.
+InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
+                         const size_t num_unique, const size_t num_skip,
+                         const Params& p) {
+  InputVec full;
+  if (num_unique == 1) {
+    full.assign(p.subset_ratio * num_skip, inputs[0]);
+    return full;
+  }
+
+  full.reserve(p.subset_ratio * num_skip * num_inputs);
+  for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
+    full.insert(full.end(), inputs, inputs + num_inputs);
+  }
+  std::mt19937 rng;
+  std::shuffle(full.begin(), full.end(), rng);
+  return full;
+}
+
+// Copies the "full" to "subset" in the same order, but with "num_skip"
+// randomly selected occurrences of "input_to_skip" removed.
+void FillSubset(const InputVec& full, const FuncInput input_to_skip,
+                const size_t num_skip, InputVec* subset) {
+  const size_t count =
+      static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
+  // Generate num_skip random indices: which occurrence to skip.
+  std::vector<uint32_t> omit(count);
+  std::iota(omit.begin(), omit.end(), 0);
+  // omit[] is the same on every call, but that's OK because they identify the
+  // Nth instance of input_to_skip, so the position within full[] differs.
+  std::mt19937 rng;
+  std::shuffle(omit.begin(), omit.end(), rng);
+  omit.resize(num_skip);
+  std::sort(omit.begin(), omit.end());
+
+  uint32_t occurrence = ~0u;  // 0 after preincrement
+  size_t idx_omit = 0;        // cursor within omit[]
+  size_t idx_subset = 0;      // cursor within *subset
+  for (const FuncInput next : full) {
+    if (next == input_to_skip) {
+      ++occurrence;
+      // Haven't removed enough already
+      if (idx_omit < num_skip) {
+        // This one is up for removal
+        if (occurrence == omit[idx_omit]) {
+          ++idx_omit;
+          continue;
+        }
+      }
+    }
+    if (idx_subset < subset->size()) {
+      (*subset)[idx_subset++] = next;
+    }
+  }
+  NANOBENCHMARK_CHECK(idx_subset == subset->size());
+  NANOBENCHMARK_CHECK(idx_omit == omit.size());
+  NANOBENCHMARK_CHECK(occurrence == count - 1);
+}
+
+// Returns total ticks elapsed for all inputs.
+Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
+                    const Params& p, double* max_rel_mad) {
+  double rel_mad;
+  const Ticks duration =
+      SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
+        for (const FuncInput input : *inputs) {
+          PreventElision(func(arg, input));
+        }
+      });
+  *max_rel_mad = std::max(*max_rel_mad, rel_mad);
+  return duration;
+}
+
+// (Nearly) empty Func for measuring timer overhead/resolution.
+HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
+  return input;
+}
+
+// Returns overhead of accessing inputs[] and calling a function; this will
+// be deducted from future TotalDuration return values.
+Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
+  double rel_mad;
+  // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
+  return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
+    for (const FuncInput input : *inputs) {
+      PreventElision(EmptyFunc(arg, input));
+    }
+  });
+}
+
+}  // namespace
+
+int Unpredictable1() { return timer::Start64() != ~0ULL; }
+
+size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+               const size_t num_inputs, Result* results, const Params& p) {
+  NANOBENCHMARK_CHECK(num_inputs != 0);
+  const InputVec& unique = UniqueInputs(inputs, num_inputs);
+
+  const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
+  if (num_skip == 0) return 0;  // NumSkip already printed error message
+  // (slightly less work on x86 to cast from signed integer)
+  const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
+
+  const InputVec& full =
+      ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
+  InputVec subset(full.size() - num_skip);
+
+  const Ticks overhead = Overhead(arg, &full, p);
+  const Ticks overhead_skip = Overhead(arg, &subset, p);
+  if (overhead < overhead_skip) {
+    fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
+            overhead_skip);
+    return 0;
+  }
+
+  if (p.verbose) {
+    printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
+           overhead, overhead_skip);
+  }
+
+  double max_rel_mad = 0.0;
+  const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
+
+  for (size_t i = 0; i < unique.size(); ++i) {
+    FillSubset(full, unique[i], num_skip, &subset);
+    const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
+
+    if (total < total_skip) {
+      fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
+      return 0;
+    }
+
+    const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
+    results[i].input = unique[i];
+    results[i].ticks = static_cast<float>(duration) * mul;
+    results[i].variability = static_cast<float>(max_rel_mad);
+  }
+
+  return unique.size();
+}
+
+}  // namespace hwy
--- a/third_party/highway/hwy/nanobenchmark.h
+++ b/third_party/highway/hwy/nanobenchmark.h
@ -0,0 +1,186 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
+#define HIGHWAY_HWY_NANOBENCHMARK_H_
+
+// Benchmarks functions of a single integer argument with realistic branch
+// prediction hit rates. Uses a robust estimator to summarize the measurements.
+// The precision is about 0.2%.
+//
+// Examples: see nanobenchmark_test.cc.
+//
+// Background: Microbenchmarks such as http://github.com/google/benchmark
+// can measure elapsed times on the order of a microsecond. Shorter functions
+// are typically measured by repeating them thousands of times and dividing
+// the total elapsed time by this count. Unfortunately, repetition (especially
+// with the same input parameter!) influences the runtime. In time-critical
+// code, it is reasonable to expect warm instruction/data caches and TLBs,
+// but a perfect record of which branches will be taken is unrealistic.
+// Unless the application also repeatedly invokes the measured function with
+// the same parameter, the benchmark is measuring something very different -
+// a best-case result, almost as if the parameter were made a compile-time
+// constant. This may lead to erroneous conclusions about branch-heavy
+// algorithms outperforming branch-free alternatives.
+//
+// Our approach differs in three ways. Adding fences to the timer functions
+// reduces variability due to instruction reordering, improving the timer
+// resolution to about 40 CPU cycles. However, shorter functions must still
+// be invoked repeatedly. For more realistic branch prediction performance,
+// we vary the input parameter according to a user-specified distribution.
+// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
+// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
+// central tendency of the measurement samples with the "half sample mode",
+// which is more robust to outliers and skewed data than the mean or median.
+
+// WARNING if included from multiple translation units compiled with distinct
+// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
+// macro that is unique to the current compile flags. We must also avoid
+// standard library headers such as vector and functional that define functions.
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Enables sanity checks that verify correct operation at the cost of
+// longer benchmark runs.
+#ifndef NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_ENABLE_CHECKS 0
+#endif
+
+#define NANOBENCHMARK_CHECK_ALWAYS(condition)                             \
+  while (!(condition)) {                                                  \
+    fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
+    abort();                                                              \
+  }
+
+#if NANOBENCHMARK_ENABLE_CHECKS
+#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
+#else
+#define NANOBENCHMARK_CHECK(condition)
+#endif
+
+namespace hwy {
+
+namespace platform {
+
+// Returns tick rate, useful for converting measurements to seconds. Invariant
+// means the tick counter frequency is independent of CPU throttling or sleep.
+// This call may be expensive, callers should cache the result.
+double InvariantTicksPerSecond();
+
+}  // namespace platform
+
+// Returns 1, but without the compiler knowing what the value is. This prevents
+// optimizing out code.
+int Unpredictable1();
+
+// Input influencing the function being measured (e.g. number of bytes to copy).
+using FuncInput = size_t;
+
+// "Proof of work" returned by Func to ensure the compiler does not elide it.
+using FuncOutput = uint64_t;
+
+// Function to measure: either 1) a captureless lambda or function with two
+// arguments or 2) a lambda with capture, in which case the first argument
+// is reserved for use by MeasureClosure.
+using Func = FuncOutput (*)(const void*, FuncInput);
+
+// Internal parameters that determine precision/resolution/measuring time.
+struct Params {
+  // For measuring timer overhead/resolution. Used in a nested loop =>
+  // quadratic time, acceptable because we know timer overhead is "low".
+  // constexpr because this is used to define array bounds.
+  static constexpr size_t kTimerSamples = 256;
+
+  // Best-case precision, expressed as a divisor of the timer resolution.
+  // Larger => more calls to Func and higher precision.
+  size_t precision_divisor = 1024;
+
+  // Ratio between full and subset input distribution sizes. Cannot be less
+  // than 2; larger values increase measurement time but more faithfully
+  // model the given input distribution.
+  size_t subset_ratio = 2;
+
+  // Together with the estimated Func duration, determines how many times to
+  // call Func before checking the sample variability. Larger values increase
+  // measurement time, memory/cache use and precision.
+  double seconds_per_eval = 4E-3;
+
+  // The minimum number of samples before estimating the central tendency.
+  size_t min_samples_per_eval = 7;
+
+  // The mode is better than median for estimating the central tendency of
+  // skewed/fat-tailed distributions, but it requires sufficient samples
+  // relative to the width of half-ranges.
+  size_t min_mode_samples = 64;
+
+  // Maximum permissible variability (= median absolute deviation / center).
+  double target_rel_mad = 0.002;
+
+  // Abort after this many evals without reaching target_rel_mad. This
+  // prevents infinite loops.
+  size_t max_evals = 9;
+
+  // Whether to print additional statistics to stdout.
+  bool verbose = true;
+};
+
+// Measurement result for each unique input.
+struct Result {
+  FuncInput input;
+
+  // Robust estimate (mode or median) of duration.
+  float ticks;
+
+  // Measure of variability (median absolute deviation relative to "ticks").
+  float variability;
+};
+
+// Precisely measures the number of ticks elapsed when calling "func" with the
+// given inputs, shuffled to ensure realistic branch prediction hit rates.
+//
+// "func" returns a 'proof of work' to ensure its computations are not elided.
+// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
+// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
+//   "func". The values should be chosen to maximize coverage of "func". This
+//   represents a distribution, so a value's frequency should reflect its
+//   probability in the real application. Order does not matter; for example, a
+//   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
+// Returns how many Result were written to "results": one per unique input, or
+//   zero if the measurement failed (an error message goes to stderr).
+size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
+               const size_t num_inputs, Result* results,
+               const Params& p = Params());
+
+// Calls operator() of the given closure (lambda function).
+template <class Closure>
+static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
+  return (*f)(input);
+}
+
+// Same as Measure, except "closure" is typically a lambda function of
+// FuncInput -> FuncOutput with a capture list.
+template <class Closure>
+static inline size_t MeasureClosure(const Closure& closure,
+                                    const FuncInput* inputs,
+                                    const size_t num_inputs, Result* results,
+                                    const Params& p = Params()) {
+  return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
+                 reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
+                 results, p);
+}
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_NANOBENCHMARK_H_
--- a/third_party/highway/hwy/nanobenchmark_test.cc
+++ b/third_party/highway/hwy/nanobenchmark_test.cc
@ -0,0 +1,104 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/nanobenchmark.h"
+
+#include <stdio.h>
+#include <stdlib.h>  // strtol
+#include <unistd.h>  // sleep
+
+#include <random>
+
+namespace hwy {
+namespace {
+
+FuncOutput Div(const void*, FuncInput in) {
+  // Here we're measuring the throughput because benchmark invocations are
+  // independent. Any dividend will do; the divisor is nonzero.
+  return 0xFFFFF / in;
+}
+
+template <size_t N>
+void MeasureDiv(const FuncInput (&inputs)[N]) {
+  Result results[N];
+  Params params;
+  params.max_evals = 4;  // avoid test timeout
+  const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
+  for (size_t i = 0; i < num_results; ++i) {
+    printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
+           results[i].ticks, results[i].variability * 100.0);
+  }
+}
+
+std::mt19937 rng;
+
+// A function whose runtime depends on rng.
+FuncOutput Random(const void* /*arg*/, FuncInput in) {
+  const size_t r = rng() & 0xF;
+  uint32_t ret = in;
+  for (size_t i = 0; i < r; ++i) {
+    ret /= ((rng() & 1) + 2);
+  }
+  return ret;
+}
+
+// Ensure the measured variability is high.
+template <size_t N>
+void MeasureRandom(const FuncInput (&inputs)[N]) {
+  Result results[N];
+  Params p;
+  p.max_evals = 4;  // avoid test timeout
+  p.verbose = false;
+  const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
+  for (size_t i = 0; i < num_results; ++i) {
+    NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
+  }
+}
+
+template <size_t N>
+void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
+  printf("Expect a 'measurement failed' below:\n");
+  Result results[N];
+
+  const size_t num_results = Measure(
+      [](const void*, const FuncInput input) -> FuncOutput {
+        // Loop until the sleep succeeds (not interrupted by signal). We assume
+        // >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
+        while (sleep(2) != 0) {
+        }
+        return input;
+      },
+      nullptr, inputs, N, results);
+  NANOBENCHMARK_CHECK(num_results == 0);
+  (void)num_results;
+}
+
+void RunAll(const int argc, char** /*argv*/) {
+  // unpredictable == 1 but the compiler doesn't know that.
+  const int unpredictable = argc != 999;
+  static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
+                                     static_cast<FuncInput>(unpredictable + 9)};
+
+  MeasureDiv(inputs);
+  MeasureRandom(inputs);
+  EnsureLongMeasurementFails(inputs);
+}
+
+}  // namespace
+}  // namespace hwy
+
+int main(int argc, char* argv[]) {
+  hwy::RunAll(argc, argv);
+  return 0;
+}
--- a/third_party/highway/hwy/ops/arm_neon-inl.h
+++ b/third_party/highway/hwy/ops/arm_neon-inl.h
--- a/third_party/highway/hwy/ops/rvv-inl.h
+++ b/third_party/highway/hwy/ops/rvv-inl.h
--- a/third_party/highway/hwy/ops/scalar-inl.h
+++ b/third_party/highway/hwy/ops/scalar-inl.h
--- a/third_party/highway/hwy/ops/set_macros-inl.h
+++ b/third_party/highway/hwy/ops/set_macros-inl.h
@ -0,0 +1,226 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Sets macros based on HWY_TARGET.
+
+// This include guard is toggled by foreach_target, so avoid the usual _H_
+// suffix to prevent copybara from renaming it.
+#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
+#ifdef HWY_SET_MACROS_PER_TARGET
+#undef HWY_SET_MACROS_PER_TARGET
+#else
+#define HWY_SET_MACROS_PER_TARGET
+#endif
+
+#endif  // HWY_SET_MACROS_PER_TARGET
+
+#include "hwy/targets.h"
+
+#undef HWY_NAMESPACE
+#undef HWY_ALIGN
+#undef HWY_LANES
+
+#undef HWY_CAP_INTEGER64
+#undef HWY_CAP_FLOAT64
+#undef HWY_CAP_GE256
+#undef HWY_CAP_GE512
+
+#undef HWY_TARGET_STR
+
+// Before include guard so we redefine HWY_TARGET_STR on each include,
+// governed by the current HWY_TARGET.
+//-----------------------------------------------------------------------------
+// SSE4
+#if HWY_TARGET == HWY_SSE4
+
+#define HWY_NAMESPACE N_SSE4
+#define HWY_ALIGN alignas(16)
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_TARGET_STR "sse2,ssse3,sse4.1"
+
+//-----------------------------------------------------------------------------
+// AVX2
+#elif HWY_TARGET == HWY_AVX2
+
+#define HWY_NAMESPACE N_AVX2
+#define HWY_ALIGN alignas(32)
+#define HWY_LANES(T) (32 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 1
+#define HWY_CAP_GE512 0
+
+#if defined(HWY_DISABLE_BMI2_FMA)
+#define HWY_TARGET_STR "avx,avx2,f16c"
+#else
+#define HWY_TARGET_STR "avx,avx2,bmi,bmi2,fma,f16c"
+#endif
+
+//-----------------------------------------------------------------------------
+// AVX3
+#elif HWY_TARGET == HWY_AVX3
+
+#define HWY_ALIGN alignas(64)
+#define HWY_LANES(T) (64 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 1
+#define HWY_CAP_GE512 1
+
+#define HWY_NAMESPACE N_AVX3
+
+// Must include AVX2 because an AVX3 test may call AVX2 functions (e.g. when
+// converting to half-vectors). HWY_DISABLE_BMI2_FMA is not relevant because if
+// we have AVX3, we should also have BMI2/FMA.
+#define HWY_TARGET_STR \
+  "avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw"
+
+//-----------------------------------------------------------------------------
+// PPC8
+#elif HWY_TARGET == HWY_PPC8
+
+#define HWY_ALIGN alignas(16)
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_PPC8
+
+#define HWY_TARGET_STR "altivec,vsx"
+
+//-----------------------------------------------------------------------------
+// NEON
+#elif HWY_TARGET == HWY_NEON
+
+#define HWY_ALIGN alignas(16)
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#if HWY_ARCH_ARM_A64
+#define HWY_CAP_FLOAT64 1
+#else
+#define HWY_CAP_FLOAT64 0
+#endif
+
+#define HWY_NAMESPACE N_NEON
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+
+//-----------------------------------------------------------------------------
+// WASM
+#elif HWY_TARGET == HWY_WASM
+
+#define HWY_ALIGN alignas(16)
+#define HWY_LANES(T) (16 / sizeof(T))
+
+#define HWY_CAP_INTEGER64 0
+#define HWY_CAP_FLOAT64 0
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_WASM
+
+#define HWY_TARGET_STR "simd128"
+
+//-----------------------------------------------------------------------------
+// RVV
+#elif HWY_TARGET == HWY_RVV
+
+// RVV only requires lane alignment, not natural alignment of the entire vector,
+// and the compiler already aligns builtin types, so nothing to do here.
+#define HWY_ALIGN
+
+// Arbitrary constant, not the actual lane count! Large enough that we can
+// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
+#define HWY_LANES(T) (4096 / sizeof(T))
+
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_RVV
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+// (rv64gcv is not a valid target)
+
+//-----------------------------------------------------------------------------
+// SCALAR
+#elif HWY_TARGET == HWY_SCALAR
+
+#define HWY_ALIGN
+#define HWY_LANES(T) 1
+
+#define HWY_CAP_INTEGER64 1
+#define HWY_CAP_FLOAT64 1
+#define HWY_CAP_GE256 0
+#define HWY_CAP_GE512 0
+
+#define HWY_NAMESPACE N_SCALAR
+
+// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
+
+#else
+#pragma message("HWY_TARGET does not match any known target")
+#endif  // HWY_TARGET
+
+// Clang <9 requires this be invoked at file scope, before any namespace.
+#undef HWY_BEFORE_NAMESPACE
+#if defined(HWY_TARGET_STR)
+#define HWY_BEFORE_NAMESPACE()        \
+  HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
+  static_assert(true, "For requiring trailing semicolon")
+#else
+// avoids compiler warning if no HWY_TARGET_STR
+#define HWY_BEFORE_NAMESPACE() \
+  static_assert(true, "For requiring trailing semicolon")
+#endif
+
+// Clang <9 requires any namespaces be closed before this macro.
+#undef HWY_AFTER_NAMESPACE
+#if defined(HWY_TARGET_STR)
+#define HWY_AFTER_NAMESPACE() \
+  HWY_POP_ATTRIBUTES          \
+  static_assert(true, "For requiring trailing semicolon")
+#else
+// avoids compiler warning if no HWY_TARGET_STR
+#define HWY_AFTER_NAMESPACE() \
+  static_assert(true, "For requiring trailing semicolon")
+#endif
+
+#undef HWY_ATTR
+#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
+#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
+#else
+#define HWY_ATTR
+#endif
+
+// DEPRECATED
+#undef HWY_GATHER_LANES
+#define HWY_GATHER_LANES(T) HWY_LANES(T)
--- a/third_party/highway/hwy/ops/shared-inl.h
+++ b/third_party/highway/hwy/ops/shared-inl.h
@ -0,0 +1,125 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Per-target definitions shared by ops/*.h and user code.
+
+#include <cmath>
+
+// Separate header because foreach_target.h re-enables its include guard.
+#include "hwy/ops/set_macros-inl.h"
+
+// Relies on the external include guard in highway.h.
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// SIMD operations are implemented as overloaded functions selected using a
+// "descriptor" D := Simd<T, N>. T is the lane type, N a number of lanes >= 1
+// (always a power of two). Users generally do not choose N directly, but
+// instead use HWY_FULL(T[, LMUL]) (the largest available size). N is not
+// necessarily the actual number of lanes, which is returned by Lanes(D()).
+//
+// Only HWY_FULL(T) and N <= 16 / sizeof(T) are guaranteed to be available - the
+// latter are useful if >128 bit vectors are unnecessary or undesirable.
+template <typename Lane, size_t N>
+struct Simd {
+  constexpr Simd() = default;
+  using T = Lane;
+  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
+
+  // Widening/narrowing ops change the number of lanes and/or their type.
+  // To initialize such vectors, we need the corresponding descriptor types:
+
+  // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
+  template <typename NewLane>
+  using Rebind = Simd<NewLane, N>;
+
+  // MulEven() with another lane type, but same total size.
+  // Round up to correctly handle scalars with N=1.
+  template <typename NewLane>
+  using Repartition =
+      Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
+
+  // LowerHalf() with the same lane type, but half the lanes.
+  // Round up to correctly handle scalars with N=1.
+  using Half = Simd<T, (N + 1) / 2>;
+
+  // Combine() with the same lane type, but twice the lanes.
+  using Twice = Simd<T, 2 * N>;
+};
+
+template <class D>
+using TFromD = typename D::T;
+
+// Descriptor for the same number of lanes as D, but with the LaneType T.
+template <class T, class D>
+using Rebind = typename D::template Rebind<T>;
+
+template <class D>
+using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
+template <class D>
+using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
+template <class D>
+using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
+
+// Descriptor for the same total size as D, but with the LaneType T.
+template <class T, class D>
+using Repartition = typename D::template Repartition<T>;
+
+template <class D>
+using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
+template <class D>
+using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
+
+// Descriptor for the same lane type as D, but half the lanes.
+template <class D>
+using Half = typename D::Half;
+
+// Descriptor for the same lane type as D, but twice the lanes.
+template <class D>
+using Twice = typename D::Twice;
+
+// Same as base.h macros but with a Simd<T, N> argument instead of T.
+#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
+#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
+#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
+#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
+#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
+#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
+
+// Compile-time-constant, (typically but not guaranteed) an upper bound on the
+// number of lanes.
+// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
+// `#if HWY_CAP_GE*`.
+template <typename T, size_t N>
+HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
+  return N;
+}
+
+// Targets with non-constexpr Lanes define this themselves.
+#if HWY_TARGET != HWY_RVV
+
+// (Potentially) non-constant actual size of the vector at runtime, subject to
+// the limit imposed by the Simd. Useful for advancing loop counters.
+template <typename T, size_t N>
+HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
+  return N;
+}
+
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
--- a/third_party/highway/hwy/ops/wasm_128-inl.h
+++ b/third_party/highway/hwy/ops/wasm_128-inl.h
--- a/third_party/highway/hwy/ops/x86_128-inl.h
+++ b/third_party/highway/hwy/ops/x86_128-inl.h
--- a/third_party/highway/hwy/ops/x86_256-inl.h
+++ b/third_party/highway/hwy/ops/x86_256-inl.h
--- a/third_party/highway/hwy/ops/x86_512-inl.h
+++ b/third_party/highway/hwy/ops/x86_512-inl.h
--- a/third_party/highway/hwy/targets.cc
+++ b/third_party/highway/hwy/targets.cc
@ -0,0 +1,286 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/targets.h"
+
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include <atomic>
+#include <limits>
+
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+#include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
+#endif                                        // defined(*_SANITIZER)
+
+#if HWY_ARCH_X86
+#include <xmmintrin.h>
+#if HWY_COMPILER_MSVC
+#include <intrin.h>
+#else  // HWY_COMPILER_MSVC
+#include <cpuid.h>
+#endif // HWY_COMPILER_MSVC
+#endif
+
+namespace hwy {
+namespace {
+
+#if HWY_ARCH_X86
+
+bool IsBitSet(const uint32_t reg, const int index) {
+  return (reg & (1U << index)) != 0;
+}
+
+// Calls CPUID instruction with eax=level and ecx=count and returns the result
+// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
+void Cpuid(const uint32_t level, const uint32_t count,
+           uint32_t* HWY_RESTRICT abcd) {
+#if HWY_COMPILER_MSVC
+  int regs[4];
+  __cpuidex(regs, level, count);
+  for (int i = 0; i < 4; ++i) {
+    abcd[i] = regs[i];
+  }
+#else  // HWY_COMPILER_MSVC
+  uint32_t a;
+  uint32_t b;
+  uint32_t c;
+  uint32_t d;
+  __cpuid_count(level, count, a, b, c, d);
+  abcd[0] = a;
+  abcd[1] = b;
+  abcd[2] = c;
+  abcd[3] = d;
+#endif  // HWY_COMPILER_MSVC
+}
+
+// Returns the lower 32 bits of extended control register 0.
+// Requires CPU support for "OSXSAVE" (see below).
+uint32_t ReadXCR0() {
+#if HWY_COMPILER_MSVC
+  return static_cast<uint32_t>(_xgetbv(0));
+#else  // HWY_COMPILER_MSVC
+  uint32_t xcr0, xcr0_high;
+  const uint32_t index = 0;
+  asm volatile(".byte 0x0F, 0x01, 0xD0"
+               : "=a"(xcr0), "=d"(xcr0_high)
+               : "c"(index));
+  return xcr0;
+#endif  // HWY_COMPILER_MSVC
+}
+
+#endif  // HWY_ARCH_X86
+
+// Not function-local => no compiler-generated locking.
+std::atomic<uint32_t> supported_{0};  // Not yet initialized
+
+// When running tests, this value can be set to the mocked supported targets
+// mask. Only written to from a single thread before the test starts.
+uint32_t supported_targets_for_test_ = 0;
+
+// Mask of targets disabled at runtime with DisableTargets.
+uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
+
+#if HWY_ARCH_X86
+// Bits indicating which instruction set extensions are supported.
+constexpr uint32_t kSSE = 1 << 0;
+constexpr uint32_t kSSE2 = 1 << 1;
+constexpr uint32_t kSSE3 = 1 << 2;
+constexpr uint32_t kSSSE3 = 1 << 3;
+constexpr uint32_t kSSE41 = 1 << 4;
+constexpr uint32_t kSSE42 = 1 << 5;
+constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
+
+constexpr uint32_t kAVX = 1u << 6;
+constexpr uint32_t kAVX2 = 1u << 7;
+constexpr uint32_t kFMA = 1u << 8;
+constexpr uint32_t kLZCNT = 1u << 9;
+constexpr uint32_t kBMI = 1u << 10;
+constexpr uint32_t kBMI2 = 1u << 11;
+
+// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
+// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
+// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
+// avoiding using and requiring these so AVX2 can still be used.
+#ifdef HWY_DISABLE_BMI2_FMA
+constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
+#else
+constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
+#endif
+
+constexpr uint32_t kAVX512F = 1u << 12;
+constexpr uint32_t kAVX512VL = 1u << 13;
+constexpr uint32_t kAVX512DQ = 1u << 14;
+constexpr uint32_t kAVX512BW = 1u << 15;
+constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
+#endif
+
+}  // namespace
+
+HWY_NORETURN void HWY_FORMAT(3, 4)
+    Abort(const char* file, int line, const char* format, ...) {
+  char buf[2000];
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buf, sizeof(buf), format, args);
+  va_end(args);
+
+  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
+#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
+    defined(THREAD_SANITIZER)
+  // If compiled with any sanitizer print a stack trace. This call doesn't crash
+  // the program, instead the trap below will crash it also allowing gdb to
+  // break there.
+  __sanitizer_print_stack_trace();
+#endif  // defined(*_SANITIZER)
+  fflush(stderr);
+
+#if HWY_COMPILER_MSVC
+  abort();  // Compile error without this due to HWY_NORETURN.
+#else
+  __builtin_trap();
+#endif
+}
+
+void DisableTargets(uint32_t disabled_targets) {
+  supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
+  // We can call Update() here to initialize the mask but that will trigger a
+  // call to SupportedTargets() which we use in tests to tell whether any of the
+  // highway dynamic dispatch functions were used.
+  chosen_target.DeInit();
+}
+
+void SetSupportedTargetsForTest(uint32_t targets) {
+  // Reset the cached supported_ value to 0 to force a re-evaluation in the
+  // next call to SupportedTargets() which will use the mocked value set here
+  // if not zero.
+  supported_.store(0, std::memory_order_release);
+  supported_targets_for_test_ = targets;
+  chosen_target.DeInit();
+}
+
+bool SupportedTargetsCalledForTest() {
+  return supported_.load(std::memory_order_acquire) != 0;
+}
+
+uint32_t SupportedTargets() {
+  uint32_t bits = supported_.load(std::memory_order_acquire);
+  // Already initialized?
+  if (HWY_LIKELY(bits != 0)) {
+    return bits & supported_mask_;
+  }
+
+  // When running tests, this allows to mock the current supported targets.
+  if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
+    // Store the value to signal that this was used.
+    supported_.store(supported_targets_for_test_, std::memory_order_release);
+    return supported_targets_for_test_ & supported_mask_;
+  }
+
+  bits = HWY_SCALAR;
+
+#if HWY_ARCH_X86
+  uint32_t flags = 0;
+  uint32_t abcd[4];
+
+  Cpuid(0, 0, abcd);
+  const uint32_t max_level = abcd[0];
+
+  // Standard feature flags
+  Cpuid(1, 0, abcd);
+  flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
+  flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
+  flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
+  flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
+  flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
+  flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
+  flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
+  flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
+  const bool has_osxsave = IsBitSet(abcd[2], 27);
+
+  // Extended feature flags
+  Cpuid(0x80000001U, 0, abcd);
+  flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
+
+  // Extended features
+  if (max_level >= 7) {
+    Cpuid(7, 0, abcd);
+    flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
+    flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
+    flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
+
+    flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
+    flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
+    flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
+    flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
+  }
+
+  // Verify OS support for XSAVE, without which XMM/YMM registers are not
+  // preserved across context switches and are not safe to use.
+  if (has_osxsave) {
+    const uint32_t xcr0 = ReadXCR0();
+    // XMM
+    if (!IsBitSet(xcr0, 1)) {
+      flags = 0;
+    }
+    // YMM
+    if (!IsBitSet(xcr0, 2)) {
+      flags &= ~kGroupAVX2;
+    }
+    // ZMM + opmask
+    if ((xcr0 & 0x70) != 0x70) {
+      flags &= ~kGroupAVX3;
+    }
+  }
+
+  // Set target bit(s) if all their group's flags are all set.
+  if ((flags & kGroupAVX3) == kGroupAVX3) {
+    bits |= HWY_AVX3;
+  }
+  if ((flags & kGroupAVX2) == kGroupAVX2) {
+    bits |= HWY_AVX2;
+  }
+  if ((flags & kGroupSSE4) == kGroupSSE4) {
+    bits |= HWY_SSE4;
+  }
+#else
+  // TODO(janwas): detect for other platforms
+  bits = HWY_ENABLED_BASELINE;
+#endif  // HWY_ARCH_X86
+
+  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
+    fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
+            size_t(bits), HWY_ENABLED_BASELINE);
+  }
+
+  supported_.store(bits, std::memory_order_release);
+  return bits & supported_mask_;
+}
+
+// Declared in targets.h
+ChosenTarget chosen_target;
+
+void ChosenTarget::Update() {
+  // The supported variable contains the current CPU supported targets shifted
+  // to the location expected by the ChosenTarget mask. We enabled SCALAR
+  // regardless of whether it was compiled since it is also used as the
+  // fallback mechanism to the baseline target.
+  uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
+                       HWY_CHOSEN_TARGET_MASK_SCALAR;
+  mask_.store(supported);
+}
+
+}  // namespace hwy
--- a/third_party/highway/hwy/targets.h
+++ b/third_party/highway/hwy/targets.h
@ -0,0 +1,491 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HIGHWAY_HWY_TARGETS_H_
+#define HIGHWAY_HWY_TARGETS_H_
+
+#include <vector>
+
+// For SIMD module implementations and their callers. Defines which targets to
+// generate and call.
+
+#include "hwy/base.h"
+
+//------------------------------------------------------------------------------
+// Optional configuration
+
+// See ../quick_reference.md for documentation of these macros.
+
+// Uncomment to override the default baseline determined from predefined macros:
+// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
+
+// Uncomment to override the default blocklist:
+// #define HWY_BROKEN_TARGETS HWY_AVX3
+
+// Uncomment to definitely avoid generating those target(s):
+// #define HWY_DISABLED_TARGETS HWY_SSE4
+
+// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
+// AVX2 target for VMs which support AVX2 but not the other instruction sets)
+// #define HWY_DISABLE_BMI2_FMA
+
+//------------------------------------------------------------------------------
+// Targets
+
+// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
+// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
+//
+// All values are unconditionally defined so we can test HWY_TARGETS without
+// first checking the HWY_ARCH_*.
+//
+// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
+// can use 32-bit literals.
+
+// 1,2,4: reserved
+#define HWY_AVX3 8
+#define HWY_AVX2 16
+// 32: reserved for AVX
+#define HWY_SSE4 64
+// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
+
+// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
+// dynamic dispatch. All x86 target bits must be lower or equal to
+// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
+// HWY_MAX_DYNAMIC_TARGETS in total.
+#define HWY_HIGHEST_TARGET_BIT_X86 9
+
+// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
+#define HWY_NEON 0x2000
+
+#define HWY_HIGHEST_TARGET_BIT_ARM 13
+
+// 0x4000, 0x8000 reserved
+#define HWY_PPC8 0x10000  // v2.07 or 3
+// 0x20000, 0x40000 reserved for prior VSX/AltiVec
+
+#define HWY_HIGHEST_TARGET_BIT_PPC 18
+
+// 0x80000 reserved
+#define HWY_WASM 0x100000
+
+#define HWY_HIGHEST_TARGET_BIT_WASM 20
+
+// 0x200000, 0x400000, 0x800000 reserved
+
+#define HWY_RVV 0x1000000
+
+#define HWY_HIGHEST_TARGET_BIT_RVV 24
+
+// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
+
+#define HWY_SCALAR 0x20000000
+// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
+
+//------------------------------------------------------------------------------
+// Set default blocklists
+
+// Disabled means excluded from enabled at user's request. A separate config
+// macro allows disabling without deactivating the blocklist below.
+#ifndef HWY_DISABLED_TARGETS
+#define HWY_DISABLED_TARGETS 0
+#endif
+
+// Broken means excluded from enabled due to known compiler issues. Allow the
+// user to override this blocklist without any guarantee of success.
+#ifndef HWY_BROKEN_TARGETS
+
+// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
+// SSE4 codegen (msan failure), so disable all those targets.
+#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
+// TODO: Disable all non-scalar targets for every build target once we have
+// clang-7 enabled in our builders.
+#ifdef MEMORY_SANITIZER
+#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
+#else
+#define HWY_BROKEN_TARGETS 0
+#endif
+// This entails a major speed reduction, so warn unless the user explicitly
+// opts in to scalar-only.
+#if !defined(HWY_COMPILE_ONLY_SCALAR)
+#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
+#endif
+
+// MSVC, or 32-bit may fail to compile AVX2/3.
+#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
+#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
+#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
+
+#else
+#define HWY_BROKEN_TARGETS 0
+#endif
+
+#endif  // HWY_BROKEN_TARGETS
+
+// Enabled means not disabled nor blocklisted.
+#define HWY_ENABLED(targets) \
+  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
+
+//------------------------------------------------------------------------------
+// Detect baseline targets using predefined macros
+
+// Baseline means the targets for which the compiler is allowed to generate
+// instructions, implying the target CPU would have to support them. Do not use
+// this directly because it does not take the blocklist into account. Allow the
+// user to override this without any guarantee of success.
+#ifndef HWY_BASELINE_TARGETS
+
+#ifdef __wasm_simd128__
+#define HWY_BASELINE_WASM HWY_WASM
+#else
+#define HWY_BASELINE_WASM 0
+#endif
+
+#ifdef __VSX__
+#define HWY_BASELINE_PPC8 HWY_PPC8
+#else
+#define HWY_BASELINE_PPC8 0
+#endif
+
+// GCC 4.5.4 only defines the former; 5.4 defines both.
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define HWY_BASELINE_NEON HWY_NEON
+#else
+#define HWY_BASELINE_NEON 0
+#endif
+
+#ifdef __SSE4_1__
+#define HWY_BASELINE_SSE4 HWY_SSE4
+#else
+#define HWY_BASELINE_SSE4 0
+#endif
+
+#ifdef __AVX2__
+#define HWY_BASELINE_AVX2 HWY_AVX2
+#else
+#define HWY_BASELINE_AVX2 0
+#endif
+
+#ifdef __AVX512F__
+#define HWY_BASELINE_AVX3 HWY_AVX3
+#else
+#define HWY_BASELINE_AVX3 0
+#endif
+
+#ifdef __riscv_vector
+#define HWY_BASELINE_RVV HWY_RVV
+#else
+#define HWY_BASELINE_RVV 0
+#endif
+
+#define HWY_BASELINE_TARGETS                                                \
+  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
+   HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \
+   HWY_BASELINE_RVV)
+
+#endif  // HWY_BASELINE_TARGETS
+
+//------------------------------------------------------------------------------
+// Choose target for static dispatch
+
+#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
+#if HWY_ENABLED_BASELINE == 0
+#error "At least one baseline target must be defined and enabled"
+#endif
+
+// Best baseline, used for static dispatch. This is the least-significant 1-bit
+// within HWY_ENABLED_BASELINE and lower bit values imply "better".
+#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
+
+// Start by assuming static dispatch. If we later use dynamic dispatch, this
+// will be defined to other targets during the multiple-inclusion, and finally
+// return to the initial value. Defining this outside begin/end_target ensures
+// inl headers successfully compile by themselves (required by Bazel).
+#define HWY_TARGET HWY_STATIC_TARGET
+
+//------------------------------------------------------------------------------
+// Choose targets for dynamic dispatch according to one of four policies
+
+#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
+     defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
+#error "Invalid config: can only define a single policy for targets"
+#endif
+
+// Attainable means enabled and the compiler allows intrinsics (even when not
+// allowed to autovectorize). Used in 3 and 4.
+#if HWY_ARCH_X86
+#define HWY_ATTAINABLE_TARGETS \
+  HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
+#else
+#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
+#endif
+
+// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
+// to ~HWY_SCALAR, but this is more explicit).
+#if defined(HWY_COMPILE_ONLY_SCALAR)
+#undef HWY_STATIC_TARGET
+#define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
+#define HWY_TARGETS HWY_SCALAR
+
+// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
+#elif defined(HWY_COMPILE_ONLY_STATIC)
+#define HWY_TARGETS HWY_STATIC_TARGET
+
+// 3) For tests: include all attainable targets (in particular: scalar)
+#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
+#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
+
+// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
+// excluding superseded targets, in particular scalar.
+#else
+
+#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
+
+#endif  // target policy
+
+// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
+// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
+// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
+#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
+#error "Logic error: best baseline should be included in dynamic targets"
+#endif
+
+//------------------------------------------------------------------------------
+
+namespace hwy {
+
+// Returns (cached) bitfield of enabled targets that are supported on this CPU.
+// Implemented in supported_targets.cc; unconditionally compiled to support the
+// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
+// allow eliding calls to this function.
+uint32_t SupportedTargets();
+
+// Disable from runtime dispatch the mask of compiled in targets. Targets that
+// were not enabled at compile time are ignored. This function is useful to
+// disable a target supported by the CPU that is known to have bugs or when a
+// lower target is desired. For this reason, attempts to disable targets which
+// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
+// returns at least the baseline target.
+void DisableTargets(uint32_t disabled_targets);
+
+// Single target: reduce code size by eliding the call and conditional branches
+// inside Choose*() functions.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
+#define HWY_SUPPORTED_TARGETS HWY_TARGETS
+#else
+#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
+#endif
+
+// Set the mock mask of CPU supported targets instead of the actual CPU
+// supported targets computed in SupportedTargets(). The return value of
+// SupportedTargets() will still be affected by the DisabledTargets() mask
+// regardless of this mock, to prevent accidentally adding targets that are
+// known to be buggy in the current CPU. Call with a mask of 0 to disable the
+// mock and use the actual CPU supported targets instead.
+void SetSupportedTargetsForTest(uint32_t targets);
+
+// Returns whether the SupportedTargets() function was called since the last
+// SetSupportedTargetsForTest() call.
+bool SupportedTargetsCalledForTest();
+
+// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
+// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
+// is affected by the current SetSupportedTargetsForTest() mock if any.
+HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
+  std::vector<uint32_t> ret;
+  for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
+       targets = targets & (targets - 1)) {
+    uint32_t current_target = targets & ~(targets - 1);
+    ret.push_back(current_target);
+  }
+  return ret;
+}
+
+static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
+  switch (target) {
+#if HWY_ARCH_X86
+    case HWY_SSE4:
+      return "SSE4";
+    case HWY_AVX2:
+      return "AVX2";
+    case HWY_AVX3:
+      return "AVX3";
+#endif
+
+#if HWY_ARCH_ARM
+    case HWY_NEON:
+      return "Neon";
+#endif
+
+#if HWY_ARCH_PPC
+    case HWY_PPC8:
+      return "Power8";
+#endif
+
+#if HWY_ARCH_WASM
+    case HWY_WASM:
+      return "Wasm";
+#endif
+
+#if HWY_ARCH_RVV
+    case HWY_RVV:
+      return "RVV";
+#endif
+
+    case HWY_SCALAR:
+      return "Scalar";
+
+    default:
+      return "?";
+  }
+}
+
+// The maximum number of dynamic targets on any architecture is defined by
+// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
+
+// For the ChosenTarget mask and index we use a different bit arrangement than
+// in the HWY_TARGETS mask. Only the targets involved in the current
+// architecture are used in this mask, and therefore only the least significant
+// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
+// significant bit is set when the mask is not initialized, the next
+// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
+// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
+// that position and the next more significant bit is used for the scalar
+// target. Because of this we need to define equivalent values for HWY_TARGETS
+// in this representation.
+// This mask representation allows to use ctz() on this mask and obtain a small
+// number that's used as an index of the table for dynamic dispatch. In this
+// way the first entry is used when the mask is uninitialized, the following
+// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
+// scalar.
+
+// The HWY_SCALAR bit in the ChosenTarget mask format.
+#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
+
+// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
+// current architecture.
+#define HWY_CHOSEN_TARGET_SHIFT(X)                                    \
+  ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
+    ((1u << HWY_MAX_DYNAMIC_TARGETS) - 1))                            \
+   << 1)
+
+// The HWY_TARGETS mask in the ChosenTarget mask format.
+#define HWY_CHOSEN_TARGET_MASK_TARGETS \
+  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
+
+#if HWY_ARCH_X86
+// Maximum number of dynamic targets, changing this value is an ABI incompatible
+// change
+#define HWY_MAX_DYNAMIC_TARGETS 10
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
+// These must match the order in which the HWY_TARGETS are defined
+// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
+// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
+// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
+// corresponds to the best target. Don't include a "," at the end of the list.
+#define HWY_CHOOSE_TARGET_LIST(func_name)        \
+  nullptr,                        /* reserved */ \
+      nullptr,                    /* reserved */ \
+      nullptr,                    /* reserved */ \
+      HWY_CHOOSE_AVX3(func_name), /* AVX3 */     \
+      HWY_CHOOSE_AVX2(func_name), /* AVX2 */     \
+      nullptr,                    /* AVX */      \
+      HWY_CHOOSE_SSE4(func_name), /* SSE4 */     \
+      nullptr,                    /* SSSE3 */    \
+      nullptr,                    /* SSE3 */     \
+      nullptr                     /* SSE2 */
+
+#endif  // HWY_ARCH_X86
+
+#if HWY_ARCH_ARM
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
+#define HWY_CHOOSE_TARGET_LIST(func_name)       \
+  nullptr,                       /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      HWY_CHOOSE_NEON(func_name) /* NEON */
+
+#endif  // HWY_ARCH_ARM
+
+#if HWY_ARCH_PPC
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 5
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
+#define HWY_CHOOSE_TARGET_LIST(func_name)        \
+  nullptr,                        /* reserved */ \
+      nullptr,                    /* reserved */ \
+      HWY_CHOOSE_PPC8(func_name), /* PPC8 */     \
+      nullptr,                    /* VSX */      \
+      nullptr                     /* AltiVec */
+
+#endif  // HWY_ARCH_PPC
+
+#if HWY_ARCH_WASM
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
+#define HWY_CHOOSE_TARGET_LIST(func_name)       \
+  nullptr,                       /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      HWY_CHOOSE_WASM(func_name) /* WASM */
+
+#endif  // HWY_ARCH_WASM
+
+#if HWY_ARCH_RVV
+// See HWY_ARCH_X86 above for details.
+#define HWY_MAX_DYNAMIC_TARGETS 4
+#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
+#define HWY_CHOOSE_TARGET_LIST(func_name)       \
+  nullptr,                       /* reserved */ \
+      nullptr,                   /* reserved */ \
+      nullptr,                   /* reserved */ \
+      HWY_CHOOSE_RVV(func_name) /* RVV */
+
+#endif  // HWY_ARCH_RVV
+
+struct ChosenTarget {
+ public:
+  // Update the ChosenTarget mask based on the current CPU supported
+  // targets.
+  void Update();
+
+  // Reset the ChosenTarget to the uninitialized state.
+  void DeInit() { mask_.store(1); }
+
+  // Whether the ChosenTarget was initialized. This is useful to know whether
+  // any HWY_DYNAMIC_DISPATCH function was called.
+  bool IsInitialized() const { return mask_.load() != 1; }
+
+  // Return the index in the dynamic dispatch table to be used by the current
+  // CPU. Note that this method must be in the header file so it uses the value
+  // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
+  // calls it, which may be different from others. This allows to only consider
+  // those targets that were actually compiled in this module.
+  size_t HWY_INLINE GetIndex() const {
+    return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
+                                              HWY_CHOSEN_TARGET_MASK_TARGETS);
+  }
+
+ private:
+  // Initialized to 1 so GetChosenTargetIndex() returns 0.
+  std::atomic<uint32_t> mask_{1};
+};
+
+extern ChosenTarget chosen_target;
+
+}  // namespace hwy
+
+#endif  // HIGHWAY_HWY_TARGETS_H_
--- a/third_party/highway/hwy/targets_test.cc
+++ b/third_party/highway/hwy/targets_test.cc
@ -0,0 +1,102 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "hwy/targets.h"
+
+#include "hwy/tests/test_util-inl.h"
+
+namespace fake {
+
+#define DECLARE_FUNCTION(TGT)                        \
+  namespace N_##TGT {                                \
+    uint32_t FakeFunction(int) { return HWY_##TGT; } \
+  }
+
+DECLARE_FUNCTION(AVX3)
+DECLARE_FUNCTION(AVX2)
+DECLARE_FUNCTION(SSE4)
+DECLARE_FUNCTION(NEON)
+DECLARE_FUNCTION(PPC8)
+DECLARE_FUNCTION(WASM)
+DECLARE_FUNCTION(RVV)
+DECLARE_FUNCTION(SCALAR)
+
+HWY_EXPORT(FakeFunction);
+
+void CheckFakeFunction() {
+#define CHECK_ARRAY_ENTRY(TGT)                                              \
+  if ((HWY_TARGETS & HWY_##TGT) != 0) {                                     \
+    hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \
+    /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \
+    /* the pointer to the already cached function. */                       \
+    hwy::chosen_target.Update();                                            \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+    /* Calling DeInit() will test that the initializer function */          \
+    /* also calls the right function. */                                    \
+    hwy::chosen_target.DeInit();                                            \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+    /* Second call uses the cached value from the previous call. */         \
+    EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
+  }
+  CHECK_ARRAY_ENTRY(AVX3)
+  CHECK_ARRAY_ENTRY(AVX2)
+  CHECK_ARRAY_ENTRY(SSE4)
+  CHECK_ARRAY_ENTRY(NEON)
+  CHECK_ARRAY_ENTRY(PPC8)
+  CHECK_ARRAY_ENTRY(WASM)
+  CHECK_ARRAY_ENTRY(RVV)
+  CHECK_ARRAY_ENTRY(SCALAR)
+#undef CHECK_ARRAY_ENTRY
+}
+
+}  // namespace fake
+
+namespace hwy {
+
+class HwyTargetsTest : public testing::Test {
+ protected:
+  void TearDown() override {
+    SetSupportedTargetsForTest(0);
+    DisableTargets(0);  // Reset the mask.
+  }
+};
+
+// Test that the order in the HWY_EXPORT static array matches the expected
+// value of the target bits. This is only checked for the targets that are
+// enabled in the current compilation.
+TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
+
+TEST_F(HwyTargetsTest, DisabledTargetsTest) {
+  DisableTargets(~0u);
+  // Check that the baseline can't be disabled.
+  HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
+
+  DisableTargets(0);  // Reset the mask.
+  uint32_t current_targets = SupportedTargets();
+  if ((current_targets & ~HWY_ENABLED_BASELINE) == 0) {
+    // We can't test anything else if the only compiled target is the baseline.
+    return;
+  }
+  // Get the lowest bit in the mask (the best target) and disable that one.
+  uint32_t lowest_target = current_targets & (~current_targets + 1);
+  // The lowest target shouldn't be one in the baseline.
+  HWY_ASSERT((lowest_target & ~HWY_ENABLED_BASELINE) != 0);
+  DisableTargets(lowest_target);
+
+  // Check that the other targets are still enabled.
+  HWY_ASSERT((lowest_target ^ current_targets) == SupportedTargets());
+  DisableTargets(0);  // Reset the mask.
+}
+
+}  // namespace hwy
--- a/third_party/highway/hwy/tests/arithmetic_test.cc
+++ b/third_party/highway/hwy/tests/arithmetic_test.cc
--- a/third_party/highway/hwy/tests/combine_test.cc
+++ b/third_party/highway/hwy/tests/combine_test.cc
@ -0,0 +1,287 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+// Not yet implemented
+#if HWY_TARGET != HWY_RVV
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLowerHalf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Half<D> d2;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+    const auto v = Iota(d, 1);
+    Store(LowerHalf(v), d2, lanes.get());
+    size_t i = 0;
+    for (; i < Lanes(d2); ++i) {
+      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+    }
+    // Other half remains unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+    }
+  }
+};
+
+struct TestLowerQuarter {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Half<Half<D>> d4;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+    const auto v = Iota(d, 1);
+    const auto lo = LowerHalf(LowerHalf(v));
+    Store(lo, d4, lanes.get());
+    size_t i = 0;
+    for (; i < Lanes(d4); ++i) {
+      HWY_ASSERT_EQ(T(i + 1), lanes[i]);
+    }
+    // Upper 3/4 remain unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLowerHalf() {
+  constexpr size_t kDiv = 1;
+  ForAllTypes(ForPartialVectors<TestLowerHalf, kDiv, /*kMinLanes=*/2>());
+  ForAllTypes(ForPartialVectors<TestLowerQuarter, kDiv, /*kMinLanes=*/4>());
+}
+
+struct TestUpperHalf {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define UpperHalf.
+#if HWY_TARGET != HWY_SCALAR
+    const Half<D> d2;
+
+    const auto v = Iota(d, 1);
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+
+    Store(UpperHalf(v), d2, lanes.get());
+    size_t i = 0;
+    for (; i < Lanes(d2); ++i) {
+      HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
+    }
+    // Other half remains unchanged
+    for (; i < N; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllUpperHalf() {
+  ForAllTypes(ForGE128Vectors<TestUpperHalf>());
+}
+
+struct TestZeroExtendVector {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_CAP_GE256
+    const Twice<D> d2;
+
+    const auto v = Iota(d, 1);
+    const size_t N2 = Lanes(d2);
+    auto lanes = AllocateAligned<T>(N2);
+    Store(v, d, &lanes[0]);
+    Store(v, d, &lanes[N2 / 2]);
+
+    const auto ext = ZeroExtendVector(v);
+    Store(ext, d2, lanes.get());
+
+    size_t i = 0;
+    // Lower half is unchanged
+    for (; i < N2 / 2; ++i) {
+      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
+    }
+    // Upper half is zero
+    for (; i < N2; ++i) {
+      HWY_ASSERT_EQ(T(0), lanes[i]);
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllZeroExtendVector() {
+  ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
+}
+
+struct TestCombine {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_CAP_GE256
+    const Twice<D> d2;
+    const size_t N2 = Lanes(d2);
+    auto lanes = AllocateAligned<T>(N2);
+
+    const auto lo = Iota(d, 1);
+    const auto hi = Iota(d, N2 / 2 + 1);
+    const auto combined = Combine(hi, lo);
+    Store(combined, d2, lanes.get());
+
+    const auto expected = Iota(d2, 1);
+    HWY_ASSERT_VEC_EQ(d2, expected, combined);
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllCombine() {
+  ForAllTypes(ForExtendableVectors<TestCombine>());
+}
+
+
+template <int kBytes>
+struct TestCombineShiftRightBytesR {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    const auto lo = BitCast(d, Iota(d8, 1));
+    const auto hi = BitCast(d, Iota(d8, 1 + N8));
+
+    auto expected = AllocateAligned<T>(Lanes(d));
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    const size_t kBlockSize = 16;
+    for (size_t i = 0; i < N8; ++i) {
+      const size_t block = i / kBlockSize;
+      const size_t lane = i % kBlockSize;
+      const size_t first_lo = block * kBlockSize;
+      const size_t idx = lane + kBytes;
+      const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
+      const bool at_end = idx >= 2 * kBlockSize;
+      expected_bytes[i] =
+          at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(),
+                      CombineShiftRightBytes<kBytes>(hi, lo));
+
+    TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+template <int kLanes>
+struct TestCombineShiftRightLanesR {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> d8;
+    const size_t N8 = Lanes(d8);
+    const auto lo = BitCast(d, Iota(d8, 1));
+    const auto hi = BitCast(d, Iota(d8, 1 + N8));
+
+    auto expected = AllocateAligned<T>(Lanes(d));
+
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    const size_t kBlockSize = 16;
+    for (size_t i = 0; i < N8; ++i) {
+      const size_t block = i / kBlockSize;
+      const size_t lane = i % kBlockSize;
+      const size_t first_lo = block * kBlockSize;
+      const size_t idx = lane + kLanes * sizeof(T);
+      const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
+      const bool at_end = idx >= 2 * kBlockSize;
+      expected_bytes[i] =
+          at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(),
+                      CombineShiftRightLanes<kLanes>(hi, lo));
+
+    TestCombineShiftRightBytesR<kLanes - 1>()(t, d);
+#else
+    (void)t;
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+template <>
+struct TestCombineShiftRightBytesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};
+
+template <>
+struct TestCombineShiftRightLanesR<0> {
+  template <class T, class D>
+  void operator()(T /*unused*/, D /*unused*/) {}
+};
+
+struct TestCombineShiftRight {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    TestCombineShiftRightBytesR<15>()(t, d);
+    TestCombineShiftRightLanesR<16 / sizeof(T) - 1>()(t, d);
+  }
+};
+
+HWY_NOINLINE void TestAllCombineShiftRight() {
+  ForAllTypes(ForGE128Vectors<TestCombineShiftRight>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyCombineTest);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
+HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
+}  // namespace hwy
+#endif
+
+#else
+int main(int, char**) { return 0; }
+#endif  // HWY_TARGET != HWY_RVV
--- a/third_party/highway/hwy/tests/compare_test.cc
+++ b/third_party/highway/hwy/tests/compare_test.cc
@ -0,0 +1,217 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memset
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// All types.
+struct TestMask {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+    const auto actual_false = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
+
+    memset(lanes.get(), 0xFF, N * sizeof(T));
+    const auto actual_true = MaskFromVec(Load(d, lanes.get()));
+    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
+  }
+};
+
+HWY_NOINLINE void TestAllMask() { ForAllTypes(ForPartialVectors<TestMask>()); }
+
+// All types.
+struct TestEquality {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v2 = Iota(d, 2);
+    const auto v2b = Iota(d, 2);
+    const auto v3 = Iota(d, 3);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
+  }
+};
+
+HWY_NOINLINE void TestAllEquality() {
+  ForAllTypes(ForPartialVectors<TestEquality>());
+}
+
+// a > b should be true, verify that for Gt/Lt and with swapped args.
+template <class D>
+void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
+  const auto mask_false = MaskFalse(d);
+  const auto mask_true = MaskTrue(d);
+
+  const auto va = Set(d, a);
+  const auto vb = Set(d, b);
+  AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
+  AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
+
+  // Swapped order
+  AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
+  AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
+
+  // Also ensure irreflexive
+  AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
+  AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
+  AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
+  AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
+}
+
+#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
+
+struct TestStrictInt {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T min = LimitsMin<T>();
+    const T max = LimitsMax<T>();
+    const auto v0 = Zero(d);
+    const auto v2 = And(Iota(d, T(2)), Set(d, 127));  // 0..127
+    const auto vn = Neg(v2) - Set(d, 1);              // -1..-128
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 0, -1);
+    HWY_ENSURE_GREATER(d, -1, -2);
+    HWY_ENSURE_GREATER(d, max, max / 2);
+    HWY_ENSURE_GREATER(d, max, 1);
+    HWY_ENSURE_GREATER(d, max, 0);
+    HWY_ENSURE_GREATER(d, max, -1);
+    HWY_ENSURE_GREATER(d, max, min);
+    HWY_ENSURE_GREATER(d, 0, min);
+    HWY_ENSURE_GREATER(d, min / 2, min);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictInt() {
+  ForSignedTypes(ForExtendableVectors<TestStrictInt>());
+}
+
+struct TestStrictFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const T huge_neg = -1E35;
+    const T huge_pos = 1E36;
+    const auto v0 = Zero(d);
+    const auto v2 = Iota(d, T(2));
+    const auto vn = Neg(v2);
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    // Individual values of interest
+    HWY_ENSURE_GREATER(d, 2, 1);
+    HWY_ENSURE_GREATER(d, 1, 0);
+    HWY_ENSURE_GREATER(d, 0, -1);
+    HWY_ENSURE_GREATER(d, -1, -2);
+    HWY_ENSURE_GREATER(d, huge_pos, 1);
+    HWY_ENSURE_GREATER(d, huge_pos, 0);
+    HWY_ENSURE_GREATER(d, huge_pos, -1);
+    HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
+    HWY_ENSURE_GREATER(d, 0, huge_neg);
+
+    // Also use Iota to ensure lanes are independent
+    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
+  }
+};
+
+HWY_NOINLINE void TestAllStrictFloat() {
+  ForFloatTypes(ForExtendableVectors<TestStrictFloat>());
+}
+
+struct TestWeakFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v2 = Iota(d, 2);
+    const auto vn = Iota(d, -T(Lanes(d)));
+
+    const auto mask_false = MaskFalse(d);
+    const auto mask_true = MaskTrue(d);
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
+
+    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
+
+    HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
+    HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
+  }
+};
+
+HWY_NOINLINE void TestAllWeakFloat() {
+  ForFloatTypes(ForPartialVectors<TestWeakFloat>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyCompareTest);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
+HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/tests/convert_test.cc
+++ b/third_party/highway/hwy/tests/convert_test.cc
@ -0,0 +1,568 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
+// via TestBitCastFrom.
+template <typename ToT>
+struct TestBitCast {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const Repartition<ToT, D> dto;
+    HWY_ASSERT_EQ(Lanes(d) * sizeof(T), Lanes(dto) * sizeof(ToT));
+    const auto vf = Iota(d, 1);
+    const auto vt = BitCast(dto, vf);
+    // Must return the same bits
+    auto from_lanes = AllocateAligned<T>(Lanes(d));
+    auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
+    Store(vf, d, from_lanes.get());
+    Store(vt, dto, to_lanes.get());
+    HWY_ASSERT(
+        BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
+  }
+};
+
+// From D to all types.
+struct TestBitCastFrom {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    TestBitCast<uint8_t>()(t, d);
+    TestBitCast<uint16_t>()(t, d);
+    TestBitCast<uint32_t>()(t, d);
+#if HWY_CAP_INTEGER64
+    TestBitCast<uint64_t>()(t, d);
+#endif
+    TestBitCast<int8_t>()(t, d);
+    TestBitCast<int16_t>()(t, d);
+    TestBitCast<int32_t>()(t, d);
+#if HWY_CAP_INTEGER64
+    TestBitCast<int64_t>()(t, d);
+#endif
+    TestBitCast<float>()(t, d);
+#if HWY_CAP_FLOAT64
+    TestBitCast<double>()(t, d);
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllBitCast() {
+  // For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
+  // the former can't partition its single lane, and the latter can be smaller
+  // than a destination type.
+  const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
+  to_u8(uint8_t());
+  to_u8(int8_t());
+
+  const ForPartialVectors<TestBitCast<int8_t>> to_i8;
+  to_i8(uint8_t());
+  to_i8(int8_t());
+
+  const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
+  to_u16(uint16_t());
+  to_u16(int16_t());
+
+  const ForPartialVectors<TestBitCast<int16_t>> to_i16;
+  to_i16(uint16_t());
+  to_i16(int16_t());
+
+  const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
+  to_u32(uint32_t());
+  to_u32(int32_t());
+  to_u32(float());
+
+  const ForPartialVectors<TestBitCast<int32_t>> to_i32;
+  to_i32(uint32_t());
+  to_i32(int32_t());
+  to_i32(float());
+
+#if HWY_CAP_INTEGER64
+  const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
+  to_u64(uint64_t());
+  to_u64(int64_t());
+#if HWY_CAP_FLOAT64
+  to_u64(double());
+#endif
+
+  const ForPartialVectors<TestBitCast<int64_t>> to_i64;
+  to_i64(uint64_t());
+  to_i64(int64_t());
+#if HWY_CAP_FLOAT64
+  to_i64(double());
+#endif
+#endif  // HWY_CAP_INTEGER64
+
+  const ForPartialVectors<TestBitCast<float>> to_float;
+  to_float(uint32_t());
+  to_float(int32_t());
+  to_float(float());
+
+#if HWY_CAP_FLOAT64
+  const ForPartialVectors<TestBitCast<double>> to_double;
+  to_double(double());
+#if HWY_CAP_INTEGER64
+  to_double(uint64_t());
+  to_double(int64_t());
+#endif  // HWY_CAP_INTEGER64
+#endif  // HWY_CAP_FLOAT64
+
+  // For non-scalar vectors, we can cast all types to all.
+  ForAllTypes(ForGE128Vectors<TestBitCastFrom>());
+}
+
+template <typename ToT>
+struct TestPromoteTo {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    RandomState rng;
+    for (size_t rep = 0; rep < 200; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        const uint64_t bits = rng();
+        memcpy(&from[i], &bits, sizeof(T));
+        expected[i] = from[i];
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        PromoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllPromoteTo() {
+  const ForPartialVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
+  to_u16div2(uint8_t());
+
+  const ForPartialVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
+  to_u32div4(uint8_t());
+
+  const ForPartialVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
+  to_u32div2(uint16_t());
+
+  const ForPartialVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
+  to_i16div2(uint8_t());
+  to_i16div2(int8_t());
+
+  const ForPartialVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
+  to_i32div2(uint16_t());
+  to_i32div2(int16_t());
+
+  const ForPartialVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
+  to_i32div4(uint8_t());
+  to_i32div4(int8_t());
+
+  // Must test f16 separately because we can only load/store/convert them.
+
+#if HWY_CAP_INTEGER64
+  const ForPartialVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
+  to_u64div2(uint32_t());
+
+  const ForPartialVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
+  to_i64div2(int32_t());
+#endif
+
+#if HWY_CAP_FLOAT64
+  const ForPartialVectors<TestPromoteTo<double>, 2> to_f64div2;
+  to_f64div2(int32_t());
+  to_f64div2(float());
+#endif
+}
+
+template <typename T, HWY_IF_FLOAT(T)>
+bool IsFinite(T t) {
+  return std::isfinite(t);
+}
+// Wrapper avoids calling std::isfinite for integer types (ambiguous).
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+bool IsFinite(T /*unused*/) {
+  return true;
+}
+
+template <typename ToT>
+struct TestDemoteTo {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    // Narrower range in the wider type, for clamping before we cast
+    const T min = LimitsMin<ToT>();
+    const T max = LimitsMax<ToT>();
+
+    RandomState rng;
+    for (size_t rep = 0; rep < 1000; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(T));
+        } while (!IsFinite(from[i]));
+        expected[i] = static_cast<ToT>(std::min(std::max(min, from[i]), max));
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        DemoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToInt() {
+  ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
+
+  ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int16_t());
+  ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> to_u16;
+  to_u16(int32_t());
+
+  const ForDemoteVectors<TestDemoteTo<int16_t>, 2> to_i16;
+  to_i16(int32_t());
+}
+
+HWY_NOINLINE void TestAllDemoteToMixed() {
+#if HWY_CAP_FLOAT64
+  const ForDemoteVectors<TestDemoteTo<int32_t>, 2> to_i32;
+  to_i32(double());
+#endif
+}
+
+template <typename ToT>
+struct TestDemoteToFloat {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
+    // For floats, we clamp differently and cannot call LimitsMin.
+    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
+    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
+    const Rebind<ToT, D> to_d;
+
+    const size_t N = Lanes(from_d);
+    auto from = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<ToT>(N);
+
+    RandomState rng;
+    for (size_t rep = 0; rep < 1000; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(T));
+        } while (!IsFinite(from[i]));
+        const T magn = std::abs(from[i]);
+        const T max_abs = HighestValue<ToT>();
+        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
+        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
+        const T clipped = copysign(std::min(magn, max_abs), from[i]);
+        expected[i] = static_cast<ToT>(clipped);
+      }
+
+      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
+                        DemoteTo(to_d, Load(from_d, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllDemoteToFloat() {
+  // Must test f16 separately because we can only load/store/convert them.
+
+#if HWY_CAP_FLOAT64
+  const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
+  to_float(double());
+#endif
+}
+
+template <class D>
+AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
+  const float test_cases[] = {
+      // +/- 1
+      1.0f, -1.0f,
+      // +/- 0
+      0.0f, -0.0f,
+      // near 0
+      0.25f, -0.25f,
+      // +/- integer
+      4.0f, -32.0f,
+      // positive near limit
+      65472.0f, 65504.0f,
+      // negative near limit
+      -65472.0f, -65504.0f,
+      // positive +/- delta
+      2.00390625f, 3.99609375f,
+      // negative +/- delta
+      -2.00390625f, -3.99609375f,
+      // No infinity/NaN - implementation-defined due to ARM.
+  };
+  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
+  const size_t N = Lanes(d);
+  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
+  auto in = AllocateAligned<float>(padded);
+  auto expected = AllocateAligned<float>(padded);
+  std::copy(test_cases, test_cases + kNumTestCases, in.get());
+  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
+  return in;
+}
+
+struct TestF16 {
+  template <typename TF32, class DF32>
+  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
+    size_t padded;
+    auto in = F16TestCases(d32, padded);
+    using TF16 = float16_t;
+    const Rebind<TF16, DF32> d16;
+    const size_t N = Lanes(d32);  // same count for f16
+    auto temp16 = AllocateAligned<TF16>(N);
+
+    for (size_t i = 0; i < padded; i += N) {
+      const auto loaded = Load(d32, &in[i]);
+      Store(DemoteTo(d16, loaded), d16, temp16.get());
+      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16, 2>()(float()); }
+
+struct TestConvertU8 {
+  template <typename T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
+    const Rebind<uint8_t, D> du8;
+    auto lanes8 = AllocateAligned<uint8_t>(Lanes(du8));
+    Store(Iota(du8, 0), du8, lanes8.get());
+    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(Iota(du32, 0)));
+    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), U8FromU32(Iota(du32, 0x7F)));
+  }
+};
+
+HWY_NOINLINE void TestAllConvertU8() {
+  ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
+}
+
+// Separate function to attempt to work around a compiler bug on ARM: when this
+// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
+struct TestIntFromFloatHuge {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    // Still does not work, although ARMv7 manual says that float->int
+    // saturates, i.e. chooses the nearest representable value.
+#if HWY_TARGET != HWY_NEON
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+
+    // Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
+    // the expected i32 value is otherwise 0x80..00).
+    const auto expected_max = Set(di, LimitsMax<TI>());
+    HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));
+
+    // Huge negative (also lvalue for safety, but GCC bug was not triggered)
+    const auto expected_min = Set(di, LimitsMin<TI>());
+    HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
+#else
+    (void)df;
+#endif
+  }
+};
+
+struct TestIntFromFloat {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = MakeSigned<TF>;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
+
+    // Above positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
+
+    // Below positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
+
+    const TF eps = static_cast<TF>(0.0001);
+    // Above negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+                      ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
+
+    // Below negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+                      ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
+
+    // TF does not have enough precision to represent TI.
+    const double min = static_cast<double>(LimitsMin<TI>());
+    const double max = static_cast<double>(LimitsMax<TI>());
+
+    // Also check random values.
+    auto from = AllocateAligned<TF>(N);
+    auto expected = AllocateAligned<TI>(N);
+    RandomState rng;
+    for (size_t rep = 0; rep < 1000; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        do {
+          const uint64_t bits = rng();
+          memcpy(&from[i], &bits, sizeof(TF));
+        } while (!std::isfinite(from[i]));
+        if (from[i] >= max) {
+          expected[i] = LimitsMax<TI>();
+        } else if (from[i] <= min) {
+          expected[i] = LimitsMin<TI>();
+        } else {
+          expected[i] = static_cast<TI>(from[i]);
+        }
+      }
+
+      HWY_ASSERT_VEC_EQ(di, expected.get(),
+                        ConvertTo(di, Load(df, from.get())));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIntFromFloat() {
+  ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
+  ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
+}
+
+struct TestFloatFromInt {
+  template <typename TI, class DI>
+  HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
+    using TF = MakeFloat<TI>;
+    const Rebind<TF, DI> df;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
+
+    // Max positive
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+                      ConvertTo(df, Set(di, LimitsMax<TI>())));
+
+    // Min negative
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+                      ConvertTo(df, Set(di, LimitsMin<TI>())));
+  }
+};
+
+HWY_NOINLINE void TestAllFloatFromInt() {
+  ForPartialVectors<TestFloatFromInt>()(int32_t());
+#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
+  ForPartialVectors<TestFloatFromInt>()(int64_t());
+#endif
+}
+
+struct TestI32F64 {
+  template <typename TF, class DF>
+  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
+    using TI = int32_t;
+    const Rebind<TI, DF> di;
+    const size_t N = Lanes(df);
+
+    // Integer positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+    // Integer negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
+
+    // Above positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
+
+    // Below positive
+    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
+
+    const TF eps = static_cast<TF>(0.0001);
+    // Above negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
+
+    // Below negative
+    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
+                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
+    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
+
+    // Huge positive float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
+                      DemoteTo(di, Set(df, TF(1E12))));
+
+    // Huge negative float
+    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
+                      DemoteTo(di, Set(df, TF(-1E12))));
+
+    // Max positive int
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
+                      PromoteTo(df, Set(di, LimitsMax<TI>())));
+
+    // Min negative int
+    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
+                      PromoteTo(df, Set(di, LimitsMin<TI>())));
+  }
+};
+
+HWY_NOINLINE void TestAllI32F64() {
+#if HWY_CAP_FLOAT64
+  ForDemoteVectors<TestI32F64, 2>()(double());
+#endif
+}
+
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyConvertTest);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
+HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/tests/list_targets.cc
+++ b/third_party/highway/hwy/tests/list_targets.cc
@ -0,0 +1,34 @@
+// Copyright 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Simple tool to print the list of targets that were compiled in when building
+// this tool.
+
+#include <stdio.h>
+
+#include "hwy/highway.h"
+
+void PrintTargets(const char* msg, uint32_t targets) {
+  fprintf(stderr, "%s", msg);
+  for (unsigned x = targets; x != 0; x = x & (x - 1)) {
+    fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
+  }
+  fprintf(stderr, "\n");
+}
+
+int main() {
+  PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
+  PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
+  return 0;
+}
--- a/third_party/highway/hwy/tests/logical_test.cc
+++ b/third_party/highway/hwy/tests/logical_test.cc
@ -0,0 +1,691 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>  // memcmp
+
+#include "hwy/base.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
+#include "hwy/foreach_target.h"
+
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLogicalInteger {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vi = Iota(d, 0);
+    const auto ones = VecFromMask(d, Eq(v0, v0));
+    const auto v1 = Set(d, 1);
+    const auto vnot1 = Set(d, ~T(1));
+
+    HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
+    HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
+    HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
+    HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
+
+    HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
+
+    auto v = vi;
+    v = And(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = And(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+
+    v = Or(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = Or(v, v0);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+
+    v = Xor(v, vi);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+    v = Xor(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+  }
+};
+
+HWY_NOINLINE void TestAllLogicalInteger() {
+  ForIntegerTypes(ForPartialVectors<TestLogicalInteger>());
+}
+
+struct TestLogicalFloat {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vi = Iota(d, 0);
+
+    HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
+
+    HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
+
+    auto v = vi;
+    v = And(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = And(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+
+    v = Or(v, vi);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+    v = Or(v, v0);
+    HWY_ASSERT_VEC_EQ(d, vi, v);
+
+    v = Xor(v, vi);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+    v = Xor(v, v0);
+    HWY_ASSERT_VEC_EQ(d, v0, v);
+  }
+};
+
+HWY_NOINLINE void TestAllLogicalFloat() {
+  ForFloatTypes(ForPartialVectors<TestLogicalFloat>());
+}
+
+struct TestCopySign {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero remains zero regardless of sign
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
+    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
+
+    // Positive input, positive sign => unchanged
+    HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
+    HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
+
+    // Positive input, negative sign => negated
+    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
+    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
+
+    // Negative input, negative sign => unchanged
+    HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
+
+    // Negative input, positive sign => negated
+    HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
+  }
+};
+
+HWY_NOINLINE void TestAllCopySign() {
+  ForFloatTypes(ForPartialVectors<TestCopySign>());
+}
+
+struct TestIfThenElse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    auto in1 = AllocateAligned<T>(N);
+    auto in2 = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+
+    // NOTE: reverse polarity (mask is true iff lane == 0) because we cannot
+    // reliably compare against all bits set (NaN for float types).
+    const T off = 1;
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < 50; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        in1[i] = static_cast<T>(Random32(&rng));
+        in2[i] = static_cast<T>(Random32(&rng));
+        mask_lanes[i] = (Random32(&rng) & 1024) ? off : T(0);
+      }
+
+      const auto v1 = Load(d, in1.get());
+      const auto v2 = Load(d, in2.get());
+      const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = (mask_lanes[i] == off) ? in2[i] : in1[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = mask_lanes[i] ? T(0) : in1[i];
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
+
+      for (size_t i = 0; i < N; ++i) {
+        expected[i] = mask_lanes[i] ? in2[i] : T(0);
+      }
+      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllIfThenElse() {
+  ForAllTypes(ForPartialVectors<TestIfThenElse>());
+}
+
+struct TestMaskVec {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    const size_t N = Lanes(d);
+    auto mask_lanes = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
+      }
+
+      const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
+      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllMaskVec() {
+  const ForPartialVectors<TestMaskVec> test;
+
+  test(uint16_t());
+  test(int16_t());
+  // TODO(janwas): float16_t - cannot compare yet
+
+  test(uint32_t());
+  test(int32_t());
+  test(float());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+#if HWY_CAP_FLOAT64
+  test(double());
+#endif
+}
+
+struct TestCompress {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+
+    using TU = MakeUnsigned<T>;
+    const Rebind<TU, D> du;
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+    auto mask_lanes = AllocateAligned<TU>(N);
+    auto expected = AllocateAligned<T>(N);
+    auto actual = AllocateAligned<T>(N);
+
+    // Each lane should have a chance of having mask=true.
+    for (size_t rep = 0; rep < 100; ++rep) {
+      size_t expected_pos = 0;
+      for (size_t i = 0; i < N; ++i) {
+        const uint64_t bits = Random32(&rng);
+        in_lanes[i] = T();  // cannot initialize float16_t directly.
+        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
+        mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
+        if (mask_lanes[i] == 0) {  // Zero means true (easier to compare)
+          expected[expected_pos++] = in_lanes[i];
+        }
+      }
+
+      const auto in = Load(d, in_lanes.get());
+      const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
+
+      Store(Compress(in, mask), d, actual.get());
+      // Upper lanes are undefined.
+      for (size_t i = 0; i < expected_pos; ++i) {
+        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
+      }
+
+      // Also check CompressStore in the same way.
+      memset(actual.get(), 0, N * sizeof(T));
+      const size_t num_written = CompressStore(in, mask, d, actual.get());
+      HWY_ASSERT_EQ(expected_pos, num_written);
+      for (size_t i = 0; i < expected_pos; ++i) {
+        HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
+      }
+    }
+  }
+};
+
+#if 0
+namespace detail {  // for code folding
+void PrintCompress16x8Tables() {
+  constexpr size_t N = 8;  // 128-bit SIMD
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint8_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    // Doubled (for converting lane to byte indices)
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,", 2 * indices[i]);
+    }
+  }
+  printf("\n");
+}
+
+// Compressed to nibbles
+void PrintCompress32x8Tables() {
+  constexpr size_t N = 8;  // AVX2
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    // Convert to nibbles
+    uint64_t packed = 0;
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT(indices[i] < 16);
+      packed += indices[i] << (i * 4);
+    }
+
+    HWY_ASSERT(packed < (1ull << 32));
+    printf("0x%08x,", static_cast<uint32_t>(packed));
+  }
+  printf("\n");
+}
+
+// Pairs of 32-bit lane indices
+void PrintCompress64x4Tables() {
+  constexpr size_t N = 4;  // AVX2
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i) {
+      printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
+    }
+  }
+  printf("\n");
+}
+
+// 4-tuple of byte indices
+void PrintCompress32x4Tables() {
+  using T = uint32_t;
+  constexpr size_t N = 4;  // SSE4
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%zu,", sizeof(T) * indices[i] + idx_byte);
+      }
+    }
+  }
+  printf("\n");
+}
+
+// 8-tuple of byte indices
+void PrintCompress64x2Tables() {
+  using T = uint64_t;
+  constexpr size_t N = 2;  // SSE4
+  for (uint64_t code = 0; code < 1ull << N; ++code) {
+    std::array<uint32_t, N> indices{0};
+    size_t pos = 0;
+    for (size_t i = 0; i < N; ++i) {
+      if (code & (1ull << i)) {
+        indices[pos++] = i;
+      }
+    }
+
+    for (size_t i = 0; i < N; ++i) {
+      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
+        printf("%zu,", sizeof(T) * indices[i] + idx_byte);
+      }
+    }
+  }
+  printf("\n");
+}
+}  // namespace detail
+#endif
+
+HWY_NOINLINE void TestAllCompress() {
+  // detail::PrintCompress32x8Tables();
+  // detail::PrintCompress64x4Tables();
+  // detail::PrintCompress32x4Tables();
+  // detail::PrintCompress64x2Tables();
+  // detail::PrintCompress16x8Tables();
+
+  const ForPartialVectors<TestCompress> test;
+
+  test(uint16_t());
+  test(int16_t());
+  test(float16_t());
+
+  test(uint32_t());
+  test(int32_t());
+  test(float());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+#if HWY_CAP_FLOAT64
+  test(double());
+#endif
+}
+
+struct TestZeroIfNegative {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v0 = Zero(d);
+    const auto vp = Iota(d, 1);
+    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
+
+    // Zero and positive remain unchanged
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
+    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
+
+    // Negative are all replaced with zero
+    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
+  }
+};
+
+HWY_NOINLINE void TestAllZeroIfNegative() {
+  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
+}
+
+struct TestBroadcastSignBit {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto s0 = Zero(d);
+    const auto s1 = Set(d, -1);  // all bit set
+    const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
+    const auto vneg = s1 - vpos;
+
+    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
+    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
+
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
+    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
+  }
+};
+
+HWY_NOINLINE void TestAllBroadcastSignBit() {
+  ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
+}
+
+struct TestTestBit {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t kNumBits = sizeof(T) * 8;
+    for (size_t i = 0; i < kNumBits; ++i) {
+      const auto bit1 = Set(d, 1ull << i);
+      const auto bit2 = Set(d, 1ull << ((i + 1) % kNumBits));
+      const auto bit3 = Set(d, 1ull << ((i + 2) % kNumBits));
+      const auto bits12 = Or(bit1, bit2);
+      const auto bits23 = Or(bit2, bit3);
+      HWY_ASSERT(AllTrue(TestBit(bit1, bit1)));
+      HWY_ASSERT(AllTrue(TestBit(bits12, bit1)));
+      HWY_ASSERT(AllTrue(TestBit(bits12, bit2)));
+
+      HWY_ASSERT(AllFalse(TestBit(bits12, bit3)));
+      HWY_ASSERT(AllFalse(TestBit(bits23, bit1)));
+      HWY_ASSERT(AllFalse(TestBit(bit1, bit2)));
+      HWY_ASSERT(AllFalse(TestBit(bit2, bit1)));
+      HWY_ASSERT(AllFalse(TestBit(bit1, bit3)));
+      HWY_ASSERT(AllFalse(TestBit(bit3, bit1)));
+      HWY_ASSERT(AllFalse(TestBit(bit2, bit3)));
+      HWY_ASSERT(AllFalse(TestBit(bit3, bit2)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllTestBit() {
+  ForIntegerTypes(ForFullVectors<TestTestBit>());
+}
+
+struct TestAllTrueFalse {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto zero = Zero(d);
+    auto v = zero;
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(0));
+
+    HWY_ASSERT(AllTrue(Eq(v, zero)));
+    HWY_ASSERT(!AllFalse(Eq(v, zero)));
+
+    // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
+    // lanes and one is nonzero.
+    const bool expected_all_false = (N != 1);
+
+    // Set each lane to nonzero and back to zero
+    for (size_t i = 0; i < N; ++i) {
+      lanes[i] = T(1);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(!AllTrue(Eq(v, zero)));
+      HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
+
+      lanes[i] = T(-1);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(!AllTrue(Eq(v, zero)));
+      HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
+
+      // Reset to all zero
+      lanes[i] = T(0);
+      v = Load(d, lanes.get());
+      HWY_ASSERT(AllTrue(Eq(v, zero)));
+      HWY_ASSERT(!AllFalse(Eq(v, zero)));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllAllTrueFalse() {
+  ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
+}
+
+class TestStoreMaskBits {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*t*/, D d) {
+    // TODO(janwas): remove once implemented (cast or vse1)
+#if HWY_TARGET != HWY_RVV
+    RandomState rng;
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    const size_t expected_bytes = (N + 7) / 8;
+    auto bits = AllocateAligned<uint8_t>(expected_bytes);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Generate random mask pattern.
+      for (size_t i = 0; i < N; ++i) {
+        lanes[i] = static_cast<T>((rng() & 1024) ? 1 : 0);
+      }
+      const auto mask = Load(d, lanes.get()) == Zero(d);
+
+      const size_t bytes_written = StoreMaskBits(mask, bits.get());
+
+      HWY_ASSERT_EQ(expected_bytes, bytes_written);
+      size_t i = 0;
+      // Stored bits must match original mask
+      for (; i < N; ++i) {
+        const bool bit = (bits[i / 8] & (1 << (i % 8))) != 0;
+        HWY_ASSERT_EQ(bit, lanes[i] == 0);
+      }
+      // Any partial bits in the last byte must be zero
+      for (; i < 8 * bytes_written; ++i) {
+        const int bit = (bits[i / 8] & (1 << (i % 8)));
+        HWY_ASSERT_EQ(bit, 0);
+      }
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllStoreMaskBits() {
+  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
+}
+
+struct TestCountTrue {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = std::min(N, size_t(10));
+
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(1));
+
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      // Number of zeros written = number of mask lanes that are true.
+      size_t expected = 0;
+      for (size_t i = 0; i < max_lanes; ++i) {
+        lanes[i] = T(1);
+        if (code & (1ull << i)) {
+          ++expected;
+          lanes[i] = T(0);
+        }
+      }
+
+      const auto mask = Eq(Load(d, lanes.get()), Zero(d));
+      const size_t actual = CountTrue(mask);
+      HWY_ASSERT_EQ(expected, actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllCountTrue() {
+  ForAllTypes(ForPartialVectors<TestCountTrue>());
+}
+
+struct TestLogicalMask {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto m0 = MaskFalse(d);
+    const auto m_all = MaskTrue(d);
+
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    std::fill(lanes.get(), lanes.get() + N, T(1));
+
+    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
+    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
+
+    // For all combinations of zero/nonzero state of subset of lanes:
+    const size_t max_lanes = std::min(N, size_t(6));
+    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
+      for (size_t i = 0; i < max_lanes; ++i) {
+        lanes[i] = T(1);
+        if (code & (1ull << i)) {
+          lanes[i] = T(0);
+        }
+      }
+
+      const auto m = Eq(Load(d, lanes.get()), Zero(d));
+
+      HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
+      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
+
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
+      HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
+      HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
+      HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLogicalMask() {
+  ForAllTypes(ForFullVectors<TestLogicalMask>());
+}
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyLogicalTest);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllTrueFalse);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
+HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/tests/memory_test.cc
+++ b/third_party/highway/hwy/tests/memory_test.cc
@ -0,0 +1,413 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
+#include "hwy/cache_control.h"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestLoadStore {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto hi = Iota(d, 1 + N);
+    const auto lo = Iota(d, 1);
+    auto lanes = AllocateAligned<T>(2 * N);
+    Store(hi, d, &lanes[N]);
+    Store(lo, d, &lanes[0]);
+
+    // Aligned load
+    const auto lo2 = Load(d, &lanes[0]);
+    HWY_ASSERT_VEC_EQ(d, lo2, lo);
+
+    // Aligned store
+    auto lanes2 = AllocateAligned<T>(2 * N);
+    Store(lo2, d, &lanes2[0]);
+    Store(hi, d, &lanes2[N]);
+    for (size_t i = 0; i < 2 * N; ++i) {
+      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+    }
+
+    // Unaligned load
+    const auto vu = LoadU(d, &lanes[1]);
+    auto lanes3 = AllocateAligned<T>(N);
+    Store(vu, d, lanes3.get());
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
+    }
+
+    // Unaligned store
+    StoreU(lo2, d, &lanes2[N / 2]);
+    size_t i = 0;
+    for (; i < N / 2; ++i) {
+      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
+    }
+    for (; i < 3 * N / 2; ++i) {
+      HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
+    }
+    // Subsequent values remain unchanged.
+    for (; i < 2 * N; ++i) {
+      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllLoadStore() {
+  ForAllTypes(ForPartialVectors<TestLoadStore>());
+}
+
+struct TestStoreInterleaved3 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<uint8_t>(3 * N);
+    for (size_t i = 0; i < 3 * N; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(4 * N);
+    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[3 * i + 0] = bytes[0 * N + i];
+        expected[3 * i + 1] = bytes[1 * N + i];
+        expected[3 * i + 2] = bytes[2 * N + i];
+        // Ensure we do not write more than 3*N bytes
+        expected[3 * N + i] = actual[3 * N + i] = 0;
+      }
+      StoreInterleaved3(in0, in1, in2, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
+        Print(d, "in0", in0, pos / 3);
+        Print(d, "in1", in1, pos / 3);
+        Print(d, "in2", in2, pos / 3);
+        const size_t i = pos - pos % 3;
+        fprintf(stderr, "interleaved %d %d %d  %d %d %d\n", actual[i],
+                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+                actual[i + 5]);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreInterleaved3() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestStoreInterleaved3, 4> test;
+#else
+  const ForPartialVectors<TestStoreInterleaved3> test;
+#endif
+  test(uint8_t());
+}
+
+struct TestStoreInterleaved4 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be interleaved
+    auto bytes = AllocateAligned<uint8_t>(4 * N);
+    for (size_t i = 0; i < 4 * N; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto in0 = Load(d, &bytes[0 * N]);
+    const auto in1 = Load(d, &bytes[1 * N]);
+    const auto in2 = Load(d, &bytes[2 * N]);
+    const auto in3 = Load(d, &bytes[3 * N]);
+
+    // Interleave here, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(5 * N);
+    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
+    T* actual = actual_aligned.get() + 1;
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      for (size_t i = 0; i < N; ++i) {
+        expected[4 * i + 0] = bytes[0 * N + i];
+        expected[4 * i + 1] = bytes[1 * N + i];
+        expected[4 * i + 2] = bytes[2 * N + i];
+        expected[4 * i + 3] = bytes[3 * N + i];
+        // Ensure we do not write more than 4*N bytes
+        expected[4 * N + i] = actual[4 * N + i] = 0;
+      }
+      StoreInterleaved4(in0, in1, in2, in3, d, actual);
+      size_t pos = 0;
+      if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
+        Print(d, "in0", in0, pos / 4);
+        Print(d, "in1", in1, pos / 4);
+        Print(d, "in2", in2, pos / 4);
+        Print(d, "in3", in3, pos / 4);
+        const size_t i = pos;
+        fprintf(stderr, "interleaved %d %d %d %d  %d %d %d %d\n", actual[i],
+                actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
+                actual[i + 5], actual[i + 6], actual[i + 7]);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStoreInterleaved4() {
+#if HWY_TARGET == HWY_RVV
+  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
+  const ForExtendableVectors<TestStoreInterleaved4, 4> test;
+#else
+  const ForPartialVectors<TestStoreInterleaved4> test;
+#endif
+  test(uint8_t());
+}
+
+struct TestLoadDup128 {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define LoadDup128.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    constexpr size_t N128 = 16 / sizeof(T);
+    alignas(16) T lanes[N128];
+    for (size_t i = 0; i < N128; ++i) {
+      lanes[i] = static_cast<T>(1 + i);
+    }
+    const auto v = LoadDup128(d, lanes);
+    const size_t N = Lanes(d);
+    auto out = AllocateAligned<T>(N);
+    Store(v, d, out.get());
+    for (size_t i = 0; i < N; ++i) {
+      HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllLoadDup128() {
+  ForAllTypes(ForGE128Vectors<TestLoadDup128>());
+}
+
+struct TestStream {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, T(1));
+    const size_t affected_bytes =
+        (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
+        ~size_t(HWY_STREAM_MULTIPLE - 1);
+    const size_t affected_lanes = affected_bytes / sizeof(T);
+    auto out = AllocateAligned<T>(2 * affected_lanes);
+    std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
+
+    Stream(v, d, out.get());
+    StoreFence();
+    const auto actual = Load(d, out.get());
+    HWY_ASSERT_VEC_EQ(d, v, actual);
+    // Ensure Stream didn't modify more memory than expected
+    for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
+      HWY_ASSERT_EQ(T(0), out[i]);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllStream() {
+  const ForPartialVectors<TestStream> test;
+  // No u8,u16.
+  test(uint32_t());
+  test(uint64_t());
+  // No i8,i16.
+  test(int32_t());
+  test(int64_t());
+  ForFloatTypes(test);
+}
+
+// Assumes little-endian byte order!
+struct TestScatter {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Offset = MakeSigned<T>;
+
+    const size_t N = Lanes(d);
+    const size_t range = 4 * N;                  // number of items to scatter
+    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
+
+    RandomState rng;
+
+    // Data to be scattered
+    auto bytes = AllocateAligned<uint8_t>(max_bytes);
+    for (size_t i = 0; i < max_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
+
+    // Scatter into these regions, ensure vector results match scalar
+    auto expected = AllocateAligned<T>(range);
+    auto actual = AllocateAligned<T>(range);
+
+    const Rebind<Offset, D> d_offsets;
+    auto offsets = AllocateAligned<Offset>(N);  // or indices
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Byte offsets
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] =
+            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+        CopyBytes<sizeof(T)>(
+            bytes.get() + i * sizeof(T),
+            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
+      }
+      const auto voffsets = Load(d_offsets, offsets.get());
+      ScatterOffset(data, d, actual.get(), voffsets);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Offsets", voffsets);
+        HWY_ASSERT(false);
+      }
+
+      // Indices
+      std::fill(expected.get(), expected.get() + range, T(0));
+      std::fill(actual.get(), actual.get() + range, T(0));
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
+        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
+                             &expected[offsets[i]]);
+      }
+      const auto vindices = Load(d_offsets, offsets.get());
+      ScatterIndex(data, d, actual.get(), vindices);
+      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
+        Print(d, "Data", data);
+        Print(d_offsets, "Indices", vindices);
+        HWY_ASSERT(false);
+      }
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllScatter() {
+  // No u8,u16,i8,i16.
+  const ForPartialVectors<TestScatter> test;
+  test(uint32_t());
+  test(int32_t());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+
+  ForFloatTypes(test);
+}
+
+struct TestGather {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using Offset = MakeSigned<T>;
+
+    const size_t N = Lanes(d);
+
+    RandomState rng;
+
+    // Data to be gathered from
+    const size_t max_bytes = 4 * N * sizeof(T);  // upper bound on offset
+    auto bytes = AllocateAligned<uint8_t>(max_bytes);
+    for (size_t i = 0; i < max_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
+    }
+
+    auto expected = AllocateAligned<T>(N);
+    auto offsets = AllocateAligned<Offset>(N);
+    auto indices = AllocateAligned<Offset>(N);
+
+    for (size_t rep = 0; rep < 100; ++rep) {
+      // Offsets
+      for (size_t i = 0; i < N; ++i) {
+        offsets[i] =
+            static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
+        CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
+      }
+
+      const Rebind<Offset, D> d_offset;
+      const T* base = reinterpret_cast<const T*>(bytes.get());
+      auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+
+      // Indices
+      for (size_t i = 0; i < N; ++i) {
+        indices[i] =
+            static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
+        CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
+      }
+      actual = GatherIndex(d, base, Load(d_offset, indices.get()));
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllGather() {
+  // No u8,u16,i8,i16.
+  const ForPartialVectors<TestGather> test;
+  test(uint32_t());
+  test(int32_t());
+
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+  test(int64_t());
+#endif
+  ForFloatTypes(test);
+}
+
+HWY_NOINLINE void TestAllCache() {
+  LoadFence();
+  StoreFence();
+  int test = 0;
+  Prefetch(&test);
+  FlushCacheline(&test);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwyMemoryTest);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
+HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/tests/swizzle_test.cc
+++ b/third_party/highway/hwy/tests/swizzle_test.cc
@ -0,0 +1,642 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestShiftBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Bytes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const Repartition<uint8_t, D> du8;
+    const size_t N8 = Lanes(du8);
+
+    // Zero remains zero
+    const auto v0 = Zero(d);
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(v0));
+
+    // Zero after shifting out the high/low byte
+    auto bytes = AllocateAligned<uint8_t>(N8);
+    std::fill(bytes.get(), bytes.get() + N8, 0);
+    bytes[N8 - 1] = 0x7F;
+    const auto vhi = BitCast(d, Load(du8, bytes.get()));
+    bytes[N8 - 1] = 0;
+    bytes[0] = 0x7F;
+    const auto vlo = BitCast(d, Load(du8, bytes.get()));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
+    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(vlo));
+
+    // Check expected result with Iota
+    const size_t N = Lanes(d);
+    auto in = AllocateAligned<T>(N);
+    const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
+    const auto v = BitCast(d, Iota(du8, 1));
+    Store(v, d, in.get());
+
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    const size_t kBlockSize = HWY_MIN(N8, 16);
+    for (size_t block = 0; block < N8; block += kBlockSize) {
+      expected_bytes[block] = 0;
+      memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
+
+    for (size_t block = 0; block < N8; block += kBlockSize) {
+      memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
+      expected_bytes[block + kBlockSize - 1] = 0;
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftBytes() {
+  ForIntegerTypes(ForGE128Vectors<TestShiftBytes>());
+}
+
+struct TestShiftLanes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // Scalar does not define Shift*Lanes.
+#if HWY_TARGET != HWY_SCALAR || HWY_IDE
+    const auto v = Iota(d, T(1));
+    const size_t N = Lanes(d);
+    auto expected = AllocateAligned<T>(N);
+
+    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
+    HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(v));
+
+    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
+
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] =
+          (i % kLanesPerBlock) == (kLanesPerBlock - 1) ? T(0) : T(2 + i);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(v));
+#else
+    (void)d;
+#endif  // #if HWY_TARGET != HWY_SCALAR
+  }
+};
+
+HWY_NOINLINE void TestAllShiftLanes() {
+  ForAllTypes(ForGE128Vectors<TestShiftLanes>());
+}
+
+template <typename D, int kLane>
+struct TestBroadcastR {
+  HWY_NOINLINE void operator()() const {
+// TODO(janwas): fix failure
+#if HWY_TARGET != HWY_WASM
+    using T = typename D::T;
+    const D d;
+    const size_t N = Lanes(d);
+    auto in_lanes = AllocateAligned<T>(N);
+    std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
+    const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
+    // Need to set within each 128-bit block
+    for (size_t block = 0; block < N; block += blockN) {
+      in_lanes[block + kLane] = static_cast<T>(block + 1);
+    }
+    const auto in = Load(d, in_lanes.get());
+    auto expected = AllocateAligned<T>(N);
+    for (size_t block = 0; block < N; block += blockN) {
+      for (size_t i = 0; i < blockN; ++i) {
+        expected[block + i] = T(block + 1);
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
+
+    TestBroadcastR<D, kLane - 1>()();
+#endif
+  }
+};
+
+template <class D>
+struct TestBroadcastR<D, -1> {
+  void operator()() const {}
+};
+
+struct TestBroadcast {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
+  }
+};
+
+HWY_NOINLINE void TestAllBroadcast() {
+  const ForPartialVectors<TestBroadcast> test;
+  // No u8.
+  test(uint16_t());
+  test(uint32_t());
+#if HWY_CAP_INTEGER64
+  test(uint64_t());
+#endif
+
+  // No i8.
+  test(int16_t());
+  test(int32_t());
+#if HWY_CAP_INTEGER64
+  test(int64_t());
+#endif
+
+  ForFloatTypes(test);
+}
+
+struct TestTableLookupBytes {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    RandomState rng;
+    const size_t N = Lanes(d);
+    const size_t N8 = Lanes(Repartition<uint8_t, D>());
+    auto in_bytes = AllocateAligned<uint8_t>(N8);
+    for (size_t i = 0; i < N8; ++i) {
+      in_bytes[i] = Random32(&rng) & 0xFF;
+    }
+    const auto in =
+        BitCast(d, Load(d, reinterpret_cast<const T*>(in_bytes.get())));
+
+    // Enough test data; for larger vectors, upper lanes will be zero.
+    const uint8_t index_bytes_source[64] = {
+        // Same index as source, multiple outputs from same input,
+        // unused input (9), ascending/descending and nonconsecutive neighbors.
+        0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
+        11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
+        4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
+    auto index_bytes = AllocateAligned<uint8_t>(N8);
+    for (size_t i = 0; i < N8; ++i) {
+      index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
+      // Avoid undefined results / asan error for scalar by capping indices.
+      if (index_bytes[i] >= N * sizeof(T)) {
+        index_bytes[i] = static_cast<uint8_t>(N * sizeof(T) - 1);
+      }
+    }
+    const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
+    auto expected = AllocateAligned<T>(N);
+    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
+
+    // Byte indices wrap around
+    const size_t mod = HWY_MIN(N8, 256);
+    for (size_t block = 0; block < N8; block += 16) {
+      for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
+        const uint8_t index = index_bytes[block + i];
+        expected_bytes[block + i] = in_bytes[(block + index) % mod];
+      }
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
+  }
+};
+
+HWY_NOINLINE void TestAllTableLookupBytes() {
+  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
+}
+struct TestTableLookupLanes {
+#if HWY_TARGET == HWY_RVV
+  using Index = uint32_t;
+#else
+  using Index = int32_t;
+#endif
+
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+#if HWY_TARGET != HWY_SCALAR
+    const size_t N = Lanes(d);
+    auto idx = AllocateAligned<Index>(N);
+    std::fill(idx.get(), idx.get() + N, Index(0));
+    auto expected = AllocateAligned<T>(N);
+    const auto v = Iota(d, 1);
+
+    if (N <= 8) {  // Test all permutations
+      for (size_t i0 = 0; i0 < N; ++i0) {
+        idx[0] = static_cast<Index>(i0);
+        for (size_t i1 = 0; i1 < N; ++i1) {
+          idx[1] = static_cast<Index>(i1);
+          for (size_t i2 = 0; i2 < N; ++i2) {
+            idx[2] = static_cast<Index>(i2);
+            for (size_t i3 = 0; i3 < N; ++i3) {
+              idx[3] = static_cast<Index>(i3);
+
+              for (size_t i = 0; i < N; ++i) {
+                expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+              }
+
+              const auto opaque = SetTableIndices(d, idx.get());
+              const auto actual = TableLookupLanes(v, opaque);
+              HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+            }
+          }
+        }
+      }
+    } else {
+      // Too many permutations to test exhaustively; choose one with repeated
+      // and cross-block indices and ensure indices do not exceed #lanes.
+      // For larger vectors, upper lanes will be zero.
+      HWY_ALIGN Index idx_source[16] = {1,  3,  2,  2,  8, 1, 7, 6,
+                                        15, 14, 14, 15, 4, 9, 8, 5};
+      for (size_t i = 0; i < N; ++i) {
+        idx[i] = (i < 16) ? idx_source[i] : 0;
+        // Avoid undefined results / asan error for scalar by capping indices.
+        if (idx[i] >= static_cast<Index>(N)) {
+          idx[i] = static_cast<Index>(N - 1);
+        }
+        expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
+      }
+
+      const auto opaque = SetTableIndices(d, idx.get());
+      const auto actual = TableLookupLanes(v, opaque);
+      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
+    }
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllTableLookupLanes() {
+  const ForFullVectors<TestTableLookupLanes> test;
+  test(uint32_t());
+  test(int32_t());
+  test(float());
+}
+
+struct TestInterleave {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using TU = MakeUnsigned<T>;
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const size_t blockN = 16 / sizeof(T);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      const size_t index = (i % blockN) + block * 2 * blockN;
+      expected[i] = static_cast<T>(index & LimitsMax<TU>());
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
+
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      const size_t block = i / blockN;
+      expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllInterleave() {
+  // Not supported by HWY_SCALAR: Interleave(f32, f32) would return f32x2.
+  ForAllTypes(ForGE128Vectors<TestInterleave>());
+}
+
+struct TestZipLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const Repartition<WideT, D> dw;
+    auto expected = AllocateAligned<WideT>(Lanes(dw));
+    const WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
+    for (size_t i = 0; i < Lanes(dw); ++i) {
+      const size_t block = i / blockN;
+      // Value of least-significant lane in lo-vector.
+      const WideT lo =
+          static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
+      const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
+      expected[i] =
+          static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) + lo);
+    }
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
+  }
+};
+
+struct TestZipUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    using WideT = MakeWide<T>;
+    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
+    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
+    const size_t N = Lanes(d);
+    auto even_lanes = AllocateAligned<T>(N);
+    auto odd_lanes = AllocateAligned<T>(N);
+    for (size_t i = 0; i < Lanes(d); ++i) {
+      even_lanes[i] = static_cast<T>(2 * i + 0);
+      odd_lanes[i] = static_cast<T>(2 * i + 1);
+    }
+    const auto even = Load(d, even_lanes.get());
+    const auto odd = Load(d, odd_lanes.get());
+
+    const Repartition<WideT, D> dw;
+    auto expected = AllocateAligned<WideT>(Lanes(dw));
+
+    constexpr WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
+    for (size_t i = 0; i < Lanes(dw); ++i) {
+      const size_t block = i / blockN;
+      const WideT lo =
+          static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
+      const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
+      expected[i] = static_cast<WideT>(
+          (static_cast<WideT>(lo + 2 * blockN + 1) << kBits) + lo + 2 * blockN);
+    }
+    HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(even, odd));
+  }
+};
+
+HWY_NOINLINE void TestAllZip() {
+  const ForPartialVectors<TestZipLower, 2> lower_unsigned;
+  // TODO(janwas): fix
+#if HWY_TARGET != HWY_RVV
+  lower_unsigned(uint8_t());
+#endif
+  lower_unsigned(uint16_t());
+#if HWY_CAP_INTEGER64
+  lower_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForPartialVectors<TestZipLower, 2> lower_signed;
+#if HWY_TARGET != HWY_RVV
+  lower_signed(int8_t());
+#endif
+  lower_signed(int16_t());
+#if HWY_CAP_INTEGER64
+  lower_signed(int32_t());  // generates i64
+#endif
+
+  const ForGE128Vectors<TestZipUpper> upper_unsigned;
+#if HWY_TARGET != HWY_RVV
+  upper_unsigned(uint8_t());
+#endif
+  upper_unsigned(uint16_t());
+#if HWY_CAP_INTEGER64
+  upper_unsigned(uint32_t());  // generates u64
+#endif
+
+  const ForGE128Vectors<TestZipUpper> upper_signed;
+#if HWY_TARGET != HWY_RVV
+  upper_signed(int8_t());
+#endif
+  upper_signed(int16_t());
+#if HWY_CAP_INTEGER64
+  upper_signed(int32_t());  // generates i64
+#endif
+
+  // No float - concatenating f32 does not result in a f64
+}
+
+class TestSpecialShuffle32 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+
+#define VERIFY_LANES_32(d, v, i3, i2, i1, i0) \
+  VerifyLanes32((d), (v), (i3), (i2), (i1), (i0), __FILE__, __LINE__)
+
+    VERIFY_LANES_32(d, Shuffle2301(v), 2, 3, 0, 1);
+    VERIFY_LANES_32(d, Shuffle1032(v), 1, 0, 3, 2);
+    VERIFY_LANES_32(d, Shuffle0321(v), 0, 3, 2, 1);
+    VERIFY_LANES_32(d, Shuffle2103(v), 2, 1, 0, 3);
+    VERIFY_LANES_32(d, Shuffle0123(v), 0, 1, 2, 3);
+
+#undef VERIFY_LANES_32
+  }
+
+ private:
+  template <class D, class V>
+  HWY_NOINLINE void VerifyLanes32(D d, V v, const int i3, const int i2,
+                                  const int i1, const int i0,
+                                  const char* filename, const int line) {
+    using T = typename D::T;
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    Store(v, d, lanes.get());
+    const std::string name = TypeName(lanes[0], N);
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
+      AssertEqual(T(block + i3), lanes[block + 3], name, filename, line);
+      AssertEqual(T(block + i2), lanes[block + 2], name, filename, line);
+      AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
+      AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
+    }
+  }
+};
+
+class TestSpecialShuffle64 {
+ public:
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const auto v = Iota(d, 0);
+    VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
+  }
+
+ private:
+  template <class D, class V>
+  HWY_NOINLINE void VerifyLanes64(D d, V v, const int i1, const int i0,
+                                  const char* filename, const int line) {
+    using T = typename D::T;
+    const size_t N = Lanes(d);
+    auto lanes = AllocateAligned<T>(N);
+    Store(v, d, lanes.get());
+    const std::string name = TypeName(lanes[0], N);
+    constexpr size_t kBlockN = 16 / sizeof(T);
+    for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
+      AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
+      AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllSpecialShuffles() {
+  const ForGE128Vectors<TestSpecialShuffle32> test32;
+  test32(uint32_t());
+  test32(int32_t());
+  test32(float());
+
+#if HWY_CAP_INTEGER64
+  const ForGE128Vectors<TestSpecialShuffle64> test64;
+  test64(uint64_t());
+  test64(int64_t());
+#endif
+
+#if HWY_CAP_FLOAT64
+  const ForGE128Vectors<TestSpecialShuffle64> test_d;
+  test_d(double());
+#endif
+}
+
+struct TestConcatHalves {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // TODO(janwas): fix
+#if HWY_TARGET != HWY_RVV
+    // Construct inputs such that interleaved halves == iota.
+    const auto expected = Iota(d, 1);
+
+    const size_t N = Lanes(d);
+    auto lo = AllocateAligned<T>(N);
+    auto hi = AllocateAligned<T>(N);
+    size_t i;
+    for (i = 0; i < N / 2; ++i) {
+      lo[i] = static_cast<T>(1 + i);
+      hi[i] = static_cast<T>(lo[i] + T(N) / 2);
+    }
+    for (; i < N; ++i) {
+      lo[i] = hi[i] = 0;
+    }
+
+    HWY_ASSERT_VEC_EQ(d, expected,
+                      ConcatLowerLower(Load(d, hi.get()), Load(d, lo.get())));
+
+    // Same for high blocks.
+    for (i = 0; i < N / 2; ++i) {
+      lo[i] = hi[i] = 0;
+    }
+    for (; i < N; ++i) {
+      lo[i] = static_cast<T>(1 + i - N / 2);
+      hi[i] = static_cast<T>(lo[i] + T(N) / 2);
+    }
+
+    HWY_ASSERT_VEC_EQ(d, expected,
+                      ConcatUpperUpper(Load(d, hi.get()), Load(d, lo.get())));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllConcatHalves() {
+  ForAllTypes(ForGE128Vectors<TestConcatHalves>());
+}
+
+struct TestConcatLowerUpper {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    // TODO(janwas): fix
+#if HWY_TARGET != HWY_RVV
+    const size_t N = Lanes(d);
+    // Middle part of Iota(1) == Iota(1 + N / 2).
+    const auto lo = Iota(d, 1);
+    const auto hi = Iota(d, 1 + N);
+    HWY_ASSERT_VEC_EQ(d, Iota(d, 1 + N / 2), ConcatLowerUpper(hi, lo));
+#else
+    (void)d;
+#endif
+  }
+};
+
+HWY_NOINLINE void TestAllConcatLowerUpper() {
+  ForAllTypes(ForGE128Vectors<TestConcatLowerUpper>());
+}
+
+struct TestConcatUpperLower {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto lo = Iota(d, 1);
+    const auto hi = Iota(d, 1 + N);
+    auto expected = AllocateAligned<T>(N);
+    size_t i = 0;
+    for (; i < N / 2; ++i) {
+      expected[i] = static_cast<T>(1 + i);
+    }
+    for (; i < N; ++i) {
+      expected[i] = static_cast<T>(1 + i + N);
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(hi, lo));
+  }
+};
+
+HWY_NOINLINE void TestAllConcatUpperLower() {
+  ForAllTypes(ForGE128Vectors<TestConcatUpperLower>());
+}
+
+struct TestOddEven {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T /*unused*/, D d) {
+    const size_t N = Lanes(d);
+    const auto even = Iota(d, 1);
+    const auto odd = Iota(d, 1 + N);
+    auto expected = AllocateAligned<T>(N);
+    for (size_t i = 0; i < N; ++i) {
+      expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
+    }
+    HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
+  }
+};
+
+HWY_NOINLINE void TestAllOddEven() {
+  ForAllTypes(ForGE128Vectors<TestOddEven>());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(HwySwizzleTest);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllBroadcast);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupBytes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInterleave);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllZip);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSpecialShuffles);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatHalves);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
+HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
+}  // namespace hwy
+#endif
--- a/third_party/highway/hwy/tests/test_util-inl.h
+++ b/third_party/highway/hwy/tests/test_util-inl.h
@ -0,0 +1,580 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Normal include guard for non-SIMD portion of this header.
+#ifndef HWY_TESTS_TEST_UTIL_H_
+#define HWY_TESTS_TEST_UTIL_H_
+
+// Helper functions for use by *_test.cc.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <cstddef>
+#include <string>
+#include <utility>  // std::forward
+
+#include "hwy/aligned_allocator.h"
+#include "hwy/base.h"
+#include "hwy/highway.h"
+
+#include "gtest/gtest.h"
+
+namespace hwy {
+
+// The maximum vector size used in tests when defining test data. DEPRECATED.
+constexpr size_t kTestMaxVectorSize = 64;
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Helper class to run parametric tests using the hwy target as parameter. To
+// use this define the following in your test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   TEST_P(MyTestSuite, MyTest) { ... }
+class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
+ protected:
+  void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(SupportedTargetsCalledForTest())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+};
+
+// Function to convert the test parameter of a TestWithParamTarget for
+// displaying it in the gtest test name.
+static inline std::string TestParamTargetName(
+    const testing::TestParamInfo<uint32_t>& info) {
+  return TargetName(info.param);
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite)              \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                           \
+      suite##Group, suite,                                      \
+      testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+      ::hwy::TestParamTargetName)
+
+// Helper class similar to TestWithParamTarget to run parametric tests that
+// depend on the target and another parametric test. If you need to use multiple
+// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
+// the generator. To use this class define the following in your test:
+//   class MyTestSuite : public TestWithParamTargetT<int> {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
+//   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
+template <typename T>
+class TestWithParamTargetAndT
+    : public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
+ public:
+  // Expose the parametric type here so it can be used by the
+  // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
+  using HwyParamType = T;
+
+ protected:
+  void SetUp() override {
+    SetSupportedTargetsForTest(std::get<0>(
+        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
+  }
+
+  void TearDown() override {
+    // Check that the parametric test calls SupportedTargets() when the source
+    // was compiled with more than one target. In the single-target case only
+    // static dispatch will be used anyway.
+#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
+    EXPECT_TRUE(SupportedTargetsCalledForTest())
+        << "This hwy target parametric test doesn't use dynamic-dispatch and "
+           "doesn't need to be parametric.";
+#endif
+    SetSupportedTargetsForTest(0);
+  }
+
+  T GetParam() {
+    return std::get<1>(
+        ::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
+  }
+};
+
+template <typename T>
+std::string TestParamTargetNameAndT(
+    const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
+  return std::string(TargetName(std::get<0>(info.param))) + "_" +
+         ::testing::PrintToString(std::get<1>(info.param));
+}
+
+#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator)     \
+  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                               \
+      suite##Group, suite,                                          \
+      ::testing::Combine(                                           \
+          testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
+          generator),                                               \
+      ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
+
+// Helper macro to export a function and define a test that tests it. This is
+// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
+//   class MyTestSuite : public TestWithParamTarget {
+//    ...
+//   };
+//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
+//   HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
+#define HWY_EXPORT_AND_TEST_P(suite, func_name)                   \
+  HWY_EXPORT(func_name);                                          \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_EXPORT_AND_TEST_P_T(suite, func_name)                           \
+  HWY_EXPORT(func_name);                                                    \
+  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
+  static_assert(true, "For requiring trailing semicolon")
+
+#define HWY_BEFORE_TEST(suite)                      \
+  class suite : public hwy::TestWithParamTarget {}; \
+  HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
+  static_assert(true, "For requiring trailing semicolon")
+
+// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
+// which triggers a compiler bug.
+class RandomState {
+ public:
+  explicit RandomState(const uint64_t seed = 0x123456789ull) {
+    s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
+    s1_ = SplitMix64(s0_);
+  }
+
+  HWY_INLINE uint64_t operator()() {
+    uint64_t s1 = s0_;
+    const uint64_t s0 = s1_;
+    const uint64_t bits = s1 + s0;
+    s0_ = s0;
+    s1 ^= s1 << 23;
+    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
+    s1_ = s1;
+    return bits;
+  }
+
+ private:
+  static uint64_t SplitMix64(uint64_t z) {
+    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
+    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
+    return z ^ (z >> 31);
+  }
+
+  uint64_t s0_;
+  uint64_t s1_;
+};
+
+static HWY_INLINE uint32_t Random32(RandomState* rng) {
+  return static_cast<uint32_t>((*rng)());
+}
+
+// Prevents the compiler from eliding the computations that led to "output".
+// Works by indicating to the compiler that "output" is being read and modified.
+// The +r constraint avoids unnecessary writes to memory, but only works for
+// built-in types.
+template <class T>
+inline void PreventElision(T&& output) {
+#if HWY_COMPILER_MSVC
+  (void)output;
+#else   // HWY_COMPILER_MSVC
+  asm volatile("" : "+r"(output) : : "memory");
+#endif  // HWY_COMPILER_MSVC
+}
+
+// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
+// unsigned/signed/floating point, followed by the number of bits per lane;
+// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
+// understanding which instantiation of a generic test failed.
+template <typename T>
+static inline std::string TypeName(T /*unused*/, size_t N) {
+  const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
+  char name[64];
+  // Omit the xN suffix for scalars.
+  if (N == 1) {
+    snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
+  } else {
+    snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
+  }
+  return name;
+}
+
+// String comparison
+
+template <typename T1, typename T2>
+inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
+                       size_t* pos = nullptr) {
+  const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
+  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
+  for (size_t i = 0; i < size; ++i) {
+    if (bytes1[i] != bytes2[i]) {
+      fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
+              size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
+              TypeName(T2(), 1).c_str());
+      if (pos != nullptr) {
+        *pos = i;
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+inline bool StringsEqual(const char* s1, const char* s2) {
+  while (*s1 == *s2++) {
+    if (*s1++ == '\0') return true;
+  }
+  return false;
+}
+
+}  // namespace hwy
+
+#endif  // HWY_TESTS_TEST_UTIL_H_
+
+// Per-target include guard
+#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#else
+#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
+#endif
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+// Prints lanes around `lane`, in memory order.
+template <class D>
+HWY_NOINLINE void Print(const D d, const char* caption, const Vec<D> v,
+                        intptr_t lane = 0) {
+  using T = TFromD<D>;
+  const size_t N = Lanes(d);
+  auto lanes = AllocateAligned<T>(N);
+  Store(v, d, lanes.get());
+  const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
+  const size_t end = std::min(begin + 7, N);
+  fprintf(stderr, "%s %s [%zu+ ->]:\n  ", TypeName(T(), N).c_str(), caption,
+          begin);
+  for (size_t i = begin; i < end; ++i) {
+    fprintf(stderr, "%g,", double(lanes[i]));
+  }
+  if (begin >= end) fprintf(stderr, "(out of bounds)");
+  fprintf(stderr, "\n");
+}
+
+static HWY_NORETURN HWY_NOINLINE void NotifyFailure(
+    const char* filename, const int line, const char* type_name,
+    const size_t lane, const char* expected, const char* actual) {
+  hwy::Abort(filename, line,
+             "%s, %s lane %zu mismatch: expected '%s', got '%s'.\n",
+             hwy::TargetName(HWY_TARGET), type_name, lane, expected, actual);
+}
+
+template <class Out, class In>
+inline Out BitCast(const In& in) {
+  static_assert(sizeof(Out) == sizeof(In), "");
+  Out out;
+  CopyBytes<sizeof(out)>(&in, &out);
+  return out;
+}
+
+// Computes the difference in units of last place between x and y.
+template <typename TF>
+MakeUnsigned<TF> ComputeUlpDelta(TF x, TF y) {
+  static_assert(IsFloat<TF>(), "Only makes sense for floating-point");
+  using TU = MakeUnsigned<TF>;
+
+  // Handle -0 == 0 and infinities.
+  if (x == y) return 0;
+
+  // Consider "equal" if both are NaN, so we can verify an expected NaN.
+  // Needs a special case because there are many possible NaN representations.
+  if (std::isnan(x) && std::isnan(y)) return 0;
+
+  // NOTE: no need to check for differing signs; they will result in large
+  // differences, which is fine, and we avoid overflow.
+
+  const TU ux = BitCast<TU>(x);
+  const TU uy = BitCast<TU>(y);
+  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
+  return std::max(ux, uy) - std::min(ux, uy);
+}
+
+template <typename T, HWY_IF_NOT_FLOAT(T)>
+HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
+  return expected == actual;
+}
+
+template <typename T, HWY_IF_FLOAT(T)>
+HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
+  return ComputeUlpDelta(expected, actual) <= 1;
+}
+
+// Compare non-vector, non-string T.
+template <typename T>
+HWY_NOINLINE void AssertEqual(const T expected, const T actual,
+                              const std::string& type_name,
+                              const char* filename = "", const int line = -1,
+                              const size_t lane = 0) {
+  if (!IsEqual(expected, actual)) {
+    char expected_str[100];
+    snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
+    char actual_str[100];
+    snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
+    NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
+                  actual_str);
+  }
+}
+
+static HWY_NOINLINE HWY_MAYBE_UNUSED void AssertStringEqual(
+    const char* expected, const char* actual, const char* filename = "",
+    const int line = -1, const size_t lane = 0) {
+  if (!hwy::StringsEqual(expected, actual)) {
+    NotifyFailure(filename, line, "string", lane, expected, actual);
+  }
+}
+
+// Compare expected vector to vector.
+template <class D, class V>
+HWY_NOINLINE void AssertVecEqual(D d, const V expected, const V actual,
+                                 const char* filename, const int line) {
+  using T = TFromD<D>;
+  const size_t N = Lanes(d);
+  auto expected_lanes = AllocateAligned<T>(N);
+  auto actual_lanes = AllocateAligned<T>(N);
+  Store(expected, d, expected_lanes.get());
+  Store(actual, d, actual_lanes.get());
+  for (size_t i = 0; i < N; ++i) {
+    if (!IsEqual(expected_lanes[i], actual_lanes[i])) {
+      fprintf(stderr, "\n\n");
+      Print(d, "expect", expected, i);
+      Print(d, "actual", actual, i);
+
+      char expected_str[100];
+      snprintf(expected_str, sizeof(expected_str), "%g",
+               double(expected_lanes[i]));
+      char actual_str[100];
+      snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
+
+      NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
+                    expected_str, actual_str);
+    }
+  }
+}
+
+// Compare expected lanes to vector.
+template <class D>
+HWY_NOINLINE void AssertVecEqual(D d, const TFromD<D>* expected, Vec<D> actual,
+                                 const char* filename, int line) {
+  AssertVecEqual(d, LoadU(d, expected), actual, filename, line);
+}
+
+template <class D>
+HWY_NOINLINE void AssertMaskEqual(D d, Mask<D> a, Mask<D> b,
+                                  const char* filename, int line) {
+  AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
+
+  const std::string type_name = TypeName(TFromD<D>(), Lanes(d));
+  AssertEqual(CountTrue(a), CountTrue(b), type_name, filename, line, 0);
+  AssertEqual(AllTrue(a), AllTrue(b), type_name, filename, line, 0);
+  AssertEqual(AllFalse(a), AllFalse(b), type_name, filename, line, 0);
+
+  // TODO(janwas): StoreMaskBits
+}
+
+template <class D>
+HWY_NOINLINE Mask<D> MaskTrue(const D d) {
+  const auto v0 = Zero(d);
+  return Eq(v0, v0);
+}
+
+template <class D>
+HWY_NOINLINE Mask<D> MaskFalse(const D d) {
+  // Lt is only for signed types and we cannot yet cast mask types.
+  return Eq(Zero(d), Set(d, 1));
+}
+
+#ifndef HWY_ASSERT_EQ
+
+#define HWY_ASSERT_EQ(expected, actual) \
+  AssertEqual(expected, actual, hwy::TypeName(expected, 1), __FILE__, __LINE__)
+
+#define HWY_ASSERT_STRING_EQ(expected, actual) \
+  AssertStringEqual(expected, actual, __FILE__, __LINE__)
+
+#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
+  AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
+
+#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
+  AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
+
+#endif  // HWY_ASSERT_EQ
+
+// Helpers for instantiating tests with combinations of lane types / counts.
+
+// For all powers of two in [kMinLanes, N * kMinLanes] (so that recursion stops
+// at N == 0)
+template <typename T, size_t N, size_t kMinLanes, class Test>
+struct ForeachSizeR {
+  static void Do() {
+    static_assert(N != 0, "End of recursion");
+    Test()(T(), Simd<T, N * kMinLanes>());
+    ForeachSizeR<T, N / 2, kMinLanes, Test>::Do();
+  }
+};
+
+// Base case to stop the recursion.
+template <typename T, size_t kMinLanes, class Test>
+struct ForeachSizeR<T, 0, kMinLanes, Test> {
+  static void Do() {}
+};
+
+// These adapters may be called directly, or via For*Types:
+
+// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
+template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
+struct ForPartialVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_RVV
+    // Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
+    ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
+                 Test>::Do();
+#endif
+  }
+};
+
+// Calls Test for all vectors that can be demoted log2(kFactor) times.
+template <class Test, size_t kFactor>
+struct ForDemoteVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_RVV
+    // Only m1..8 for now.
+    ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T), 1, Test>::Do();
+#endif
+  }
+};
+
+// Calls Test for all powers of two in [128 bits, max bits].
+template <class Test>
+struct ForGE128Vectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
+                 Test>::Do();
+
+#endif
+  }
+};
+
+// Calls Test for all vectors that can be expanded by kFactor.
+template <class Test, size_t kFactor = 2>
+struct ForExtendableVectors {
+  template <typename T>
+  void operator()(T /*unused*/) const {
+#if HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
+#else
+    ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
+                 Test>::Do();
+#endif
+  }
+};
+
+// Calls Test for full vectors only.
+template <class Test>
+struct ForFullVectors {
+  template <typename T>
+  void operator()(T t) const {
+#if HWY_TARGET == HWY_RVV
+    ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
+    (void)t;
+#else
+    Test()(t, HWY_FULL(T)());
+#endif
+  }
+};
+
+// Type lists to shorten call sites:
+
+template <class Func>
+void ForSignedTypes(const Func& func) {
+  func(int8_t());
+  func(int16_t());
+  func(int32_t());
+#if HWY_CAP_INTEGER64
+  func(int64_t());
+#endif
+}
+
+template <class Func>
+void ForUnsignedTypes(const Func& func) {
+  func(uint8_t());
+  func(uint16_t());
+  func(uint32_t());
+#if HWY_CAP_INTEGER64
+  func(uint64_t());
+#endif
+}
+
+template <class Func>
+void ForIntegerTypes(const Func& func) {
+  ForSignedTypes(func);
+  ForUnsignedTypes(func);
+}
+
+template <class Func>
+void ForFloatTypes(const Func& func) {
+  func(float());
+#if HWY_CAP_FLOAT64
+  func(double());
+#endif
+}
+
+template <class Func>
+void ForAllTypes(const Func& func) {
+  ForIntegerTypes(func);
+  ForFloatTypes(func);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#endif  // per-target include guard
--- a/third_party/highway/hwy/tests/test_util_test.cc
+++ b/third_party/highway/hwy/tests/test_util_test.cc
@ -0,0 +1,102 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
+#include "hwy/foreach_target.h"
+#include "hwy/highway.h"
+#include "hwy/tests/test_util-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace hwy {
+namespace HWY_NAMESPACE {
+
+struct TestName {
+  template <class T, class D>
+  HWY_NOINLINE void operator()(T t, D d) {
+    char num[10];
+    std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
+    snprintf(num, sizeof(num), "%zu", sizeof(T) * 8);
+    expected += num;
+
+    const size_t N = Lanes(d);
+    if (N != 1) {
+      expected += 'x';
+      snprintf(num, sizeof(num), "%zu", N);
+      expected += num;
+    }
+    const std::string actual = TypeName(t, N);
+    if (expected != actual) {
+      NotifyFailure(__FILE__, __LINE__, expected.c_str(), 0, expected.c_str(),
+                    actual.c_str());
+    }
+  }
+};
+
+HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
+
+struct TestEqualInteger {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*t*/) const {
+    HWY_ASSERT(IsEqual(T(0), T(0)));
+    HWY_ASSERT(IsEqual(T(1), T(1)));
+    HWY_ASSERT(IsEqual(T(-1), T(-1)));
+    HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));
+
+    HWY_ASSERT(!IsEqual(T(0), T(1)));
+    HWY_ASSERT(!IsEqual(T(1), T(0)));
+    HWY_ASSERT(!IsEqual(T(1), T(-1)));
+    HWY_ASSERT(!IsEqual(T(-1), T(1)));
+    HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
+    HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
+  }
+};
+
+struct TestEqualFloat {
+  template <class T>
+  HWY_NOINLINE void operator()(T /*t*/) const {
+    HWY_ASSERT(IsEqual(T(0), T(0)));
+    HWY_ASSERT(IsEqual(T(1), T(1)));
+    HWY_ASSERT(IsEqual(T(-1), T(-1)));
+    HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
+
+    HWY_ASSERT(!IsEqual(T(0), T(1)));
+    HWY_ASSERT(!IsEqual(T(1), T(0)));
+    HWY_ASSERT(!IsEqual(T(1), T(-1)));
+    HWY_ASSERT(!IsEqual(T(-1), T(1)));
+    HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
+    HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
+  }
+};
+
+HWY_NOINLINE void TestAllEqual() {
+  ForIntegerTypes(TestEqualInteger());
+  ForFloatTypes(TestEqualFloat());
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace hwy
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace hwy {
+HWY_BEFORE_TEST(TestUtilTest);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
+HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
+}  // namespace hwy
+#endif
--- a/third_party/highway/libhwy-test.pc.in
+++ b/third_party/highway/libhwy-test.pc.in
@ -0,0 +1,8 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: libhwy-test
+Description: Efficient and performance-portable SIMD wrapper, test helpers.
+Requires: gtest
+Version: @HWY_LIBRARY_VERSION@
+Cflags: -I${includedir}
--- a/third_party/highway/libhwy.pc.in
+++ b/third_party/highway/libhwy.pc.in
@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
+includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
+
+Name: libhwy
+Description: Efficient and performance-portable SIMD wrapper
+Version: @HWY_LIBRARY_VERSION@
+Libs: -L${libdir} -lhwy
+Cflags: -I${includedir}
--- a/third_party/highway/run_tests.bat
+++ b/third_party/highway/run_tests.bat
@ -0,0 +1,20 @@
+@echo off
+REM Switch directory of this batch file
+cd %~dp0
+
+if not exist build mkdir build
+
+cd build
+cmake .. -G Ninja || goto error
+ninja || goto error
+ctest -j || goto error
+
+cd ..
+echo Success
+goto end
+
+:error
+echo Failure
+exit /b 1
+
+:end
--- a/third_party/highway/run_tests.sh
+++ b/third_party/highway/run_tests.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Switch to directory of this script
+MYDIR=$(dirname $(realpath "$0"))
+cd "${MYDIR}"
+
+# Exit if anything fails
+set -e
+
+mkdir -p build
+cd build
+cmake ..
+make -j
+ctest -j
+echo Success
--- a/third_party/jpeg-xl/.clang-format
+++ b/third_party/jpeg-xl/.clang-format
@ -0,0 +1,4 @@
+BasedOnStyle: Google
+IncludeCategories:
+  - Regex:           '^<hwy/'
+    Priority:        2
--- a/third_party/jpeg-xl/.clang-tidy
+++ b/third_party/jpeg-xl/.clang-tidy
@ -0,0 +1,70 @@
+# Disabled checks:
+# - google-readability-todo: We don't use the google TODO format.
+#
+# - modernize-deprecated-headers: We don't use std:: versions of the standard
+#   types and functions like size_t or printf, so we should include <stdio.h>
+#   instead <cstdio>.
+# - modernize-return-braced-init-list: this often doesn't improve readability.
+# - modernize-use-auto: is too aggressive towards using auto.
+# - modernize-use-default-member-init: with a mix of constructors and default
+#   member initialization this can be confusing if enforced.
+# - modernize-use-trailing-return-type: does not improve readability when used
+#   systematically.
+# - modernize-use-using: typedefs are ok.
+#
+# - readability-else-after-return: It doesn't always improve readability.
+# - readability-static-accessed-through-instance
+#   It is often more useful and readable to access a constant of a passed
+#   variable (like d.N) instead of using the type of the variable that could be
+#   long and complex.
+# - readability-uppercase-literal-suffix: we write 1.0f, not 1.0F.
+
+Checks: >-
+  bugprone-*,
+  clang-*,
+  -clang-diagnostic-unused-command-line-argument,
+  google-*,
+  modernize-*,
+  performance-*,
+  readability-*,
+  -google-readability-todo,
+  -modernize-deprecated-headers,
+  -modernize-return-braced-init-list,
+  -modernize-use-auto,
+  -modernize-use-default-member-init,
+  -modernize-use-trailing-return-type,
+  -modernize-use-using,
+  -readability-else-after-return,
+  -readability-function-cognitive-complexity,
+  -readability-static-accessed-through-instance,
+  -readability-uppercase-literal-suffix,
+
+
+WarningsAsErrors: >-
+  bugprone-argument-comment,
+  bugprone-macro-parentheses,
+  bugprone-suspicious-string-compare,
+  bugprone-use-after-move,
+  clang-*,
+  clang-analyzer-*,
+  -clang-diagnostic-unused-command-line-argument,
+  google-build-using-namespace,
+  google-explicit-constructor,
+  google-readability-braces-around-statements,
+  google-readability-namespace-comments,
+  modernize-use-override,
+  readability-inconsistent-declaration-parameter-name
+
+# We are only interested in the headers from this projects, excluding
+# third_party/ and build/.
+HeaderFilterRegex: '^.*/(lib|tools)/.*\.h$'
+
+CheckOptions:
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '2'
+  - key:             readability-implicit-bool-conversion.AllowPointerConditions
+    value:           '1'
+  - key:             readability-implicit-bool-conversion.AllowIntegerConditions
+    value:           '1'
--- a/third_party/jpeg-xl/CMakeLists.txt
+++ b/third_party/jpeg-xl/CMakeLists.txt
@ -0,0 +1,334 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Ubuntu bionic ships with cmake 3.10.
+cmake_minimum_required(VERSION 3.10)
+
+# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
+if(POLICY CMP0063)
+  cmake_policy(SET CMP0063 NEW)
+endif()
+# Pass CMAKE_EXE_LINKER_FLAGS to CC and CXX compilers when testing if they work.
+if(POLICY CMP0065)
+  cmake_policy(SET CMP0065 NEW)
+endif()
+
+# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
+if(POLICY CMP0083)
+  cmake_policy(SET CMP0083 NEW)
+endif()
+
+project(JPEGXL LANGUAGES C CXX)
+
+include(CheckCXXSourceCompiles)
+check_cxx_source_compiles(
+   "int main() {
+      #if !defined(__EMSCRIPTEN__)
+      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
+      #endif
+      return 0;
+    }"
+  JPEGXL_EMSCRIPTEN
+)
+
+message(STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-fsanitize=fuzzer-no-link" CXX_FUZZERS_SUPPORTED)
+check_cxx_compiler_flag("-Xclang -mconstructor-aliases" CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
+
+# Enabled PIE binaries by default if supported.
+include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
+if(CHECK_PIE_SUPPORTED)
+  check_pie_supported(LANGUAGES CXX)
+  if(CMAKE_CXX_LINK_PIE_SUPPORTED)
+    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
+  endif()
+endif()
+
+### Project build options:
+if(${CXX_FUZZERS_SUPPORTED})
+  # Enabled by default except on arm64, Windows and Apple builds.
+  set(ENABLE_FUZZERS_DEFAULT true)
+endif()
+find_package(PkgConfig)
+if(NOT APPLE AND NOT WIN32 AND NOT HAIKU AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+  pkg_check_modules(TCMallocMinimalVersionCheck QUIET IMPORTED_TARGET
+      libtcmalloc_minimal)
+  if(TCMallocMinimalVersionCheck_FOUND AND
+     NOT TCMallocMinimalVersionCheck_VERSION VERSION_EQUAL 2.8.0)
+    # Enabled by default except on Windows and Apple builds for
+    # tcmalloc != 2.8.0. tcmalloc 2.8.1 already has a fix for this issue.
+    set(ENABLE_TCMALLOC_DEFAULT true)
+  else()
+    message(STATUS
+        "tcmalloc version ${TCMallocMinimalVersionCheck_VERSION} -- "
+        "tcmalloc 2.8.0 disabled due to "
+        "https://github.com/gperftools/gperftools/issues/1204")
+  endif()
+endif()
+
+set(WARNINGS_AS_ERRORS_DEFAULT false)
+
+set(JPEGXL_ENABLE_FUZZERS ${ENABLE_FUZZERS_DEFAULT} CACHE BOOL
+    "Build JPEGXL fuzzer targets.")
+set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
+    "Build JPEGXL developer tools.")
+set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
+    "Build and install man pages for the command-line tools.")
+set(JPEGXL_ENABLE_BENCHMARK true CACHE BOOL
+    "Build JPEGXL benchmark tools.")
+set(JPEGXL_ENABLE_EXAMPLES true CACHE BOOL
+    "Build JPEGXL library usage examples.")
+set(JPEGXL_ENABLE_SJPEG true CACHE BOOL
+    "Build JPEGXL with support for encoding with sjpeg.")
+set(JPEGXL_ENABLE_OPENEXR true CACHE BOOL
+    "Build JPEGXL with support for OpenEXR if available.")
+set(JPEGXL_ENABLE_SKCMS true CACHE BOOL
+    "Build with skcms instead of lcms2.")
+set(JPEGXL_ENABLE_VIEWERS false CACHE BOOL
+    "Build JPEGXL viewer tools for evaluation.")
+set(JPEGXL_ENABLE_TCMALLOC ${ENABLE_TCMALLOC_DEFAULT} CACHE BOOL
+    "Build JPEGXL using gperftools (tcmalloc) allocator.")
+set(JPEGXL_ENABLE_PLUGINS false CACHE BOOL
+    "Build third-party plugings to support JPEG XL in other applications.")
+set(JPEGXL_ENABLE_COVERAGE false CACHE BOOL
+    "Enable code coverage tracking for libjxl. This also enables debug and disables optimizations.")
+set(JPEGXL_STATIC false CACHE BOOL
+    "Build tools as static binaries.")
+set(JPEGXL_WARNINGS_AS_ERRORS ${WARNINGS_AS_ERRORS_DEFAULT} CACHE BOOL
+    "Treat warnings as errors during compilation.")
+set(JPEGXL_DEP_LICENSE_DIR "" CACHE STRING
+    "Directory where to search for system dependencies \"copyright\" files.")
+set(JPEGXL_FORCE_NEON false CACHE BOOL
+    "Set flags to enable NEON in arm if not enabled by your toolchain.")
+
+
+# Force system dependencies.
+set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
+    "Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
+set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
+    "Force using system installed brotli instead of third_party/brotli source.")
+set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
+    "Force using system installed highway (libhwy-dev) instead of third_party/highway source.")
+
+# Check minimum compiler versions. Older compilers are not supported and fail
+# with hard to understand errors.
+if (NOT ${CMAKE_C_COMPILER_ID} STREQUAL ${CMAKE_CXX_COMPILER_ID})
+  message(FATAL_ERROR "Different C/C++ compilers set: "
+          "${CMAKE_C_COMPILER_ID} vs ${CMAKE_CXX_COMPILER_ID}")
+endif()
+if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+  # Android NDK's toolchain.cmake fakes the clang version in
+  # CMAKE_CXX_COMPILER_VERSION with an incorrect number, so ignore this.
+  if (NOT ${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION} MATCHES "clang"
+      AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 6)
+    message(FATAL_ERROR
+      "Minimum Clang version required is Clang 6, please update.")
+  endif()
+elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
+  if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7)
+    message(FATAL_ERROR
+      "Minimum GCC version required is 7, please update.")
+  endif()
+endif()
+
+message(STATUS
+    "Compiled IDs C:${CMAKE_C_COMPILER_ID}, C++:${CMAKE_CXX_COMPILER_ID}")
+
+# CMAKE_EXPORT_COMPILE_COMMANDS is used to generate the compilation database
+# used by clang-tidy.
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(JPEGXL_STATIC)
+  set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
+  set(BUILD_SHARED_LIBS 0)
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
+  if (MINGW)
+    # In MINGW libstdc++ uses pthreads directly. When building statically a
+    # program (regardless of whether the source code uses pthread or not) the
+    # toolchain will add stdc++ and pthread to the linking step but stdc++ will
+    # be linked statically while pthread will be linked dynamically.
+    # To avoid this and have pthread statically linked with need to pass it in
+    # the command line with "-Wl,-Bstatic -lpthread -Wl,-Bdynamic" but the
+    # linker will discard it if not used by anything else up to that point in
+    # the linker command line. If the program or any dependency don't use
+    # pthread directly -lpthread is discarded and libstdc++ (added by the
+    # toolchain later) will then use the dynamic version. For this we also need
+    # to pass -lstdc++ explicitly before -lpthread. For pure C programs -lstdc++
+    # will be discarded anyway.
+    # This adds these flags as dependencies for *all* targets. Adding this to
+    # CMAKE_EXE_LINKER_FLAGS instead would cause them to be included before any
+    # object files and therefore discarded.
+    link_libraries(-Wl,-Bstatic -lstdc++ -lpthread -Wl,-Bdynamic)
+  endif()  # MINGW
+endif()  # JPEGXL_STATIC
+
+if (MSVC)
+# TODO(janwas): add flags
+else ()
+
+# Global compiler flags for all targets here and in subdirectories.
+add_definitions(
+  # Avoid changing the binary based on the current time and date.
+  -D__DATE__="redacted"
+  -D__TIMESTAMP__="redacted"
+  -D__TIME__="redacted"
+)
+
+if("${JPEGXL_ENABLE_FUZZERS}" OR "${JPEGXL_ENABLE_COVERAGE}")
+  add_definitions(
+    -DJXL_ENABLE_FUZZERS
+  )
+endif()  # JPEGXL_ENABLE_FUZZERS
+
+# In CMake before 3.12 it is problematic to pass repeated flags like -Xclang.
+# For this reason we place them in CMAKE_CXX_FLAGS instead.
+# See https://gitlab.kitware.com/cmake/cmake/issues/15826
+
+# Machine flags.
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funwind-tables")
+if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mrelax-all")
+endif()
+if ("${CXX_CONSTRUCTOR_ALIASES_SUPPORTED}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mconstructor-aliases")
+endif()
+
+if(WIN32)
+# Not supported by clang-cl, but frame pointers are default on Windows
+else()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+endif()
+
+# CPU flags - remove once we have NEON dynamic dispatch
+
+# TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
+if(JPEGXL_FORCE_NEON)
+# GCC requires these flags, otherwise __ARM_NEON is undefined.
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+   -mfpu=neon-vfpv4 -mfloat-abi=hard")
+endif()
+endif()
+
+# Force build with optimizations in release mode.
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+
+add_compile_options(
+  # Ignore this to allow redefining __DATE__ and others.
+  -Wno-builtin-macro-redefined
+
+  # Global warning settings.
+  -Wall
+)
+
+if (JPEGXL_WARNINGS_AS_ERRORS)
+add_compile_options(-Werror)
+endif ()
+endif ()  # !MSVC
+
+include(GNUInstallDirs)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+add_subdirectory(third_party)
+
+set(THREADS_PREFER_PTHREAD_FLAG YES)
+find_package(Threads REQUIRED)
+
+# Copy the JXL license file to the output build directory.
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/LICENSE"
+               ${PROJECT_BINARY_DIR}/LICENSE.jpeg-xl COPYONLY)
+
+# Enable tests regardless of where are they defined.
+enable_testing()
+include(CTest)
+
+# Libraries.
+add_subdirectory(lib)
+
+if(BUILD_TESTING)
+# Script to run tests over the source code in bash.
+find_program (BASH_PROGRAM bash)
+if(BASH_PROGRAM)
+  add_test(
+    NAME bash_test
+    COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
+endif()
+endif() # BUILD_TESTING
+
+# Documentation generated by Doxygen
+find_package(Doxygen)
+if(DOXYGEN_FOUND)
+set(DOXYGEN_GENERATE_HTML "YES")
+set(DOXYGEN_GENERATE_XML "NO")
+set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/include")
+set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md")
+set(DOXYGEN_WARN_AS_ERROR "YES")
+doxygen_add_docs(doc
+  "${CMAKE_CURRENT_SOURCE_DIR}/lib/include/jxl"
+  "${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
+  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+  COMMENT "Generating C API documentation")
+else()
+# Create a "doc" target for compatibility since "doc" is not otherwise added to
+# the build when doxygen is not installed.
+add_custom_target(doc false
+  COMMENT "Error: Can't generate doc since Doxygen not installed.")
+endif() # DOXYGEN_FOUND
+
+if(JPEGXL_ENABLE_MANPAGES)
+find_package(Python COMPONENTS Interpreter)
+if(Python_Interpreter_FOUND)
+  find_program(ASCIIDOC a2x)
+endif()
+if(NOT Python_Interpreter_FOUND OR "${ASCIIDOC}" STREQUAL "ASCIIDOC-NOTFOUND")
+  message(WARNING "asciidoc was not found, the man pages will not be installed.")
+else()
+  set(MANPAGE_FILES "")
+  set(MANPAGES "")
+  foreach(PAGE IN ITEMS cjxl djxl)
+    # Invoking the Python interpreter ourselves instead of running the a2x binary
+    # directly is necessary on MSYS2, otherwise it is run through cmd.exe which
+    # does not recognize it.
+    add_custom_command(
+      OUTPUT "${PAGE}.1"
+      COMMAND Python::Interpreter
+      ARGS "${ASCIIDOC}"
+        --format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
+        "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
+      MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
+    list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
+    list(APPEND MANPAGES "${PAGE}.1")
+  endforeach()
+  add_custom_target(manpages ALL DEPENDS ${MANPAGES})
+  install(FILES ${MANPAGE_FILES} DESTINATION share/man/man1)
+endif()
+endif()
+
+# Example usage code.
+if (${JPEGXL_ENABLE_EXAMPLES})
+add_subdirectory(examples)
+endif ()
+
+# Plugins for third-party software
+if (${JPEGXL_ENABLE_PLUGINS})
+add_subdirectory(plugins)
+endif ()
+
+# Binary tools
+add_subdirectory(tools)
--- a/third_party/jpeg-xl/CONTRIBUTING.md
+++ b/third_party/jpeg-xl/CONTRIBUTING.md
@ -0,0 +1,10 @@
+# How to Contribute
+
+We are currently unable to accept patches to this project, but we'd very much
+appreciate if you open a Gitlab issue for any bug reports/feature requests.
+We will be happy to investigate and hopefully fix any issues.
+
+# Community Guidelines
+
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
--- a/third_party/jpeg-xl/CONTRIBUTORS
+++ b/third_party/jpeg-xl/CONTRIBUTORS
@ -0,0 +1,23 @@
+# This files lists individuals who made significant contributions to the JPEG XL
+# code base, such as design, adding features, performing experiments, ...
+# Small changes such as a small bugfix or fixing spelling errors are not
+# included. If you'd like to be included in this file thanks to a significant
+# contribution, feel free to send a pull request changing this file.
+Alex Deymo
+Alexander Rhatushnyak
+Evgenii Kliuchnikov
+Iulia-Maria Comșa
+Jan Wassenberg
+Jon Sneyers
+Jyrki Alakuijala
+Krzysztof Potempa
+Lode Vandevenne
+Luca Versari
+Martin Bruse
+Moritz Firsching
+Renata Khasanova
+Robert Obryk
+Sami Boukortt
+Sebastian Gomez-Gonzalez
+Thomas Fischbacher
+Zoltan Szabadka
--- a/third_party/jpeg-xl/LICENSE
+++ b/third_party/jpeg-xl/LICENSE
@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
--- a/third_party/jpeg-xl/README.Haiku.md
+++ b/third_party/jpeg-xl/README.Haiku.md
@ -0,0 +1,20 @@
+## Disclaimer
+
+Haiku builds are not officially supported, i.e. the build might not work at all,
+some tests may fail and some sub-projects are excluded from build.
+
+This manual outlines Haiku-specific setup. For general building and testing
+instructions see "[README](README.md)" and
+"[Building and Testing changes](doc/building_and_testing.md)".
+
+## Dependencies
+
+```shell
+pkgman install llvm9_clang ninja cmake doxygen libjpeg_turbo_devel
+```
+
+## Building
+
+```shell
+TEST_STACK_LIMIT=none CMAKE_FLAGS="-I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++ -I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++/x86_64-unknown-haiku" CMAKE_SHARED_LINKER_FLAGS="-shared -Xlinker -soname=libjpegxl.so -lpthread" ./ci.sh opt
+```
--- a/third_party/jpeg-xl/README.OSX.md
+++ b/third_party/jpeg-xl/README.OSX.md
@ -0,0 +1,41 @@
+## Disclaimer
+
+OSX builds have "best effort" support, i.e. build might not work at all, some
+tests may fail and some sub-projects are excluded from build.
+
+This manual outlines OSX specific setup. For general building and testing
+instructions see "[README](README.md)" and
+"[Building and Testing changes](doc/building_and_testing.md)".
+
+[Homebrew](https://brew.sh/) is a popular package manager. JPEG XL library and
+binaries could be installed using it:
+
+```bash
+brew install jpeg-xl
+```
+
+## Dependencies
+
+Make sure that `brew doctor` does not report serious problems and up-to-date
+version of XCode is installed.
+
+Installing (actually, building) `clang` might take a couple hours.
+
+```bash
+brew install llvm
+```
+
+```bash
+brew install coreutils cmake giflib jpeg-turbo libpng ninja zlib
+```
+
+Before building the project check that `which clang` is
+`/usr/local/opt/llvm/bin/clang`, not the one provided by XCode. If not, update
+`PATH` environment variable.
+
+Also, setting `CMAKE_PREFIX_PATH` might be necessary for correct include paths
+resolving, e.g.:
+
+```bash
+export CMAKE_PREFIX_PATH=`brew --prefix giflib`:`brew --prefix jpeg-turbo`:`brew --prefix libpng`:`brew --prefix zlib`
+```
--- a/third_party/jpeg-xl/README.md
+++ b/third_party/jpeg-xl/README.md
@ -0,0 +1,224 @@
+# JPEG XL reference implementation
+
+<img src="doc/jxl.svg" width="100" align="right" alt="JXL logo">
+
+This repository contains a reference implementation of JPEG XL (encoder and
+decoder), called `libjxl`.
+
+JPEG XL is in the final stages of standardization and its codestream format is
+frozen.
+
+The libraries API, command line options and tools in this repository are subject
+to change, however files encoded with `cjxl` conform to the JPEG XL format
+specification and can be decoded with current and future `djxl` decoders or
+`libjxl` decoding library.
+
+## Quick start guide
+
+For more details and other workflows see the "Advanced guide" below.
+
+### Checking out the code
+
+```bash
+git clone https://gitlab.com/wg1/jpeg-xl.git --recursive
+```
+
+This repository uses git submodules to handle some third party dependencies
+under `third_party/`, that's why is important to pass `--recursive`. If you
+didn't check out with `--recursive`, or any submodule has changed, run:
+`git submodule update --init --recursive`.
+
+Important: If you downloaded a zip file or tarball from the web interface you
+won't get the needed submodules and the code will not compile. You can download
+these external dependencies from source running `./deps.sh`. The git workflow
+described above is recommended instead.
+
+### Installing dependencies
+
+Required dependencies for compiling the code, in a Debian/Ubuntu based
+distribution run:
+
+```bash
+sudo apt install cmake pkg-config libbrotli-dev
+```
+
+Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
+in a Debian/Ubuntu based distribution run:
+
+```bash
+sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
+```
+
+We recommend using a recent Clang compiler (version 7 or newer), for that
+install clang and set `CC` and `CXX` variables. For example, with clang-7:
+
+```bash
+sudo apt install clang-7
+export CC=clang-7 CXX=clang++-7
+```
+
+### Building
+
+```bash
+cd jpeg-xl
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
+cmake --build . -- -j$(nproc)
+```
+
+The encoder/decoder tools will be available in the `build/tools` directory.
+
+### <a name="installing"></a> Installing
+
+```bash
+sudo cmake --install .
+```
+
+### Basic encoder/decoder
+
+To encode a source image to JPEG XL with default settings:
+
+```bash
+build/tools/cjxl input.png output.jxl
+```
+
+For more settings run `build/tools/cjxl --help` or for a full list of options
+run `build/tools/cjxl -v -v --help`.
+
+To decode a JPEG XL file run:
+
+```bash
+build/tools/djxl input.jxl output.png
+```
+
+When possible `cjxl`/`djxl` are able to read/write the following
+image formats: .exr, .gif, .jpeg/.jpg, .pfm, .pgm/.ppm, .pgx, .png.
+
+### Benchmarking
+
+For speed benchmarks on single images in single or multi-threaded decoding
+`djxl` can print decoding speed information. See `djxl --help` for details
+on the decoding options and note that the output image is optional for
+benchmarking purposes.
+
+For a more comprehensive comparison of compression density between multiple
+options see "Benchmarking with benchmark_xl" section below.
+
+## Advanced guide
+
+### Building with Docker
+
+We build a common environment based on Debian/Ubuntu using Docker. Other
+systems may have different combinations of versions and dependencies that
+have not been tested and may not work. For those cases we recommend using the
+Docker environment as explained in the
+[step by step guide](doc/developing_in_docker.md).
+
+### Building JPEG XL for developers
+
+For experienced developers, we also provide build instructions for an [up to
+date Debian-based Linux](doc/developing_in_debian.md) and [64-bit
+Windows](doc/developing_in_windows.md). If you encounter any difficulties,
+please use Docker instead.
+
+## Benchmarking with benchmark_xl
+
+We recommend `build/tools/benchmark_xl` as a convenient method for reading
+images or image sequences, encoding them using various codecs (jpeg jxl png
+webp), decoding the result, and computing objective quality metrics. An example
+invocation is:
+
+```bash
+build/tools/benchmark_xl --input "/path/*.png" --codec jxl:wombat:d1,jxl:cheetah:d2
+```
+
+Multiple comma-separated codecs are allowed. The characters after : are
+parameters for the codec, separated by colons, in this case specifying maximum
+target psychovisual distances of 1 and 2 (higher implies lower quality) and
+the encoder effort (see below). Other common parameters are `r0.5` (target
+bitrate 0.5 bits per pixel) and `q92` (quality 92, on a scale of 0-100, where
+higher is better). The `jxl` codec supports the following additional parameters:
+
+Speed: `falcon`, `cheetah`, `hare`, `wombat`, `squirrel`, `kitten`, `tortoise`
+control the encoder effort in ascending order. This also affects memory usage:
+using lower effort will typically reduce memory consumption during encoding.
+
+*   `falcon` disables all of the following tools.
+*   `cheetah` enables coefficient reordering, context clustering, and heuristics
+    for selecting DCT sizes and quantization steps.
+*   `hare` enables Gaborish filtering, chroma from luma, and an initial estimate
+    of quantization steps.
+*   `wombat` enables error diffusion quantization and full DCT size selection
+    heuristics.
+*   `squirrel` (default) enables dots, patches, and spline detection, and full
+    context clustering.
+*   `kitten` optimizes the adaptive quantization for a psychovisual metric.
+*   `tortoise` enables a more thorough adaptive quantization search.
+
+Mode: JPEG XL has two modes. The default is Var-DCT mode, which is suitable for
+lossy compression. The other mode is Modular mode, which is suitable for lossless
+compression. Modular mode can also do lossy compression (e.g. `jxl:m:q50`).
+
+*   `m` activates modular mode.
+
+Other arguments to benchmark_xl include:
+
+*   `--save_compressed`: save codestreams to `output_dir`.
+*   `--save_decompressed`: save decompressed outputs to `output_dir`.
+*   `--output_extension`: selects the format used to output decoded images.
+*   `--num_threads`: number of codec instances that will independently
+    encode/decode images, or 0.
+*   `--inner_threads`: how many threads each instance should use for parallel
+    encoding/decoding, or 0.
+*   `--encode_reps`/`--decode_reps`: how many times to repeat encoding/decoding
+    each image, for more consistent measurements (we recommend 10).
+
+The benchmark output begins with a header:
+
+```
+Compr              Input    Compr            Compr       Compr  Decomp  Butteraugli
+Method            Pixels     Size              BPP   #    MP/s    MP/s     Distance    Error p norm           BPP*pnorm   Errors
+```
+
+`ComprMethod` lists each each comma-separated codec. `InputPixels` is the number
+of pixels in the input image. `ComprSize` is the codestream size in bytes and
+`ComprBPP` the bitrate. `Compr MP/s` and `Decomp MP/s` are the
+compress/decompress throughput, in units of Megapixels/second.
+`Butteraugli Distance` indicates the maximum psychovisual error in the decoded
+image (larger is worse). `Error p norm` is a similar summary of the psychovisual
+error, but closer to an average, giving less weight to small low-quality
+regions. `BPP*pnorm` is the product of `ComprBPP` and `Error p norm`, which is a
+figure of merit for the codec (lower is better). `Errors` is nonzero if errors
+occurred while loading or encoding/decoding the image.
+
+## License
+
+This software is available under Apache 2.0 license which can be found in the
+[LICENSE](LICENSE) file.
+
+## Additional documentation
+
+### Codec description
+
+*   [Introductory paper](https://www.spiedigitallibrary.org/proceedings/Download?fullDOI=10.1117%2F12.2529237) (open-access)
+*   [XL Overview](doc/xl_overview.md) - a brief introduction to the source code modules
+*   [JPEG XL white paper](http://ds.jpeg.org/whitepapers/jpeg-xl-whitepaper.pdf)
+*   [JPEG XL website](https://jpeg.org/jpegxl/)
+*   [Jon's JXL info page](https://sneyers.info/jxl/)
+
+### Development process
+*   [Docker setup - **start here**](doc/developing_in_docker.md)
+*   [Building on Debian](doc/developing_in_debian.md) - for experts only
+*   [Building on Windows](doc/developing_in_windows.md) - for experts only
+*   [More information on testing/build options](doc/building_and_testing.md)
+*   [Git guide for JPEG XL](doc/developing_in_gitlab.md) - for developers only
+*   [Building Web Assembly artifacts](doc/building_wasm.md)
+
+### Contact
+
+If you encounter a bug or other issue with the software, please open an Issue here.
+
+There is a [subreddit about JPEG XL](https://www.reddit.com/r/jpegxl/), and
+informal chatting with developers and early adopters of `libjxl` can be done on the
+[JPEG XL Discord server](https://discord.gg/DqkQgDRTFu).
--- a/third_party/jpeg-xl/SECURITY.md
+++ b/third_party/jpeg-xl/SECURITY.md
@ -0,0 +1,37 @@
+# Security and Vulnerability Policy for JPEG XL
+
+The current focus of the reference implementation is to provide a vehicle for
+evaluating the JPEG XL codec compression density, quality, features and its
+actual performance on different platforms. With this focus in mind we provide
+source code releases with improvements on performance and quality so developers
+can evaluate the codec.
+
+At this time, **we don't provide security and vulnerability support** for any
+of these releases. This means that the source code may contain bugs, including
+security bugs, that may be added or fixed between releases and will **not** be
+individually documented. All of these
+[releases](https://gitlab.com/wg1/jpeg-xl/-/releases) include the following
+note to that effect:
+
+* Note: This release is for evaluation purposes and may contain bugs, including
+  security bugs, that will *not* be individually documented when fixed. Always
+  prefer to use the latest release. Please provide feedback and report bugs
+  [here](https://gitlab.com/wg1/jpeg-xl/-/issues).
+
+To be clear, this means that because a release doesn't mention any CVE it
+doesn't mean that no security issues in previous versions were fixed. You should
+assume that any previous release contains security issues if that's a concern
+for your use case.
+
+This however doesn't impede you from evaluating the codec with your own trusted
+inputs, such as `.jxl` you encoded yourself, or when taking appropriate measures
+for your application like sandboxing if processing untrusted inputs.
+
+## Future plans
+
+To help our users and developers integrating this implementation into their
+software we plan to provide support for security and vulnerability tracking of
+this implementation in the future.
+
+When we can provide such support we will update this Policy with the details and
+expectations and clearly mention that fact in the release notes.
--- a/third_party/jpeg-xl/bash_test.sh
+++ b/third_party/jpeg-xl/bash_test.sh
@ -0,0 +1,229 @@
+#!/bin/bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Tests implemented in bash. These typically will run checks about the source
+# code rather than the compiled one.
+
+MYDIR=$(dirname $(realpath "$0"))
+
+set -u
+
+test_includes() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h)$'); do
+    # Check that the public files (in lib/include/ directory) don't use the full
+    # path to the public header since users of the library will include the
+    # library as: #include "jxl/foobar.h".
+    if [[ "${f#lib/include/}" != "${f}" ]]; then
+      if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
+        echo "Don't add \"include/\" to the include path of public headers." >&2
+        ret=1
+      fi
+    fi
+
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+
+      # Check that local files don't use the full path to third_party/
+      # directory since the installed versions will not have that path.
+      # Add an exception for third_party/dirent.h.
+      if grep -v -F 'third_party/dirent.h' "$f" | \
+          grep -i -H -n -E '#include\s*[<"]third_party/' >&2 &&
+          [[ $ret -eq 0 ]]; then
+        cat >&2 <<EOF
+$f: Don't add third_party/ to the include path of third_party projects. This \
+makes it harder to use installed system libraries instead of the third_party/ \
+ones.
+EOF
+        ret=1
+      fi
+    fi
+
+  done
+  return ${ret}
+}
+
+test_include_collision() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '^lib/include/'); do
+    local base=${f#lib/include/}
+    if [[ -e "lib/${base}" ]]; then
+      echo "$f: Name collision, both $f and lib/${base} exist." >&2
+      ret=1
+    fi
+  done
+  return ${ret}
+}
+
+test_copyright() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h|\.sh|\.m|\.py)$'); do
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+      if ! head -n 10 "$f" |
+          grep -F 'Copyright (c) the JPEG XL Project' >/dev/null ; then
+        echo "$f: Missing Copyright blob near the top of the file." >&2
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+# Check that "dec_" code doesn't depend on "enc_" headers.
+test_dec_enc_deps() {
+  local ret=0
+  local f
+  for f in $(git ls-files | grep -E '/dec_'); do
+    if [[ "${f#third_party/}" == "$f" ]]; then
+      # $f is not in third_party/
+      if grep -n -H -E "#include.*/enc_" "$f" >&2; then
+        echo "$f: Don't include \"enc_*\" files from \"dec_*\" files." >&2
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+# Check for git merge conflict markers.
+test_merge_conflict() {
+  local ret=0
+  TEXT_FILES='(\.cc|\.cpp|\.h|\.sh|\.m|\.py|\.md|\.txt|\.cmake)$'
+  for f in $(git ls-files | grep -E "${TEXT_FILES}"); do
+    if grep -E '^<<<<<<< ' "$f"; then
+      echo "$f: Found git merge conflict marker. Please resolve." >&2
+      ret=1
+    fi
+  done
+  return ${ret}
+}
+
+# Check that the library and the package have the same version. This prevents
+# accidentally having them out of sync.
+get_version() {
+  local varname=$1
+  local line=$(grep -F "set(${varname} " lib/CMakeLists.txt | head -n 1)
+  [[ -n "${line}" ]]
+  line="${line#set(${varname} }"
+  line="${line%)}"
+  echo "${line}"
+}
+
+test_version() {
+  local major=$(get_version JPEGXL_MAJOR_VERSION)
+  local minor=$(get_version JPEGXL_MINOR_VERSION)
+  local patch=$(get_version JPEGXL_PATCH_VERSION)
+  # Check that the version is not empty
+  if [[ -z "${major}${minor}${patch}" ]]; then
+    echo "Couldn't parse version from CMakeLists.txt" >&2
+    return 1
+  fi
+  local pkg_version=$(head -n 1 debian/changelog)
+  # Get only the part between the first "jpeg-xl (" and the following ")".
+  pkg_version="${pkg_version#jpeg-xl (}"
+  pkg_version="${pkg_version%%)*}"
+  if [[ -z "${pkg_version}" ]]; then
+    echo "Couldn't parse version from debian package" >&2
+    return 1
+  fi
+
+  local lib_version="${major}.${minor}.${patch}"
+  lib_version="${lib_version%.0}"
+  if [[ "${pkg_version}" != "${lib_version}"* ]]; then
+    echo "Debian package version (${pkg_version}) doesn't match library" \
+      "version (${lib_version})." >&2
+    return 1
+  fi
+  return 0
+}
+
+# Check that the SHA versions in deps.sh matches the git submodules.
+test_deps_version() {
+  while IFS= read -r line; do
+    if [[ "${line:0:10}" != "[submodule" ]]; then
+      continue
+    fi
+    line="${line#[submodule \"}"
+    line="${line%\"]}"
+    local varname=$(tr '[:lower:]' '[:upper:]' <<< "${line}")
+    varname="${varname/\//_}"
+    if ! grep -F "${varname}=" deps.sh >/dev/null; then
+      # Ignoring submodule not in deps.sh
+      continue
+    fi
+    local deps_sha=$(grep -F "${varname}=" deps.sh | cut -f 2 -d '"')
+    [[ -n "${deps_sha}" ]]
+    local git_sha=$(git ls-tree -r HEAD "${line}" | cut -f 1 | cut -f 3 -d ' ')
+    if [[ "${deps_sha}" != "${git_sha}" ]]; then
+      cat >&2 <<EOF
+deps.sh: SHA for project ${line} is at ${deps_sha} but the git submodule is at
+${git_sha}. Please update deps.sh
+EOF
+      return 1
+    fi
+  done < .gitmodules
+}
+
+# Make sure that all the Fields objects are fuzzed directly.
+test_fuzz_fields() {
+  local ret=0
+  # List all the classes of the form "ClassName : public Fields".
+  # This doesn't catch class names that are too long to fit.
+  local field_classes=$( git ls-files |
+    grep -E '\.(cc|h)' | grep -v 'test\.cc$' |
+    xargs grep -h -o -E '\b[^ ]+ : public Fields' | cut -f 1 -d ' ')
+  local classname
+  for classname in ${field_classes}; do
+    if ! grep -E "\\b${classname}\\b" tools/fields_fuzzer.cc >/dev/null; then
+      cat >&2 <<EOF
+tools/fields_fuzzer.cc: Class ${classname} not found in the fields_fuzzer.
+EOF
+      ret=1
+    fi
+  done
+  return $ret
+}
+
+main() {
+  local ret=0
+  cd "${MYDIR}"
+
+  if ! git rev-parse >/dev/null 2>/dev/null; then
+    echo "Not a git checkout, skipping bash_test"
+    return 0
+  fi
+
+  IFS=$'\n'
+  for f in $(declare -F); do
+    local test_name=$(echo "$f" | cut -f 3 -d ' ')
+    # Runs all the local bash functions that start with "test_".
+    if [[ "${test_name}" == test_* ]]; then
+      echo "Test ${test_name}: Start"
+      if ${test_name}; then
+        echo "Test ${test_name}: PASS"
+      else
+        echo "Test ${test_name}: FAIL"
+        ret=1
+      fi
+    fi
+  done
+  return ${ret}
+}
+
+main "$@"
--- a/third_party/jpeg-xl/ci.sh
+++ b/third_party/jpeg-xl/ci.sh
--- a/third_party/jpeg-xl/debian/changelog
+++ b/third_party/jpeg-xl/debian/changelog
@ -0,0 +1,86 @@
+jpeg-xl (0.3.7) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.7.
+  * Fix a rounding issue in 8-bit decoding.
+
+ -- Sami Boukortt <sboukortt@google.com>  Mon, 29 Mar 2021 12:14:20 +0200
+
+jpeg-xl (0.3.6) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.6.
+  * Fix a bug that could result in the generation of invalid codestreams as
+    well as failure to decode valid streams.
+
+ -- Sami Boukortt <sboukortt@google.com>  Thu, 25 Mar 2021 17:40:58 +0100
+
+jpeg-xl (0.3.5) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.5.
+  * Memory usage improvements.
+  * New encode-time options for faster decoding at the cost of quality.
+  * Faster decoding to 8-bit output with the C API.
+  * GIMP plugin: avoid the sRGB conversion dialog for sRGB images, do not show
+    a console window on Windows.
+  * Various bug fixes.
+  * Man pages for cjxl and djxl.
+
+ -- Sami Boukortt <sboukortt@google.com>  Tue, 23 Mar 2021 15:20:44 +0100
+
+jpeg-xl (0.3.4) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.4.
+  * Improved box parsing.
+  * Improved metadata handling.
+  * Performance and memory usage improvements.
+
+ -- Sami Boukortt <sboukortt@google.com>  Tue, 16 Mar 2021 12:13:59 +0100
+
+jpeg-xl (0.3.3) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.3.
+  * Performance improvements for small images.
+  * Add a (flag-protected) non-high-precision mode with better speed.
+  * Significantly speed up the PQ EOTF.
+  * Allow optional HDR tone mapping in djxl (--tone_map, --display_nits).
+  * Change the behavior of djxl -j to make it consistent with cjxl (#153).
+  * Improve image quality.
+  * Improve EXIF handling.
+
+ -- Sami Boukortt <sboukortt@google.com>  Fri, 5 Mar 2021 19:15:26 +0100
+
+jpeg-xl (0.3.2) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.2.
+  * Fix embedded ICC encoding regression #149.
+
+ -- Alex Deymo <deymo@google.com>  Fri, 12 Feb 2021 21:00:12 +0100
+
+jpeg-xl (0.3.1) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.1.
+
+ -- Alex Deymo <deymo@google.com>  Tue, 09 Feb 2021 09:48:43 +0100
+
+jpeg-xl (0.3) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.3.
+
+ -- Alex Deymo <deymo@google.com>  Wed, 27 Jan 2021 22:36:32 +0100
+
+jpeg-xl (0.2) UNRELEASED; urgency=medium
+
+  * Bump JPEG XL version to 0.2.
+
+ -- Alex Deymo <deymo@google.com>  Wed, 23 Nov 2020 20:42:10 +0100
+
+jpeg-xl (0.1) UNRELEASED; urgency=medium
+
+  * JPEG XL format release candidate.
+
+ -- Alex Deymo <deymo@google.com>  Fri, 13 Nov 2020 17:42:24 +0100
+
+jpeg-xl (0.0.2-1) UNRELEASED; urgency=medium
+
+  * Initial debian package.
+
+ -- Alex Deymo <deymo@google.com>  Tue, 27 Oct 2020 15:27:59 +0100
--- a/third_party/jpeg-xl/debian/compat
+++ b/third_party/jpeg-xl/debian/compat
@ -0,0 +1 @@
+10
--- a/third_party/jpeg-xl/debian/control
+++ b/third_party/jpeg-xl/debian/control
@ -0,0 +1,61 @@
+Source: jpeg-xl
+Maintainer: JPEG XL Maintainers <jpegxl@google.com>
+Section: misc
+Priority: optional
+Standards-Version: 3.9.8
+Build-Depends: cmake,
+               debhelper (>= 9),
+               libbrotli-dev,
+               libgif-dev,
+               libgmock-dev,
+               libgoogle-perftools-dev,
+               libgtest-dev,
+               libhwy-dev,
+               libjpeg-dev,
+               libopenexr-dev,
+               libpng-dev,
+               libwebp-dev,
+               pkg-config,
+Homepage: https://gitlab.com/wg1/jpeg-xl
+Rules-Requires-Root: no
+
+Package: jxl
+Architecture: any
+Section: utils
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Description: JPEG XL Image Coding System - "JXL" (command line utility)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs the command line utilities.
+
+Package: libjxl-dev
+Architecture: any
+Section: libdevel
+Depends: libjxl (= ${binary:Version}), ${misc:Depends}
+Description: JPEG XL Image Coding System - "JXL" (development files)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs development files.
+
+Package: libjxl
+Architecture: any
+Multi-Arch: same
+Section: libs
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Pre-Depends: ${misc:Pre-Depends}
+Description: JPEG XL Image Coding System - "JXL" (shared libraries)
+ The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
+ lossless image compression format. It has a rich feature set and is
+ particularly optimized for responsive web environments, so that
+ content renders well on a wide range of devices. Moreover, it includes
+ several features that help transition from the legacy JPEG format.
+ .
+ This package installs shared libraries.
--- a/third_party/jpeg-xl/debian/copyright
+++ b/third_party/jpeg-xl/debian/copyright
@ -0,0 +1,236 @@
+Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: jpeg-xl
+
+Files: *
+Copyright: 2020 the JPEG XL Project
+License: Apache-2.0
+
+Files: third_party/sjpeg/*
+Copyright: 2017 Google, Inc
+License: Apache-2.0
+
+Files: third_party/lodepng/*
+Copyright: 2005-2018 Lode Vandevenne
+License: Zlib License
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+ .
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+ .
+     1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+ .
+     2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+ .
+     3. This notice may not be removed or altered from any source
+     distribution.
+
+Files: third_party/skcms/*
+Copyright: 2018 Google Inc.
+License: BSD-3-clause
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+ .
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Google Inc. nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ .
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Files: third_party/testdata/imagecompression.info/*
+Copyright: their respective owners.
+License: License without any prohibitive copyright restrictions.
+ See https://imagecompression.info/test_images/ for details.
+ .
+ These Images are available without any prohibitive copyright restrictions.
+ .
+ These images are (c) there respective owners. You are granted full
+ redistribution and publication rights on these images provided:
+ .
+ 1. The origin of the pictures must not be misrepresented; you must not claim
+ that you took the original pictures. If you use, publish or redistribute them,
+ an acknowledgment would be appreciated but is not required.
+ 2. Altered versions must be plainly marked as such, and must not be
+ misinterpreted as being the originals.
+ 3. No payment is required for distribution of this material, it must be
+ available freely under the conditions stated here. That is, it is prohibited to
+ sell the material.
+ 4. This notice may not be removed or altered from any distribution.
+
+Files: third_party/testdata/pngsuite/*
+Copyright: Willem van Schaik, 1996, 2011
+License: PngSuite License
+ See http://www.schaik.com/pngsuite/ for details.
+ .
+ Permission to use, copy, modify and distribute these images for any
+ purpose and without fee is hereby granted.
+
+Files: third_party/testdata/raw.pixls/*
+Copyright: their respective owners listed in https://raw.pixls.us/
+License: CC0-1.0
+
+Files: third_party/testdata/raw.pixls/*
+Copyright: their respective owners listed in https://www.wesaturate.com/
+License: CC0-1.0
+
+Files: third_party/testdata/wide-gamut-tests/
+Copyright: github.com/codelogic/wide-gamut-tests authors.
+License: Apache-2.0
+
+License: Apache-2.0
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ .
+      http://www.apache.org/licenses/LICENSE-2.0
+ .
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ .
+ On Debian systems, the complete text of the Apache License, Version 2
+ can be found in "/usr/share/common-licenses/Apache-2.0".
+
+License: CC0
+ Creative Commons Zero v1.0 Universal
+ .
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL
+ SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT
+ RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS"
+ BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS
+ DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS
+ LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE
+ INFORMATION OR WORKS PROVIDED HEREUNDER.
+ .
+ Statement of Purpose
+ .
+ The laws of most jurisdictions throughout the world automatically confer
+ exclusive Copyright and Related Rights (defined below) upon the creator and
+ subsequent owner(s) (each and all, an "owner") of an original work of
+ authorship and/or a database (each, a "Work").
+ .
+ Certain owners wish to permanently relinquish those rights to a Work for the
+ purpose of contributing to a commons of creative, cultural and scientific
+ works ("Commons") that the public can reliably and without fear of later
+ claims of infringement build upon, modify, incorporate in other works, reuse
+ and redistribute as freely as possible in any form whatsoever and for any
+ purposes, including without limitation commercial purposes. These owners may
+ contribute to the Commons to promote the ideal of a free culture and the
+ further production of creative, cultural and scientific works, or to gain
+ reputation or greater distribution for their Work in part through the use
+ and efforts of others.
+ .
+ For these and/or other purposes and motivations, and without any expectation
+ of additional consideration or compensation, the person associating CC0 with
+ a Work (the "Affirmer"), to the extent that he or she is an owner of
+ Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to
+ the Work and publicly distribute the Work under its terms, with knowledge of
+ his or her Copyright and Related Rights in the Work and the meaning and
+ intended legal effect of CC0 on those rights.
+ .
+ 1. Copyright and Related Rights. A Work made available under CC0 may be
+ protected by copyright and related or neighboring rights ("Copyright and
+ Related Rights"). Copyright and Related Rights include, but are not limited
+ to, the following:
+   i. the right to reproduce, adapt, distribute, perform, display,
+ communicate, and translate a Work;
+   ii. moral rights retained by the original author(s) and/or performer(s);
+   iii. publicity and privacy rights pertaining to a person's image or
+ likeness depicted in a Work;
+   iv. rights protecting against unfair competition in regards to a Work,
+ subject to the limitations in paragraph 4(a), below;
+   v. rights protecting the extraction, dissemination, use and reuse of data
+ in a Work;
+   vi. database rights (such as those arising under Directive 96/9/EC of the
+ European Parliament and of the Council of 11 March 1996 on the legal
+ protection of databases, and under any national implementation thereof,
+ including any amended or successor version of such directive); and
+   vii. other similar, equivalent or corresponding rights throughout the
+ world based on applicable law or treaty, and any national implementations
+ thereof.
+ .
+ 2. Waiver. To the greatest extent permitted by, but not in contravention of,
+ applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+ unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+ and Related Rights and associated claims and causes of action, whether now
+ known or unknown (including existing as well as future claims and causes of
+ action), in the Work (i) in all territories worldwide, (ii) for the maximum
+ duration provided by applicable law or treaty (including future time
+ extensions), (iii) in any current or future medium and for any number of
+ copies, and (iv) for any purpose whatsoever, including without limitation
+ commercial, advertising or promotional purposes (the "Waiver"). Affirmer
+ makes the Waiver for the benefit of each member of the public at large and
+ to the detriment of Affirmer's heirs and successors, fully intending that
+ such Waiver shall not be subject to revocation, rescission, cancellation,
+ termination, or any other legal or equitable action to disrupt the quiet
+ enjoyment of the Work by the public as contemplated by Affirmer's express
+ Statement of Purpose.
+ .
+ 3. Public License Fallback. Should any part of the Waiver for any reason be
+ judged legally invalid or ineffective under applicable law, then the Waiver
+ shall be preserved to the maximum extent permitted taking into account
+ Affirmer's express Statement of Purpose. In addition, to the extent the
+ Waiver is so judged Affirmer hereby grants to each affected person a
+ royalty-free, non transferable, non sublicensable, non exclusive,
+ irrevocable and unconditional license to exercise Affirmer's Copyright and
+ Related Rights in the Work (i) in all territories worldwide, (ii) for the
+ maximum duration provided by applicable law or treaty (including future time
+ extensions), (iii) in any current or future medium and for any number of
+ copies, and (iv) for any purpose whatsoever, including without limitation
+ commercial, advertising or promotional purposes (the "License"). The License
+ shall be deemed effective as of the date CC0 was applied by Affirmer to the
+ Work. Should any part of the License for any reason be judged legally
+ invalid or ineffective under applicable law, such partial invalidity or
+ ineffectiveness shall not invalidate the remainder of the License, and in
+ such case Affirmer hereby affirms that he or she will not (i) exercise any
+ of his or her remaining Copyright and Related Rights in the Work or (ii)
+ assert any associated claims and causes of action with respect to the Work,
+ in either case contrary to Affirmer's express Statement of Purpose.
+ .
+ 4. Limitations and Disclaimers.
+   a. No trademark or patent rights held by Affirmer are waived, abandoned,
+ surrendered, licensed or otherwise affected by this document.
+   b. Affirmer offers the Work as-is and makes no representations or
+ warranties of any kind concerning the Work, express, implied, statutory or
+ otherwise, including without limitation warranties of title,
+ merchantability, fitness for a particular purpose, non infringement, or the
+ absence of latent or other defects, accuracy, or the present or absence of
+ errors, whether or not discoverable, all to the greatest extent permissible
+ under applicable law.
+   c. Affirmer disclaims responsibility for clearing rights of other persons
+ that may apply to the Work or any use thereof, including without limitation
+ any person's Copyright and Related Rights in the Work. Further, Affirmer
+ disclaims responsibility for obtaining any necessary consents, permissions
+ or other rights required for any use of the Work.
+   d. Affirmer understands and acknowledges that Creative Commons is not a
+ party to this document and has no duty or obligation with respect to this
+ CC0 or use of the Work.
+ .
+ For more information, please see:
+ http://creativecommons.org/publicdomain/zero/1.0/>
+
--- a/third_party/jpeg-xl/debian/jxl.install
+++ b/third_party/jpeg-xl/debian/jxl.install
@ -0,0 +1 @@
+debian/tmp/usr/bin/*
--- a/third_party/jpeg-xl/debian/libjxl-dev.install
+++ b/third_party/jpeg-xl/debian/libjxl-dev.install
@ -0,0 +1,4 @@
+debian/tmp/usr/include/jxl/*.h
+debian/tmp/usr/lib/*/*.a
+debian/tmp/usr/lib/*/*.so
+debian/tmp/usr/lib/*/pkgconfig/*.pc
--- a/third_party/jpeg-xl/debian/libjxl.install
+++ b/third_party/jpeg-xl/debian/libjxl.install
@ -0,0 +1 @@
+debian/tmp/usr/lib/*/libjxl*.so.*
--- a/third_party/jpeg-xl/debian/rules
+++ b/third_party/jpeg-xl/debian/rules
@ -0,0 +1,12 @@
+#!/usr/bin/make -f
+%:
+	dh $@ --buildsystem=cmake
+
+override_dh_auto_configure:
+	# TODO(deymo): Remove the DCMAKE_BUILD_TYPE once builds without NDEBUG
+	# are as useful as Release builds.
+	dh_auto_configure -- \
+	  -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+	  -DJPEGXL_FORCE_SYSTEM_GTEST=ON \
+	  -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+	  -DJPEGXL_FORCE_SYSTEM_HWY=ON
--- a/third_party/jpeg-xl/debian/source/format
+++ b/third_party/jpeg-xl/debian/source/format
@ -0,0 +1 @@
+3.0 (quilt)
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file downloads the dependencies needed to build JPEG XL into third_party.
+# These dependencies are normally pulled by gtest.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# Git revisions we use for the given submodules. Update these whenever you
+# update a git submodule.
+THIRD_PARTY_HIGHWAY="ca1a57c342cd815053abfcffa29b44eaead4f20b"
+THIRD_PARTY_LODEPNG="48e5364ef48ec2408f44c727657ac1b6703185f8"
+THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
+THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
+
+# Download the target revision from GitHub.
+download_github() {
+  local path="$1"
+  local project="$2"
+
+  local varname="${path^^}"
+  varname="${varname/\//_}"
+  local sha
+  eval "sha=\${${varname}}"
+
+  local down_dir="${MYDIR}/downloads"
+  local local_fn="${down_dir}/${sha}.tar.gz"
+  if [[ -e "${local_fn}" && -d "${MYDIR}/${path}" ]]; then
+    echo "${path} already up to date." >&2
+    return 0
+  fi
+
+  local url
+  local strip_components=0
+  if [[ "${project:0:4}" == "http" ]]; then
+    # "project" is a googlesource.com base url.
+    url="${project}${sha}.tar.gz"
+  else
+    # GitHub files have a top-level directory
+    strip_components=1
+    url="https://github.com/${project}/tarball/${sha}"
+  fi
+
+  echo "Downloading ${path} version ${sha}..." >&2
+  mkdir -p "${down_dir}"
+  curl -L --show-error -o "${local_fn}.tmp" "${url}"
+  mkdir -p "${MYDIR}/${path}"
+  tar -zxf "${local_fn}.tmp" -C "${MYDIR}/${path}" \
+    --strip-components="${strip_components}"
+  mv "${local_fn}.tmp" "${local_fn}"
+}
+
+
+main() {
+  if git -C "${MYDIR}" rev-parse; then
+    cat >&2 <<EOF
+Currenty directory is a git repository, downloading dependencies via git:
+
+  git submodule update --init --recursive
+
+EOF
+    git -C "${MYDIR}" submodule update --init --recursive
+    return 0
+  fi
+
+  # Sources downloaded from a tarball.
+  download_github third_party/highway google/highway
+  download_github third_party/lodepng lvandeve/lodepng
+  download_github third_party/sjpeg webmproject/sjpeg
+  download_github third_party/skcms \
+    "https://skia.googlesource.com/skcms/+archive/"
+  echo "Done."
+}
+
+main "$@"
--- a/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder
+++ b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder
@ -0,0 +1,30 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build an Ubuntu-based docker image with the installed software needed to
+# develop and test JPEG XL.
+
+FROM ubuntu:bionic
+
+# Set a prompt for when using it locally.
+ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
+
+COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
+
+COPY scripts /jpegxl_scripts
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN /jpegxl_scripts/jpegxl_builder.sh && \
+  rm -rf /jpegxl_scripts
--- a/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder-run-aarch64
+++ b/third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder-run-aarch64
@ -0,0 +1,46 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Build an Ubuntu-based docker image for aarch64 with the installed software
+# needed to run JPEG XL. This is only useful when running on actual aarch64
+# hardware.
+
+FROM arm64v8/ubuntu:bionic
+
+COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
+
+# Set a prompt for when using it locally.
+ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+RUN set -ex; \
+  apt-get update -y; \
+  apt-get install -y \
+    bsdmainutils \
+    cmake \
+    curl \
+    ca-certificates \
+    extra-cmake-modules \
+    git \
+    imagemagick \
+    libjpeg8 \
+    libgif7 \
+    libgoogle-perftools4 \
+    libopenexr22 \
+    libpng16-16 \
+    libqt5x11extras5 \
+    libsdl2-2.0-0 \
+    parallel; \
+  rm -rf /var/lib/apt/lists/*;
--- a/third_party/jpeg-xl/docker/README.md
+++ b/third_party/jpeg-xl/docker/README.md
@ -0,0 +1,7 @@
+### Docker container infrastructure for JPEG XL
+
+This directory contains the requirements to build a docker image for the
+JPEG XL project builder.
+
+Docker images need to be created and upload manually. See ./build.sh for
+details.
--- a/third_party/jpeg-xl/docker/build.sh
+++ b/third_party/jpeg-xl/docker/build.sh
@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+declare -a TARGETS
+
+load_targets() {
+  # Built-in OSX "find" does not support "-m".
+  FIND=$(which "gfind" || which "find")
+  for f in $(${FIND} -maxdepth 1 -name 'Dockerfile.*' | sort); do
+    local target="${f#*Dockerfile.}"
+    TARGETS+=("${target}")
+  done
+}
+
+usage() {
+    cat >&2 <<EOF
+Use: $1 [targets]
+
+Available targets:
+  * all
+EOF
+  for target in "${TARGETS[@]}"; do
+    echo "  * ${target}" >&2
+  done
+}
+
+build_target() {
+  local target="$1"
+
+  local dockerfile="${MYDIR}/Dockerfile.${target}"
+  # JPEG XL builder images are stored in the gcr.io/jpegxl project.
+  local tag="gcr.io/jpegxl/${target}"
+
+  echo "Building ${target}"
+  if ! sudo docker build --no-cache -t "${tag}" -f "${dockerfile}" "${MYDIR}" \
+      >"${target}.log" 2>&1; then
+    echo "${target} failed. See ${target}.log" >&2
+  else
+    echo "Done, to upload image run:" >&2
+    echo "  sudo docker push ${tag}"
+    if [[ "${JPEGXL_PUSH:-}" == "1" ]]; then
+      echo "sudo docker push ${tag}" >&2
+      sudo docker push "${tag}"
+      # The RepoDigest is only created after it is pushed.
+      local fulltag=$(sudo docker inspect --format="{{.RepoDigests}}" "${tag}")
+      fulltag="${fulltag#[}"
+      fulltag="${fulltag%]}"
+      echo "Updating .gitlab-ci.yml to ${fulltag}" >&2
+      sed -E "s;${tag}@sha256:[0-9a-f]+;${fulltag};" \
+        -i "${MYDIR}/../.gitlab-ci.yml"
+    fi
+  fi
+}
+
+main() {
+  cd "${MYDIR}"
+  local target="${1:-}"
+
+  load_targets
+  if [[ -z "${target}" ]]; then
+    usage $0
+    exit 1
+  fi
+
+  if [[ "${target}" == "all" ]]; then
+    for target in "${TARGETS[@]}"; do
+      build_target "${target}"
+    done
+  else
+    for target in "$@"; do
+      build_target "${target}"
+    done
+  fi
+}
+
+main "$@"
--- a/third_party/jpeg-xl/docker/scripts/99_norecommends
+++ b/third_party/jpeg-xl/docker/scripts/99_norecommends
@ -0,0 +1 @@
+APT::Install-Recommends "false";
--- a/third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch
+++ b/third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch
@ -0,0 +1,28 @@
+Description: fix lack of alignment in relocations (crashes on mingw)
+See https://sourceware.org/git/?p=binutils-gdb.git;a=patch;h=73af69e74974eaa155eec89867e3ccc77ab39f6d
+From: Marc <marc@groundctl.com>
+Date: Fri, 9 Nov 2018 11:13:50 +0000
+Subject: [PATCH] Allow for compilers that do not produce aligned .rdat
+ sections in PE format files.
+
+--- a/upstream/ld/scripttempl/pe.sc	2020-05-12 18:45:12.000000000 +0200
+++ b/upstream/ld/scripttempl/pe.sc	2020-05-12 18:47:12.000000000 +0200
+@@ -143,6 +143,7 @@
+   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
+   {
+     ${R_RDATA}
+    . = ALIGN(4);
+     ${RELOCATING+__rt_psrelocs_start = .;}
+     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
+     ${RELOCATING+__rt_psrelocs_end = .;}
+--- a/upstream/ld/scripttempl/pep.sc	2020-05-12 18:45:19.000000000 +0200
+++ b/upstream/ld/scripttempl/pep.sc	2020-05-12 18:47:18.000000000 +0200
+@@ -143,6 +143,7 @@
+   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
+   {
+     ${R_RDATA}
+    . = ALIGN(4);
+     ${RELOCATING+__rt_psrelocs_start = .;}
+     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
+     ${RELOCATING+__rt_psrelocs_end = .;}
+
--- a/third_party/jpeg-xl/docker/scripts/emsdk_install.sh
+++ b/third_party/jpeg-xl/docker/scripts/emsdk_install.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+EMSDK_URL="https://github.com/emscripten-core/emsdk/archive/master.tar.gz"
+EMSDK_DIR="/opt/emsdk"
+
+EMSDK_RELEASE="2.0.4"
+
+set -eu -x
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+main() {
+  local workdir=$(mktemp -d --suffix=emsdk)
+  CLEANUP_FILES+=("${workdir}")
+
+  local emsdktar="${workdir}/emsdk.tar.gz"
+  curl --output "${emsdktar}" "${EMSDK_URL}" --location
+  mkdir -p "${EMSDK_DIR}"
+  tar -zxf "${emsdktar}" -C "${EMSDK_DIR}" --strip-components=1
+
+  cd "${EMSDK_DIR}"
+  ./emsdk install --shallow "${EMSDK_RELEASE}"
+  ./emsdk activate --embedded "${EMSDK_RELEASE}"
+}
+
+main "$@"
--- a/third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh
+++ b/third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh
@ -0,0 +1,515 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Main entry point for all the Dockerfile for jpegxl-builder. This centralized
+# file helps sharing code and configuration between Dockerfiles.
+
+set -eux
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# libjpeg-turbo.
+JPEG_TURBO_RELEASE="2.0.4"
+JPEG_TURBO_URL="https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${JPEG_TURBO_RELEASE}.tar.gz"
+JPEG_TURBO_SHA256="7777c3c19762940cff42b3ba4d7cd5c52d1671b39a79532050c85efb99079064"
+
+# zlib (dependency of libpng)
+ZLIB_RELEASE="1.2.11"
+ZLIB_URL="https://www.zlib.net/zlib-${ZLIB_RELEASE}.tar.gz"
+ZLIB_SHA256="c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
+# The name in the .pc and the .dll generated don't match in zlib for Windows
+# because they use different .dll names in Windows. We avoid that by defining
+# UNIX=1. We also install all the .dll files to ${prefix}/lib instead of the
+# default ${prefix}/bin.
+ZLIB_FLAGS='-DUNIX=1 -DINSTALL_PKGCONFIG_DIR=/${CMAKE_INSTALL_PREFIX}/lib/pkgconfig -DINSTALL_BIN_DIR=/${CMAKE_INSTALL_PREFIX}/lib'
+
+# libpng
+LIBPNG_RELEASE="1.6.37"
+LIBPNG_URL="https://github.com/glennrp/libpng/archive/v${LIBPNG_RELEASE}.tar.gz"
+LIBPNG_SHA256="ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307"
+
+# giflib
+GIFLIB_RELEASE="5.2.1"
+GIFLIB_URL="https://netcologne.dl.sourceforge.net/project/giflib/giflib-${GIFLIB_RELEASE}.tar.gz"
+GIFLIB_SHA256="31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd"
+
+# A patch needed to compile GIFLIB in mingw.
+GIFLIB_PATCH_URL="https://github.com/msys2/MINGW-packages/raw/3afde38fcee7b3ba2cafd97d76cca8f06934504f/mingw-w64-giflib/001-mingw-build.patch"
+GIFLIB_PATCH_SHA256="2b2262ddea87fc07be82e10aeb39eb699239f883c899aa18a16e4d4e40af8ec8"
+
+# webp
+WEBP_RELEASE="1.0.2"
+WEBP_URL="https://codeload.github.com/webmproject/libwebp/tar.gz/v${WEBP_RELEASE}"
+WEBP_SHA256="347cf85ddc3497832b5fa9eee62164a37b249c83adae0ba583093e039bf4881f"
+
+# Google benchmark
+BENCHMARK_RELEASE="1.5.2"
+BENCHMARK_URL="https://github.com/google/benchmark/archive/v${BENCHMARK_RELEASE}.tar.gz"
+BENCHMARK_SHA256="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
+BENCHMARK_FLAGS="-DGOOGLETEST_PATH=${MYDIR}/../../third_party/googletest"
+# attribute(format(__MINGW_PRINTF_FORMAT, ...)) doesn't work in our
+# environment, so we disable the warning.
+BENCHMARK_FLAGS="-DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_TESTING=OFF \
+  -DCMAKE_CXX_FLAGS=-Wno-ignored-attributes \
+  -DCMAKE_POSITION_INDEPENDENT_CODE=ON"
+
+# V8
+V8_VERSION="8.7.230"
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+# List of Ubuntu arch names supported by the builder (such as "i386").
+LIST_ARCHS=(
+  amd64
+  i386
+  arm64
+  armhf
+)
+
+# List of target triplets supported by the builder.
+LIST_TARGETS=(
+  x86_64-linux-gnu
+  i686-linux-gnu
+  arm-linux-gnueabihf
+  aarch64-linux-gnu
+)
+LIST_MINGW_TARGETS=(
+  i686-w64-mingw32
+  x86_64-w64-mingw32
+)
+LIST_WASM_TARGETS=(
+  wasm32
+)
+
+# Setup the apt repositories and supported architectures.
+setup_apt() {
+  apt-get update -y
+  apt-get install -y curl gnupg ca-certificates
+
+  apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
+
+  # node sources.
+  cat >/etc/apt/sources.list.d/nodesource.list <<EOF
+  deb https://deb.nodesource.com/node_14.x bionic main
+  deb-src https://deb.nodesource.com/node_14.x bionic main
+EOF
+  curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
+
+  local port_list=()
+  local main_list=()
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    if [[ "${ubarch}" != "amd64" && "${ubarch}" != "i386" ]]; then
+      # other archs are not part of the main mirrors, but available in
+      # ports.ubuntu.com.
+      port_list+=("${ubarch}")
+    else
+      main_list+=("${ubarch}")
+    fi
+    # Add the arch to the system.
+    if [[ "${ubarch}" != "amd64" ]]; then
+      dpkg --add-architecture "${ubarch}"
+    fi
+  done
+
+  # Update the sources.list with the split of supported architectures.
+  local bkplist="/etc/apt/sources.list.bkp"
+  [[ -e "${bkplist}" ]] || \
+    mv /etc/apt/sources.list "${bkplist}"
+
+  local newlist="/etc/apt/sources.list.tmp"
+  rm -f "${newlist}"
+  port_list=$(echo "${port_list[@]}" | tr ' ' ,)
+  if [[ -n "${port_list}" ]]; then
+    local port_url="http://ports.ubuntu.com/ubuntu-ports/"
+    grep -v -E '^#' "${bkplist}" |
+      sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${port_list}] ${port_url} \\2;" \
+      >>"${newlist}"
+  fi
+
+  main_list=$(echo "${main_list[@]}" | tr ' ' ,)
+  grep -v -E '^#' "${bkplist}" |
+    sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
+    >>"${newlist}"
+  mv "${newlist}" /etc/apt/sources.list
+}
+
+install_pkgs() {
+  packages=(
+    # Native compilers (minimum for SIMD is clang-7)
+    clang-7 clang-format-7 clang-tidy-7
+
+    # TODO: Consider adding clang-8 to every builder:
+    #   clang-8 clang-format-8 clang-tidy-8
+
+    # For cross-compiling to Windows with mingw.
+    mingw-w64
+    wine64
+    wine-binfmt
+
+    # Native tools.
+    bsdmainutils
+    cmake
+    extra-cmake-modules
+    git
+    llvm
+    nasm
+    ninja-build
+    parallel
+    pkg-config
+
+    # These are used by the ./ci.sh lint in the native builder.
+    clang-format-7
+    clang-format-8
+
+    # For coverage builds
+    gcovr
+
+    # For compiling giflib documentation.
+    xmlto
+
+    # Common libraries.
+    libstdc++-8-dev
+
+    # We don't use tcmalloc on archs other than amd64. This installs
+    # libgoogle-perftools4:amd64.
+    google-perftools
+
+    # NodeJS for running WASM tests
+    nodejs
+
+    # To generate API documentation.
+    doxygen
+
+    # Freezes version that builds (passes tests). Newer version
+    # (2.30-21ubuntu1~18.04.4) claims to fix "On Intel Skylake
+    # (-march=native) generated avx512 instruction can be wrong",
+    # but newly added tests does not pass. Perhaps the problem is
+    # that mingw package is not updated.
+    binutils-source=2.30-15ubuntu1
+  )
+
+  # Install packages that are arch-dependent.
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    packages+=(
+      # Library dependencies. These normally depend on the target architecture
+      # we are compiling for and can't usually be installed for multiple
+      # architectures at the same time.
+      libgif7:"${ubarch}"
+      libjpeg-dev:"${ubarch}"
+      libpng-dev:"${ubarch}"
+      libqt5x11extras5-dev:"${ubarch}"
+
+      libstdc++-8-dev:"${ubarch}"
+      qtbase5-dev:"${ubarch}"
+
+      # For OpenEXR:
+      libilmbase12:"${ubarch}"
+      libopenexr22:"${ubarch}"
+
+      # TCMalloc dependency
+      libunwind-dev:"${ubarch}"
+
+      # Cross-compiling tools per arch.
+      libc6-dev-"${ubarch}"-cross
+      libstdc++-8-dev-"${ubarch}"-cross
+    )
+  done
+
+  local target
+  for target in "${LIST_TARGETS[@]}"; do
+    # Per target cross-compiling tools.
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      packages+=(
+        binutils-"${target}"
+        gcc-"${target}"
+      )
+    fi
+  done
+
+  # Install all the manual packages via "apt install" for the main arch. These
+  # will be installed for other archs via manual download and unpack.
+  apt install -y "${packages[@]}" "${UNPACK_PKGS[@]}"
+}
+
+# binutils <2.32 need a patch.
+install_binutils() {
+  local workdir=$(mktemp -d --suffix=_install)
+  CLEANUP_FILES+=("${workdir}")
+  pushd "${workdir}"
+  apt source binutils-mingw-w64
+  apt -y build-dep binutils-mingw-w64
+  cd binutils-mingw-w64-8ubuntu1
+  cp "${MYDIR}/binutils_align_fix.patch" debian/patches
+  echo binutils_align_fix.patch >> debian/patches/series
+  dpkg-buildpackage -b
+  cd ..
+  dpkg -i *deb
+  popd
+}
+
+# Install a library from the source code for multiple targets.
+# Usage: install_from_source <tar_url> <sha256> <target> [<target...>]
+install_from_source() {
+  local package="$1"
+  shift
+
+  local url
+  eval "url=\${${package}_URL}"
+  local sha256
+  eval "sha256=\${${package}_SHA256}"
+  # Optional package flags
+  local pkgflags
+  eval "pkgflags=\${${package}_FLAGS:-}"
+
+  local workdir=$(mktemp -d --suffix=_install)
+  CLEANUP_FILES+=("${workdir}")
+
+  local tarfile="${workdir}"/$(basename "${url}")
+  curl -L --output "${tarfile}" "${url}"
+  if ! echo "${sha256} ${tarfile}" | sha256sum -c --status -; then
+    echo "SHA256 mismatch for ${url}: expected ${sha256} but found:"
+    sha256sum "${tarfile}"
+    exit 1
+  fi
+
+  local target
+  for target in "$@"; do
+    echo "Installing ${package} for target ${target} from ${url}"
+
+    local srcdir="${workdir}/source-${target}"
+    mkdir -p "${srcdir}"
+    tar -zxf "${tarfile}" -C "${srcdir}" --strip-components=1
+
+    local prefix="/usr"
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      prefix="/usr/${target}"
+    fi
+
+    # Apply patches to buildfiles.
+    if [[ "${package}" == "GIFLIB" && "${target}" == *mingw32 ]]; then
+      # GIFLIB Makefile has several problems so we need to fix them here. We are
+      # using a patch from MSYS2 that already fixes the compilation for mingw.
+      local make_patch="${srcdir}/libgif.patch"
+      curl -L "${GIFLIB_PATCH_URL}" -o "${make_patch}"
+      echo "${GIFLIB_PATCH_SHA256} ${make_patch}" | sha256sum -c --status -
+      patch "${srcdir}/Makefile" < "${make_patch}"
+    elif [[ "${package}" == "LIBPNG" && "${target}" == wasm* ]]; then
+      # Cut the dependency to libm; there is pull request to fix it, so this
+      # might not be needed in the future.
+      sed -i 's/APPLE/EMSCRIPTEN/g' "${srcdir}/CMakeLists.txt"
+    fi
+
+    local cmake_args=()
+    local export_args=("CC=clang-7" "CXX=clang++-7")
+    local cmake="cmake"
+    local make="make"
+    local system_name="Linux"
+    if [[ "${target}" == *mingw32 ]]; then
+      system_name="Windows"
+      # When compiling with clang, CMake doesn't detect that we are using mingw.
+      cmake_args+=(
+        -DMINGW=1
+        # Googletest needs this when cross-compiling to windows
+        -DCMAKE_CROSSCOMPILING=1
+        -DHAVE_STD_REGEX=0
+        -DHAVE_POSIX_REGEX=0
+        -DHAVE_GNU_POSIX_REGEX=0
+      )
+      local windres=$(which ${target}-windres || true)
+      if [[ -n "${windres}" ]]; then
+        cmake_args+=(-DCMAKE_RC_COMPILER="${windres}")
+      fi
+    fi
+    if [[ "${target}" == wasm* ]]; then
+      system_name="WASM"
+      cmake="emcmake cmake"
+      make="emmake make"
+      export_args=()
+      cmake_args+=(
+        -DCMAKE_FIND_ROOT_PATH="${prefix}"
+        -DCMAKE_PREFIX_PATH="${prefix}"
+      )
+      # Static and shared library link to the same file -> race condition.
+      nproc=1
+    else
+      nproc=`nproc --all`
+    fi
+    cmake_args+=(-DCMAKE_SYSTEM_NAME="${system_name}")
+
+    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
+      # Cross-compiling.
+      cmake_args+=(
+        -DCMAKE_C_COMPILER_TARGET="${target}"
+        -DCMAKE_CXX_COMPILER_TARGET="${target}"
+        -DCMAKE_SYSTEM_PROCESSOR="${target%%-*}"
+      )
+    fi
+
+    if [[ -e "${srcdir}/CMakeLists.txt" ]]; then
+      # Most pacakges use cmake for building which is easier to configure for
+      # cross-compiling.
+      if [[ "${package}" == "JPEG_TURBO" && "${target}" == wasm* ]]; then
+        # JT erroneously detects WASM CPU as i386 and tries to use asm.
+        # Wasm/Emscripten support for dynamic linking is incomplete; disable
+        # to avoid CMake warning.
+        cmake_args+=(-DWITH_SIMD=0 -DENABLE_SHARED=OFF)
+      fi
+      (
+        cd "${srcdir}"
+        export ${export_args[@]}
+        ${cmake} \
+          -DCMAKE_INSTALL_PREFIX="${prefix}" \
+          "${cmake_args[@]}" ${pkgflags}
+        ${make} -j${nproc}
+        ${make} install
+      )
+    elif [[ "${package}" == "GIFLIB" ]]; then
+      # GIFLIB doesn't yet have a cmake build system. There is a pull
+      # request in giflib for adding CMakeLists.txt so this might not be
+      # needed in the future.
+      (
+        cd "${srcdir}"
+        local giflib_make_flags=(
+          CFLAGS="-O2 --target=${target} -std=gnu99"
+          PREFIX="${prefix}"
+        )
+        if [[ "${target}" != wasm* ]]; then
+          giflib_make_flags+=(CC=clang-7)
+        fi
+        # giflib make dependencies are not properly set up so parallel building
+        # doesn't work for everything.
+        ${make} -j${nproc} libgif.a "${giflib_make_flags[@]}"
+        ${make} -j${nproc} all "${giflib_make_flags[@]}"
+        ${make} install "${giflib_make_flags[@]}"
+      )
+    else
+      echo "Don't know how to install ${package}"
+      exit 1
+    fi
+
+  done
+}
+
+# Packages that are manually unpacked for each architecture.
+UNPACK_PKGS=(
+  libgif-dev
+  libclang-common-7-dev
+
+  # For OpenEXR:
+  libilmbase-dev
+  libopenexr-dev
+
+  # TCMalloc
+  libgoogle-perftools-dev
+  libtcmalloc-minimal4
+  libgoogle-perftools4
+)
+
+# Main script entry point.
+main() {
+  cd "${MYDIR}"
+
+  # Configure the repositories with the sources for multi-arch cross
+  # compilation.
+  setup_apt
+  apt-get update -y
+  apt-get dist-upgrade -y
+
+  install_pkgs
+  install_binutils
+  apt clean
+
+  # Manually extract packages for the target arch that can't install it directly
+  # at the same time as the native ones.
+  local ubarch
+  for ubarch in "${LIST_ARCHS[@]}"; do
+    if [[ "${ubarch}" != "amd64" ]]; then
+      local pkg
+      for pkg in "${UNPACK_PKGS[@]}"; do
+        apt download "${pkg}":"${ubarch}"
+        dpkg -x "${pkg}"_*_"${ubarch}".deb /
+      done
+    fi
+  done
+  # TODO: Add clang from the llvm repos. This is problematic since we are
+  # installing libclang-common-7-dev:"${ubarch}" from the ubuntu ports repos
+  # which is not available in the llvm repos so it might have a different
+  # version than the ubuntu ones.
+
+  # Remove the win32 libgcc version. The gcc-mingw-w64-x86-64 (and i686)
+  # packages install two libgcc versions:
+  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-posix
+  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32
+  # (exact libgcc version number depends on the package version).
+  #
+  # Clang will pick the best libgcc, sorting by version, but it doesn't
+  # seem to be a way to specify one or the other one, except by passing
+  # -nostdlib and setting all the include paths from the command line.
+  # To check which one is being used you can run:
+  #   clang++-7 --target=x86_64-w64-mingw32 -v -print-libgcc-file-name
+  # We need to use the "posix" versions for thread support, so here we
+  # just remove the other one.
+  local target
+  for target in "${LIST_MINGW_TARGETS[@]}"; do
+    update-alternatives --set "${target}-gcc" $(which "${target}-gcc-posix")
+    local gcc_win32_path=$("${target}-cpp-win32" -print-libgcc-file-name)
+    rm -rf $(dirname "${gcc_win32_path}")
+  done
+
+  # TODO: Add msan for the target when cross-compiling. This only installs it
+  # for amd64.
+  ./msan_install.sh
+
+  # Build and install qemu user-linux targets.
+  ./qemu_install.sh
+
+  # Install emscripten SDK.
+  ./emsdk_install.sh
+
+  # Setup environment for building WASM libraries from sources.
+  source /opt/emsdk/emsdk_env.sh
+
+  # Install some dependency libraries manually for the different targets.
+
+  install_from_source JPEG_TURBO "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source ZLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source LIBPNG "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  install_from_source GIFLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
+  # webp in Ubuntu is relatively old so we install it from source for everybody.
+  install_from_source WEBP "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
+
+  install_from_source BENCHMARK "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
+
+  # Install v8. v8 has better WASM SIMD support than NodeJS 14.
+  # First we need the installer to install v8.
+  npm install jsvu -g
+  # install specific version;
+  HOME=/opt jsvu --os=linux64 "v8@${V8_VERSION}"
+  ln -s "/opt/.jsvu/v8-${V8_VERSION}" "/opt/.jsvu/v8"
+
+  # Cleanup.
+  find /var/lib/apt/lists/ -mindepth 1 -delete
+}
+
+main "$@"
--- a/third_party/jpeg-xl/docker/scripts/msan_install.sh
+++ b/third_party/jpeg-xl/docker/scripts/msan_install.sh
@ -0,0 +1,140 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eu
+
+MYDIR=$(dirname $(realpath "$0"))
+
+# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
+CMAKE_FLAGS=${CMAKE_FLAGS:-}
+CMAKE_C_FLAGS=${CMAKE_C_FLAGS:-${CMAKE_FLAGS}}
+CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS:-${CMAKE_FLAGS}}
+CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
+
+CLANG_VERSION="${CLANG_VERSION:-}"
+# Detect the clang version suffix and store it in CLANG_VERSION. For example,
+# "6.0" for clang 6 or "7" for clang 7.
+detect_clang_version() {
+  if [[ -n "${CLANG_VERSION}" ]]; then
+    return 0
+  fi
+  local clang_version=$("${CC:-clang}" --version | head -n1)
+  local llvm_tag
+  case "${clang_version}" in
+    "clang version 6."*)
+      CLANG_VERSION="6.0"
+      ;;
+    "clang version 7."*)
+      CLANG_VERSION="7"
+      ;;
+    "clang version 8."*)
+      CLANG_VERSION="8"
+      ;;
+    "clang version 9."*)
+      CLANG_VERSION="9"
+      ;;
+    *)
+      echo "Unknown clang version: ${clang_version}" >&2
+      return 1
+  esac
+}
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+# Install libc++ libraries compiled with msan in the msan_prefix for the current
+# compiler version.
+cmd_msan_install() {
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+  # Detect the llvm to install:
+  export CC="${CC:-clang}"
+  export CXX="${CXX:-clang++}"
+  detect_clang_version
+  local llvm_tag
+  case "${CLANG_VERSION}" in
+    "6.0")
+      llvm_tag="llvmorg-6.0.1"
+      ;;
+    "7")
+      llvm_tag="llvmorg-7.0.1"
+      ;;
+    "8")
+      llvm_tag="llvmorg-8.0.0"
+      ;;
+    *)
+      echo "Unknown clang version: ${clang_version}" >&2
+      return 1
+  esac
+  local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
+  curl -L --show-error -o "${llvm_targz}" \
+    "https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
+  tar -C "${tmpdir}" -zxf "${llvm_targz}"
+  local llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
+
+  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
+  rm -rf "${msan_prefix}"
+
+  declare -A CMAKE_EXTRAS
+  CMAKE_EXTRAS[libcxx]="\
+    -DLIBCXX_CXX_ABI=libstdc++ \
+    -DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
+
+  for project in libcxx; do
+    local proj_build="${tmpdir}/build-${project}"
+    local proj_dir="${llvm_root}/${project}"
+    mkdir -p "${proj_build}"
+    cmake -B"${proj_build}" -H"${proj_dir}" \
+      -G Ninja \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DLLVM_USE_SANITIZER=Memory \
+      -DLLVM_PATH="${llvm_root}/llvm" \
+      -DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
+                            head -n1)" \
+      -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
+      -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
+      -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
+      -DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
+      ${CMAKE_EXTRAS[${project}]}
+    cmake --build "${proj_build}"
+    ninja -C "${proj_build}" install
+  done
+}
+
+main() {
+  set -x
+  for version in 6.0 7 8; do
+    if ! which "clang-${version}" >/dev/null; then
+      echo "Skipping msan install for clang version ${version}"
+      continue
+    fi
+    (
+     trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+     export CLANG_VERSION=${version}
+     export CC=clang-${version}
+     export CXX=clang++-${version}
+     cmd_msan_install
+    ) &
+  done
+  wait
+}
+
+main "$@"
--- a/third_party/jpeg-xl/docker/scripts/qemu_install.sh
+++ b/third_party/jpeg-xl/docker/scripts/qemu_install.sh
@ -0,0 +1,92 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+QEMU_RELEASE="4.1.0"
+QEMU_URL="https://download.qemu.org/qemu-${QEMU_RELEASE}.tar.xz"
+QEMU_ARCHS=(
+  aarch64
+  arm
+  i386
+  # TODO: Consider adding these:
+  # aarch64_be
+  # mips64el
+  # mips64
+  # mips
+  # ppc64
+  # ppc
+)
+
+# Ubuntu packages not installed that are needed to build qemu.
+QEMU_BUILD_DEPS=(
+  libglib2.0-dev
+  libpixman-1-dev
+  flex
+  bison
+)
+
+set -eu -x
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -fr "${CLEANUP_FILES[@]}"
+  fi
+}
+trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
+
+main() {
+  local workdir=$(mktemp -d --suffix=qemu)
+  CLEANUP_FILES+=("${workdir}")
+
+  apt install -y "${QEMU_BUILD_DEPS[@]}"
+
+  local qemutar="${workdir}/qemu.tar.gz"
+  curl --output "${qemutar}" "${QEMU_URL}"
+  tar -Jxf "${qemutar}" -C "${workdir}"
+  local srcdir="${workdir}/qemu-${QEMU_RELEASE}"
+
+  local builddir="${workdir}/build"
+  local prefixdir="${workdir}/prefix"
+  mkdir -p "${builddir}"
+
+  # List of targets to build.
+  local targets=""
+  local make_targets=()
+  local target
+  for target in "${QEMU_ARCHS[@]}"; do
+    targets="${targets} ${target}-linux-user"
+    # Build just the linux-user targets.
+    make_targets+=("${target}-linux-user/all")
+  done
+
+  cd "${builddir}"
+  "${srcdir}/configure" \
+    --prefix="${prefixdir}" \
+    --static --disable-system --enable-linux-user \
+    --target-list="${targets}"
+
+  make -j $(nproc --all || echo 1) "${make_targets[@]}"
+
+  # Manually install these into the non-standard location. This script runs as
+  # root anyway.
+  for target in "${QEMU_ARCHS[@]}"; do
+    cp "${target}-linux-user/qemu-${target}" "/usr/bin/qemu-${target}-static"
+  done
+
+  apt autoremove -y --purge "${QEMU_BUILD_DEPS[@]}"
+}
+
+main "$@"
--- a/third_party/jpeg-xl/examples/CMakeLists.txt
+++ b/third_party/jpeg-xl/examples/CMakeLists.txt
@ -0,0 +1,28 @@
+# Copyright (c) the JPEG XL Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+add_executable(decode_oneshot decode_oneshot.cc)
+target_link_libraries(decode_oneshot jxl_dec jxl_threads)
+add_executable(encode_oneshot encode_oneshot.cc)
+target_link_libraries(encode_oneshot jxl jxl_threads)
+
+add_executable(jxlinfo jxlinfo.c)
+target_link_libraries(jxlinfo jxl)
+
+if(NOT ${SANITIZER} STREQUAL "none")
+  # Linking a C test binary with the C++ JPEG XL implementation when using
+  # address sanitizer is not well supported by clang 9, so force using clang++
+  # for linking this test if a sanitizer is used.
+  set_target_properties(jxlinfo PROPERTIES LINKER_LANGUAGE CXX)
+endif()  # SANITIZER != "none"
--- a/Показать больше
+++ b/Показать больше