Bug 1815790 - Replace intgemm by gemmology r=yury

gemmology is a rewrite of intgemm based on xsimd (that we already vendor), with a focus on the API we actually use. It also supports sse2 and has a decent implementation for arm32 and aarch64. Differential Revision: https://phabricator.services.mozilla.com/D171265
2023-03-28 13:22:27 +00:00 · 2023-03-28 13:22:27 +00:00 · 194cb113aa
--- a/build/moz.configure/toolchain.configure
+++ b/build/moz.configure/toolchain.configure
@ -3174,3 +3174,4 @@ set_config("SSE2_FLAGS", ["-msse2"])
 set_config("SSSE3_FLAGS", ["-mssse3"])
 set_config("SSE4_2_FLAGS", ["-msse4.2"])
 set_config("FMA_FLAGS", ["-mfma"])
+set_config("AVX2_FLAGS", ["-mavx2"])
--- a/js/src/intgemm/IntegerGemmIntrinsic.cpp
+++ b/js/src/intgemm/IntegerGemmIntrinsic.cpp
@ -11,7 +11,7 @@
 #include "mozilla/CheckedInt.h"
 #include "mozilla/IntegerPrintfMacros.h"

-#include <intgemm.h>
+#include <gemmology_fwd.h>

 #include "js/ErrorReport.h"
 #include "js/HeapAPI.h"
@ -20,6 +20,26 @@
 #include "wasm/WasmInstance.h"
 #include "wasm/WasmLog.h"

+#if defined(USE_AVX2)
+#  define SUPPORTED_ARCHS \
+    xsimd::arch_list<xsimd::avx2, xsimd::ssse3, xsimd::sse2>
+#elif defined(USE_SSSE3)
+#  define SUPPORTED_ARCHS xsimd::arch_list<xsimd::ssse3, xsimd::sse2>
+#elif defined(USE_SSE2)
+#  define SUPPORTED_ARCHS xsimd::arch_list<xsimd::sse2>
+#else
+#  error no supported architecture
+#endif
+
+// Dispatch *at runtime* based on run-time hardware and compile-time
+// architectures.
+//
+// FIXME: Ideally we would not run the dispatch code at each function call.
+#define GEMMOLOGY_DISPATCH(FUNC)                                 \
+  xsimd::dispatch<SUPPORTED_ARCHS>([](auto arch, auto... args) { \
+    return gemmology::Engine<decltype(arch)>::FUNC(args...);     \
+  })
+
 struct JSContext;

 static constexpr uint32_t ARRAY_ALIGNMENT = 64;
@ -118,10 +138,10 @@ int32_t js::intgemm::IntrI8PrepareB(wasm::Instance* instance,
  // Actual call to the 3rd party library (intgemm) for PrepareB
  uint8_t* inputMatrixBPtr = &memBase[inputMatrixB];
  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
-  ::intgemm::Int8::PrepareB((const float*)inputMatrixBPtr,
-                            (int8_t*)outputMatrixBPtr,
-                            (float)scale,  // Quant Mult
-                            rowsB, colsB);
+  GEMMOLOGY_DISPATCH(PrepareB)
+  ((const float*)inputMatrixBPtr, (int8_t*)outputMatrixBPtr,
+   (float)scale,  // Quant Mult
+   rowsB, colsB);
  return 0;
 }

@ -160,10 +180,10 @@ int32_t js::intgemm::IntrI8PrepareBFromTransposed(
  // Actual call to the 3rd party library (intgemm) for PrepareBTransposed
  uint8_t* inputMatrixBTransposedPtr = &memBase[inputMatrixBTransposed];
  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
-  ::intgemm::Int8::PrepareBTransposed((const float*)inputMatrixBTransposedPtr,
-                                      (int8_t*)outputMatrixBPtr,
-                                      (float)scale,  // Quant Mult
-                                      rowsB, colsB);
+  GEMMOLOGY_DISPATCH(PrepareBTransposed)
+  ((const float*)inputMatrixBTransposedPtr, (int8_t*)outputMatrixBPtr,
+   (float)scale,  // Quant Mult
+   rowsB, colsB);
  return 0;
 }

@ -202,9 +222,9 @@ int32_t js::intgemm::IntrI8PrepareBFromQuantizedTransposed(
  uint8_t* inputMatrixBQuantizedTransposedPtr =
      &memBase[inputMatrixBQuantizedTransposed];
  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
-  ::intgemm::Int8::PrepareBQuantizedTransposed(
-      (const int8_t*)inputMatrixBQuantizedTransposedPtr,
-      (int8_t*)outputMatrixBPtr, rowsB, colsB);
+  GEMMOLOGY_DISPATCH(PrepareBQuantizedTransposed)
+  ((const int8_t*)inputMatrixBQuantizedTransposedPtr, (int8_t*)outputMatrixBPtr,
+   rowsB, colsB);
  return 0;
 }

@ -243,9 +263,8 @@ int32_t js::intgemm::IntrI8PrepareA(wasm::Instance* instance,
  // Actual call to the 3rd party library (intgemm)
  uint8_t* inputMatrixAPtr = &memBase[inputMatrixA];
  uint8_t* outputMatrixAPtr = &memBase[outputMatrixA];
-  ::intgemm::Int8Shift::PrepareA((const float*)inputMatrixAPtr,
-                                 (int8_t*)outputMatrixAPtr, scale, rowsA,
-                                 colsA);
+  GEMMOLOGY_DISPATCH(Shift::PrepareA)
+  ((const float*)inputMatrixAPtr, outputMatrixAPtr, scale, rowsA, colsA);
  return 0;
 }

@ -290,10 +309,10 @@ int32_t js::intgemm::IntrI8PrepareBias(
  uint8_t* outputPtr = &memBase[output];
  float unquantFactor =
      (-1) * ((127.0f / scaleA) * (127.0f / scaleB)) / (127.0f);
-  ::intgemm::Int8Shift::PrepareBias(
-      (const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
-      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
-          unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
+  GEMMOLOGY_DISPATCH(Shift::PrepareBias)
+  ((const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
+   gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
+       unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
  return 0;
 }

@ -347,12 +366,12 @@ int32_t js::intgemm::IntrI8MultiplyAndAddBias(
  uint8_t* inputBiasPreparedPtr = &memBase[inputBiasPrepared];
  uint8_t* outputPtr = &memBase[output];
  float unquantFactor = unquantMultiplier / (scaleA * scaleB);
-  ::intgemm::Int8Shift::Multiply(
-      (const int8_t*)inputMatrixAPreparedPtr,
-      (const int8_t*)inputMatrixBPreparedPtr, rowsA, width, colsB,
-      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
-          unquantFactor, (const float*)inputBiasPreparedPtr,
-          (float*)outputPtr));
+
+  GEMMOLOGY_DISPATCH(Shift::Multiply)
+  (inputMatrixAPreparedPtr, (const int8_t*)inputMatrixBPreparedPtr, rowsA,
+   width, colsB,
+   gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
+       unquantFactor, (const float*)inputBiasPreparedPtr, (float*)outputPtr));
  return 0;
 }

@ -401,9 +420,12 @@ int32_t js::intgemm::IntrI8SelectColumnsOfB(wasm::Instance* instance,
  uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
  uint8_t* colIndexListPtr = &memBase[colIndexList];
  uint8_t* outputPtr = &memBase[output];
-  ::intgemm::Int8::SelectColumnsB(
-      (const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
-      (const uint32_t*)colIndexListPtr,
-      (const uint32_t*)colIndexListPtr + sizeColIndexList);
+  GEMMOLOGY_DISPATCH(SelectColumnsB)
+  ((const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
+   (const uint32_t*)colIndexListPtr,
+   (const uint32_t*)colIndexListPtr + sizeColIndexList);
  return 0;
 }
+
+#undef GEMMOLOGY_DISPATCH
+#undef SUPPORTED_ARCHS
--- a/js/src/intgemm/README_MOZILLA
+++ b/js/src/intgemm/README_MOZILLA
@ -1,18 +1,18 @@
-This directory contains build files for the intgemm reference implementation.
-The actual library source is in $TOPSRCDIR/third_party/intgemm/
+This directory contains build files for the gemmology reference implementation.
+The actual library source is in $TOPSRCDIR/third_party/gemmology/

 Any patches or additional configuration to be applied to the
-upstream source should be kept in $TOPSRCDIR/third_party/intgemm/.
+upstream source should be kept in $TOPSRCDIR/third_party/gemmology/.

 To update the library source and build config files, execute

-  ./mach vendor js/src/intgemm/moz.yaml
+  ./mach vendor third_party/gemmology/moz.yaml

 To update to a specific upstream git tag or commit, use

-  ./mach vendor js/src/intgemm/moz.yaml -r <commit>
+  ./mach vendor third_party/gemmology/moz.yaml -r <commit>

-The upstream git repository is https://github.com/kpu/intgemm
+The upstream git repository is https://github.com/serge-sans-paille/gemmology

 To view the information about the current version, check the
-'origin' section of moz.yaml.
+'origin' section of moz.yaml.
--- a/js/src/intgemm/enable_intel_extensions.py
+++ b/js/src/intgemm/enable_intel_extensions.py
@ -1,24 +0,0 @@
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-def main(output, intgemm_config):
-    with open(intgemm_config, "r") as f:
-        config = f.read()
-
-    # Enable intel AVX2 hardware extension specific code to allow using AVX2 at run time
-    # if target cpu supports it
-    config = config.replace(
-        "#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX2",
-        "#define INTGEMM_COMPILER_SUPPORTS_AVX2",
-    )
-
-    # Disable more advanced intel hardware extensions for now because base-toolchain compiler
-    # versions aren't able to compile them
-    config = config.replace("#cmakedefine", "#undef")
-
-    output.write(config)
-    output.close()
-
-    return 0
--- a/js/src/intgemm/moz.build
+++ b/js/src/intgemm/moz.build
@ -15,19 +15,32 @@ with Files("*"):
 LOCAL_INCLUDES += [
    "!..",
    "..",
-    "/third_party/intgemm/intgemm",
+    "/third_party/gemmology",
+    "/third_party/xsimd/include",
 ]

+if CONFIG["INTEL_ARCHITECTURE"]:
+    DEFINES["USE_SSE2"] = True
+    SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"]
+    SOURCES["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"].flags += CONFIG[
+        "SSE2_FLAGS"
+    ]
+    if CONFIG["SSSE3_FLAGS"]:
+        DEFINES["USE_SSSE3"] = True
+        SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"]
+        SOURCES[
+            "/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"
+        ].flags += CONFIG["SSSE3_FLAGS"]
+    if CONFIG["AVX2_FLAGS"]:
+        DEFINES["USE_AVX2"] = True
+        SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"]
+        SOURCES[
+            "/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"
+        ].flags += CONFIG["AVX2_FLAGS"]
+
 SOURCES += [
-    "/third_party/intgemm/intgemm/intgemm.cc",
    "IntegerGemmIntrinsic.cpp",
 ]

-GeneratedFile(
-    "intgemm/intgemm_config.h",
-    script="enable_intel_extensions.py",
-    inputs=["/third_party/intgemm/intgemm/intgemm_config.h.in"],
-)
-
 # We allow warnings for third-party code that can be updated from upstream.
 AllowCompilerWarnings()
--- a/js/src/intgemm/moz.yaml
+++ b/js/src/intgemm/moz.yaml
@ -1,47 +0,0 @@
-# Version of this schema
-schema: 1
-
-bugzilla:
-  # Bugzilla product and component for this directory and subdirectories
-  product: Core
-  component: "JavaScript: WebAssembly"
-
-# Document the source of externally hosted code
-origin:
-
-  # Short name of the package/library
-  name: intgemm
-
-  description: integer matrix multiplication
-
-  # Full URL for the package's homepage/etc
-  # Usually different from repository url
-  url: https://github.com/kpu/intgemm
-
-  # Human-readable identifier for this version/release
-  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit fc3a614351ce6e667197307d97f45db5265c96af (2022-02-09T14:56:05Z).
-
-  # Revision to pull in
-  # Must be a long or short commit SHA (long preferred)
-  revision: fc3a614351ce6e667197307d97f45db5265c96af
-
-  # The package's license, where possible using the mnemonic from
-  # https://spdx.org/licenses/
-  # Multiple licenses can be specified (as a YAML list)
-  # A "LICENSE" file must exist containing the full license text
-  license: MIT
-
-vendoring:
-  url: https://github.com/kpu/intgemm
-  source-hosting: github
-  vendor-directory: third_party/intgemm
-
-  exclude:
-    - build/.gitattributes
-    - build/.gitignore
-
-  update-actions:
-    - action: delete-path
-      path: '{yaml_dir}/config'
-
--- a/js/src/make-source-package.py
+++ b/js/src/make-source-package.py
@ -179,7 +179,8 @@ rsync_filter_list = """
 - /third_party/python/gyp
 + /third_party/python/**
 + /third_party/rust/**
-+ /third_party/intgemm/**
+ /third_party/gemmology/**
+ /third_party/xsimd/**
 + /layout/tools/reftest/reftest/**

 + /testing/mach_commands.py
--- a/third_party/gemmology/LICENSE
+++ b/third_party/gemmology/LICENSE
@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2023 Serge Guelton
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+--
+
+The original 8-bit code came from:
+MIT License
+
+Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/third_party/gemmology/gemmology.h
+++ b/third_party/gemmology/gemmology.h
--- a/third_party/gemmology/gemmology_fwd.h
+++ b/third_party/gemmology/gemmology_fwd.h
@ -0,0 +1,218 @@
+/***************************************************************
+ *                                       _                     *
+ *                                      | |                    *
+ *   __ _  ___ _ __ ___  _ __ ___   ___ | | ___   __ _ _   _   *
+ *  / _` |/ _ \ '_ ` _ \| '_ ` _ \ / _ \| |/ _ \ / _` | | | |  *
+ * | (_| |  __/ | | | | | | | | | | (_) | | (_) | (_| | |_| |  *
+ *  \__, |\___|_| |_| |_|_| |_| |_|\___/|_|\___/ \__, |\__, |  *
+ *   __/ |                                        __/ | __/ |  *
+ *  |___/                                        |___/ |___/   *
+ *                                                             *
+ *                                                 version 0.1 *
+ ***************************************************************/
+
+#ifndef GEMMOLOGY_FWD_H
+#define GEMMOLOGY_FWD_H
+
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include <xsimd/xsimd.hpp>
+
+namespace gemmology {
+
+namespace callbacks {
+
+struct Unquantize {
+  float unquant_mult;
+  template <class Arch>
+  xsimd::batch<float, Arch> operator()(xsimd::batch<int32_t, Arch> total, size_t, size_t, size_t);
+  template <class Arch>
+  std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> operator()(
+      std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+          total,
+      size_t, size_t, size_t);
+};
+
+struct AddBias {
+  const float *bias_addr;
+  template <class Arch>
+  xsimd::batch<float, Arch> operator()(xsimd::batch<float, Arch> total, size_t, size_t col_idx,
+                  size_t);
+  template <class Arch>
+  std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>>
+  operator()(
+      std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> total,
+      size_t, size_t col_idx, size_t);
+};
+
+struct Write {
+  float *output_addr;
+
+  Write(float *o) : output_addr(o) {}
+
+  template <class Arch>
+  void operator()(xsimd::batch<float, Arch> result, size_t row_idx,
+                  size_t col_idx, size_t col_size);
+  template <class Arch>
+  void operator()(xsimd::batch<int32_t, Arch> result, size_t row_idx,
+                  size_t col_idx, size_t col_size);
+
+  template <class Arch>
+  void operator()(
+      std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> result,
+      size_t row_idx, size_t col_idx, size_t col_size);
+
+  template <class Arch>
+  void operator()(
+      std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
+          result,
+      size_t row_idx, size_t col_idx, size_t col_size);
+};
+
+struct UnquantizeAndWrite {
+
+  Unquantize unquantize;
+  Write write;
+
+  UnquantizeAndWrite(float factor, float *output)
+      : unquantize{factor}, write{output} {}
+
+  template <class T>
+  void operator()(T const &total, size_t row_idx, size_t col_idx,
+                  size_t col_size);
+};
+
+struct UnquantizeAndAddBiasAndWrite {
+
+  Unquantize unquantize;
+  AddBias add_bias;
+  Write write;
+
+  UnquantizeAndAddBiasAndWrite(float factor, const float *bias, float *output)
+      : unquantize{factor}, add_bias{bias}, write{output} {}
+
+  template <class T>
+  void operator()(T const &total, size_t row_idx, size_t col_idx,
+                  size_t col_size);
+};
+
+} // namespace callbacks
+
+//
+// Arch-specific implementation of each routine
+//
+template <class Arch> struct Engine {
+
+  static void QuantizeU(const float *input, uint8_t *output, float quant_mult,
+                        size_t size);
+
+  static void Quantize(const float *const input, int8_t *const output,
+                       float quant_mult, size_t size);
+
+  template <typename IntegerTy>
+  static void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
+                             const IntegerTy *cols_begin,
+                             const IntegerTy *cols_end);
+
+  static void PrepareBTransposed(const float *input, int8_t *output,
+                                 float quant_mult, size_t cols, size_t rows);
+
+  static void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
+                                          size_t cols, size_t rows);
+
+  static void PrepareB(const float *input, int8_t *output_shadow,
+                       float quant_mult, size_t rows, size_t cols);
+
+  static void PrepareA(const float *input, int8_t *output, float quant_mult,
+                       size_t rows, size_t cols);
+
+  struct Shift {
+
+    static void PrepareA(const float *input, uint8_t *output, float quant_mult,
+                         size_t rows, size_t cols);
+
+    template <class Callback>
+    static void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
+                         size_t width, size_t B_cols, Callback callback);
+
+    template <class Callback>
+    static void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
+                            Callback C);
+  };
+};
+
+//
+// Top-level wrappers that mostly match intgemm API
+//
+
+template <class Arch = xsimd::default_arch>
+inline void QuantizeU(const float *input, uint8_t *output, float quant_mult,
+                      size_t size) {
+  return Engine<Arch>::QuantizeU(input, output, quant_mult, size);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void Quantize(const float *const input, int8_t *const output,
+                     float quant_mult, size_t size) {
+  return Engine<Arch>::Quantize(input, output, quant_mult, size);
+}
+
+template <class Arch = xsimd::default_arch, typename IntegerTy>
+inline void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
+                           const IntegerTy *cols_begin,
+                           const IntegerTy *cols_end) {
+  return Engine<Arch>::SelectColumnsB(input, output, rows, cols_begin,
+                                      cols_end);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareBTransposed(const float *input, int8_t *output,
+                               float quant_mult, size_t cols, size_t rows) {
+  return Engine<Arch>::PrepareBTransposed(input, output, quant_mult, cols,
+                                          rows);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
+                                        size_t cols, size_t rows) {
+  return Engine<Arch>::PrepareBQuantizedTransposed(input, output, cols, rows);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareB(const float *input, int8_t *output_shadow,
+                     float quant_mult, size_t rows, size_t cols) {
+  return Engine<Arch>::PrepareB(input, output_shadow, quant_mult, rows, cols);
+}
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareA(const float *input, int8_t *output, float quant_mult,
+                     size_t rows, size_t cols) {
+  return Engine<Arch>::PrepareA(input, output, quant_mult, rows, cols);
+}
+
+namespace Shift {
+
+template <class Arch = xsimd::default_arch>
+inline void PrepareA(const float *input, uint8_t *output, float quant_mult,
+                     size_t rows, size_t cols) {
+  return Engine<Arch>::Shift::PrepareA(input, output, quant_mult, rows, cols);
+}
+
+template <class Arch = xsimd::default_arch, class Callback>
+inline void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
+                     size_t width, size_t B_cols, Callback C) {
+  return Engine<Arch>::Shift::Multiply(A, B, A_rows, width, B_cols, C);
+}
+
+template <class Arch = xsimd::default_arch, class Callback>
+inline void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
+                        Callback C) {
+  return Engine<Arch>::Shift::PrepareBias(B, width, B_cols, C);
+}
+
+} // namespace Shift
+
+} // namespace gemmology
+
+#endif
--- a/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp
+++ b/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp
@ -0,0 +1,19 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <gemmology.h>
+
+namespace gemmology {
+template struct Engine<xsimd::avx2>;
+template void Engine<xsimd::avx2>::SelectColumnsB(int8_t const*, int8_t*,
+                                                  size_t, uint32_t const*,
+                                                  uint32_t const*);
+template void Engine<xsimd::avx2>::Shift::Multiply(
+    uint8_t const*, int8_t const*, size_t, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+template void Engine<xsimd::avx2>::Shift::PrepareBias(
+    int8_t const*, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+}  // namespace gemmology
--- a/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp
+++ b/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp
@ -0,0 +1,19 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <gemmology.h>
+
+namespace gemmology {
+template struct Engine<xsimd::sse2>;
+template void Engine<xsimd::sse2>::SelectColumnsB(int8_t const*, int8_t*,
+                                                  size_t, uint32_t const*,
+                                                  uint32_t const*);
+template void Engine<xsimd::sse2>::Shift::Multiply(
+    uint8_t const*, int8_t const*, size_t, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+template void Engine<xsimd::sse2>::Shift::PrepareBias(
+    int8_t const*, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+}  // namespace gemmology
--- a/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp
+++ b/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp
@ -0,0 +1,19 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include <gemmology.h>
+
+namespace gemmology {
+template struct Engine<xsimd::ssse3>;
+template void Engine<xsimd::ssse3>::SelectColumnsB(int8_t const*, int8_t*,
+                                                   size_t, uint32_t const*,
+                                                   uint32_t const*);
+template void Engine<xsimd::ssse3>::Shift::Multiply(
+    uint8_t const*, int8_t const*, size_t, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+template void Engine<xsimd::ssse3>::Shift::PrepareBias(
+    int8_t const*, size_t, size_t,
+    gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
+}  // namespace gemmology
--- a/third_party/gemmology/moz.yaml
+++ b/third_party/gemmology/moz.yaml
@ -0,0 +1,29 @@
+schema: 1
+
+bugzilla:
+  product: Core
+  component: "JavaScript: WebAssembly"
+
+origin:
+  name: gemmology
+  description: small integer matrix multiply
+
+  url: https://github.com/serge-sans-paille/gemmology
+
+  release: e1167c52cbbfd989390e4d9515c84c88878bfe80 (2023-03-28T11:32:43Z).
+  revision: e1167c52cbbfd989390e4d9515c84c88878bfe80
+
+  license: MIT
+
+vendoring:
+  url: https://github.com/serge-sans-paille/gemmology
+  source-hosting: github
+  tracking: commit
+
+  exclude:
+    - ".*"
+    - "*.rst"
+    - test
+
+  keep:
+    - kernels/*.cpp
--- a/third_party/intgemm/CMake/Catch.cmake
+++ b/third_party/intgemm/CMake/Catch.cmake
@ -1,175 +0,0 @@
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-#[=======================================================================[.rst:
-Catch
-----
-
-This module defines a function to help use the Catch test framework.
-
-The :command:`catch_discover_tests` discovers tests by asking the compiled test
-executable to enumerate its tests.  This does not require CMake to be re-run
-when tests change.  However, it may not work in a cross-compiling environment,
-and setting test properties is less convenient.
-
-This command is intended to replace use of :command:`add_test` to register
-tests, and will create a separate CTest test for each Catch test case.  Note
-that this is in some cases less efficient, as common set-up and tear-down logic
-cannot be shared by multiple test cases executing in the same instance.
-However, it provides more fine-grained pass/fail information to CTest, which is
-usually considered as more beneficial.  By default, the CTest test name is the
-same as the Catch name; see also ``TEST_PREFIX`` and ``TEST_SUFFIX``.
-
-.. command:: catch_discover_tests
-
-  Automatically add tests with CTest by querying the compiled test executable
-  for available tests::
-
-    catch_discover_tests(target
-                         [TEST_SPEC arg1...]
-                         [EXTRA_ARGS arg1...]
-                         [WORKING_DIRECTORY dir]
-                         [TEST_PREFIX prefix]
-                         [TEST_SUFFIX suffix]
-                         [PROPERTIES name1 value1...]
-                         [TEST_LIST var]
-    )
-
-  ``catch_discover_tests`` sets up a post-build command on the test executable
-  that generates the list of tests by parsing the output from running the test
-  with the ``--list-test-names-only`` argument.  This ensures that the full
-  list of tests is obtained.  Since test discovery occurs at build time, it is
-  not necessary to re-run CMake when the list of tests changes.
-  However, it requires that :prop_tgt:`CROSSCOMPILING_EMULATOR` is properly set
-  in order to function in a cross-compiling environment.
-
-  Additionally, setting properties on tests is somewhat less convenient, since
-  the tests are not available at CMake time.  Additional test properties may be
-  assigned to the set of tests as a whole using the ``PROPERTIES`` option.  If
-  more fine-grained test control is needed, custom content may be provided
-  through an external CTest script using the :prop_dir:`TEST_INCLUDE_FILES`
-  directory property.  The set of discovered tests is made accessible to such a
-  script via the ``<target>_TESTS`` variable.
-
-  The options are:
-
-  ``target``
-    Specifies the Catch executable, which must be a known CMake executable
-    target.  CMake will substitute the location of the built executable when
-    running the test.
-
-  ``TEST_SPEC arg1...``
-    Specifies test cases, wildcarded test cases, tags and tag expressions to
-    pass to the Catch executable with the ``--list-test-names-only`` argument.
-
-  ``EXTRA_ARGS arg1...``
-    Any extra arguments to pass on the command line to each test case.
-
-  ``WORKING_DIRECTORY dir``
-    Specifies the directory in which to run the discovered test cases.  If this
-    option is not provided, the current binary directory is used.
-
-  ``TEST_PREFIX prefix``
-    Specifies a ``prefix`` to be prepended to the name of each discovered test
-    case.  This can be useful when the same test executable is being used in
-    multiple calls to ``catch_discover_tests()`` but with different
-    ``TEST_SPEC`` or ``EXTRA_ARGS``.
-
-  ``TEST_SUFFIX suffix``
-    Similar to ``TEST_PREFIX`` except the ``suffix`` is appended to the name of
-    every discovered test case.  Both ``TEST_PREFIX`` and ``TEST_SUFFIX`` may
-    be specified.
-
-  ``PROPERTIES name1 value1...``
-    Specifies additional properties to be set on all tests discovered by this
-    invocation of ``catch_discover_tests``.
-
-  ``TEST_LIST var``
-    Make the list of tests available in the variable ``var``, rather than the
-    default ``<target>_TESTS``.  This can be useful when the same test
-    executable is being used in multiple calls to ``catch_discover_tests()``.
-    Note that this variable is only available in CTest.
-
-#]=======================================================================]
-
-#------------------------------------------------------------------------------
-function(catch_discover_tests TARGET)
-  cmake_parse_arguments(
-    ""
-    ""
-    "TEST_PREFIX;TEST_SUFFIX;WORKING_DIRECTORY;TEST_LIST"
-    "TEST_SPEC;EXTRA_ARGS;PROPERTIES"
-    ${ARGN}
-  )
-
-  if(NOT _WORKING_DIRECTORY)
-    set(_WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
-  endif()
-  if(NOT _TEST_LIST)
-    set(_TEST_LIST ${TARGET}_TESTS)
-  endif()
-
-  ## Generate a unique name based on the extra arguments
-  string(SHA1 args_hash "${_TEST_SPEC} ${_EXTRA_ARGS}")
-  string(SUBSTRING ${args_hash} 0 7 args_hash)
-
-  # Define rule to generate test list for aforementioned test executable
-  set(ctest_include_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_include-${args_hash}.cmake")
-  set(ctest_tests_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_tests-${args_hash}.cmake")
-  get_property(crosscompiling_emulator
-    TARGET ${TARGET}
-    PROPERTY CROSSCOMPILING_EMULATOR
-  )
-  add_custom_command(
-    TARGET ${TARGET} POST_BUILD
-    BYPRODUCTS "${ctest_tests_file}"
-    COMMAND "${CMAKE_COMMAND}"
-            -D "TEST_TARGET=${TARGET}"
-            -D "TEST_EXECUTABLE=$<TARGET_FILE:${TARGET}>"
-            -D "TEST_EXECUTOR=${crosscompiling_emulator}"
-            -D "TEST_WORKING_DIR=${_WORKING_DIRECTORY}"
-            -D "TEST_SPEC=${_TEST_SPEC}"
-            -D "TEST_EXTRA_ARGS=${_EXTRA_ARGS}"
-            -D "TEST_PROPERTIES=${_PROPERTIES}"
-            -D "TEST_PREFIX=${_TEST_PREFIX}"
-            -D "TEST_SUFFIX=${_TEST_SUFFIX}"
-            -D "TEST_LIST=${_TEST_LIST}"
-            -D "CTEST_FILE=${ctest_tests_file}"
-            -P "${_CATCH_DISCOVER_TESTS_SCRIPT}"
-    VERBATIM
-  )
-
-  file(WRITE "${ctest_include_file}"
-    "if(EXISTS \"${ctest_tests_file}\")\n"
-    "  include(\"${ctest_tests_file}\")\n"
-    "else()\n"
-    "  add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
-    "endif()\n"
-  )
-
-  if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0") 
-    # Add discovered tests to directory TEST_INCLUDE_FILES
-    set_property(DIRECTORY
-      APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
-    )
-  else()
-    # Add discovered tests as directory TEST_INCLUDE_FILE if possible
-    get_property(test_include_file_set DIRECTORY PROPERTY TEST_INCLUDE_FILE SET)
-    if (NOT ${test_include_file_set})
-      set_property(DIRECTORY
-        PROPERTY TEST_INCLUDE_FILE "${ctest_include_file}"
-      )
-    else()
-      message(FATAL_ERROR
-        "Cannot set more than one TEST_INCLUDE_FILE"
-      )
-    endif()
-  endif()
-
-endfunction()
-
-###############################################################################
-
-set(_CATCH_DISCOVER_TESTS_SCRIPT
-  ${CMAKE_CURRENT_LIST_DIR}/CatchAddTests.cmake
-)
--- a/third_party/intgemm/CMake/CatchAddTests.cmake
+++ b/third_party/intgemm/CMake/CatchAddTests.cmake
@ -1,78 +0,0 @@
-# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
-# file Copyright.txt or https://cmake.org/licensing for details.
-
-set(prefix "${TEST_PREFIX}")
-set(suffix "${TEST_SUFFIX}")
-set(spec ${TEST_SPEC})
-set(extra_args ${TEST_EXTRA_ARGS})
-set(properties ${TEST_PROPERTIES})
-set(script)
-set(suite)
-set(tests)
-
-function(add_command NAME)
-  set(_args "")
-  foreach(_arg ${ARGN})
-    if(_arg MATCHES "[^-./:a-zA-Z0-9_]")
-      set(_args "${_args} [==[${_arg}]==]") # form a bracket_argument
-    else()
-      set(_args "${_args} ${_arg}")
-    endif()
-  endforeach()
-  set(script "${script}${NAME}(${_args})\n" PARENT_SCOPE)
-endfunction()
-
-# Run test executable to get list of available tests
-if(NOT EXISTS "${TEST_EXECUTABLE}")
-  message(FATAL_ERROR
-    "Specified test executable '${TEST_EXECUTABLE}' does not exist"
-  )
-endif()
-execute_process(
-  COMMAND ${TEST_EXECUTOR} "${TEST_EXECUTABLE}" ${spec} --list-test-names-only
-  OUTPUT_VARIABLE output
-  RESULT_VARIABLE result
-)
-# Catch --list-test-names-only reports the number of tests, so 0 is... surprising
-if(${result} EQUAL 0)
-  message(WARNING
-    "Test executable '${TEST_EXECUTABLE}' contains no tests!\n"
-  )
-elseif(${result} LESS 0)
-  message(FATAL_ERROR
-    "Error running test executable '${TEST_EXECUTABLE}':\n"
-    "  Result: ${result}\n"
-    "  Output: ${output}\n"
-  )
-endif()
-
-string(REPLACE "\n" ";" output "${output}")
-
-# Parse output
-foreach(line ${output})
-  set(test ${line})
-  # use escape commas to handle properly test cases with commans inside the name
-  string(REPLACE "," "\\," test_name ${test})
-  # ...and add to script
-  add_command(add_test
-    "${prefix}${test}${suffix}"
-    ${TEST_EXECUTOR}
-    "${TEST_EXECUTABLE}"
-    "${test_name}"
-    ${extra_args}
-  )
-  add_command(set_tests_properties
-    "${prefix}${test}${suffix}"
-    PROPERTIES
-    WORKING_DIRECTORY "${TEST_WORKING_DIR}"
-    ${properties}
-  )
-  list(APPEND tests "${prefix}${test}${suffix}")
-endforeach()
-
-# Create a list of all discovered tests, which users may use to e.g. set
-# properties on the tests
-add_command(set ${TEST_LIST} ${tests})
-
-# Write CTest script
-file(WRITE "${CTEST_FILE}" "${script}")
--- a/third_party/intgemm/CMakeLists.txt
+++ b/third_party/intgemm/CMakeLists.txt
@ -1,136 +0,0 @@
-cmake_minimum_required(VERSION 3.5)
-project(intgemm)
-string(ASCII 27 Esc)
-set(Orange "${Esc}[33m")
-set(ColourReset "${Esc}[m")
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-set(CMAKE_CXX_STANDARD 11)
-
-if(MSVC)
-  add_compile_options(/W4 /WX)
-else()
-  add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-unknown-pragmas)
-  if (COMPILE_WASM)
-    # Disabling Pthreads + memory growth warning to be an error for WASM
-    # Pthreads + memory growth causes JS accessing the wasm memory to be slow
-    # https://github.com/WebAssembly/design/issues/1271
-    add_compile_options(-Wno-error=pthreads-mem-growth)
-  endif()
-endif()
-
-# Check if compiler supports AVX2 (this should only catch emscripten)
-try_compile(INTGEMM_COMPILER_SUPPORTS_AVX2
-  ${CMAKE_CURRENT_BINARY_DIR}/compile_tests
-  ${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx2.cc)
-
-# Check if compiler supports AVX512BW
-try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  ${CMAKE_CURRENT_BINARY_DIR}/compile_tests
-  ${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx512bw.cc)
-
-# Check if the compiler supports AVX512VNNI
-try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  ${CMAKE_CURRENT_BINARY_DIR}/compile_tests
-  ${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx512vnni.cc)
-
-if (NOT INTGEMM_COMPILER_SUPPORTS_AVX2 OR NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW OR NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
-  set(UNSUPPORTED "Your compiler is too old to support")
-  if (NOT INTGEMM_COMPILER_SUPPORTS_AVX2)
-    set(UNSUPPORTED "${UNSUPPORTED} AVX2")
-  endif()
-  if (NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-    set(UNSUPPORTED "${UNSUPPORTED} AVX512BW")
-  endif()
-  if (NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
-    set(UNSUPPORTED "${UNSUPPORTED} AVX512VNNI")
-  endif()
-  message(WARNING "${Orange}${UNSUPPORTED}.  Multiplication will be slower on CPUs that support these instructions. For details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}")
-endif()
-
-
-add_library(intgemm STATIC intgemm/intgemm.cc)
-
-# Generate configure file
-configure_file(intgemm/intgemm_config.h.in intgemm/intgemm_config.h)
-#Ensure it is included by users.
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-target_include_directories(intgemm PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-
-# This isn't necessary since intgemm uses entirely relative paths but source code depending on it may want to #include <intgemm/intgemm.h>
-target_include_directories(intgemm INTERFACE .)
-
-option(USE_OPENMP "Use OpenMP" OFF)
-if (USE_OPENMP)
-  message(STATUS "Compiling with OpenMP")
-  find_package(OpenMP)
-  if (NOT ${OpenMP_CXX_FOUND})
-    message(SEND_ERROR "OpenMP requested but C++ support not found")
-  endif()
-  add_compile_options(${OpenMP_CXX_FLAGS})
-  target_link_libraries(intgemm PUBLIC OpenMP::OpenMP_CXX)
-endif()
-
-if (COMPILE_WASM)
-    # A compile defintion to compile intgemm on WASM platform
-    target_compile_definitions(intgemm PUBLIC WASM)
-endif()
-
-option(WORMHOLE "Use WASM wormhole https://bugzilla.mozilla.org/show_bug.cgi?id=1672160" OFF)
-if (WORMHOLE)
-  target_compile_definitions(intgemm PUBLIC INTGEMM_WORMHOLE)
-endif()
-
-option(INTGEMM_CPUID_ENVIRONMENT "Allow INTGEMM_CPUID environment variable to downgrade CPU model, which is mainly for testing." ON)
-if (INTGEMM_CPUID_ENVIRONMENT)
-  target_compile_definitions(intgemm PRIVATE INTGEMM_CPUID_ENVIRONMENT)
-endif()
-
-if(INTGEMM_DONT_BUILD_TESTS)
-  return()
-endif()
-
-foreach(exe benchmark biasmultiply benchmark_quantizer)
-  add_executable(${exe} benchmarks/${exe}.cc)
-  target_link_libraries(${exe} intgemm)
-endforeach()
-
-add_executable(example example.cc)
-target_link_libraries(example intgemm)
-
-add_executable(tests
-  test/test.cc
-
-  # General tests
-  test/add127_test.cc
-  test/multiply_test.cc
-  test/prepare_b_quantized_transposed.cc
-  test/prepare_b_transposed.cc
-  test/quantize_test.cc
-  test/utils_test.cc
-
-  # Kernels tests
-  test/kernels/add_bias_test.cc
-  test/kernels/bitwise_not_test.cc
-  test/kernels/downcast_test.cc
-  test/kernels/exp_test.cc
-  test/kernels/floor_test.cc
-  test/kernels/multiply_test.cc
-  test/kernels/quantize_test.cc
-  test/kernels/relu_test.cc
-  test/kernels/rescale_test.cc
-  test/kernels/sigmoid_test.cc
-  test/kernels/tanh_test.cc
-  test/kernels/unquantize_test.cc
-  test/kernels/upcast_test.cc
-  test/kernels/write_test.cc
-)
-target_link_libraries(tests intgemm)
-
-#CTest integration with Catch2
-include(${CMAKE_CURRENT_SOURCE_DIR}/CMake/Catch.cmake)
-include(CTest)
-catch_discover_tests(tests)
--- a/third_party/intgemm/LICENSE
+++ b/third_party/intgemm/LICENSE
@ -1,70 +0,0 @@
-MIT License
-
-Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-
-
-test/3rd_party/catch.hpp
-Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved.
-Distributed under the Boost Software License, Version 1.0. (See accompanying
-file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
-
-
-The original 16-bit SSE2 code came from:
-
-Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU by Jacob Devlin
-https://arxiv.org/abs/1705.01991
-
-Under a license:
-
-Copyright (c) 2017 Microsoft Corporation
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
--- a/third_party/intgemm/README.md
+++ b/third_party/intgemm/README.md
@ -1,91 +0,0 @@
-[![Build SSE](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-SSE.svg?label=SSE)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-SSE/)
-[![Build AVX2](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX2.svg?label=AVX2)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX2/)
-[![Build AVX512BW](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX512BW.svg?label=AVX512BW)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX512BW/)
-![Build Ubuntu](https://github.com/kpu/intgemm/workflows/Ubuntu/badge.svg)
-![Build Ubuntu debug](https://github.com/kpu/intgemm/workflows/Ubuntu%20debug/badge.svg)
-![Build Ubuntu OpenMP](https://github.com/kpu/intgemm/workflows/Ubuntu%20OpenMP/badge.svg)
-![Build Windows](https://github.com/kpu/intgemm/workflows/Windows/badge.svg)
-![Build Mac](https://github.com/kpu/intgemm/workflows/Mac/badge.svg)
-[![Intel Compiler](https://github.com/kpu/intgemm/actions/workflows/intel-19.yml/badge.svg)](https://github.com/kpu/intgemm/actions/workflows/intel-19.yml)
-
-# Integer Matrix Multiplication
-
-This repository implements 8-bit and 16-bit matrix multiplication:
-
-C = A * B
-
-It's designed with neural network inference in mind: A is typically activations, B is typically fixed parameters, and C is activations for the next layer.
-
-A can have any number of rows.  Typically this is a batch size.
-The shared dimension, A's columns and B's rows, must be a multiple of 32 (for 16-bit) or 64 (for 8-bit).
-B's columns must be a multiple of 8.
-
-## Accuracy
-16-bit multiplication accumulates into 32-bit integers WITHOUT SATURATION (because there is no 32-bit add with saturation). If width is too large (i.e. >2048) or many 16-bit values are large, there is substantial risk of overflow.  Choose a smaller quantization multiplier to scale things down or implement periodic upcasting to 64-bit for me.
-
-8-bit multiplication accumulates into 16-bit integers with saturation.  This saturates for larger widths (~1024) and is worst on SSSE3 because it accumulates in fewer values.  It's possible to upcast to 32-bit every so often, but this has not been implemented yet.
-
-## Usage
-
-A full example appears in [example.cc](example.cc).
-
-Both A and B should be prepared before multiplication.
-```C++
-#include "intgemm/intgemm.h"
-
-/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
- * A is A_rows x width.
- * B is width x B_cols.
- */
-/* Prepare A for multiplication.  This might be offline or on the fly. */
-intgemm::Int16::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
-/* Prepare B for multiplication.  This is typically done offline. */
-intgemm::Int16::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
-/* Multiply and produce results in C */
-intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0 / (quant_mult * quant_mult), C.begin()));
-```
-For 8-bit, use `Int8` instead of `Int16`.
-
-When repesented as floats, all of A, B, and C are in row-major format.
-
-The last argument of `Multiply` is a callback which is usually used to performs postprocessing on the output matrix (C). Full set of built-in callbacks can be found in [callbacks/configs.h](callbacks/configs.h). You can also write your own callback. To do that you just need to:
-1. Add configuration structure for your callback in [callbacks/configs.h](callbacks/configs.h).
-2. Add your callback implementation:
-   - in [callbacks/implementations.inl](callbacks/implementations.inl) if you want to implement it for all architecturs at the same time.
-   - in `callbacks/ARCHITECTURE.h` (e.g. [callbacks/sse2.h](callbacks/sse2.h)) if you want to implement it only for the specific architecture.
-
-For 8-bit, you can make use a of a slightly faster implementation, assuming you can determine tha quantization multipliers and prepare the biases offline:
-
-```C++
-#include "intgemm/intgemm.h"
-
-/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
- * A is A_rows x width.
- * B is width x B_cols.
- * If you want to make use of the slightly faster 8bit codepath (assuming you can cache biases and quantization multipliers)
- * This routine only supports C = A*B + Bias
- * In practise it computes C = (A+127)*B + Bias - |127|*B
- * Prepare A and B first:
- */
-float alpha = 25;
-float quant_mult = 127/alpha;
-intgemm::Int8Shift::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
-intgemm::Int8Shift::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
-/* Prepare the bias (inplace) */
-float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f);
-intgemm::Int8Shift::PrepareBias(B_prepared.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, inputBias.begin(), inputBias.begin()));
-/* Multiply */
-intgemm::Int8Shift::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), C.begin()));
-```
-
-## Quantization
-Floating-point values are multiplied by a user-specified constant then rounded to an integer.
-
-In 16 bit, Jacob Devlin recommends 1024.0 for neural networks to prevent the aforementioned overflow.
-
-In 8 bit, use 127.0 / the largest value (use MaxAbsolute).  Quantization will saturate so it's possible to use larger multipliers to obtain clipping.
-
-## Acknowledgments
-The original 16-bit SSE2 code came from:
-
-Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU by Jacob Devlin https://arxiv.org/abs/1705.01991 under the MIT license.
--- a/third_party/intgemm/benchmarks/benchmark.cc
+++ b/third_party/intgemm/benchmarks/benchmark.cc
@ -1,214 +0,0 @@
-#include "../intgemm/aligned.h"
-#include "intgemm/intgemm_config.h"
-#include "../intgemm/avx512_gemm.h"
-#include "../intgemm/sse2_gemm.h"
-#include "../intgemm/avx2_gemm.h"
-#include "../intgemm/ssse3_gemm.h"
-#include "../intgemm/intgemm.h"
-#include "../intgemm/stats.h"
-#include "../intgemm/callbacks.h"
-
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <iostream>
-#include <random>
-
-namespace intgemm {
-namespace {
-
-struct RandomMatrices {
-  RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) :
-    A_rows(A_rows_in), width(width_in), B_cols(B_cols_in),
-    A(A_rows * width), B(width * B_cols) {
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> dist(-1.f, 1.f);
-    gen.seed(45678);
-
-    for (auto& it : A) {
-      it = dist(gen);
-    }
-    for (auto& it : B) {
-      it = dist(gen);
-    }
-  }
-
-  const Index A_rows, width, B_cols;
-  AlignedVector<float> A, B;
-};
-
-template <class Backend> double Run(const RandomMatrices &m) {
-  using Integer = typename Backend::Integer;
-  float quant_mult = 127.0f / 2.0f;
-  float unquant_mult = 1.0f / (quant_mult * quant_mult);
-  AlignedVector<Integer> A_prepared(m.A_rows * m.width);
-  Backend::PrepareA(m.A.begin(), A_prepared.begin(), quant_mult, m.A_rows, m.width);
-  AlignedVector<Integer> B_prepared(m.width * m.B_cols);
-  Backend::PrepareB(m.B.begin(), B_prepared.begin(), quant_mult, m.width, m.B_cols);
-  AlignedVector<float> output(m.A_rows * m.B_cols);
-  // Burn in
-  Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
-  auto start = std::chrono::steady_clock::now();
-  Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
-  return std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
-}
-
-template <class Backend> void RunAll(RandomMatrices *matrices, RandomMatrices *matrices_end, std::vector<std::vector<double>> &stats) {
-  if (Backend::kUses > kCPU) return;
-  std::size_t size = matrices_end - matrices;
-  if (stats.size() < size)
-    stats.resize(size);
-  for (std::size_t i = 0; i < size; ++i) {
-    stats[i].push_back(Run<Backend>(matrices[i]));
-  }
-}
-
-struct BackendStats {
-  std::vector<std::vector<double>> ssse3_8bit;
-  std::vector<std::vector<double>> avx2_8bit;
-  std::vector<std::vector<double>> avx512_8bit;
-  std::vector<std::vector<double>> avx512vnni_8bit;
-  std::vector<std::vector<double>> sse2_16bit;
-  std::vector<std::vector<double>> avx2_16bit;
-  std::vector<std::vector<double>> avx512_16bit;
-};
-
-const float kOutlierThreshold = 0.75;
-void Summarize(std::vector<double> &stats) {
-  // Throw out outliers.
-  std::vector<double>::iterator keep = stats.begin() + static_cast<std::size_t>(static_cast<float>(stats.size()) * kOutlierThreshold);
-  std::nth_element(stats.begin(), keep, stats.end());
-  double avg = 0.0;
-  for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
-    avg += *i;
-  }
-  avg /= (keep - stats.begin());
-  double stddev = 0.0;
-  for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
-    double off = (double)*i - avg;
-    stddev += off * off;
-  }
-  stddev = sqrt(stddev / (keep - stats.begin() - 1));
-  std::cout << std::setw(10) << *std::min_element(stats.begin(), stats.end()) << '\t' << std::setw(8) << avg << '\t' << std::setw(8) << stddev;
-}
-
-template <class Backend> void Print(std::vector<std::vector<double>> &stats, std::size_t index) {
-  if (stats.empty()) return;
-  std::cout << std::setw(16) << Backend::kName << '\t';
-  Summarize(stats[index]);
-  std::cout << '\n';
-}
-
-} // namespace intgemm
-} // namespace
-
-// Program takes no input
-int main(int, char ** argv) {
-  std::cerr << "Remember to run this on a specific core:\ntaskset --cpu-list 0 " << argv[0] << std::endl;
-
-  using namespace intgemm;
-  RandomMatrices matrices[] = {
-    {1, 64, 8},
-    {8, 256, 256},
-    {8, 2048, 256},
-    {8, 256, 2048},
-    {320, 256, 256},
-    {472, 256, 256},
-    {248, 256, 256},
-    {200, 256, 256},
-    // Additional stuff
-    {256, 256, 256},
-    {512, 512, 512},
-    {1024, 1024, 1024},
-/*    {4096, 4096, 4096},
-    {4096, 4096, 2048},
-    {4096, 4096, 1024},
-    {4096, 4096, 512},
-    {4096, 4096, 256},*/
-    {4096, 4096, 128}
-  };
-  RandomMatrices *matrices_end = (RandomMatrices*)matrices + sizeof(matrices) / sizeof(RandomMatrices);
-  // Only do full sampling for <1024 rows.
-  RandomMatrices *full_sample;
-  for (full_sample = matrices_end - 1; full_sample >= matrices && full_sample->A_rows >= 1024; --full_sample) {}
-  ++full_sample;
-
-  BackendStats stats;
-  const int kSamples = 100;
-  // Realistically, we don't expect different architectures or different precisions to run in the
-  // same run of an application. Benchmark per architecture and per precision level.
-  std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<SSSE3::Kernels8>(matrices, end, stats.ssse3_8bit);
-  }
-
-  std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<SSE2::Kernels16>(matrices, end, stats.sse2_16bit);
-  }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-  std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX2::Kernels8>(matrices, end, stats.avx2_8bit);
-  }
-
-  std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX2::Kernels16>(matrices, end, stats.avx2_16bit);
-  }
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512BW::Kernels8>(matrices, end, stats.avx512_8bit);
-  }
-
-  std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512BW::Kernels16>(matrices, end, stats.avx512_16bit);
-  }
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
-  for (int samples = 0; samples < kSamples; ++samples) {
-    RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
-    RunAll<AVX512VNNI::Kernels8>(matrices, end, stats.avx512vnni_8bit);
-  }
-#endif
-
-  if (stats.sse2_16bit.empty()) {
-    std::cerr << "No CPU support." << std::endl;
-    return 1;
-  }
-  for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
-    std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
-    Print<SSSE3::Kernels8>(stats.ssse3_8bit, i);
-    Print<AVX2::Kernels8>(stats.avx2_8bit, i);
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    Print<AVX512BW::Kernels8>(stats.avx512_8bit, i);
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-    Print<AVX512VNNI::Kernels8>(stats.avx512vnni_8bit, i);
-#endif
-    Print<SSE2::Kernels16>(stats.sse2_16bit, i);
-    Print<AVX2::Kernels16>(stats.avx2_16bit, i);
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    Print<AVX512BW::Kernels16>(stats.avx512_16bit, i);
-#endif
-  }
-  return 0;
-}
-
-
--- a/third_party/intgemm/benchmarks/benchmark_quantizer.cc
+++ b/third_party/intgemm/benchmarks/benchmark_quantizer.cc
@ -1,74 +0,0 @@
-#include "../intgemm/intgemm.h"
-#include "../intgemm/aligned.h"
-#include "../intgemm/ssse3_gemm.h"
-#include "../intgemm/avx2_gemm.h"
-#include "../intgemm/avx512_gemm.h"
-
-#include <chrono>
-#include <iomanip>
-#include <iostream>
-#include <random>
-#include <vector>
-
-namespace {
-
-float MaxAbsoluteBaseline(const float *begin, const float *end) {
-  auto res = std::minmax_element(begin, end);
-  return std::max(std::fabs(*res.first), std::fabs(*res.second));
-}
-
-void BenchmarkMaxAbsolute() {
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(0.f, 1.f);
-  gen.seed(45678);
-
-  intgemm::AlignedVector<float> v(4096 * 4096);
-  for (auto& it : v) {
-    it = dist(gen);
-  }
-
-  // Hopefully these don't get optimized out...
-  MaxAbsoluteBaseline(v.begin(), v.end());
-  auto start = std::chrono::steady_clock::now();
-  MaxAbsoluteBaseline(v.begin(), v.end());
-  double baseline = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
-  intgemm::MaxAbsolute(v.begin(), v.end());
-  start = std::chrono::steady_clock::now();
-  intgemm::MaxAbsolute(v.begin(), v.end());
-  double optimized = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
-  std::cout << "MaxAbsolute baseline = " << baseline << " optimized = " << optimized << " speedup = " << (optimized / baseline) << '\n';
-}
-
-template <class Backend> void QuantizerBench(const float *in, int8_t *out, intgemm::Index count) {
-  if (intgemm::kCPU < Backend::kUses) return;
-  Backend::Quantize(in, out, 1.0, count);
-  const std::size_t kTries = 60;
-  auto start = std::chrono::steady_clock::now();
-  for (std::size_t t = 0; t < kTries; ++t) {
-    Backend::Quantize(in, out, 1.0, count);
-  }
-  auto end = std::chrono::steady_clock::now();
-  double took = std::chrono::duration<double>(end - start).count() / kTries;
-  std::cout << std::setw(9) << count << ' ' << std::fixed << std::setw(9) << std::setprecision(7) << took << ' ' << Backend::kName << std::endl;
-}
-} // namespace
-
-int main() {
-  BenchmarkMaxAbsolute();
-  for (std::size_t count = 1; count < (1ULL<<30); count *= 2) {
-    intgemm::AlignedVector<float> in(count);
-    intgemm::AlignedVector<int8_t> out(count);
-    std::mt19937 gen;
-    std::uniform_real_distribution<float> dist(-129.0, 129.0);
-    for (float &element : in) {
-      element = dist(gen);
-    }
-    QuantizerBench<intgemm::SSSE3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-    QuantizerBench<intgemm::AVX2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    QuantizerBench<intgemm::AVX512BW::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
-#endif
-  }
-}
--- a/third_party/intgemm/benchmarks/biasmultiply.cc
+++ b/third_party/intgemm/benchmarks/biasmultiply.cc
@ -1,278 +0,0 @@
-#include "../intgemm/intgemm.h"
-#include "../intgemm/aligned.h"
-#include <chrono>
-#include <random>
-#include <iostream>
-
-using namespace intgemm;
-
-template <class Routine>
-void testOld(Index /*rows*/, Index /*cols*/) {
-}
-
-template <class Routine>
-std::chrono::duration<double> testNew(Index A_rows, Index width, Index B_cols) {
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<uint8_t> A_prep(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
-  Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
-  auto start = std::chrono::system_clock::now();
-  Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-  auto end = std::chrono::system_clock::now();
-
-  std::chrono::duration<double> elapsed_seconds = end-start;
-  return elapsed_seconds;
-
-}
-
-template <class Routine>
-std::chrono::duration<double> testOld(Index A_rows, Index width, Index B_cols) {
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<int8_t> A_prep(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  auto start = std::chrono::system_clock::now();
-  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-  auto end = std::chrono::system_clock::now();
-
-  std::chrono::duration<double> elapsed_seconds = end-start;
-  return elapsed_seconds;
-
-}
-
-template <class Routine>
-std::chrono::duration<double> testOld_nobias(Index A_rows, Index width, Index B_cols) {
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<int8_t> A_prep(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  auto start = std::chrono::system_clock::now();
-  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin()));
-  auto end = std::chrono::system_clock::now();
-
-  std::chrono::duration<double> elapsed_seconds = end-start;
-  return elapsed_seconds;
-
-}
-
-int main(int argc, char ** argv) {
-	int repeat = 1000;
-	if (argc > 1) {
-		repeat = atoi(argv[1]);
-	}
-
-	std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 2048, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(320, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(472, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(248, 256, 256);
-		oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> oldSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldSSSE3 += testOld<SSSE3::Kernels8>(8, 256, 256);
-		oldSSSE3 += testOld<SSSE3::Kernels8>(8, 2048, 256);
-		oldSSSE3 += testOld<SSSE3::Kernels8>(320, 256, 256);
-		oldSSSE3 += testOld<SSSE3::Kernels8>(472, 256, 256);
-		oldSSSE3 += testOld<SSSE3::Kernels8>(248, 256, 256);
-		oldSSSE3 += testOld<SSSE3::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 2048, 256);
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(320, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(472, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(248, 256, 256);
-		newTimeSSSE3 += testNew<SSSE3::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-	std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 2048, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(320, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(472, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(248, 256, 256);
-		oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> oldAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldAVX2 += testOld<AVX2::Kernels8>(8, 256, 256);
-		oldAVX2 += testOld<AVX2::Kernels8>(8, 2048, 256);
-		oldAVX2 += testOld<AVX2::Kernels8>(320, 256, 256);
-		oldAVX2 += testOld<AVX2::Kernels8>(472, 256, 256);
-		oldAVX2 += testOld<AVX2::Kernels8>(248, 256, 256);
-		oldAVX2 += testOld<AVX2::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> newTimeAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		newTimeAVX2 += testNew<AVX2::Kernels8>(8, 256, 256);
-		newTimeAVX2 += testNew<AVX2::Kernels8>(8, 2048, 256);
-		newTimeAVX2 += testNew<AVX2::Kernels8>(320, 256, 256);
-		newTimeAVX2 += testNew<AVX2::Kernels8>(472, 256, 256);
-		newTimeAVX2 += testNew<AVX2::Kernels8>(248, 256, 256);
-		newTimeAVX2 += testNew<AVX2::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-	if (kCPU < CPUType::AVX512BW) return 0;
-	std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 2048, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(320, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(472, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(248, 256, 256);
-		oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> oldAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		oldAVX512 += testOld<AVX512BW::Kernels8>(8, 256, 256);
-		oldAVX512 += testOld<AVX512BW::Kernels8>(8, 2048, 256);
-		oldAVX512 += testOld<AVX512BW::Kernels8>(320, 256, 256);
-		oldAVX512 += testOld<AVX512BW::Kernels8>(472, 256, 256);
-		oldAVX512 += testOld<AVX512BW::Kernels8>(248, 256, 256);
-		oldAVX512 += testOld<AVX512BW::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
-
-	std::chrono::duration<double> newTimeAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
-	for (int i = 0; i<repeat; i++) {
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 256, 256);
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 2048, 256);
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(320, 256, 256);
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(472, 256, 256);
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(248, 256, 256);
-		newTimeAVX512 += testNew<AVX512BW::Kernels8>(200, 256, 256);
-	}
-
-	std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  if (kCPU < CPUType::AVX512VNNI) return 0;
-  std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
-  for (int i = 0; i<repeat; i++) {
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 2048, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(320, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(472, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(248, 256, 256);
-          oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(200, 256, 256);
-  }
-
-  std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
-
-  std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
-  for (int i = 0; i<repeat; i++) {
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 2048, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(320, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(472, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(248, 256, 256);
-          oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(200, 256, 256);
-  }
-
-  std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
-
-  std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
-  for (int i = 0; i<repeat; i++) {
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 2048, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(320, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(472, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(248, 256, 256);
-    newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(200, 256, 256);
-  }
-
-  std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
-#endif
-
-}
--- a/third_party/intgemm/compile_test/avx2.cc
+++ b/third_party/intgemm/compile_test/avx2.cc
@ -1,25 +0,0 @@
-// Some compilers don't have AVX2 support.  Test for them.
-#include <immintrin.h>
-
-// clang-cl bug doesn't include these headers when pretending to be MSVC
-// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
-#if defined(_MSC_VER) && defined(__clang__)
-#include <avxintrin.h>
-#include <avx2intrin.h>
-#include <smmintrin.h>
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define INTGEMM_AVX2
-#else
-#define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
-#endif
-
-INTGEMM_AVX2 int Test() {
-  __m256i value = _mm256_set1_epi32(1);
-  value = _mm256_abs_epi8(value);
-  return *(int*)&value;
-}
-
-int main() {
-}
--- a/third_party/intgemm/compile_test/avx512bw.cc
+++ b/third_party/intgemm/compile_test/avx512bw.cc
@ -1,31 +0,0 @@
-// Some compilers don't have AVX512BW support.  Test for them.
-#include <immintrin.h>
-
-// clang-cl bug doesn't include these headers when pretending to be MSVC
-// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
-#if defined(_MSC_VER) && defined(__clang__)
-#include <avxintrin.h>
-#include <avx2intrin.h>
-#include <smmintrin.h>
-#include <avx512fintrin.h>
-#include <avx512dqintrin.h>
-#include <avx512bwintrin.h>
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#define INTGEMM_AVX512BW
-#elif defined(__INTEL_COMPILER)
-#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f")))
-#else
-#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw")))
-#endif
-
-INTGEMM_AVX512BW int Test() {
-  // AVX512BW
-  __m512i value = _mm512_set1_epi32(1);
-  value = _mm512_maddubs_epi16(value, value);
-  return *(int*)&value;
-}
-
-int main() {
-}
--- a/third_party/intgemm/compile_test/avx512vnni.cc
+++ b/third_party/intgemm/compile_test/avx512vnni.cc
@ -1,36 +0,0 @@
-#include <immintrin.h>
-
-// clang-cl bug doesn't include these headers when pretending to be MSVC
-// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
-#if defined(_MSC_VER) && defined(__clang__)
-#include <avxintrin.h>
-#include <avx2intrin.h>
-#include <smmintrin.h>
-#include <avx512fintrin.h>
-#include <avx512dqintrin.h>
-#include <avx512bwintrin.h>
-#include <avx512vnniintrin.h>
-#endif
-
-#if defined(_MSC_VER) && !defined(__clang__)
-#elif defined(__INTEL_COMPILER)
-__attribute__ ((target ("avx512f")))
-#else
-__attribute__ ((target ("avx512f,avx512bw,avx512dq,avx512vnni")))
-#endif
-bool Foo() {
-  // AVX512F
-  __m512i value = _mm512_set1_epi32(1);
-  // AVX512BW
-  value = _mm512_maddubs_epi16(value, value);
-  // AVX512DQ
-   __m256i value2 = _mm256_set1_epi8(1);
-  value = _mm512_inserti32x8(value, value2, 1);
-  // AVX512VNNI
-  value = _mm512_dpbusd_epi32(value, value, value);
-  return *(int*)&value;
-}
-
-int main() {
-  return Foo();
-}
--- a/third_party/intgemm/example.cc
+++ b/third_party/intgemm/example.cc
@ -1,79 +0,0 @@
-#include "intgemm/intgemm.h"
-// This is just for AlignedVector, which helps managed 64-byte aligned memory.
-// Feel free to manage memory yourself.
-#include "intgemm/aligned.h" 
-#include "intgemm/callbacks.h"
-
-#include <cassert>
-#include <cmath>
-#include <random>
-
-int main() {
-  using intgemm::Index;
-  const Index A_rows = 1;
-  // The shared dimension: A's columns and B's rows.
-  const Index width = 64;
-  const Index B_cols = 8;
-
-  // This is a simple vector class that allocates memory aligned to 64 bytes.
-  // You don't have to use it; just use aligned_alloc and friends directly.
-  using intgemm::AlignedVector;
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-
-  // Fill with random values in range [-2, 2].
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-2.f, 2.f);
-  gen.seed(1);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-
-  // Compute the top left corner of C as a sanity check.
-  float top_left_reference = 0.0f;
-  for (Index w = 0; w < width; ++w) {
-    top_left_reference += A[w] * B[w * B_cols];
-  }
-
-  // 16-bit multiplication.
-  {
-    // For 16-bit, Jacob Devlin recommends 1024 so as to not overflow in 32-bit accumulation.
-    float quant_mult = 1024.0f;
-    AlignedVector<int16_t> A_prepared(A.size());
-    AlignedVector<int16_t> B_prepared(B.size());
-    // Quantize A.
-    intgemm::Int16::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
-    // Quantize and reshape B.
-    // Typically you will do this once when parameters are loaded, not every time.
-    intgemm::Int16::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
-
-    AlignedVector<float> C(A_rows * B_cols);
-    // Do the actual multiply.
-    intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
-    // Sanity check.  C will be row major.
-    assert(std::fabs(C[0] - top_left_reference) < 0.05f);
-  }
-
-  // 8-bit multiplication.
-  {
-    // For 8-bit a good quantization multiplier is 127 / largest absolute value..
-    float quant_mult = 127.0f / 2.0f;
-    AlignedVector<int8_t> A_prepared(A.size());
-    AlignedVector<int8_t> B_prepared(B.size());
-    // Quantize A.
-    intgemm::Int8::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
-    // Quantize and reshape B.
-    // Typically you will do this once when parameters are loaded, not every time.
-    intgemm::Int8::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
-
-    AlignedVector<float> C(A_rows * B_cols);
-    // Do the actual multiply.
-    intgemm::Int8::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
-    // Sanity check.  C will be row major.
-    assert(std::fabs(C[0] - top_left_reference) < 0.05f);
-  }
-  return 0;
-}
--- a/third_party/intgemm/intgemm/aligned.h
+++ b/third_party/intgemm/intgemm/aligned.h
@ -1,90 +0,0 @@
-#pragma once
-#include <cstdlib>
-#include <new>
-#ifdef _MSC_VER
-// Ensure _HAS_EXCEPTIONS is defined
-#include <vcruntime.h>
-#include <malloc.h>
-#endif
-
-#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
-#include <cstdlib>
-#endif
-
-// Aligned simple vector.
-
-namespace intgemm {
-
-template <class T> class AlignedVector {
-  public:
-    AlignedVector() : mem_(nullptr), size_(0) {}
-
-    explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
-      : size_(size) {
-#ifdef _MSC_VER
-      mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
-      if (!mem_) {
-#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
-        throw std::bad_alloc();
-#  else
-        std::abort();
-#  endif
-      }
-#else
-      if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
-#  if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
-        throw std::bad_alloc();
-#  else
-        std::abort();
-#  endif
-      }
-#endif
-    }
-
-    AlignedVector(AlignedVector &&from) : mem_(from.mem_), size_(from.size_) {
-      from.mem_ = nullptr;
-      from.size_ = 0;
-    }
-
-    AlignedVector &operator=(AlignedVector &&from) {
-      if (this == &from) return *this;
-      release();
-      mem_ = from.mem_;
-      size_ = from.size_;
-      from.mem_ = nullptr;
-      from.size_ = 0;
-      return *this;
-    }
-
-    AlignedVector(const AlignedVector&) = delete;
-    AlignedVector& operator=(const AlignedVector&) = delete;
-
-    ~AlignedVector() { release(); }
-
-    std::size_t size() const { return size_; }
-
-    T &operator[](std::size_t offset) { return mem_[offset]; }
-    const T &operator[](std::size_t offset) const { return mem_[offset]; }
-
-    T *begin() { return mem_; }
-    const T *begin() const { return mem_; }
-    T *end() { return mem_ + size_; }
-    const T *end() const { return mem_ + size_; }
-
-    template <typename ReturnType>
-    ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
-
-  private:
-    T *mem_;
-    std::size_t size_;
-
-    void release() {
-#ifdef _MSC_VER
-      _aligned_free(mem_);
-#else
-      std::free(mem_);
-#endif
-    }
-};
-
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/avx2_gemm.h
+++ b/third_party/intgemm/intgemm/avx2_gemm.h
@ -1,232 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-
-#include "interleave.h"
-#include "kernels.h"
-#include "multiply.h"
-#include "types.h"
-
-#include <cstdint>
-#include <cstring>
-
-namespace intgemm {
-namespace AVX2 {
-
-INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
-  return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
-}
-
-INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
-
-class QuantizeTile16 {
-  public:
-    INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
-      return Tile(mult_reg, input, input + 8);
-    }
-
-    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
-      return Tile(mult_reg,
-        input,
-        input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
-    }
-
-    INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
-      // 8 rows in the first 128-bit register, 8 in the second register.
-      return Tile(mult_reg, input, input + 8 * cols);
-    }
-
-  private:
-    INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
-      Register g0 = QuantizerGrab(input0, mult_reg);
-      Register g1 = QuantizerGrab(input1, mult_reg);
-      Register packed = _mm256_packs_epi32(g0, g1);
-      // Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
-      // Technically this could be removed if the PrepareB did the same reordering internally.
-      return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
-    }
-};
-
-struct Kernels16 {
-  typedef int16_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  // Just quantize everything in order.
-  INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-    assert(size % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    FRegister q = set1_ps<FRegister>(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
-    }
-  }
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 16;
-  static const Index kBTileCol = 8;
-/*
-  INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols);
-  }*/
-  INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t)
-
-  INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
-  }
-
-  INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
-
-  constexpr static const char *const kName = "16-bit AVX2";
-
-  static const CPUType kUses = CPUType::AVX2;
-};
-
-/* Read 8 floats at a time from input0, input1, input2, and input3.  Quantize
- * them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
- * the result into one register and return it.
- */
-class QuantizeTile8 {
-  public:
-    INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
-      return Tile(quant_mult, input, input + 8, input + 16, input + 24);
-    }
-
-    INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
-      return TileU(quant_mult, input, input + 8, input + 16, input + 24);
-    }
-
-    INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
-      const float* inputs[4];
-      for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
-        while (cols_left < sizeof(Register) / sizeof(float)) {
-          input += cols * (row_step - 1);
-          cols_left += cols;
-        }
-        inputs[i] = input;
-        input += sizeof(Register) / sizeof(float);
-        cols_left -= sizeof(Register) / sizeof(float);
-      }
-      return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
-    }
-
-    INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
-      // Put higher rows in the second half of the register.  These will jumble
-      // around in the same way then conveniently land in the right place.
-      return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
-    }
-
-    INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
-      // Looking at the assembly, gcc has pulled this outside the loops calling this.
-      const __m256i neg127 = _mm256_set1_epi8(-127);
-      const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
-      __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
-      __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
-      __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
-      // Pack 32-bit to 16-bit.
-      __m256i packed0 = _mm256_packs_epi32(g0, g1);
-      __m256i packed1 = _mm256_packs_epi32(g2, g3);
-      // Pack 16-bit to 8-bit.
-      __m256i packed = _mm256_packs_epi16(packed0, packed1);
-      // Ban -128.
-      packed = _mm256_max_epi8(packed, neg127);
-      // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
-      // Or as 32-bit integers 0 2 4 6 1 3 5 7
-      // Technically this could be removed so long as the rows are bigger than 16
-      // and the values are only used for GEMM.
-      return _mm256_permutevar8x32_epi32(packed, shuffle_param);
-    }
-
-  private:
-    //A version that produces uint8_ts
-    INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
-      // Looking at the assembly, gcc has pulled this outside the loops calling this.
-      const __m256i neg127 = _mm256_set1_epi8(-127);
-      const __m256i pos127 = _mm256_set1_epi8(127);
-      const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
-      // Grab 4 registers at a time in 32-bit format.
-      __m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
-      __m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
-      __m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
-      __m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
-      // Pack 32-bit to 16-bit.
-      __m256i packed0 = _mm256_packs_epi32(g0, g1);
-      __m256i packed1 = _mm256_packs_epi32(g2, g3);
-      // Pack 16-bit to 8-bit.
-      __m256i packed = _mm256_packs_epi16(packed0, packed1);
-      // Ban -128.
-      packed = _mm256_max_epi8(packed, neg127); //Could be removed  if we use +128
-      packed = _mm256_add_epi8(packed, pos127);
-      // Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
-      // Or as 32-bit integers 0 2 4 6 1 3 5 7
-      // Technically this could be removed so long as the rows are bigger than 16
-      // and the values are only used for GEMM.
-      return _mm256_permutevar8x32_epi32(packed, shuffle_param);
-    }
-};
-
-struct Kernels8 {
-  typedef int8_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
- private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
- public:
-  INTGEMM_QUANTIZE(INTGEMM_AVX2)
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
-    QuantizeU(input, output, quant_mult, rows * cols);
-  }
-
-  // Just quantize everything in order.
-  INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
-    assert(size % 32 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
-    FRegister q = set1_ps<FRegister>(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 32, output += 32) {
-      *reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
-    }
-  }
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 32;
-  static const Index kBTileCol = 8;
-
-  INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t)
-
-  INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
-  }
-
-  INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
-
-  INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2)
-
-  INTGEMM_PREPAREBIASFOR8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
-  
-  constexpr static const char *const kName = "8-bit AVX2";
-
-  static const CPUType kUses = CPUType::AVX2;
-};
-
-} // namespace AVX2
-} // namespace intgemm
-
-#endif
--- a/third_party/intgemm/intgemm/avx512_gemm.h
+++ b/third_party/intgemm/intgemm/avx512_gemm.h
@ -1,411 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-
-#include "interleave.h"
-#include "kernels.h"
-#include "multiply.h"
-#include "types.h"
-
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-
-/* AVX512 implementation.
- * This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
- * That means it supports mainstream CPUs with AVX512, starting with Skylake
- * Xeons.
- * It does not support any Knights / Xeon Phi processors.
- *
- * All memory must be 64-byte aligned.
- */
-
-namespace intgemm {
-
-// AVX512 has combined collapse and store instructions:
-// _mm512_mask_cvtsepi32_storeu_epi16
-// _mm512_mask_cvtsepi32_storeu_epi8
-// So conversion in memory uses these, but I also implement a wider version for
-// rearranging B.
-
-namespace AVX512BW {
-
-// Load from memory, multiply, and convert to int32_t.
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
-  return kernels::quantize(loadu_ps<__m512>(input), quant_mult_reg);
-}
-
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
-
-// For PrepareB we want to read 8 columns at a time.  When converting 32-bit
-// floats to 8-bit values, that's 32 bytes of floats.  But AVX512 is 64 bytes
-// wide so it reads off the edge of the tile.  We could expand the tile size
-// but then the memory written to won't be contiguous anyway so we'd be doing a
-// scatter anyway.  Easier to just read the 8 columns we wanted as 256 bits
-// concatenate.
-INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
-  // INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway.
-  return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
-}
-
-// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
-  __m512 appended = Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
-  appended = _mm512_mul_ps(appended, quant_mult_reg);
-  return _mm512_cvtps_epi32(appended);
-}
-
-// These are only used for reshaping due to the AVX512 instructions
-// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
-// being used for the quantizer.
-class QuantizeTile16 {
-  public:
-    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
-      auto input0 = input;
-      auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
-      auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
-      auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
-      auto packed = packs_epi32(g0, g1);
-      return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
-    }
-
-    INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
-      __m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
-      __m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
-      __m512i packed = packs_epi32(g0, g1);
-      // Permute within 256-bit lanes, so same as INTGEMM_AVX2
-      return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
-    }
-};
-
-class QuantizeTile8 {
-  public:
-    INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
-      static const __m512i neg127 = _mm512_set1_epi8(-127);
-      static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-      const float* inputs[4];
-      for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
-        while (cols_left < sizeof(Register) / sizeof(float)) {
-          input += cols * (row_step - 1);
-          cols_left += cols;
-        }
-        inputs[i] = input;
-        input += sizeof(Register) / sizeof(float);
-        cols_left -= sizeof(Register) / sizeof(float);
-      }
-
-      auto g0 = QuantizerGrab(inputs[0], quant_mult);
-      auto g1 = QuantizerGrab(inputs[1], quant_mult);
-      auto g2 = QuantizerGrab(inputs[2], quant_mult);
-      auto g3 = QuantizerGrab(inputs[3], quant_mult);
-
-      auto packed0 = packs_epi32(g0, g1);
-      auto packed1 = packs_epi32(g2, g3);
-      auto packed = _mm512_packs_epi16(packed0, packed1);
-      packed = _mm512_max_epi8(packed, neg127);
-      return _mm512_permutexvar_epi32(shuffle_param, packed);
-    }
-
-    INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
-      // TODO: try alternative: _mm512_cvtsepi32_epi8 ?
-      const __m512i neg127 = _mm512_set1_epi8(-127);
-      // In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
-      const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-
-      // 32-bit format.
-      __m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
-      __m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
-      __m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
-      __m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
-      // Pack 32-bit to 16-bit.
-      __m512i packed0 = packs_epi32(g0, g1);
-      __m512i packed1 = packs_epi32(g2, g3);
-      // Pack 16-bit to 8-bit.
-      __m512i packed = _mm512_packs_epi16(packed0, packed1);
-      // Ban -128.
-      packed = _mm512_max_epi8(packed, neg127);
-      // 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
-      return _mm512_permutexvar_epi32(shuffle_param, packed);
-    }
-};
-
-struct Kernels16 {
-  typedef int16_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  // rows * cols must be a multiple of 16.
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  // Technically output can be unaligned in Quantize.
-  // But then it will need to be aligned for Multiply.
-  // size must be a multiple of 16.
-  // Convert to 16-bit signed integers.
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-    assert(size % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-    // Fill with the quantization multiplier.
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 16, output += 16) {
-      // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
-    }
-  }
-
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 32;
-  static const Index kBTileCol = 8;
-
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int16_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile16, int16_t)
-
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
-  }
-
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
-
-  constexpr static const char *const kName = "16-bit AVX512";
-
-  static const CPUType kUses = CPUType::AVX512BW;
-};
-
-struct Kernels8 {
-  typedef int8_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
- private:
-  /* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes
-   * to the hidden function it creates in implementing #pragma omp parallel for.
-   * So intrinstics were not working inside the for loop when compiled with
-   * OMP. Also, passing register types across #pragma omp parallel for
-   * generated an internal compiler error.
-   * The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0.
-   * As a workaround, I split into #pragma omp parallel with boring types
-   * passed across the boundary then call this function with target attributes.
-   */
-  INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) {
-    const __m512i neg127 = _mm512_set1_epi32(-127);
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
-#pragma omp for
-    for (std::size_t i = 0; i < count; i += kBatch) {
-      __m512i asint = QuantizerGrab(input + i, quant_mult_reg);
-      asint = _mm512_max_epi32(asint, neg127);
-      // There doesn't seem to be an unmasked version.
-      _mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
-    }
-  }
-
- public:
-  // Technically output can be unaligned in Quantize.
-  // But then it will need to be aligned for Multiply.
-  // Convert to 8-bit signed integers.
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
-    assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0);
-    const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
-    std::size_t fast_size = (size & ~(kBatch - 1));
-    const float *fast_input_end = input + fast_size;
-    int8_t *fast_output_end = output + fast_size;
-#pragma omp parallel
-    {
-      QuantizeThread(input, output, quant_mult, fast_size);
-    }
-    std::size_t overhang = size & (kBatch - 1);
-    if (!overhang) return; // We needed a branch anyway for the empty case.
-    const __m512i neg127 = _mm512_set1_epi32(-127);
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    __m512i asint = QuantizerGrab(fast_input_end, quant_mult_reg);
-    asint = _mm512_max_epi32(asint, neg127);
-    _mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
-  }
-
-  // Preparing A for the signed/unsigned multiplication. Using add 127
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
-    QuantizeU(input, output, quant_mult, rows * cols);
-  }
-
-  // Technically output can be unaligned in Quantize.
-  // But then it will need to be aligned for Multiply.
-  // Convert to 8-bit signed integers.
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-
-  INTGEMM_AVX512BW static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
-    assert(size % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
-    const __m512i pos127 = _mm512_set1_epi32(127);
-    const __m512i zero = _mm512_setzero_si512();
-    const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
-    const float *end = input + size;
-    for (; input < end; input += 16, output += 16) {
-      __m512i asint = QuantizerGrab(input, quant_mult_reg);
-      asint = _mm512_min_epi32(asint, pos127);
-      asint = _mm512_add_epi32(asint, pos127);
-      asint = _mm512_max_epi32(asint, zero);
-      _mm512_mask_cvtusepi32_storeu_epi8(output, 0xffff, asint);
-    }
-  }
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 64;
-  static const Index kBTileCol = 8;
-
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile8, int8_t)
-
-  /* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-  INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
-  }
-
-  // Special AVX512 implementation due to having 32 registers (so I don't have to
-  // allocate registers manually) and no sign instruction.
-  template <typename Callback>
-  INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    // This is copy-paste from Multiply8_SSE2OrAVX2.
-    assert(width % sizeof(Register) == 0);
-    assert(B_cols % 8 == 0);
-    assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
-    assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
-    // There's 8 results for INTGEMM_AVX2 to handle.
-    auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const Index simd_width = width / sizeof(Register);
-    // Added for AVX512.
-    Register zeros = setzero_si<Register>();
-    // Go over 8 columns of B at a time.
-#pragma omp for
-    for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
-      const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
-      // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
-      for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
-        // Iterate over shared (inner) dimension.
-        const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
-        const Register *A_end = A_live + simd_width;
-        const Register *B_live = B0_col;
-
-        // Do the first iteration to initialize the sums.
-        __m512i a = *A_live;
-        __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
-        __m512i a_positive = _mm512_abs_epi8(a);
-        // These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
-        Register sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
-        Register sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
-        Register sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
-        Register sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
-        Register sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
-        Register sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
-        Register sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
-        Register sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
-
-        ++A_live;
-        B_live += 8;
-
-        // Use A as the loop variable so the add can be done where gcc likes it
-        // for branch prediction.
-        for (; A_live != A_end; ++A_live, B_live += 8) {
-          // Unique code here: can we do an inline function?
-          // Retrieve a.  We will use this as the unsigned part.
-          a = *A_live;
-          // Retrieve the conveniently consecutive values of B.
-          __m512i b0 = *B_live;
-          __m512i b1 = *(B_live + 1);
-          __m512i b2 = *(B_live + 2);
-          __m512i b3 = *(B_live + 3);
-          __m512i b4 = *(B_live + 4);
-          __m512i b5 = *(B_live + 5);
-          __m512i b6 = *(B_live + 6);
-          __m512i b7 = *(B_live + 7);
-
-          // Get a mask where a is negative.
-          // Didn't seem to make a difference definining sign bits here vs at top
-          neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
-          a_positive = _mm512_abs_epi8(a);
-
-          // Negate by subtracting from zero with a mask.
-          b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
-          b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
-          b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
-          b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
-          b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
-          b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
-          b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
-          b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
-          // The magic 8-bit multiply then horizontal sum into 16-bit.
-          b0 = _mm512_maddubs_epi16(a_positive, b0);
-          b1 = _mm512_maddubs_epi16(a_positive, b1);
-          b2 = _mm512_maddubs_epi16(a_positive, b2);
-          b3 = _mm512_maddubs_epi16(a_positive, b3);
-          b4 = _mm512_maddubs_epi16(a_positive, b4);
-          b5 = _mm512_maddubs_epi16(a_positive, b5);
-          b6 = _mm512_maddubs_epi16(a_positive, b6);
-          b7 = _mm512_maddubs_epi16(a_positive, b7);
-          // Now we have 16-bit results that are the sum of two multiplies.
-          // Choosing to approximate and do adds.
-          // Perhaps every so often we could accumulate by upcasting.
-          sum0 = _mm512_adds_epi16(sum0, b0);
-          sum1 = _mm512_adds_epi16(sum1, b1);
-          sum2 = _mm512_adds_epi16(sum2, b2);
-          sum3 = _mm512_adds_epi16(sum3, b3);
-          sum4 = _mm512_adds_epi16(sum4, b4);
-          sum5 = _mm512_adds_epi16(sum5, b5);
-          sum6 = _mm512_adds_epi16(sum6, b6);
-          sum7 = _mm512_adds_epi16(sum7, b7);
-          // Unique code ends: can we do an inline function?
-        }
-        // Upcast to 32-bit and horizontally add.
-        Register ones = set1_epi16<Register>(1);
-        sum0 = madd_epi16(sum0, ones);
-        sum1 = madd_epi16(sum1, ones);
-        sum2 = madd_epi16(sum2, ones);
-        sum3 = madd_epi16(sum3, ones);
-        sum4 = madd_epi16(sum4, ones);
-        sum5 = madd_epi16(sum5, ones);
-        sum6 = madd_epi16(sum6, ones);
-        sum7 = madd_epi16(sum7, ones);
-        Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
-        Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-
-        auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
-      }
-    }
-  }
-
-  INTGEMM_MULTIPLY8SHIFT(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
-
-  INTGEMM_PREPAREBIASFOR8(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
-
-  constexpr static const char *const kName = "8-bit AVX512BW";
-
-  static const CPUType kUses = CPUType::AVX512BW;
-};
-
-} // namespace AVX512BW
-} // namespace intgemm
-
-#endif
--- a/third_party/intgemm/intgemm/avx512vnni_gemm.h
+++ b/third_party/intgemm/intgemm/avx512vnni_gemm.h
@ -1,168 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-#include "avx512_gemm.h"
-#include "types.h"
-
-namespace intgemm {
-namespace AVX512VNNI {
-
-// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
-INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-    asm ("vpdpbusds %2, %1, %0" : "+x"(c) : "x"(a), "mx"(b));
-#else
-    c = _mm512_dpbusds_epi32(c, a, b);
-#endif
-}
-
-struct Kernels8 : public AVX512BW::Kernels8 {
-  template <typename Callback>
-  INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    assert(width % sizeof(Register) == 0);
-    assert(B_cols % 8 == 0);
-    assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
-    assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
-    auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const Index simd_width = width / sizeof(Register);
-    Register zeros = setzero_si<Register>();
-    // Go over 8 columns of B at a time.
-#pragma omp for
-    for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
-      const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
-      // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
-      for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
-        // Iterate over shared (inner) dimension.
-        const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
-        const Register *A_end = A_live + simd_width;
-        const Register *B_live = B0_col;
-        // TODO: separate first step.
-        Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
-        for (; A_live != A_end; ++A_live, B_live += 8) {
-          Register a = *A_live;
-          // Retrieve the conveniently consecutive values of B.
-          Register b0 = *B_live;
-          Register b1 = *(B_live + 1);
-          Register b2 = *(B_live + 2);
-          Register b3 = *(B_live + 3);
-          Register b4 = *(B_live + 4);
-          Register b5 = *(B_live + 5);
-          Register b6 = *(B_live + 6);
-          Register b7 = *(B_live + 7);
-          // Get a mask where a is negative.
-          __mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
-          Register a_positive = _mm512_abs_epi8(a);
-          // Negate by subtracting from zero with a mask.
-          b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
-          b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
-          b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
-          b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
-          b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
-          b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
-          b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
-          b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
-          VNNI8(sum0, a_positive, b0);
-          VNNI8(sum1, a_positive, b1);
-          VNNI8(sum2, a_positive, b2);
-          VNNI8(sum3, a_positive, b3);
-          VNNI8(sum4, a_positive, b4);
-          VNNI8(sum5, a_positive, b5);
-          VNNI8(sum6, a_positive, b6);
-          VNNI8(sum7, a_positive, b7);
-        }
-        Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
-        Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-        auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
-      }
-    }
-  }
-
-  template <typename Callback>
-  INTGEMM_AVX512VNNI static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    assert(width % sizeof(Register) == 0);
-    assert(B_cols % 8 == 0);
-    assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
-    assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
-    auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    const Index simd_width = width / sizeof(Register);
-    Register zeros = setzero_si<Register>();
-    // Go over 8 columns of B at a time.
-#pragma omp for
-    for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
-      const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
-      // Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.
-      for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
-        // Iterate over shared (inner) dimension.
-        const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
-        const Register *A_end = A_live + simd_width;
-        const Register *B_live = B0_col;
-        // TODO: separate first step.
-        Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
-        for (; A_live != A_end; ++A_live, B_live += 8) {
-          Register a = *A_live;
-          //MultiplyAdd
-          VNNI8(sum0, a, *B_live);
-          VNNI8(sum1, a, *(B_live + 1));
-          VNNI8(sum2, a, *(B_live + 2));
-          VNNI8(sum3, a, *(B_live + 3));
-          VNNI8(sum4, a, *(B_live + 4));
-          VNNI8(sum5, a, *(B_live + 5));
-          VNNI8(sum6, a, *(B_live + 6));
-          VNNI8(sum7, a, *(B_live + 7));
-        }
-        Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
-        Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-        auto total = PermuteSummer(pack0123, pack4567);
-        callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
-      }
-    }
-  }
-
-  template <typename Callback>
-  INTGEMM_AVX512VNNI static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
-    assert(width % sizeof(Register) == 0);
-    assert(B_cols % 8 == 0);
-    assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
-    auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
-    Index simd_width = width / sizeof(Register);
-    Register zeros = setzero_si<Register>();
-    const Register a = set1_epi8<Register>(1);
-    // Go over 8 columns of B at a time.
-#pragma omp for
-    for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
-      const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
-      const Register *B_live = B0_col; //In order to make the code look as much as possible as the above function
-      const Register *B_end = B_live + simd_width*8;
-
-      // TODO: separate first step.
-      Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
-      for (; B_live != B_end; B_live += 8) {
-        // Retrieve the conveniently consecutive values of B.
-        VNNI8(sum0, a, *B_live);
-        VNNI8(sum1, a, *(B_live + 1));
-        VNNI8(sum2, a, *(B_live + 2));
-        VNNI8(sum3, a, *(B_live + 3));
-        VNNI8(sum4, a, *(B_live + 4));
-        VNNI8(sum5, a, *(B_live + 5));
-        VNNI8(sum6, a, *(B_live + 6));
-        VNNI8(sum7, a, *(B_live + 7));
-      }
-      Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
-      Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
-      auto total = PermuteSummer(pack0123, pack4567);
-      callback_impl.Run(total, callbacks::OutputBufferInfo(0, B0_colidx, 1, B_cols));
-    }
-  }
-
-  constexpr static const char *const kName = "8-bit AVX512VNNI";
-
-  static const CPUType kUses = CPUType::AVX512VNNI;
-};
-
-} // namespace AVX512VNNI
-} // namespace intgemm
-
-#endif
--- a/third_party/intgemm/intgemm/callbacks.h
+++ b/third_party/intgemm/intgemm/callbacks.h
@ -1,28 +0,0 @@
-#pragma once
-
-#include "callbacks/configs.h"
-#include "callbacks/output_buffer_info.h"
-
-#include "intgemm/intgemm_config.h"
-#include "intrinsics.h"
-#include "kernels.h"
-#include "types.h"
-#include "utils.h"
-#include "vec_traits.h"
-
-#define CALLBACKS_THIS_IS_SSE2
-#include "callbacks/implementations.inl"
-#undef CALLBACKS_THIS_IS_SSE2
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-#define CALLBACKS_THIS_IS_AVX2
-#include "callbacks/implementations.inl"
-#undef CALLBACKS_THIS_IS_AVX2
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-#define CALLBACKS_THIS_IS_AVX512BW
-#include "callbacks/implementations.inl"
-#undef CALLBACKS_THIS_IS_AVX512BW
-#endif
-
--- a/third_party/intgemm/intgemm/callbacks/configs.h
+++ b/third_party/intgemm/intgemm/callbacks/configs.h
@ -1,73 +0,0 @@
-#pragma once
-
-#include <tuple>
-
-namespace intgemm {
-namespace callbacks {
-
-/*
- * Sequence meta-config
- */
-template <typename... Configs>
-std::tuple<Configs...> Sequence(const Configs&... configs) {
-  return std::make_tuple(configs...);
-}
-
-/*
- * Configs
- */
-struct Dummy {
-};
-
-template <typename Type>
-struct Write {
-  Type* output_addr;
-
-  Write(Type* output_addr) : output_addr(output_addr) {}
-};
-
-struct Unquantize {
-  float unquant_mult;
-
-  Unquantize(float unquant_mult) : unquant_mult(unquant_mult) {}
-};
-
-struct UnquantizeAndWrite {
-  float unquant_mult;
-  float* output_addr;
-
-  UnquantizeAndWrite(float unquant_mult, float* output_addr) : unquant_mult(unquant_mult), output_addr(output_addr) {}
-};
-
-struct UnquantizeAndWriteRelu {
-  float unquant_mult;
-  float* output_addr;
-
-  UnquantizeAndWriteRelu(float unquant_mult, float* output_addr) : unquant_mult(unquant_mult), output_addr(output_addr) {}
-};
-
-struct AddBiasAndWrite {
-  const int* bias_addr;
-  int* output_addr;
-
-  AddBiasAndWrite(const int* bias_addr, int* output_addr) :  bias_addr(bias_addr), output_addr(output_addr) {}
-};
-
-struct UnquantizeAndAddBiasAndWrite {
-  float unquant_mult;
-  const float* bias_addr;
-  float* output_addr;
-
-  UnquantizeAndAddBiasAndWrite(float unquant_mult, const float* bias_addr, float* output_addr) : unquant_mult(unquant_mult), bias_addr(bias_addr), output_addr(output_addr) {}
-};
-
-struct UnquantizeAndAddBiasAndWriteRelu {
-  float unquant_mult;
-  const float* bias_addr;
-  float* output_addr;
-
-  UnquantizeAndAddBiasAndWriteRelu(float unquant_mult, const float* bias_addr, float* output_addr) : unquant_mult(unquant_mult), bias_addr(bias_addr), output_addr(output_addr) {}
-};
-
-}
-}
--- a/third_party/intgemm/intgemm/callbacks/implementations.inl
+++ b/third_party/intgemm/intgemm/callbacks/implementations.inl
@ -1,258 +0,0 @@
-/* This file is included multiple times, once per architecture. */
-#if defined(CALLBACKS_THIS_IS_SSE2)
-  #define CPU_NAME SSE2
-  #define INTGEMM_TARGET INTGEMM_SSE2
-#elif defined(CALLBACKS_THIS_IS_AVX2)
-  #define CPU_NAME AVX2
-  #define INTGEMM_TARGET INTGEMM_AVX2
-#elif defined(CALLBACKS_THIS_IS_AVX512BW)
-  #define CPU_NAME AVX512BW
-  #define INTGEMM_TARGET INTGEMM_AVX512BW
-#else
-  #error "Only SSE2, AVX2 and AVX512BW are supported"
-#endif
-
-#if defined(CALLBACKS_THIS_IS_SSE2)
-  #define vi vector_t<CPUType::SSE2, int>
-  #define vf vector_t<CPUType::SSE2, float>
-  #define vd vector_t<CPUType::SSE2, double>
-#else
-  #define vi vector_t<CPUType::AVX2, int>
-  #define vf vector_t<CPUType::AVX2, float>
-  #define vd vector_t<CPUType::AVX2, double>
-#endif
-
-/* Intel compiler 19.1.0.166 20191121 fails to link constructors with target attributes */
-#ifdef __INTEL_COMPILER
-#define INTGEMM_TARGET_CONSTRUCTOR
-#else
-#define INTGEMM_TARGET_CONSTRUCTOR INTGEMM_TARGET
-#endif
-
-namespace intgemm {
-namespace callbacks {
-
-template <CPUType CpuType, typename CallbackConfig>
-class CallbackImpl;
-
-}}
-
-/*
- * Callbacks implementations....
- */
-namespace intgemm {
-namespace callbacks {
-
-/*
- * Sequence
- */
-template <typename... Configs>
-class CallbackImpl<CPUType::CPU_NAME, std::tuple<Configs...>> {
-public:
-  explicit CallbackImpl(const std::tuple<Configs...>& configs) : callbacks(init_callbacks(configs, make_sequence<sizeof...(Configs)>())) {}
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    run_callbacks(input, info, callbacks, make_sequence<sizeof...(Configs)>());
-  }
-
-private:
-  using CallbacksTupleType = std::tuple<CallbackImpl<CPUType::CPU_NAME, Configs>...>;
-
-  CallbacksTupleType callbacks;
-
-  template <unsigned... Indices>
-  CallbacksTupleType init_callbacks(const std::tuple<Configs...>& configs, sequence<Indices...>) {
-    return std::make_tuple(CallbackImpl<CPUType::CPU_NAME, typename std::tuple_element<Indices, std::tuple<Configs...>>::type>(std::get<Indices>(configs))...);
-  }
-
-#define RUN_CALLBACKS_PIPELINE_IMPL(vtype) \
-  template <unsigned FirstIndex> \
-  INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex>) { \
-    std::get<FirstIndex>(tuple)(input, info); \
-  } \
-  template <unsigned FirstIndex, unsigned SecondIndex, unsigned... RestIndices> \
-  INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex, SecondIndex, RestIndices...>) { \
-    auto output = std::get<FirstIndex>(tuple)(input, info); \
-    run_callbacks(output, info, tuple, sequence<SecondIndex, RestIndices...>()); \
-  }
-
-  RUN_CALLBACKS_PIPELINE_IMPL(vi)
-  RUN_CALLBACKS_PIPELINE_IMPL(vf)
-  RUN_CALLBACKS_PIPELINE_IMPL(vd)
-
-#undef RUN_CALLBACKS_PIPELINE_IMPL
-};
-
-/*
- * Dummy
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, Dummy> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Dummy&) {}
-  INTGEMM_TARGET void Run(vi, const OutputBufferInfo&) {}
-};
-
-/*
- * Write
- */
-template <typename Type>
-class CallbackImpl<CPUType::CPU_NAME, Write<Type>> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Write<Type>& config) : config(config) {}
-
-  INTGEMM_TARGET void Run(vector_t<CPUType::CPU_NAME, Type> input, const OutputBufferInfo& info) {
-    kernels::write(input, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-
-private:
-  Write<Type> config;
-};
-
-/*
- * Unquantize
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, Unquantize> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Unquantize& config) : config(config) {
-    unquant_mult = set1_ps<vf>(config.unquant_mult);
-  }
-
-  INTGEMM_TARGET vf Run(vi input, const OutputBufferInfo&) {
-    return kernels::unquantize(input, unquant_mult);
-  }
-
-private:
-  vf unquant_mult;
-  Unquantize config;
-};
-
-/*
- * UnquantizeAndWrite
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndWrite> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndWrite& config) : config(config) {
-    unquant_mult = set1_ps<vf>(config.unquant_mult);
-  }
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    // Workaround gcc 5 internal compiler error that can't read register members in debug.
-    vf mult_reg;
-#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
-#else
-    mult_reg = unquant_mult;
-#endif
-    auto result = kernels::unquantize(input, mult_reg);
-    kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-
-private:
-  vf unquant_mult;
-  UnquantizeAndWrite config;
-};
-
-/*
- * UnquantizeAndWriteRelu
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndWriteRelu> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndWriteRelu& config) : config(config) {
-    unquant_mult = set1_ps<vf>(config.unquant_mult);
-  }
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    // Workaround gcc 5 internal compiler error that can't read register members in debug.
-    vf mult_reg;
-#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
-#else
-    mult_reg = unquant_mult;
-#endif
-    auto result = kernels::relu<float>(kernels::unquantize(input, mult_reg));
-    kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-
-private:
-  vf unquant_mult;
-  UnquantizeAndWriteRelu config;
-};
-
-
-/*
- * AddBiasAndWrite
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, AddBiasAndWrite> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const AddBiasAndWrite& config) : config(config) {}
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    auto result = kernels::add_bias(input, config.bias_addr, info.col_idx);
-    kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-
-private:
-  AddBiasAndWrite config;
-};
-
-/*
- * UnquantizeAndAddBiasAndWrite
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndAddBiasAndWrite> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndAddBiasAndWrite& config) : config(config) {
-    unquant_mult = set1_ps<vf>(config.unquant_mult);
-  }
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    // Workaround gcc 5 internal compiler error that can't read register members in debug.
-    vf mult_reg;
-#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
-#else
-    mult_reg = unquant_mult;
-#endif
-    auto result = kernels::unquantize(input, mult_reg);
-    result = kernels::add_bias(result, config.bias_addr, info.col_idx);
-    kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-private:
-  vf unquant_mult;
-  UnquantizeAndAddBiasAndWrite config;
-};
-
-/*
- * UnquantizeAndAddBiasAndWrite
- */
-template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndAddBiasAndWriteRelu> {
-public:
-  explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndAddBiasAndWriteRelu& config) : config(config) {
-    unquant_mult = set1_ps<vf>(config.unquant_mult);
-  }
-
-  INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
-    // Workaround gcc 5 internal compiler error that can't read register members in debug.
-    vf mult_reg;
-#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-    asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
-#else
-    mult_reg = unquant_mult;
-#endif
-    auto result = kernels::unquantize(input, mult_reg);
-    result = kernels::add_bias(result, config.bias_addr, info.col_idx);
-    result = kernels::relu<float>(result);
-    kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
-  }
-private:
-  vf unquant_mult;
-  UnquantizeAndAddBiasAndWriteRelu config;
-};
-
-}
-}
-
-#undef CPU_NAME
-#undef INTGEMM_TARGET
-#undef vi
-#undef vf
-#undef vd
--- a/third_party/intgemm/intgemm/callbacks/output_buffer_info.h
+++ b/third_party/intgemm/intgemm/callbacks/output_buffer_info.h
@ -1,20 +0,0 @@
-#pragma once
-
-#include "../types.h"
-
-namespace intgemm {
-namespace callbacks {
-
-struct OutputBufferInfo {
-  Index row_idx;
-  Index col_idx;
-
-  Index rows; // = A_rows
-  Index cols; // = B_cols
-
-  OutputBufferInfo(Index row_idx, Index col_idx, Index rows, Index cols)
-    : row_idx(row_idx), col_idx(col_idx), rows(rows), cols(cols) {}
-};
-
-}
-}
--- a/third_party/intgemm/intgemm/interleave.h
+++ b/third_party/intgemm/intgemm/interleave.h
@ -1,317 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-#include "intrinsics.h"
-#include "types.h"
-
-#include <algorithm>
-#include <cassert>
-
-namespace intgemm {
-
-/*
- * Interleave vectors.
- */
-#define INTGEMM_INTERLEAVE_N(target, type, N) \
-target static inline void Interleave##N(type &first, type &second) { \
-  type temp = unpacklo_epi##N(first, second); \
-  second = unpackhi_epi##N(first, second); \
-  first = temp; \
-}
-
-#define INTGEMM_INTERLEAVE(target, type) \
-INTGEMM_INTERLEAVE_N(target, type, 8) \
-INTGEMM_INTERLEAVE_N(target, type, 16) \
-INTGEMM_INTERLEAVE_N(target, type, 32) \
-INTGEMM_INTERLEAVE_N(target, type, 64)
-
-INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i)
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i)
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i)
-#endif
-
-/*
- * Swap vectors.
- */
-#define INTGEMM_SWAP(target, Register) \
-target static inline void Swap(Register &a, Register &b) { \
-  Register tmp = a; \
-  a = b; \
-  b = tmp; \
-} \
-
-INTGEMM_SWAP(INTGEMM_SSE2, __m128i)
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_SWAP(INTGEMM_AVX2, __m256i)
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i)
-#endif
-
-/* Transpose registers containing 8 packed 16-bit integers.
- * Each 128-bit lane is handled independently.
- */
-#define INTGEMM_TRANSPOSE16(target, Register) \
-target static inline void Transpose16InLane(Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7) { \
-  /* r0: columns 0 1 2 3 4 5 6 7 from row 0
-     r1: columns 0 1 2 3 4 5 6 7 from row 1*/ \
-  Interleave16(r0, r1); \
-  Interleave16(r2, r3); \
-  Interleave16(r4, r5); \
-  Interleave16(r6, r7); \
-  /* r0: columns 0 0 1 1 2 2 3 3 from rows 0 and 1
-     r1: columns 4 4 5 5 6 6 7 7 from rows 0 and 1
-     r2: columns 0 0 1 1 2 2 3 3 from rows 2 and 3
-     r3: columns 4 4 5 5 6 6 7 7 from rows 2 and 3
-     r4: columns 0 0 1 1 2 2 3 3 from rows 4 and 5
-     r5: columns 4 4 5 5 6 6 7 7 from rows 4 and 5
-     r6: columns 0 0 1 1 2 2 3 3 from rows 6 and 7
-     r7: columns 4 4 5 5 6 6 7 7 from rows 6 and 7*/ \
-  Interleave32(r0, r2); \
-  Interleave32(r1, r3); \
-  Interleave32(r4, r6); \
-  Interleave32(r5, r7); \
-  /* r0: columns 0 0 0 0 1 1 1 1 from rows 0, 1, 2, and 3
-     r1: columns 4 4 4 4 5 5 5 5 from rows 0, 1, 2, and 3
-     r2: columns 2 2 2 2 3 3 3 3 from rows 0, 1, 2, and 3
-     r3: columns 6 6 6 6 7 7 7 7 from rows 0, 1, 2, and 3
-     r4: columns 0 0 0 0 1 1 1 1 from rows 4, 5, 6, and 7
-     r5: columns 4 4 4 4 5 5 5 5 from rows 4, 5, 6, and 7
-     r6: columns 2 2 2 2 3 3 3 3 from rows 4, 5, 6, and 7
-     r7: columns 6 6 6 6 7 7 7 7 from rows 4, 5, 6, and 7*/ \
-  Interleave64(r0, r4); \
-  Interleave64(r1, r5); \
-  Interleave64(r2, r6); \
-  Interleave64(r3, r7); \
-  /* r0: columns 0 0 0 0 0 0 0 0 from rows 0 through 7
-     r1: columns 4 4 4 4 4 4 4 4 from rows 0 through 7
-     r2: columns 2 2 2 2 2 2 2 2 from rows 0 through 7
-     r3: columns 6 6 6 6 6 6 6 6 from rows 0 through 7
-     r4: columns 1 1 1 1 1 1 1 1 from rows 0 through 7
-     r5: columns 5 5 5 5 5 5 5 5 from rows 0 through 7*/ \
-  /* Empirically gcc is able to remove these movs and just rename the outputs of Interleave64. */ \
-  Swap(r1, r4); \
-  Swap(r3, r6); \
-} \
-
-INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i)
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i)
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i)
-#endif
-
-/* Tranpose registers containing 16 packed 8-bit integers.
- * Each 128-bit lane is handled independently.
- */
-template <class Register> static inline void Transpose8InLane(
-    Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7,
-    Register &r8, Register &r9, Register &r10, Register &r11, Register &r12, Register &r13, Register &r14, Register &r15) {
-  // Get 8-bit values to 16-bit values so they can travel together.
-  Interleave8(r0, r1);
-  // r0: columns 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 from rows 0 and 1.
-  // r1: columns 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 from rows 0 and 1.
-  Interleave8(r2, r3);
-  // r2: columns 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 from rows 2 and 3.
-  Interleave8(r4, r5);
-  Interleave8(r6, r7);
-  Interleave8(r8, r9);
-  Interleave8(r10, r11);
-  Interleave8(r12, r13);
-  Interleave8(r14, r15);
-  Transpose16InLane(r0, r2, r4, r6, r8, r10, r12, r14);
-  Transpose16InLane(r1, r3, r5, r7, r9, r11, r13, r15);
-  // Permute into correct order.  This is free because the outputs just get pemuted.
-  Register tmp;
-  tmp = r2;
-  r2 = r4;
-  r4 = r8;
-  r8 = r1;
-  r1 = tmp;
-  tmp = r3;
-  r3 = r6;
-  r6 = r12;
-  r12 = r9;
-  r9 = tmp;
-  tmp = r5;
-  r5 = r10;
-  r10 = tmp;
-  tmp = r7;
-  r7 = r14;
-  r14 = r13;
-  r13 = r11;
-  r11 = tmp;
-}
-
-// PREPARE B: quantize and rearrange.  B is presumed to be constantparameters
-// so we can take our time rearranging it in order to save during the multiply.
-//
-// We presume B starts in row-major order.
-//
-// In INTGEMM_AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
-// that many values from the same column in the register.
-//
-// The multiplier reads 8 rows at a time and we want these reads to be
-// contiguous.
-//
-// Each 8x32 (for 8-bit) or 8x16 (for 16-bit) tile of B is transposed.
-// The tiles are stored in column major order.
-//
-// For INTGEMM_AVX2, this matrix shows what index each value of B will be stored at:
-//   0  16 ... 240
-//   1  17 ... 241
-//   2  18 ... 242
-//   3  19 ... 243
-//   4  20 ... 244
-//   5  21 ... 245
-//   6  22 ... 246
-//   7  23 ... 247
-//   8  24 ... 248
-//   9  25 ... 249
-//  10  26 ... 250
-//  11  27 ... 251
-//  12  28 ... 252
-//  13  29 ... 253
-//  14  30 ... 254
-//  15  31 ... 255
-// 256 272
-// 257 273
-// ... ...
-#define INTGEMM_PREPARE_B_8(target, QuantClass) \
-target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  FRegister q = set1_ps<FRegister>(quant_mult); \
-  /* Currently all multipliers have a stride of 8 columns.*/ \
-  const Index kColStride = 8; \
-  assert(cols % kColStride == 0); \
-  assert(rows % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
-  Register *output = reinterpret_cast<Register*>(output_shadow); \
-  assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  for (Index c = 0; c < cols; c += kColStride) { \
-    for (Index r = 0; r < rows; r += sizeof(Register), output += 8) { \
-      /* Quantize and perform a transpose with height sizeof(Register) and width 8. \
-         This isn't quite Transpose8InLane because it's half the number of columns, \
-         so each register starts with two rows instead of being one row. \
-         The quantizers know to skip a row.*/ \
-      output[0] = QuantClass::ForReshape(q, input + cols * (r    ) + c, cols); \
-      output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
-      output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
-      output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
-      output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
-      output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
-      output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
-      output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
-      Interleave8(output[0], output[1]); \
-      Interleave8(output[2], output[3]); \
-      Interleave8(output[4], output[5]); \
-      Interleave8(output[6], output[7]); \
-      Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
-    } \
-  } \
-} \
-
-#define INTGEMM_PREPARE_B_16(target, QuantClass) \
-target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
-  FRegister q = set1_ps<FRegister>(quant_mult); \
-  assert(cols % 8 == 0); \
-  assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
-  assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
-  Register *output = reinterpret_cast<Register*>(output_shadow); \
-  assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  for (Index c = 0; c < cols; c += 8) { \
-    for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
-      /* gcc unrolls this loop and uses registers for output[k]*/ \
-      for (Index k = 0; k < 8; ++k) { \
-        output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
-      } \
-      Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
-    } \
-  } \
-}
-
-/*
- * Prepare B matrix.
- * B matrix has to be transposed and quantized.
- * Cols has to be a multiple of sizeof(Register) / sizeof(Integer).
- *
- * cols and rows describe size of transposed B.
- */
-#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, Integer) \
-target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \
-  const Index RegisterElems = sizeof(Register) / sizeof(Integer); \
-  const Index kColStride = 8; \
-  \
-  assert(cols % RegisterElems == 0); \
-  assert(rows % kColStride == 0); \
-  assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  \
-  Register* output_it = reinterpret_cast<Register*>(output); \
-  for (Index r = 0; r < rows; r += kColStride) \
-    for (Index c = 0; c < cols; c += RegisterElems) \
-      for (Index ri = 0; ri < 8; ++ri) \
-        *output_it++ = *reinterpret_cast<const Register*>(input + (r + ri) * cols + c); \
-}
-
-/*
- * Prepare B matrix.
- * B matrix has to be transposed.
- * Cols has to be a multiple of sizeof(Register) / sizeof(float).
- *
- * cols and rows describe size of transposed B.
- */
-#define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, Integer) \
-target static inline void PrepareBTransposed(const float* input, Integer* output, float quant_mult, Index cols, Index rows) { \
-  const Index RegisterElemsInt = sizeof(Register) / sizeof(Integer); \
-  const Index kColStride = 8; \
-  \
-  assert(cols % (sizeof(Register) / sizeof(float)) == 0); \
-  assert(rows % kColStride == 0); \
-  assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  \
-  FRegister q = set1_ps<FRegister>(quant_mult); \
-  Register* output_it = reinterpret_cast<Register*>(output); \
-  Index r = 0; \
-  Index c = 0; \
-  while (r < rows) { \
-    for (Index ri = 0; ri < 8; ++ri) \
-      *output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
-    c += RegisterElemsInt; \
-    while (c >= cols) { \
-      r += kColStride; \
-      c -= cols; \
-    } \
-  } \
-}
-
-/* Select columns of B from PrepareB format to PrepareB format.
- */
-#define INTGEMM_SELECT_COL_B(target, Register) \
-target static inline void SelectColumnsOfB(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
-  assert(rows_bytes % sizeof(Register) == 0); \
-  assert((cols_end - cols_begin) % 8 == 0);  \
-  /* Do columns for multiples of 8.*/ \
-  Index register_rows = rows_bytes / sizeof(Register); \
-  const Register *starts[8]; \
-  for (; cols_begin != cols_end; cols_begin += 8) { \
-    for (Index k = 0; k < 8; ++k) { \
-      starts[k] = input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows; \
-    } \
-    for (Index r = 0; r < register_rows; ++r) { \
-      for (Index k = 0; k < 8; ++k) { \
-        *(output++) = *starts[k]; \
-        starts[k] += 8; \
-      } \
-    } \
-  } \
-}
-
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/intgemm.cc
+++ b/third_party/intgemm/intgemm/intgemm.cc
@ -1,207 +0,0 @@
-#if defined(WASM)
-// No header for CPUID since it's hard-coded.
-#elif defined(__INTEL_COMPILER)
-#include <immintrin.h>
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#else
-// Assume GCC and clang style.
-#include <cpuid.h>
-#endif
-
-#include "intgemm.h"
-#include "stats.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace intgemm {
-
-namespace {
-
-// Return the maximum CPU model that's found and supported at compile time.
-CPUType RealCPUID() {
-#if defined(WASM)
-  // emscripten does SSE4.1 but we only use up to SSSE3.
-  return CPUType::SSSE3;
-#elif defined(__INTEL_COMPILER)
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  if (_may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)) return CPUType::AVX512VNNI;
-#  endif
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  if (_may_i_use_cpu_feature(_FEATURE_AVX512BW)) return CPUType::AVX512BW;
-#  endif
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-  if (_may_i_use_cpu_feature(_FEATURE_AVX2)) return CPUType::AVX2;
-#  endif
-  if (_may_i_use_cpu_feature(_FEATURE_SSSE3)) return CPUType::SSSE3;
-  if (_may_i_use_cpu_feature(_FEATURE_SSE2)) return CPUType::SSE2;
-  return CPUType::UNSUPPORTED;
-#else
-// Not emscripten, not Intel compiler
-#  if defined(_MSC_VER)
-  int regs[4];
-  int &eax = regs[0], &ebx = regs[1], &ecx = regs[2], &edx = regs[3];
-  __cpuid(regs, 0);
-  int m = eax;
-#  else
-  /* gcc and clang.
-   * If intgemm is compiled by gcc 6.4.1 then dlopened into an executable
-   * compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info.
-   * Work around this by calling the intrinsics more directly instead of
-   * __builtin_cpu_supports.
-   *
-   * clang 6.0.0-1ubuntu2 supports vnni but doesn't have
-   *   __builtin_cpu_supports("avx512vnni")
-   * so use the hand-coded CPUID for clang.
-   */
-  unsigned int m = __get_cpuid_max(0, 0);
-  unsigned int eax, ebx, ecx, edx;
-#  endif
-  if (m >= 7) {
-#  if defined(_MSC_VER)
-    __cpuid(regs, 7);
-#  else
-    __cpuid_count(7, 0, eax, ebx, ecx, edx);
-#  endif
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-    if (ecx & (1 << 11)) return CPUType::AVX512VNNI;
-#  endif
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-    if (ebx & (1 << 30)) return CPUType::AVX512BW;
-#  endif
-#  ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-    if (ebx & (1 << 5)) return CPUType::AVX2;
-#   endif
-  }
-  if (m >= 1) {
-#  if defined(_MSC_VER)
-    __cpuid(regs, 1);
-#  else
-    __cpuid_count(1, 0, eax, ebx, ecx, edx);
-#  endif
-    if (ecx & (1 << 9)) return CPUType::SSSE3;
-    if (edx & (1 << 26)) return CPUType::SSE2;
-  }
-  return CPUType::UNSUPPORTED;
-#endif
-}
-
-#ifdef INTGEMM_CPUID_ENVIRONMENT
-CPUType EnvironmentCPUID() {
-#  if defined(_MSC_VER)
-  char env_override[11];
-  size_t len = 0;
-  if (getenv_s(&len, env_override, sizeof(env_override), "INTGEMM_CPUID")) return CPUType::AVX512VNNI;
-  if (!len) return CPUType::AVX512VNNI;
-#  else
-  const char *env_override = getenv("INTGEMM_CPUID");
-  if (!env_override) return CPUType::AVX512VNNI; /* This will be capped to actual ID */
-#  endif
-  if (!strcmp(env_override, "AVX512VNNI")) return CPUType::AVX512VNNI;
-  if (!strcmp(env_override, "AVX512BW")) return CPUType::AVX512BW;
-  if (!strcmp(env_override, "AVX2")) return CPUType::AVX2;
-  if (!strcmp(env_override, "SSSE3")) return CPUType::SSSE3;
-  if (!strcmp(env_override, "SSE2")) return CPUType::SSE2;
-  fprintf(stderr, "Ignoring unrecognized INTGEMM_CPUID %s\n", env_override);
-  return CPUType::AVX512VNNI;
-}
-#endif
-
-} // namespace
-
-CPUType GetCPUID() {
-  static const CPUType kLocalCPU =
-#ifdef INTGEMM_CPUID_ENVIRONMENT
-    std::min(RealCPUID(), EnvironmentCPUID());
-#else
-    RealCPUID();
-#endif
-  return kLocalCPU;
-}
-
-const CPUType kCPU = GetCPUID();
-
-void UnsupportedCPUError() {
-#if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
-  throw UnsupportedCPU();
-#else
-  fprintf(stderr, "intgemm does not support this CPU.\n");
-  abort();
-#endif
-}
-
-float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
-  UnsupportedCPUError();
-  return 0.0f;
-}
-
-MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
-  UnsupportedCPUError();
-  return MeanStd();
-}
-
-void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize);
-
-void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
-
-void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
-
-void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
-
-void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
-
-const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName);
-
-void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
-
-void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
-
-void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
-
-void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
-
-void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
-
-const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
-
-const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
-
-#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2)
-namespace AVX2{
-using SSE2::MaxAbsolute;
-using SSE2::VectorMeanStd;
-} // namespace AVX2
-#endif
-#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
-namespace AVX512BW {
-using AVX2::MaxAbsolute;
-using AVX2::VectorMeanStd;
-} // namespace AVX512BW
-#endif
-
-float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute);
-
-MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd);
-
-constexpr const char *const Unsupported_16bit::kName;
-constexpr const char *const Unsupported_8bit::kName;
-constexpr const char *const SSE2::Kernels16::kName;
-constexpr const char *const SSSE3::Kernels8::kName;
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-constexpr const char *const AVX2::Kernels8::kName;
-constexpr const char *const AVX2::Kernels16::kName;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-constexpr const char *const AVX512BW::Kernels8::kName;
-constexpr const char *const AVX512BW::Kernels16::kName;
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-constexpr const char *const AVX512VNNI::Kernels8::kName;
-#endif
-
-}
--- a/third_party/intgemm/intgemm/intgemm.h
+++ b/third_party/intgemm/intgemm/intgemm.h
@ -1,365 +0,0 @@
-#pragma once
-/* Main interface for integer matrix multiplication.
- *
- * We are computing C = A * B with an optional scaling factor.
- *
- * A is typically activations.
- * Rows a multiple of 1 (no restriction)
- * Columns a multiple of 64 for 8-bit or 32 for 16-bit.
- * Use PrepareA to prepare A for multiplication.  This is meant to be fast.
- *
- * B is typically fixed model parameters.
- * Rows a multiple of 64 for 8-bit or 32 for 16-bit.
- * Columns a multiple of: 8
- * Use PrepareB to prepare B for multiplication.  This is slower, with the
- * intention that it will be prepared once and remembered.
- *
- * C is row major.
- *
- * Once both A and B are prepared, call Multiply.
- *
- * All memory (A, B, and C in float or prepared form) must be 64-byte aligned.
- * It's easy to write code that works on your CPU with lower alignment, but
- * breaks on AVX512.
- *
- * When preparing, you provide a quantization multiplier.  Values will be
- * multiplied by this then rounded to an integer.
- * For 16-bit neural networks, Jacob Devlin recommends 1024.0.
- * For 8-bit, use 127 / largest absolute value.
- *
- * Note that quantization saturates.  However, 16-bit does accumulation in
- * 32-bit which can overflow if you use too big of a multiplier.
- *
- * The multiply routine expects an unquantization multiplier.
- * This should be unquant_mult = 1.0 / (A_quant_mult * B_quant_mult).
- * Where A_quant_mult is what you passed to PrepareA and B_quant_mult is what you
- * passed to PrepareB.
- *
- * Feel free to multiply in a scaling factor to compute C = \lambda A * B by
- * passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult).
- */
-
-#include <cstdint>
-
-#include "types.h"
-#include "sse2_gemm.h"
-#include "ssse3_gemm.h"
-#include "avx2_gemm.h"
-#include "avx512_gemm.h"
-#include "avx512vnni_gemm.h"
-
-/* Dispatch to functions based on runtime CPUID.  This adds one call-by-variable to each call. */
-
-namespace intgemm {
-
-void UnsupportedCPUError();
-
-struct Unsupported_16bit {
-  static void Quantize(const float *, int16_t *, float, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareB(const float *, int16_t *, float, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
-    UnsupportedCPUError();
-  }
-  template <typename Callback>
-  static void Multiply(const int16_t *, const int16_t *, Index, Index, Index, Callback) {
-    UnsupportedCPUError();
-  }
-  constexpr static const char *const kName = "16-bit Unsupported";
-};
-
-struct Unsupported_8bit {
-  static void Quantize(const float *, int8_t *, float, Index) {
-    UnsupportedCPUError();
-  }
-  static void QuantizeU(const float *, uint8_t *, float, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareA(const float *, int8_t *, float, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) {
-    UnsupportedCPUError();
-  }
-  static void PrepareB(const float *, int8_t *, float, Index, Index) {
-    UnsupportedCPUError();
-  }
-  template<class Callback>
-  static void PrepareBias(const int8_t *, Index, Index, Callback) {
-    UnsupportedCPUError();
-  }
-  static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
-    UnsupportedCPUError();
-  }
-  template <typename Callback>
-  static void Multiply(const int8_t *, const int8_t *, Index, Index, Index, Callback) {
-    UnsupportedCPUError();
-  }
-  template<class Callback>
-  static void Multiply8Shift(const uint8_t *, const int8_t *, Index, Index, Index, Callback) {
-    UnsupportedCPUError();
-  }
-
-  constexpr static const char *const kName = "8-bit Unsupported";
-};
-
-#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-// These won't ever be called in this capacity, but it does let the code below compile.
-namespace AVX512VNNI {
-typedef Unsupported_8bit Kernels8;
-} // namespace AVX512VNNI
-#endif
-#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace AVX512BW {
-typedef Unsupported_8bit Kernels8;
-typedef Unsupported_16bit Kernels16;
-} // namespace AVX512BW
-#endif
-#ifndef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace AVX2 {
-typedef Unsupported_8bit Kernels8;
-typedef Unsupported_16bit Kernels16;
-} // namespace AVX2
-#endif
-
-CPUType GetCPUID();
-
-/* Returns:
- * axx512vnni if the CPU supports AVX512VNNI
- *
- * avx512bw if the CPU supports AVX512BW
- *
- * avx2 if the CPU supports AVX2
- *
- * ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
- *
- * sse2 if the CPU supports SSE2
- *
- * unsupported otherwise
- */
-template <class T> T ChooseCPU(T avx512vnni, T avx512bw, T avx2, T ssse3, T sse2, T unsupported) {
-  const T ret[] = {unsupported, sse2, ssse3, avx2, avx512bw, avx512vnni};
-  return ret[(int)GetCPUID()];
-}
-
-struct TileInfo {
-  const Index a_rows;
-  const Index a_cols;
-  const Index b_rows;
-  const Index b_cols;
-};
-
-/*
- * 8-bit matrix multiplication
- */
-struct Int8 {
-  using Integer = int8_t;
-
-  // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
-  static constexpr TileInfo tile_info{1, 64, 64, 8};
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  // A's columns must be a multiple of 8.
-  // The number of rows is anything.
-  static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  // Multiply floats by quant_mult then convert to 8-bit integers with saturation.
-  static void (*Quantize)(const float *input, int8_t *output, float quant_mult, Index size);
-
-  // Multiply floats by quant_mult then convert to 8-bit integers with saturation.
-  // A version that adds 127 to each number, making sure that all numbers are positive
-  static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size);
-
-  // Warning: the output of PrepareB depends on the CPU.
-  // It will match the Multiply function on the same CPU though.
-  static void (*PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
-
-  // Convert from a B that was already transposed (routine not provided) and
-  // quantized (e.g. with Quantize) to the CPU-dependent format used for
-  // Multiply.  This is useful for storing a quantized model on disk then in a
-  // CPU-independent fashion.
-  static void (*PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols);
-
-  // Convert from a B that was already transposed (routine not provided) to
-  // the CPU-dependent format used for Multiply.  This is useful for storing
-  // a quantized model on disk then in a CPU-independent fashion.
-  static void (*PrepareBTransposed)(const float *input, int8_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
-
-  // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8.
-  static void (*SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
-
-  // Multiply C = A * B, presuming A and B have been prepared.
-  template <typename Callback>
-  static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
-  }
-
-  static const char *const kName;
-
-private:
-  template <typename Callback>
-  struct MultiplyImpl {
-    static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-  };
-};
-
-template <typename Callback>
-void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI::Kernels8>, OMPParallelWrap<Callback, AVX512BW::Kernels8>, OMPParallelWrap<Callback, AVX2::Kernels8>, OMPParallelWrap<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
-
-/*
- * 8-bit matrix multiplication with shifting A by 127
- */
-struct Int8Shift {
-  using Integer = int8_t;
-
-  // A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
-  static constexpr TileInfo tile_info{1, 64, 64, 8};
-
-  // Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive.
-  static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    QuantizeU(input, reinterpret_cast<uint8_t *>(output), quant_mult, rows * cols);
-  }
-
-  // Multiply floats by quant_mult then convert to 8-bit integers with saturation.
-  // A version that adds 127 to each number, making sure that all numbers are positive
-  static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size);
-  
-  // Warning: the output of PrepareB depends on the CPU.
-  // It will match the Multiply function on the same CPU though.
-  static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    Int8::PrepareB(input, output, quant_mult, rows, cols);
-  }
-
-  // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8. 
-  static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    Int8::SelectColumnsB(input, output, rows, cols_begin, cols_end);
-  }
-
-  // A slightly faster version compared to the Int8 one (assuming a bias is used) because of better handling of the sign bit
-  // Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used
-  template<class Callback>
-  static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback);
-  }
-
-  // This function prepares the bias for the Multiply routine that does unsigned * signed multiplication.
-  // The function takes:
-  // a preparedB matrix, width, B_cols and
-  // the callback UnquantizeAndAddBiasAndWrite(unquant_mult, Bias_matrix, Bias_matrix)
-  // unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f);
-  template<class Callback>
-  static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
-    PrepareBiasImpl<Callback>::run(B, width, B_cols, callback);
-  }
-  
-  static const char *const kName;
-
-private:
-  template <typename Callback>
-  struct MultiplyImpl {
-    static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-  };
-
-  template <typename Callback>
-  struct PrepareBiasImpl {
-    static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback);
-  };
-};
-
-template <class Callback>
-void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
-    OMPParallelWrap8Shift<Callback, AVX512VNNI::Kernels8>,
-    OMPParallelWrap8Shift<Callback, AVX512BW::Kernels8>,
-    OMPParallelWrap8Shift<Callback, AVX2::Kernels8>,
-    OMPParallelWrap8Shift<Callback, SSSE3::Kernels8>, 
-    Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
-
-template <class Callback>
-void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias<Callback>, AVX512BW::Kernels8::PrepareBias<Callback>, AVX2::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
-
-/*
- * 16-bit matrix multiplication
- */
-struct Int16 {
-  using Integer = int16_t;
-
-  // A's size must be a multiple of 1x32, B's size must be a multiple of 32x8.
-  static constexpr TileInfo tile_info{1, 32, 32, 8};
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  // A's columns must be a multiple of 8.
-  // The number of rows is anything.
-  static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  // Multiply floats by quant_mult then convert to 16-bit integers with saturation.
-  // input
-  static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
-
-  // Warning: the output of PrepareB depends on the CPU.
-  // It will match the Multiply function on the same CPU though.
-  static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
-
-  // Convert from a B that was already transposed (routine not provided) and
-  // quantized (e.g. with Quantize) to the CPU-dependent format used for
-  // Multiply.  This is useful for storing a quantized model on disk then in a
-  // CPU-independent fashion.
-  static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
-
-  // Convert from a B that was already transposed (routine not provided) to
-  // the CPU-dependent format used for Multiply.  This is useful for storing
-  // a quantized model on disk then in a CPU-independent fashion.
-  static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
-
-  // Select columns from a prepared B matrix.  The number of selected columns must be a multiple of 8. 
-  static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
-
-  // Multiply C = A * B, presuming A and B have been prepared.
-  template <typename Callback>
-  static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-    MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
-  }
-
-  static const char *const kName;
-
-private:
-  template <typename Callback>
-  struct MultiplyImpl {
-    static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
-  };
-};
-
-template <typename Callback>
-void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512BW::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512BW::Kernels16>, OMPParallelWrap<Callback, AVX2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
-
-extern const CPUType kCPU;
-
-// Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned.
-extern float (*MaxAbsolute)(const float *begin, const float *end);
-
-// Get a Quantization value that is equant to the mean of the data +N standard deviations. Use 2 by default
-extern MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool);
-
-/* Returns the Mean and the Standard deviation of a vector. 
- * If "absolute" is set to true, it computes the mean and the standard deviation of the absolute values of the vector */
-static inline MeanStd GetVectorMeanStd(const float * begin, const float * end, bool absolute=false) {
-  return VectorMeanStd(begin, end, absolute);
-}
-
-
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/intgemm_config.h.in
+++ b/third_party/intgemm/intgemm/intgemm_config.h.in
@ -1,5 +0,0 @@
-#pragma once
-
-#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX2
-#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512BW
-#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
--- a/third_party/intgemm/intgemm/intrinsics.h
+++ b/third_party/intgemm/intgemm/intrinsics.h
@ -1,611 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-#include "types.h"
-
-#include <tmmintrin.h>
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-#include <immintrin.h>
-#endif
-#ifdef INTGEMM_WORMHOLE
-#include <wasm_simd128.h>
-#endif
-
-#include <cstdint>
-
-/*
- * NOTE: Please keep intrinsics in alphabetical order.
- */
-namespace intgemm {
-
-/*
- * Define a bunch of intrinstics as overloaded functions so they work with
- * templates.
- */
-template <class Register> static inline Register load_ps(float const* from);
-template <class Register> static inline Register loadu_ps(const float* mem_addr);
-template <class Register> static inline Register set1_epi16(int16_t to);
-template <class Register> static inline Register set1_epi32(int32_t to);
-template <class Register> static inline Register set1_epi8(int8_t to);
-template <class Register> static inline Register set1_pd(double to);
-template <class Register> static inline Register set1_ps(float to);
-template <class Register> static inline Register setzero_pd();
-template <class Register> static inline Register setzero_ps();
-template <class Register> static inline Register setzero_si();
-
-/*
- *
- * SSE2
- *
- */
-INTGEMM_SSSE3 static inline __m128i abs_epi8(__m128i arg) {
-  return _mm_abs_epi8(arg);
-}
-INTGEMM_SSE2 static inline __m128i add_epi8(__m128i a, __m128i b) {
-  return _mm_add_epi8(a, b);
-}
-INTGEMM_SSE2 static inline __m128i add_epi16(__m128i a, __m128i b) {
-  return _mm_add_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
-  return _mm_add_epi32(first, second);
-}
-INTGEMM_SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
-  return _mm_adds_epi16(first, second);
-}
-INTGEMM_SSE2 static inline __m128d add_pd(__m128d a, __m128d b) {
-  return _mm_add_pd(a, b);
-}
-INTGEMM_SSE2 static inline __m128 add_ps(__m128 a, __m128 b) {
-  return _mm_add_ps(a, b);
-}
-INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
-  return _mm_and_ps(first, second);
-}
-INTGEMM_SSE2 static inline __m128 andnot_ps(__m128 a, __m128 b) {
-  return _mm_andnot_ps(a, b);
-}
-INTGEMM_SSE2 static inline __m128i and_si(__m128i a, __m128i b) {
-  return _mm_and_si128(a, b);
-}
-INTGEMM_SSE2 static inline __m128 cast_ps(__m128i a) {
-  return _mm_castsi128_ps(a);
-}
-INTGEMM_SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
-  return _mm_cvtepi32_ps(arg);
-}
-INTGEMM_SSE2 static inline __m128i cvtps_epi32(__m128 arg) {
-  return _mm_cvtps_epi32(arg);
-}
-INTGEMM_SSE2 static inline __m128i cvttps_epi32(__m128 a) {
-  return _mm_cvttps_epi32(a);
-}
-INTGEMM_SSE2 static inline __m128 div_ps(__m128 a, __m128 b) {
-  return _mm_div_ps(a, b);
-}
-/*
- * Missing i32gather_ps for SSE2
- */
-template <> INTGEMM_SSE2 inline __m128 load_ps<__m128>(const float* from) {
-  return _mm_load_ps(from);
-}
-template <> INTGEMM_SSE2 inline __m128 loadu_ps(const float* mem_addr) {
-  return _mm_loadu_ps(mem_addr);
-}
-INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
-// https://bugzilla.mozilla.org/show_bug.cgi?id=1672160
-#ifdef INTGEMM_WORMHOLE
-  return wasm_v8x16_shuffle(first, second, 31, 0, 30, 2, 29, 4, 28, 6, 27, 8, 26, 10, 25, 12, 24, 2 /* PMADDWD */);
-#else
-  return _mm_madd_epi16(first, second);
-#endif
-}
-INTGEMM_SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
-// https://bugzilla.mozilla.org/show_bug.cgi?id=1672160
-#ifdef INTGEMM_WORMHOLE
-  return wasm_v8x16_shuffle(first, second, 31, 0, 30, 2, 29, 4, 28, 6, 27, 8, 26, 10, 25, 12, 24, 1 /* PMADDUBSW */);
-#else
-  return _mm_maddubs_epi16(first, second);
-#endif
-}
-/*
- * Missing max_epi8 for SSE2
- */
-INTGEMM_SSE2 static inline __m128i max_epi16(__m128i first, __m128i second) {
-  return _mm_max_epi16(first, second);
-}
-INTGEMM_SSE2 static inline __m128d max_pd(__m128d first, __m128d second) {
-  return _mm_max_pd(first, second);
-}
-INTGEMM_SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
-  return _mm_max_ps(first, second);
-}
-INTGEMM_SSE2 static inline __m128 min_ps(__m128 a, __m128 b) {
-  return _mm_min_ps(a, b);
-}
-INTGEMM_SSE2 static inline __m128i mul_epu32(__m128i a, __m128i b) {
-  return _mm_mul_epu32(a, b);
-}
-INTGEMM_SSE2 static inline __m128d mul_pd(__m128d a, __m128d b) {
-  return _mm_mul_pd(a, b);
-}
-INTGEMM_SSE2 static inline __m128 mul_ps(__m128 a, __m128 b) {
-  return _mm_mul_ps(a, b);
-}
-INTGEMM_SSE2 static inline __m128i mulhi_epi16(__m128i a, __m128i b) {
-  return _mm_mulhi_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i mullo_epi16(__m128i a, __m128i b) {
-  return _mm_mullo_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i or_si(__m128i a, __m128i b) {
-  return _mm_or_si128(a, b);
-}
-INTGEMM_SSE2 static inline __m128i packs_epi16(__m128i a, __m128i b) {
-  return _mm_packs_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i packs_epi32(__m128i a, __m128i b) {
-  return _mm_packs_epi32(a, b);
-}
-template <> INTGEMM_SSE2 inline __m128i set1_epi8<__m128i>(int8_t to) {
-  return _mm_set1_epi8(to);
-}
-template <> INTGEMM_SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
-  return _mm_set1_epi16(to);
-}
-template <> INTGEMM_SSE2 inline __m128i set1_epi32<__m128i>(int32_t to) {
-  return _mm_set1_epi32(to);
-}
-template <> INTGEMM_SSE2 inline __m128d set1_pd<__m128d>(double to) {
-  return _mm_set1_pd(to);
-}
-template <> INTGEMM_SSE2 inline __m128 set1_ps<__m128>(float to) {
-  return _mm_set1_ps(to);
-}
-template <> INTGEMM_SSE2 inline __m128d setzero_pd<__m128d>() {
-  return _mm_setzero_pd();
-}
-template <> INTGEMM_SSE2 inline __m128 setzero_ps<__m128>() {
-  return _mm_setzero_ps();
-}
-template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
-  return _mm_setzero_si128();
-}
-INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
-  return _mm_sign_epi8(first, second);
-}
-template <int imm8> INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a) {
-  return _mm_slli_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a) {
-  return _mm_srai_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a) {
-  return _mm_srai_epi32(a, imm8);
-}
-template <int imm8> INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a) {
-  return _mm_srli_epi16(a, imm8);
-}
-INTGEMM_SSE2 static inline void storeu_ps(float* mem_addr, __m128 a) {
-  _mm_storeu_ps(mem_addr, a);
-}
-INTGEMM_SSE2 static inline __m128d sub_pd(__m128d a, __m128d b) {
-  return _mm_sub_pd(a, b);
-}
-INTGEMM_SSE2 static inline __m128 sub_ps(__m128 a, __m128 b) {
-  return _mm_sub_ps(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpacklo_epi8(__m128i a, __m128i b) {
-  return _mm_unpacklo_epi8(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpackhi_epi8(__m128i a, __m128i b) {
-  return _mm_unpackhi_epi8(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpacklo_epi16(__m128i a, __m128i b) {
-  return _mm_unpacklo_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpackhi_epi16(__m128i a, __m128i b) {
-  return _mm_unpackhi_epi16(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpacklo_epi32(__m128i a, __m128i b) {
-  return _mm_unpacklo_epi32(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpackhi_epi32(__m128i a, __m128i b) {
-  return _mm_unpackhi_epi32(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpacklo_epi64(__m128i a, __m128i b) {
-  return _mm_unpacklo_epi64(a, b);
-}
-INTGEMM_SSE2 static inline __m128i unpackhi_epi64(__m128i a, __m128i b) {
-  return _mm_unpackhi_epi64(a, b);
-}
-INTGEMM_SSE2 static inline __m128i xor_si(__m128i a, __m128i b) {
-  return _mm_xor_si128(a, b);
-}
-
-/*
- *
- * AVX2
- *
- */
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) {
-  return _mm256_abs_epi8(arg);
-}
-INTGEMM_AVX2 static inline __m256i add_epi8(__m256i a, __m256i b) {
-  return _mm256_add_epi8(a, b);
-}
-INTGEMM_AVX2 static inline __m256i add_epi16(__m256i a, __m256i b) {
-  return _mm256_add_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
-  return _mm256_add_epi32(first, second);
-}
-INTGEMM_AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
-  return _mm256_adds_epi16(first, second);
-}
-INTGEMM_AVX2 static inline __m256d add_pd(__m256d a, __m256d b) {
-  return _mm256_add_pd(a, b);
-}
-INTGEMM_AVX2 static inline __m256 add_ps(__m256 a, __m256 b) {
-  return _mm256_add_ps(a, b);
-}
-INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
-  return _mm256_and_ps(first, second);
-}
-INTGEMM_AVX2 static inline __m256 andnot_ps(__m256 a, __m256 b) {
-  return _mm256_andnot_ps(a, b);
-}
-INTGEMM_AVX2 static inline __m256i and_si(__m256i a, __m256i b) {
-  return _mm256_and_si256(a, b);
-}
-INTGEMM_AVX2 static inline __m256 cast_ps(__m256i a) {
-  return _mm256_castsi256_ps(a);
-}
-INTGEMM_AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
-  return _mm256_cvtepi32_ps(arg);
-}
-INTGEMM_AVX2 static inline __m256i cvtps_epi32(__m256 arg) {
-  return _mm256_cvtps_epi32(arg);
-}
-INTGEMM_AVX2 static inline __m256i cvttps_epi32(__m256 a) {
-  return _mm256_cvttps_epi32(a);
-}
-INTGEMM_AVX2 static inline __m256 div_ps(__m256 a, __m256 b) {
-  return _mm256_div_ps(a, b);
-}
-template <unsigned Scale>
-INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex) {
-  return _mm256_i32gather_ps(base_addr, vindex, Scale);
-}
-template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) {
-  return _mm256_loadu_ps(mem_addr);
-}
-template <> INTGEMM_AVX2 inline __m256 load_ps<__m256>(const float* from) {
-  return _mm256_load_ps(from);
-}
-INTGEMM_AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
-  return _mm256_madd_epi16(first, second);
-}
-INTGEMM_AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
-  return _mm256_maddubs_epi16(first, second);
-}
-INTGEMM_AVX2 static inline __m256i max_epi8(__m256i first, __m256i second) {
-  return _mm256_max_epi8(first, second);
-}
-INTGEMM_AVX2 static inline __m256i max_epi16(__m256i first, __m256i second) {
-  return _mm256_max_epi16(first, second);
-}
-INTGEMM_AVX2 static inline __m256d max_pd(__m256d first, __m256d second) {
-  return _mm256_max_pd(first, second);
-}
-INTGEMM_AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
-  return _mm256_max_ps(first, second);
-}
-INTGEMM_AVX2 static inline __m256 min_ps(__m256 a, __m256 b) {
-  return _mm256_min_ps(a, b);
-}
-INTGEMM_AVX2 static inline __m256i mul_epu32(__m256i a, __m256i b) {
-  return _mm256_mul_epu32(a, b);
-}
-INTGEMM_AVX2 static inline __m256d mul_pd(__m256d a, __m256d b) {
-  return _mm256_mul_pd(a, b);
-}
-INTGEMM_AVX2 static inline __m256 mul_ps(__m256 a, __m256 b) {
-  return _mm256_mul_ps(a, b);
-}
-INTGEMM_AVX2 static inline __m256i mulhi_epi16(__m256i a, __m256i b) {
-  return _mm256_mulhi_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i mullo_epi16(__m256i a, __m256i b) {
-  return _mm256_mullo_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i or_si(__m256i a, __m256i b) {
-  return _mm256_or_si256(a, b);
-}
-INTGEMM_AVX2 static inline __m256i packs_epi16(__m256i a, __m256i b) {
-  return _mm256_packs_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i packs_epi32(__m256i a, __m256i b) {
-  return _mm256_packs_epi32(a, b);
-}
-template <> INTGEMM_AVX2 inline __m256i set1_epi8<__m256i>(int8_t to) {
-  return _mm256_set1_epi8(to);
-}
-template <> INTGEMM_AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
-  return _mm256_set1_epi16(to);
-}
-template <> INTGEMM_AVX2 inline __m256i set1_epi32<__m256i>(int32_t to) {
-  return _mm256_set1_epi32(to);
-}
-template <> INTGEMM_AVX2 inline __m256d set1_pd<__m256d>(double to) {
-  return _mm256_set1_pd(to);
-}
-template <> INTGEMM_AVX2 inline __m256 set1_ps<__m256>(float to) {
-  return _mm256_set1_ps(to);
-}
-template <> INTGEMM_AVX2 inline __m256d setzero_pd<__m256d>() {
-  return _mm256_setzero_pd();
-}
-template <> INTGEMM_AVX2 inline __m256 setzero_ps<__m256>() {
-  return _mm256_setzero_ps();
-}
-template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
-  return _mm256_setzero_si256();
-}
-INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
-  return _mm256_sign_epi8(first, second);
-}
-template <int imm8> INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a) {
-  return _mm256_slli_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a) {
-  return _mm256_srai_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a) {
-  return _mm256_srai_epi32(a, imm8);
-}
-template <int imm8> INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a) {
-  return _mm256_srli_epi16(a, imm8);
-}
-INTGEMM_AVX2 static inline void storeu_ps(float* mem_addr, __m256 a) {
-  _mm256_storeu_ps(mem_addr, a);
-}
-INTGEMM_AVX2 static inline __m256d sub_pd(__m256d a, __m256d b) {
-  return _mm256_sub_pd(a, b);
-}
-INTGEMM_AVX2 static inline __m256 sub_ps(__m256 a, __m256 b) {
-  return _mm256_sub_ps(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpacklo_epi8(__m256i a, __m256i b) {
-  return _mm256_unpacklo_epi8(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpackhi_epi8(__m256i a, __m256i b) {
-  return _mm256_unpackhi_epi8(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpacklo_epi16(__m256i a, __m256i b) {
-  return _mm256_unpacklo_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpackhi_epi16(__m256i a, __m256i b) {
-  return _mm256_unpackhi_epi16(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpacklo_epi32(__m256i a, __m256i b) {
-  return _mm256_unpacklo_epi32(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpackhi_epi32(__m256i a, __m256i b) {
-  return _mm256_unpackhi_epi32(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpacklo_epi64(__m256i a, __m256i b) {
-  return _mm256_unpacklo_epi64(a, b);
-}
-INTGEMM_AVX2 static inline __m256i unpackhi_epi64(__m256i a, __m256i b) {
-  return _mm256_unpackhi_epi64(a, b);
-}
-INTGEMM_AVX2 static inline __m256i xor_si(__m256i a, __m256i b) {
-  return _mm256_xor_si256(a, b);
-}
-#endif
-
-/*
- *
- * AVX512
- *
- */
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-
-INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) {
-  return _mm512_abs_epi8(arg);
-}
-INTGEMM_AVX512BW static inline __m512i add_epi8(__m512i a, __m512i b) {
-  return _mm512_add_epi8(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i add_epi16(__m512i a, __m512i b) {
-  return _mm512_add_epi16(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
-  return _mm512_add_epi32(first, second);
-}
-INTGEMM_AVX512BW static inline __m512i adds_epi16(__m512i first, __m512i second) {
-  return _mm512_adds_epi16(first, second);
-}
-INTGEMM_AVX512BW static inline __m512d add_pd(__m512d a, __m512d b) {
-  return _mm512_add_pd(a, b);
-}
-INTGEMM_AVX512BW static inline __m512 add_ps(__m512 a, __m512 b) {
-  return _mm512_add_ps(a, b);
-}
-INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
-  return _mm512_and_ps(first, second);
-}
-INTGEMM_AVX512DQ static inline __m512 andnot_ps(__m512 a, __m512 b) {
-  return _mm512_andnot_ps(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i and_si(__m512i a, __m512i b) {
-  return _mm512_and_si512(a, b);
-}
-INTGEMM_AVX512F static inline __m512 cast_ps(__m512i a) {
-  return _mm512_castsi512_ps(a);
-}
-INTGEMM_AVX512BW static inline __m512 cvtepi32_ps(__m512i arg) {
-  return _mm512_cvtepi32_ps(arg);
-}
-INTGEMM_AVX512BW static inline __m512i cvtps_epi32(__m512 arg) {
-  return _mm512_cvtps_epi32(arg);
-}
-INTGEMM_AVX512BW static inline __m512i cvttps_epi32(__m512 a) {
-  return _mm512_cvttps_epi32(a);
-}
-INTGEMM_AVX512BW static inline __m512 div_ps(__m512 a, __m512 b) {
-  return _mm512_div_ps(a, b);
-}
-template <unsigned Scale>
-INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex) {
-  return _mm512_i32gather_ps(vindex, base_addr, Scale);
-}
-template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) {
-  return _mm512_loadu_ps(mem_addr);
-}
-INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
-  return _mm512_madd_epi16(first, second);
-}
-INTGEMM_AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
-  return _mm512_maddubs_epi16(first, second);
-}
-INTGEMM_AVX512BW static inline __m512i max_epi8(__m512i first, __m512i second) {
-  return _mm512_max_epi8(first, second);
-}
-INTGEMM_AVX512BW static inline __m512i max_epi16(__m512i first, __m512i second) {
-  return _mm512_max_epi16(first, second);
-}
-INTGEMM_AVX512BW static inline __m512d max_pd(__m512d first, __m512d second) {
-  return _mm512_max_pd(first, second);
-}
-INTGEMM_AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
-  return _mm512_max_ps(first, second);
-}
-INTGEMM_AVX512BW static inline __m512 min_ps(__m512 a, __m512 b) {
-  return _mm512_min_ps(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i mul_epu32(__m512i a, __m512i b) {
-  return _mm512_mul_epu32(a, b);
-}
-INTGEMM_AVX512BW static inline __m512d mul_pd(__m512d a, __m512d b) {
-  return _mm512_mul_pd(a, b);
-}
-INTGEMM_AVX512BW static inline __m512 mul_ps(__m512 a, __m512 b) {
-  return _mm512_mul_ps(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i mulhi_epi16(__m512i a, __m512i b) {
-  return _mm512_mulhi_epi16(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i mullo_epi16(__m512i a, __m512i b) {
-  return _mm512_mullo_epi16(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i or_si(__m512i a, __m512i b) {
-  return _mm512_or_si512(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i packs_epi16(__m512i a, __m512i b) {
-  return _mm512_packs_epi16(a, b);
-}
-/* g++ (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609 has a bug:
- * /usr/lib/gcc/x86_64-linux-gnu/5/include/avx512bwintrin.h is missing
- * _mm512_packs_epi32 when compiled with debugging.
- */
-#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && (__GNUC_MINOR__ == 4)
-INTGEMM_AVX512BW static inline __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) __m512i packs_epi32(__m512i a, __m512i b) {
-  return reinterpret_cast<__m512i>(__builtin_ia32_packssdw512_mask(
-     reinterpret_cast<__v16si>(a),
-     reinterpret_cast<__v16si>(b),
-     reinterpret_cast<__v32hi>(_mm512_setzero_si512()),
-     0xffffffff));
-}
-#else
-INTGEMM_AVX512BW static inline __m512i packs_epi32(__m512i a, __m512i b) {
-  return _mm512_packs_epi32(a, b);
-}
-#endif
-template <> inline INTGEMM_AVX512BW __m512i set1_epi8<__m512i>(int8_t to) {
-  return _mm512_set1_epi8(to);
-}
-template <> inline INTGEMM_AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
-  return _mm512_set1_epi16(to);
-}
-template <> inline INTGEMM_AVX512BW __m512i set1_epi32<__m512i>(int32_t to) {
-  return _mm512_set1_epi32(to);
-}
-template <> inline INTGEMM_AVX512BW __m512d set1_pd<__m512d>(double to) {
-  return _mm512_set1_pd(to);
-}
-template <> inline INTGEMM_AVX512BW __m512 set1_ps<__m512>(float to) {
-  return _mm512_set1_ps(to);
-}
-template <> INTGEMM_AVX512BW inline __m512d setzero_pd<__m512d>() {
-  return _mm512_setzero_pd();
-}
-template <> INTGEMM_AVX512BW inline __m512 setzero_ps<__m512>() {
-  return _mm512_setzero_ps();
-}
-template <> INTGEMM_AVX512BW inline __m512i setzero_si<__m512i>() {
-  return _mm512_setzero_si512();
-}
-template <> INTGEMM_AVX512BW inline __m512 load_ps<__m512>(const float* from) {
-  return _mm512_load_ps(from);
-}
-/*
- * Missing sign_epi8
- */
-template <int imm8> INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a) {
-  return _mm512_slli_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a) {
-  return _mm512_srai_epi16(a, imm8);
-}
-template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a) {
-  return _mm512_srai_epi32(a, imm8);
-}
-template <int imm8> INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a) {
-  return _mm512_srli_epi16(a, imm8);
-}
-INTGEMM_AVX512BW static inline void storeu_ps(float* mem_addr, __m512 a) {
-  _mm512_storeu_ps(mem_addr, a);
-}
-INTGEMM_AVX512BW static inline __m512d sub_pd(__m512d a, __m512d b) {
-  return _mm512_sub_pd(a, b);
-}
-INTGEMM_AVX512BW static inline __m512 sub_ps(__m512 a, __m512 b) {
-  return _mm512_sub_ps(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpacklo_epi8(__m512i a, __m512i b) {
-  return _mm512_unpacklo_epi8(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpackhi_epi8(__m512i a, __m512i b) {
-  return _mm512_unpackhi_epi8(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpacklo_epi16(__m512i a, __m512i b) {
-  return _mm512_unpacklo_epi16(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpackhi_epi16(__m512i a, __m512i b) {
-  return _mm512_unpackhi_epi16(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpacklo_epi32(__m512i a, __m512i b) {
-  return _mm512_unpacklo_epi32(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpackhi_epi32(__m512i a, __m512i b) {
-  return _mm512_unpackhi_epi32(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpacklo_epi64(__m512i a, __m512i b) {
-  return _mm512_unpacklo_epi64(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i unpackhi_epi64(__m512i a, __m512i b) {
-  return _mm512_unpackhi_epi64(a, b);
-}
-INTGEMM_AVX512BW static inline __m512i xor_si(__m512i a, __m512i b) {
-  return _mm512_xor_si512(a, b);
-}
-
-#endif
-
-}
--- a/third_party/intgemm/intgemm/kernels.h
+++ b/third_party/intgemm/intgemm/kernels.h
@ -1,26 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-#include "intrinsics.h"
-#include "types.h"
-#include "utils.h"
-#include "vec_traits.h"
-
-#include <cstdlib>
-
-#define KERNELS_THIS_IS_SSE2
-#include "kernels/implementations.inl"
-#undef KERNELS_THIS_IS_SSE2
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-#define KERNELS_THIS_IS_AVX2
-#include "kernels/implementations.inl"
-#undef KERNELS_THIS_IS_AVX2
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-#define KERNELS_THIS_IS_AVX512BW
-#include "kernels/implementations.inl"
-#undef KERNELS_THIS_IS_AVX512BW
-#endif
-
--- a/third_party/intgemm/intgemm/kernels/implementations.inl
+++ b/third_party/intgemm/intgemm/kernels/implementations.inl
@ -1,456 +0,0 @@
-/* This file is included multiple times, once for each backend instruction set. */
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  #define CPU_NAME SSE2
-  #define CPU_ATTR INTGEMM_SSE2
-#elif defined(KERNELS_THIS_IS_AVX2)
-  #define CPU_NAME AVX2
-  #define CPU_ATTR INTGEMM_AVX2
-#elif defined(KERNELS_THIS_IS_AVX512BW)
-  #define CPU_NAME AVX512BW
-  #define CPU_ATTR INTGEMM_AVX512BW
-#else
-  #error "Only SSE2, AVX2 and AVX512BW are supported"
-#endif
-
-#define vi vector_t<CPUType::CPU_NAME, int>
-#define vf vector_t<CPUType::CPU_NAME, float>
-#define vd vector_t<CPUType::CPU_NAME, double>
-
-/*
- * Kernels implementations....
- */
-namespace intgemm {
-namespace kernels {
-
-/*
- * Write
- */
-CPU_ATTR static inline void write(vi input, int8_t* output, Index offset) {
-  *reinterpret_cast<vi*>(output + offset) = input;
-}
-
-CPU_ATTR static inline void write(vi input, int16_t* output, Index offset) {
-  *reinterpret_cast<vi*>(output + offset) = input;
-}
-
-CPU_ATTR static inline void write(vi input, int* output, Index offset) {
-  *reinterpret_cast<vi*>(output + offset) = input;
-}
-
-CPU_ATTR static inline void write(vf input, float* output, Index offset) {
-  *reinterpret_cast<vf*>(output + offset) = input;
-}
-
-CPU_ATTR static inline void write(vd input, double* output, Index offset) {
-  *reinterpret_cast<vd*>(output + offset) = input;
-}
-
-/*
- * Quantize
- */
-CPU_ATTR static inline vi quantize(vf input, vf quant_mult) {
-  return cvtps_epi32(mul_ps(input, quant_mult));
-}
-
-/*
- * Unquantize
- */
-CPU_ATTR static inline vf unquantize(vi input, vf unquant_mult) {
-  return mul_ps(cvtepi32_ps(input), unquant_mult);
-}
-
-/*
- * Add a bias term
- */
-CPU_ATTR static inline vi add_bias(vi input, const int8_t* bias_addr, Index bias_offset) {
-  auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
-  return add_epi8(input, bias_term);
-}
-
-CPU_ATTR static inline vi add_bias(vi input, const int16_t* bias_addr, Index bias_offset) {
-  auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
-  return add_epi16(input, bias_term);
-}
-
-CPU_ATTR static inline vi add_bias(vi input, const int* bias_addr, Index bias_offset) {
-  auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
-  return add_epi32(input, bias_term);
-}
-
-CPU_ATTR static inline vf add_bias(vf input, const float* bias_addr, Index bias_offset) {
-  auto bias_term = *reinterpret_cast<const vf*>(bias_addr + bias_offset);
-  return add_ps(input, bias_term);
-}
-
-CPU_ATTR static inline vd add_bias(vd input, const double* bias_addr, Index bias_offset) {
-  auto bias_term = *reinterpret_cast<const vd*>(bias_addr + bias_offset);
-  return add_pd(input, bias_term);
-}
-
-/*
- * ReLU
- */
-template <typename Type>
-CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> relu(vector_t<CPUType::CPU_NAME, Type> input);
-
-template <>
-CPU_ATTR inline vi relu<int8_t>(vi input) {
-  static const auto vconst_zero = set1_epi8<vi>(0);
-#if defined(KERNELS_THIS_IS_SSE2)
-  return and_si(input, _mm_cmplt_epi8(vconst_zero, input));
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_max_epi8(input, vconst_zero);
-#else
-  return _mm512_max_epi8(input, vconst_zero);
-#endif
-}
-
-template <>
-CPU_ATTR inline vi relu<int16_t>(vi input) {
-  static const auto vconst_zero = set1_epi16<vi>(0);
-  return max_epi16(input, vconst_zero);
-}
-
-template <>
-CPU_ATTR inline vi relu<int>(vi input) {
-  static const auto vconst_zero = set1_epi32<vi>(0);
-#if defined(KERNELS_THIS_IS_SSE2)
-  return and_si(input, _mm_cmplt_epi32(vconst_zero, input));
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_max_epi32(input, vconst_zero);
-#else
-  return _mm512_max_epi32(input, vconst_zero);
-#endif
-}
-
-template <>
-CPU_ATTR inline vf relu<float>(vf input) {
-  static const auto vconst_zero = setzero_ps<vf>();
-  return max_ps(input, vconst_zero);
-}
-
-template <>
-CPU_ATTR inline vd relu<double>(vd input) {
-  static const auto vconst_zero = setzero_pd<vd>();
-  return max_pd(input, vconst_zero);
-}
-
-/*
- * Multiply (elemwise)
- */
-template <typename Type>
-CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b);
-
-template <>
-CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) {
-  auto even = mullo_epi16(a, b);
-  auto odd = mullo_epi16(srli_epi16<8>(a), srli_epi16<8>(b));
-  return or_si(slli_epi16<8>(odd), srli_epi16<8>(slli_epi16<8>(even)));
-}
-
-template <>
-CPU_ATTR inline vi multiply<int16_t>(vi a, vi b) {
-  return mullo_epi16(a, b);
-}
-
-template <>
-CPU_ATTR inline vi multiply<int>(vi a, vi b) {
-#if defined(KERNELS_THIS_IS_SSE2)
-  auto even = mul_epu32(a, b);
-  auto odd = mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
-  return unpacklo_epi32(_mm_shuffle_epi32(even, 0x8 /* = 0 0 2 0 */), _mm_shuffle_epi32(odd, 0x8 /* = 0 0 2 0 */));
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_mullo_epi32(a, b);
-#else
-  return _mm512_mullo_epi32(a, b);
-#endif
-}
-
-template <>
-CPU_ATTR inline vf multiply<float>(vf a, vf b) {
-  return mul_ps(a, b);
-}
-
-template <>
-CPU_ATTR inline vd multiply<double>(vd a, vd b) {
-  return mul_pd(a, b);
-}
-
-/*
- * Downcast
- */
-CPU_ATTR static inline vi downcast32to8(vi input1, vi input2, vi input3, vi input4) {
-  auto result = packs_epi16(packs_epi32(input1, input2), packs_epi32(input3, input4));
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  return result;
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_shuffle_epi32(_mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */), 0xd8 /* = 0 2 1 3 */);
-#else
-  static const auto permutation_indices = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
-  return _mm512_castps_si512(_mm512_permutexvar_ps(permutation_indices, _mm512_castsi512_ps(result)));
-#endif
-}
-
-CPU_ATTR static inline vi downcast32to16(vi input1, vi input2) {
-  auto result = packs_epi32(input1, input2);
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  return result;
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */);
-#else
-  static const auto permutation_indices = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
-  return _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(result)));
-#endif
-}
-
-CPU_ATTR static inline vi downcast16to8(vi input1, vi input2) {
-  auto result = packs_epi16(input1, input2);
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  return result;
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */);
-#else
-  static const auto permutation_indices = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
-  return _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(result)));
-#endif
-}
-
-/*
- * Upcast
- */
-CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int16_t> upcast8to16(vi input) {
-  static const auto vzero = set1_epi8<vi>(0);
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  auto higher_byte = _mm_cmpgt_epi8(vzero, input);
-#elif defined(KERNELS_THIS_IS_AVX2)
-  input = _mm256_permute4x64_epi64(input, 0xd8 /* = 0 2 1 3 */);
-  auto higher_byte = _mm256_cmpgt_epi8(vzero, input);
-#else
-  static const auto vmax_negative = set1_epi8<vi>(-1 /* 0xff */);
-  static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-
-  input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
-  auto negatives = _mm512_cmp_epi8_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
-  auto higher_byte = _mm512_mask_blend_epi8(negatives, vzero, vmax_negative);
-#endif
-
-  return {
-    unpacklo_epi8(input, higher_byte),
-    unpackhi_epi8(input, higher_byte),
-  };
-}
-
-CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int> upcast16to32(vi input) {
-  static const auto vzero = set1_epi16<vi>(0);
-
-#if defined(KERNELS_THIS_IS_SSE2)
-  auto higher_byte = _mm_cmpgt_epi16(vzero, input);
-#elif defined(KERNELS_THIS_IS_AVX2)
-  input = _mm256_permute4x64_epi64(input, 0xd8 /* = 0 2 1 3 */);
-  auto higher_byte = _mm256_cmpgt_epi16(vzero, input);
-#else
-  static const auto vmax_negative = set1_epi16<vi>(-1 /* 0xffff */);
-  static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-
-  input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
-  auto negatives = _mm512_cmp_epi16_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
-  auto higher_byte = _mm512_mask_blend_epi16(negatives, vzero, vmax_negative);
-#endif
-
-  return {
-    unpacklo_epi16(input, higher_byte),
-    unpackhi_epi16(input, higher_byte),
-  };
-}
-
-CPU_ATTR static inline qvector_t<CPUType::CPU_NAME, int> upcast8to32(vi input) {
-  auto result16 = upcast8to16(input);
-  auto result32a = upcast16to32(result16.first);
-  auto result32b = upcast16to32(result16.second);
-
-  return {
-    result32a.first,
-    result32a.second,
-    result32b.first,
-    result32b.second,
-  };
-}
-
-/*
- * Rescale int32
- */
-CPU_ATTR static inline vi rescale(vi input, vf scale) {
-  return cvtps_epi32(mul_ps(cvtepi32_ps(input), scale));
-}
-
-/*
- * Bitwise not
- */
-CPU_ATTR static inline vi bitwise_not(vi v) {
-  return xor_si(v, set1_epi32<vi>(0xffffffff));
-}
-
-/*
- * Floor
- */
-CPU_ATTR static inline vf floor(vf input) {
-#if defined(KERNELS_THIS_IS_SSE2)
-  static const auto vconst_zero = setzero_ps<vf>();
-  static const auto vconst_one = set1_ps<vf>(1.f);
-
-  auto result = cvtepi32_ps(cvttps_epi32(input));
-  auto negatives = _mm_cmplt_ps(input, vconst_zero);
-  auto nonintegers = _mm_cmpneq_ps(input, result);
-
-  return sub_ps(result, and_ps(vconst_one, and_ps(negatives, nonintegers)));
-#elif defined(KERNELS_THIS_IS_AVX2)
-  return _mm256_floor_ps(input);
-#else
-  // TODO: It should work but compiler throw the error "incorrect rounding operand"
-  // return _mm512_roundscale_round_ps(input, 0, _MM_FROUND_FLOOR);
-
-  static const auto vconst_zero = setzero_ps<vf>();
-  static const auto vconst_one = set1_ps<vf>(1.f);
-
-  auto result = cvtepi32_ps(cvttps_epi32(input));
-  auto negatives = _mm512_cmp_ps_mask(input, vconst_zero, _CMP_LT_OQ);
-  auto nonintegers = _mm512_cmp_ps_mask(input, result, _CMP_NEQ_OQ);
-
-  return _mm512_mask_blend_ps(_mm512_kand(negatives, nonintegers), result, sub_ps(result, vconst_one));
-#endif
-}
-
-/*
- * Calculate approximation of e^x using Taylor series and lookup table
- */
-#if defined(KERNELS_THIS_IS_SSE2)
-CPU_ATTR static inline vf exp_approx_taylor(vf) {
-  std::abort();
-}
-#else
-CPU_ATTR static inline vf exp_approx_taylor(vf x) {
-  static constexpr int EXP_MIN = -20;
-  static constexpr int EXP_MAX = 20;
-  static constexpr float EXP_LOOKUP[EXP_MAX - EXP_MIN + 1] = {
-    expif(-20), expif(-19), expif(-18), expif(-17), expif(-16), expif(-15),
-    expif(-14), expif(-13), expif(-12), expif(-11), expif(-10), expif(-9),
-    expif(-8), expif(-7), expif(-6), expif(-5), expif(-4), expif(-3), expif(-2),
-    expif(-1), expif(0), expif(1), expif(2), expif(3), expif(4), expif(5),
-    expif(6), expif(7), expif(8), expif(9), expif(10), expif(11), expif(12),
-    expif(13), expif(14), expif(15), expif(16), expif(17), expif(18), expif(19),
-    expif(20),
-  };
-
-  static const vf dividers[] = {
-    set1_ps<vf>(1.f / factorial(7)),
-    set1_ps<vf>(1.f / factorial(6)),
-    set1_ps<vf>(1.f / factorial(5)),
-    set1_ps<vf>(1.f / factorial(4)),
-    set1_ps<vf>(1.f / factorial(3)),
-    set1_ps<vf>(1.f / factorial(2)),
-    set1_ps<vf>(1.f / factorial(1)),
-  };
-  static const auto const_one = set1_ps<vf>(1.f);
-  static const auto const_min_x = set1_ps<vf>(EXP_MIN);
-  static const auto const_max_x = set1_ps<vf>(EXP_MAX);
-
-  x = max_ps(x, const_min_x);
-  x = min_ps(x, const_max_x);
-
-  auto a = floor(x);
-  auto xa = sub_ps(x, a);
-
-  auto result = mul_ps(dividers[0], xa);
-
-  result = add_ps(result, dividers[1]);
-  result = mul_ps(result, xa);
-  result = add_ps(result, dividers[2]);
-  result = mul_ps(result, xa);
-  result = add_ps(result, dividers[3]);
-  result = mul_ps(result, xa);
-  result = add_ps(result, dividers[4]);
-  result = mul_ps(result, xa);
-  result = add_ps(result, dividers[5]);
-  result = mul_ps(result, xa);
-  result = add_ps(result, dividers[6]);
-  result = mul_ps(result, xa);
-
-  result = add_ps(result, const_one);
-
-  auto ea = i32gather_ps<4>(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a));
-  return mul_ps(ea, result);
-}
-#endif
-
-/*
- * Sigmoid
- */
-CPU_ATTR static inline vf sigmoid(vf
-#ifndef KERNELS_THIS_IS_SSE2
-    input
-#endif
-    ) {
-#if defined(KERNELS_THIS_IS_SSE2)
-  std::abort(); // TODO: missing exp_approx_taylor for SSE2
-#elif defined(KERNELS_THIS_IS_AVX2)
-  static const auto vconst_zero = setzero_ps<vf>();
-  static const auto vconst_one = set1_ps<vf>(1.f);
-
-  auto x = input;
-  auto minus_x = sub_ps(vconst_zero, x);
-  auto e_x = exp_approx_taylor(x);
-  auto e_minus_x = exp_approx_taylor(minus_x);
-
-  auto sigmoid_case1 = _mm256_rcp_ps(add_ps(vconst_one, e_minus_x));
-  auto sigmoid_case2 = mul_ps(e_x, _mm256_rcp_ps(add_ps(vconst_one, e_x)));
-
-  auto nonnegative_x_mask = _mm256_cmp_ps(vconst_zero, x, _CMP_LT_OS);
-  return _mm256_blendv_ps(sigmoid_case1, sigmoid_case2, nonnegative_x_mask);
-#else
-  static const auto vconst_zero = setzero_ps<vf>();
-  static const auto vconst_one = set1_ps<vf>(1.f);
-
-  auto x = input;
-  auto minus_x = sub_ps(vconst_zero, x);
-  auto e_x = exp_approx_taylor(x);
-  auto e_minus_x = exp_approx_taylor(minus_x);
-
-  auto sigmoid_case1 = _mm512_rcp14_ps(add_ps(vconst_one, e_minus_x));
-  auto sigmoid_case2 = mul_ps(e_x, _mm512_rcp14_ps(add_ps(vconst_one, e_x)));
-
-  auto nonnegative_x_mask = _mm512_cmp_ps_mask(vconst_zero, x, _CMP_LT_OS);
-  return _mm512_mask_blend_ps(nonnegative_x_mask, sigmoid_case1, sigmoid_case2);
-#endif
-}
-
-/*
- * Tanh
- */
-#if defined(KERNELS_THIS_IS_SSE2)
-CPU_ATTR static inline vf tanh(vf) {
-  std::abort(); // TODO: missing exp_approx_taylor for SSE2
-}
-#else
-CPU_ATTR static inline vf tanh(vf input) {
-  const static auto vconst_zero = setzero_ps<vf>();
-
-  auto e_x = exp_approx_taylor(input);
-  auto e_minus_x = exp_approx_taylor(sub_ps(vconst_zero, input));
-
-  return div_ps(sub_ps(e_x, e_minus_x), add_ps(e_x, e_minus_x));
-}
-#endif
-
-}
-}
-
-#undef CPU_NAME
-#undef CPU_ATTR
-#undef vi
-#undef vf
-#undef vd
--- a/third_party/intgemm/intgemm/multiply.h
+++ b/third_party/intgemm/intgemm/multiply.h
@ -1,626 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-#include "interleave.h"
-#include "intrinsics.h"
-#include "vec_traits.h"
-#include "callbacks.h"
-
-namespace intgemm {
-
-INTGEMM_SSE2 static inline dvector_t<CPUType::SSE2, int> PermuteSummer(__m128i pack0123, __m128i pack4567) {
-  // No op for 128 bits: already reduced fully.
-  return { pack0123, pack4567 };
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
-  // This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
-  __m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
-  // This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
-  __m256i blended = _mm256_blend_epi32(pack0123, pack4567, 0xf0);
-  return _mm256_add_epi32(rev, blended);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
-  // Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
-  __m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
-  // Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
-  __m512i mix1 = _mm512_mask_permutex_epi64(pack4567, 0x33, pack0123, 2 | (3 << 2));
-  __m512i added = _mm512_add_epi32(mix0, mix1);
-  // Now we have 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7.
-  // Fold register over itself.
-  return _mm256_add_epi32(_mm512_castsi512_si256(added), _mm512_extracti64x4_epi64(added, 1));
-}
-#endif
-
-#ifdef _MSC_VER
-#define INTGEMM_OMP_FOR __pragma(omp for)
-#define INTGEMM_OMP_PARALLEL __pragma(omp parallel)
-#else
-#define INTGEMM_OMP_FOR _Pragma("omp for")
-#define INTGEMM_OMP_PARALLEL _Pragma("omp parallel")
-#endif
-
-// Quantize function used for SSSE3 and AVX2.
-// Separate function for thread to work around gcc 7 bug that doesn't imbue
-// target attributes across #pragma omp parallel.
-#define INTGEMM_QUANTIZE_THREAD(target) \
-target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
-  FRegister q = set1_ps<FRegister>(quant_mult); \
-  INTGEMM_OMP_FOR \
-  for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
-    *reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
-  } \
-}
-
-#define INTGEMM_QUANTIZE(target) \
-target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
-  assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
-  const std::size_t kBatch = sizeof(Register); \
-  const std::size_t fast_end = size & ~(kBatch - 1); \
-  INTGEMM_OMP_PARALLEL \
-  { \
-    QuantizeThread(input, output, quant_mult, fast_end); \
-  } \
-  std::size_t overhang = size & (kBatch - 1); \
-  if (!overhang) return; \
-  FRegister q = set1_ps<FRegister>(quant_mult); \
-  /* Each does size(Register) / 32 == kBatch / 4 floats at a time.
-   * If we're allowed to read one of them, then we can read the whole register.  */ \
-  const float *inputs[4]; \
-  std::size_t i; \
-  for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) { \
-    inputs[i] = &input[fast_end + i * (kBatch / 4)]; \
-  } \
-  /* These will be clipped off. */ \
-  for (; i < 4; ++i) { \
-    inputs[i] = &input[fast_end]; \
-  } \
-  Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
-  std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
-}
-
-/* Take 4 registers with 32-bit values to be horizontally added.  Reduce them
- * to one register with 32-bit values in the pattern 1 2 3 4 1 2 3 4, leaving
- * the final addition (which crosses 128-bit lanes) to the caller. 
- */
-#define INTGEMM_PACK0123(target, Register) \
-target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \
-  Interleave32(sum0, sum1); \
-  Register pack01 = add_epi32(sum0, sum1); \
-  Interleave32(sum2, sum3); \
-  Register pack23 = add_epi32(sum2, sum3); \
-  Interleave64(pack01, pack23); \
-  return add_epi32(pack01, pack23); \
-} \
-
-INTGEMM_PACK0123(INTGEMM_SSE2, __m128i)
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_PACK0123(INTGEMM_AVX2, __m256i)
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
-INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
-#endif
-
-template <typename Callback>
-INTGEMM_SSE2 static inline void RunCallback(Callback& callback_impl, dvector_t<CPUType::SSE2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
-  callback_impl.Run(total.first, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
-  callback_impl.Run(total.second, callbacks::OutputBufferInfo(row_idx, col_idx + 4, rows, cols));
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template <typename Callback>
-INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CPUType::AVX2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
-  callback_impl.Run(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
-}
-#endif
-
-// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512.
-// C = A * B * unquant_mult
-//
-// This has been substantially revised from Jacob Devlin's SSE code which is:
-// Copyright (c) 2017 Microsoft Corporation
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-// A is a row-major quantized matrix (from PrepareA)
-// B is a rearranged quantized matrix (from PrepareB)
-// C is output in row-major form.
-//
-// All of A, B, and C must be in aligned to a multiple of the register size:
-// INTGEMM_SSE2: 16 bytes
-// INTGEMM_AVX2: 32 bytes
-// AVX512: 64 bytes.
-//
-// A_rows can be anything non-negative.
-// width must be a multiple of the register size.
-// B_cols must be a multiple of 8.
-// Multiply16
-#define INTGEMM_MULTIPLY16(Register, target, cpu_type) \
-template <typename Callback> target static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
-  assert(width % (sizeof(Register) / sizeof(int16_t)) == 0); \
-  assert(B_cols % 8 == 0); \
-  assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const Index simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
-  auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
-  INTGEMM_OMP_FOR \
-  for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
-    const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
-    /* Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.*/ \
-    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
-      const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
-      /* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
-         Iterate over shared (inner) dimension.*/ \
-      Index k = 0; \
-      Register a = *(A_row + k); \
-      Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \
-      Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
-      Register sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
-      Register sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
-      Register sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
-      Register sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
-      Register sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
-      Register sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
-      for (k = 1; k < simd_width; ++k) { \
-        a = *(A_row + k); \
-        /* Multiply 16-bit, horizontally add to packed 32-bit integers.*/ \
-        Register mult0 = madd_epi16(a, *(B0_col + k * 8)); \
-        Register mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
-        Register mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
-        Register mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
-        Register mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
-        Register mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
-        Register mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
-        Register mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
-        /* Sum packed 32-bit integers with danger of overflow.  TODO: accumulate in 64-bit every so often.*/ \
-        sum0 = add_epi32(sum0, mult0); \
-        sum1 = add_epi32(sum1, mult1); \
-        sum2 = add_epi32(sum2, mult2); \
-        sum3 = add_epi32(sum3, mult3); \
-        sum4 = add_epi32(sum4, mult4); \
-        sum5 = add_epi32(sum5, mult5); \
-        sum6 = add_epi32(sum6, mult6); \
-        sum7 = add_epi32(sum7, mult7); \
-      } \
-      /* Reduce sums within 128-bit lanes.*/ \
-      Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
-      Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
-      /*The specific implementation may need to reduce further.*/ \
-      auto total = PermuteSummer(pack0123, pack4567); \
-      RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
-    } \
-  } \
-} \
-
-//An int8_prepbias version of the above code, using the add 127 technique
-#define INTGEMM_PREPAREBIASFOR8(Register, target, cpu_type) \
-  template <class Callback> target static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { \
-  assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
-  assert(B_cols % 8 == 0); \
-  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
-  auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
-  const Register a = set1_epi8<Register>(1); \
-  INTGEMM_OMP_FOR \
-  for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
-    const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
-    /*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \
-    /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
-       Iterate over shared (inner) dimension.*/ \
-    Index k = 0; \
-    Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
-    Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
-    Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
-    Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
-    Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
-    Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
-    Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
-    Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
-    /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
-    Register ones = set1_epi16<Register>(1); \
-    sum0 = madd_epi16(sum0, ones); \
-    sum1 = madd_epi16(sum1, ones); \
-    sum2 = madd_epi16(sum2, ones); \
-    sum3 = madd_epi16(sum3, ones); \
-    sum4 = madd_epi16(sum4, ones); \
-    sum5 = madd_epi16(sum5, ones); \
-    sum6 = madd_epi16(sum6, ones); \
-    sum7 = madd_epi16(sum7, ones); \
-    for (k = 1; k < simd_width; ++k) { \
-      /*Register a = *(A_row + k);*/ \
-      /* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
-      Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
-      Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
-      Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
-      Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
-      Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
-      Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
-      Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
-      Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
-      /* Upcast to 32-bit and horizontally add.*/ \
-      mult0 = madd_epi16(mult0, ones); \
-      mult1 = madd_epi16(mult1, ones); \
-      mult2 = madd_epi16(mult2, ones); \
-      mult3 = madd_epi16(mult3, ones); \
-      mult4 = madd_epi16(mult4, ones); \
-      mult5 = madd_epi16(mult5, ones); \
-      mult6 = madd_epi16(mult6, ones); \
-      mult7 = madd_epi16(mult7, ones); \
-      /*Add in 32bit*/ \
-      sum0 = add_epi32(sum0, mult0); \
-      sum1 = add_epi32(sum1, mult1); \
-      sum2 = add_epi32(sum2, mult2); \
-      sum3 = add_epi32(sum3, mult3); \
-      sum4 = add_epi32(sum4, mult4); \
-      sum5 = add_epi32(sum5, mult5); \
-      sum6 = add_epi32(sum6, mult6); \
-      sum7 = add_epi32(sum7, mult7); \
-      \
-    } \
-    /* Reduce sums within 128-bit lanes.*/ \
-    Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
-    Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
-    /*The specific implementation may need to reduce further.*/ \
-    auto total = PermuteSummer(pack0123, pack4567); \
-    RunCallback(callback_impl, total, 0, B0_colidx, 1, B_cols); \
-  } \
-} \
-
-//An int8 version of the above code, using the add 127 technique
-#define INTGEMM_MULTIPLY8SHIFT(Register, target, cpu_type) \
-  template <class Callback> target static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
-  assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
-  assert(B_cols % 8 == 0); \
-  assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
-  auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
-  INTGEMM_OMP_FOR \
-  for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
-    const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
-    /* Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.*/ \
-    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
-      const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
-      /* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
-         Iterate over shared (inner) dimension.*/ \
-      Index k = 0; \
-      Register a = *(A_row + k); \
-      Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
-      Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
-      Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
-      Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
-      Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
-      Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
-      Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
-      Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
-      /* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
-      Register ones = set1_epi16<Register>(1); \
-      sum0 = madd_epi16(sum0, ones); \
-      sum1 = madd_epi16(sum1, ones); \
-      sum2 = madd_epi16(sum2, ones); \
-      sum3 = madd_epi16(sum3, ones); \
-      sum4 = madd_epi16(sum4, ones); \
-      sum5 = madd_epi16(sum5, ones); \
-      sum6 = madd_epi16(sum6, ones); \
-      sum7 = madd_epi16(sum7, ones); \
-      for (k = 1; k < simd_width; ++k) { \
-        a = *(A_row + k); \
-        /* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
-        Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
-        Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
-        Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
-        Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
-        Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
-        Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
-        Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
-        Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
-        /* Upcast to 32-bit and horizontally add.*/ \
-        mult0 = madd_epi16(mult0, ones); \
-        mult1 = madd_epi16(mult1, ones); \
-        mult2 = madd_epi16(mult2, ones); \
-        mult3 = madd_epi16(mult3, ones); \
-        mult4 = madd_epi16(mult4, ones); \
-        mult5 = madd_epi16(mult5, ones); \
-        mult6 = madd_epi16(mult6, ones); \
-        mult7 = madd_epi16(mult7, ones); \
-        /*Add in 32bit*/ \
-        sum0 = add_epi32(sum0, mult0); \
-        sum1 = add_epi32(sum1, mult1); \
-        sum2 = add_epi32(sum2, mult2); \
-        sum3 = add_epi32(sum3, mult3); \
-        sum4 = add_epi32(sum4, mult4); \
-        sum5 = add_epi32(sum5, mult5); \
-        sum6 = add_epi32(sum6, mult6); \
-        sum7 = add_epi32(sum7, mult7); \
-         \
-      } \
-      /* Reduce sums within 128-bit lanes.*/ \
-      Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
-      Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
-      /*The specific implementation may need to reduce further.*/ \
-      auto total = PermuteSummer(pack0123, pack4567); \
-      RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
-    } \
-  } \
-} \
-
-/* 8-bit matrix multiply used by AVX and AVX2.
- * These have two peculiar properties:
- * 1. The sign instructions don't exist in AVX512.
- * 2. 16 registers means gcc's register allocation failed so I wrote it in my
- *    own asm.
- * 3. They support 3-argument vpsignb and vpmaddubsw.
- *
- * Fun fact: AVX introduced the three-argument vpsignb and vpmaddubsw but only
- * for 128-bit, despite the primary change in AVX being the addition of
- * 256-bit.  We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and
- * vpmaddubsw.  That's why this code is generic over 128-bit or 256-bit.
- */
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
-    __m256i a, const __m256i *b,
-    __m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3,
-    __m256i &sum4, __m256i &sum5, __m256i &sum6, __m256i &sum7) {
-  // Annoyingly the only 8-bit multiply is signed * unsigned (maddubs).
-  // So we take the sign bits off of a and apply them each b in a * b.
-  //
-  // We have only 16 YMM registers but we want to store:
-  // 1 for a (or |a|)
-  // 8 temporaries for applying sign to each column of B.
-  // 8 sums.
-#if defined(__GNUC__) && !defined(__clang__)
-  // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
-  // gcc's register allocator does:
-  // 1 for a, do all the sign application, then overwrite with |a|
-  // 8 temporaries
-  // 7 sums in registers + 1 on the stack
-  //
-  // But it's possible to complete an operation early, freeing up its
-  // temporary register for reuse.  But completing an operation early
-  // requires us to have |a| for vpmaddubsw while completing the later
-  // operation needs a again to apply sign.
-  //
-  // So we do two columns, 0 and 1, early.  This allows b0_b6 and b1_b7
-  // to be reused by columns 6 and 7, respectively.  And there's enough
-  // registers to store both a and |a|.
-  //
-  // These are the temporary variables used to process each column of b.
-  // We let the compiler choose which register number is which, but force
-  // it to allocate all registers.
-  __m256i absa;
-  __m256i b0_b6, b1_b7, b2, b3, b4, b5;
-  // Maybe this will tell gcc that we're accessing 8 registers starting
-  // at B_live.  Though I doubt it because we're passing the address as a
-  // register.
-  typedef struct { __m256i x[8]; } B_range;
-  asm(
-      // Copy the first 6 columns of b to registers.  We assume B has
-      // been rearranged so that these 8 columns are consecutive.
-      // vpsignb does not take a memory address as its second argument,
-      // so this can't be inlined into vsignb.
-      "vmovdqa          (%[B]), %[b0_b6]\n"
-      "vmovdqa   %c[size](%[B]), %[b1_b7]\n"
-      // These multiplies are executed by the assembler, not by the CPU
-      // at run time.
-      // I would have liked to just initialize b2 etc above but that
-      // would make it an input argument "+x" instead of "=&x".  And +x
-      // counts as two operands for purposes of gcc's annoying 30-operand
-      // limit.
-      "vmovdqa 2*%c[size](%[B]), %[b2]\n"
-      "vmovdqa 3*%c[size](%[B]), %[b3]\n"
-      "vmovdqa 4*%c[size](%[B]), %[b4]\n"
-      "vmovdqa 5*%c[size](%[B]), %[b5]\n"
-      // Store the absolute value of a in absa.
-      "vpabsb  %[a], %[absa]\n"
-      // If a byte of a is negative, negate the corresponding byte in
-      // b0_b6 etc.
-      "vpsignb %[a], %[b0_b6], %[b0_b6]\n"
-      "vpsignb %[a], %[b1_b7], %[b1_b7]\n"
-      // Multiply signed * unsigned then horizontally add to form packed
-      // 16-bit integers:
-      // b0[0] * |a|[0] + b0[1] * |a|[1], b0[2] * |a|[2] + b0[3] * |a|[3], ...
-      "vpmaddubsw %[b0_b6], %[absa], %[b0_b6]\n"
-      "vpmaddubsw %[b1_b7], %[absa], %[b1_b7]\n"
-      // vpmaddubsw has latency 5 so work on some other sign bits while
-      // we're at it.
-      "vpsignb %[a], %[b2], %[b2]\n"
-      "vpsignb %[a], %[b3], %[b3]\n"
-      "vpsignb %[a], %[b4], %[b4]\n"
-      "vpsignb %[a], %[b5], %[b5]\n"
-      // Perform a 16-bit add with saturation to accumlate sums.
-      "vpaddsw %[b0_b6], %[sum0], %[sum0]\n"
-      // Now we can reuse b0_b6 for b6
-      "vmovdqa 6*%c[size](%[B]), %[b0_b6]\n"
-      "vpaddsw %[b1_b7], %[sum1], %[sum1]\n"
-      // Now we can reuse b1_b7 for b7
-      "vmovdqa 7*%c[size](%[B]), %[b1_b7]\n"
-      // More crunching while the load happens.
-      "vpmaddubsw %[b2], %[absa], %[b2]\n"
-      "vpmaddubsw %[b3], %[absa], %[b3]\n"
-      "vpmaddubsw %[b4], %[absa], %[b4]\n"
-      "vpsignb %[a], %[b0_b6], %[b0_b6]\n"
-      "vpsignb %[a], %[b1_b7], %[b1_b7]\n"
-      "vpmaddubsw %[b5], %[absa], %[b5]\n"
-      "vpmaddubsw %[b0_b6], %[absa], %[b0_b6]\n"
-      "vpmaddubsw %[b1_b7], %[absa], %[b1_b7]\n"
-      "vpaddsw %[b2], %[sum2], %[sum2]\n"
-      "vpaddsw %[b3], %[sum3], %[sum3]\n"
-      "vpaddsw %[b4], %[sum4], %[sum4]\n"
-      "vpaddsw %[b5], %[sum5], %[sum5]\n"
-      "vpaddsw %[b0_b6], %[sum6], %[sum6]\n"
-      "vpaddsw %[b1_b7], %[sum7], %[sum7]\n"
-      : [sum0] "+x" (sum0),
-        [sum1] "+x" (sum1),
-        [sum2] "+x" (sum2),
-        [sum3] "+x" (sum3),
-        [sum4] "+x" (sum4),
-        [sum5] "+x" (sum5),
-        [sum6] "+x" (sum6),
-        [sum7] "+x" (sum7),
-        [b0_b6] "=&x" (b0_b6),
-        [b1_b7] "=&x" (b1_b7),
-        [b2] "=&x" (b2),
-        [b3] "=&x" (b3),
-        [b4] "=&x" (b4),
-        [b5] "=&x" (b5),
-        [absa] "=&x" (absa)
-      : 
-        // I would like to use m here but that non-deterministically
-        // chooses %(eax) or -256$(eax) and there's no way to add to that
-        // memory address:
-        // https://gcc.gnu.org/ml/gcc-help/2011-04/msg00518.html
-        //
-        [B] "r" (reinterpret_cast<const B_range*>(b)),
-        [a] "x" (a),
-        [size] "i" (sizeof(__m256i))
-    );
-#else
-  // https://bugs.llvm.org/show_bug.cgi?id=41482
-  // clang has a bug: target attribute avx2 doesn't allow inline assembly with
-  // +x for YMM registers.  For example, this will not compile with default
-  // arguments:
-  // __attribute__ ((target ("avx2"))) void Foo(__m256i sum0) {
-  //   asm("" : [sum0] "+x" (sum0));
-  // }
-  // but it will compile with -mavx2.
-  // However, clang does allow intrinsics and has a better register allocator
-  // than gcc.  So here we just use intrinsics.
-  __m256i a_positive = abs_epi8(a);
-  sum0 = adds_epi16(sum0, maddubs_epi16(a_positive, sign_epi8(b[0], a)));
-  sum1 = adds_epi16(sum1, maddubs_epi16(a_positive, sign_epi8(b[1], a)));
-  sum2 = adds_epi16(sum2, maddubs_epi16(a_positive, sign_epi8(b[2], a)));
-  sum3 = adds_epi16(sum3, maddubs_epi16(a_positive, sign_epi8(b[3], a)));
-  sum4 = adds_epi16(sum4, maddubs_epi16(a_positive, sign_epi8(b[4], a)));
-  sum5 = adds_epi16(sum5, maddubs_epi16(a_positive, sign_epi8(b[5], a)));
-  sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
-  sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
-#endif
-}
-#endif
-
-// For INTGEMM_SSSE3 without AVX
-INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
-    __m128i a, const __m128i *b,
-    __m128i &sum0, __m128i &sum1, __m128i &sum2, __m128i &sum3,
-    __m128i &sum4, __m128i &sum5, __m128i &sum6, __m128i &sum7) {
-  __m128i a_positive = abs_epi8(a);
-  sum0 = adds_epi16(sum0, maddubs_epi16(a_positive, sign_epi8(b[0], a)));
-  sum1 = adds_epi16(sum1, maddubs_epi16(a_positive, sign_epi8(b[1], a)));
-  sum2 = adds_epi16(sum2, maddubs_epi16(a_positive, sign_epi8(b[2], a)));
-  sum3 = adds_epi16(sum3, maddubs_epi16(a_positive, sign_epi8(b[3], a)));
-  sum4 = adds_epi16(sum4, maddubs_epi16(a_positive, sign_epi8(b[4], a)));
-  sum5 = adds_epi16(sum5, maddubs_epi16(a_positive, sign_epi8(b[5], a)));
-  sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
-  sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
-}
-//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply
-#define INTGEMM_MULTIPLY8(Register, target, cpu_type) \
-  template <typename Callback> target static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
-  assert(width % sizeof(Register) == 0); \
-  assert(B_cols % 8 == 0); \
-  assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
-  assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
-  const Index simd_width = width / sizeof(Register); \
-  auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
-  INTGEMM_OMP_FOR \
-  for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
-    const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
-    /*Process one row of A at a time.  Doesn't seem to be faster to do multiple rows of A at once.*/ \
-    for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
-      /*Iterate over shared (inner) dimension.*/ \
-      const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width); \
-      const Register *A_end = A_live + simd_width; \
-      const Register *B_live = B0_col; \
-      /* Rather than initializing as zeros and adding, just initialize the first.*/ \
-      Register a = *(A_live++); \
-      Register a_positive = abs_epi8(a); \
-      /* These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.*/ \
-      Register sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \
-      Register sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \
-      Register sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \
-      Register sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \
-      Register sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \
-      Register sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \
-      Register sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \
-      Register sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \
-      B_live += 8; \
-      /* Use A as the loop variable so the add can be done where gcc likes it for branch prediction.*/ \
-      for (; A_live != A_end; ++A_live, B_live += 8) { \
-        Inner##target(*A_live, B_live, sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7); \
-      } \
-      /* Convert 16-bit to 32-bit and add, not caring what parts are added.
-       * Implementations:
-       * 1. https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67 under Apache license:
-       *   This does a multiply by 1 and horizontal add:
-       *    _mm512_madd_epi16(sum, _mm512_set1_epi16(1))
-       *   Current fastest.
-       *
-       * 2. Signed extension and fold halves:
-       *    sum = _mm512_add_epi32(
-       *      _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)),
-       *      _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1)));
-       *
-       * 3. Sign extend by abuse of bitshift, then add.
-       * sum = _mm512_add_epi32(
-       *      _mm512_srai_epi32(_mm512_slli_epi32(sum, 16), 16),
-       *      _mm512_srai_epi32(sum, 16));
-       */ \
-      Register ones = set1_epi16<Register>(1); \
-      sum0 = madd_epi16(sum0, ones); \
-      sum1 = madd_epi16(sum1, ones); \
-      sum2 = madd_epi16(sum2, ones); \
-      sum3 = madd_epi16(sum3, ones); \
-      sum4 = madd_epi16(sum4, ones); \
-      sum5 = madd_epi16(sum5, ones); \
-      sum6 = madd_epi16(sum6, ones); \
-      sum7 = madd_epi16(sum7, ones); \
-      Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
-      Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
-      auto total = PermuteSummer(pack0123, pack4567); \
-      RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
-    } \
-  } \
-}
-
-/* Wrap a multiply call in OMP parallelism.  Here it launches threads then
- * inside the implementation there is a pragma omp for.  In gcc >= 8 these
- * could have been the same but older compilers don't imbue target attributes
- * on the hidden function created by pragma omp parallel.
- * 
- * Also, gcc 7 is unable to deduce the function pointer type (for ChooseCPU) if
- * I use typename Backend::Integer directly in the arguments.  As a workaround,
- * have a default template argument Integer then use that so it's resolved.
- */
-template <class Callback, class Backend, class Integer = typename Backend::Integer> static inline void OMPParallelWrap(const Integer *A, const Integer *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-#pragma omp parallel
-  Backend::template Multiply<Callback>(A, B, A_rows, width, B_cols, callback);
-}
-template <class Callback, class Backend> static inline void OMPParallelWrap8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
-#pragma omp parallel
-  Backend::template Multiply8Shift<Callback>(A, B, A_rows, width, B_cols, callback);
-}
-
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/sse2_gemm.h
+++ b/third_party/intgemm/intgemm/sse2_gemm.h
@ -1,84 +0,0 @@
-#pragma once
-
-#include "kernels.h"
-#include "multiply.h"
-#include "types.h"
-
-#include <cstdint>
-
-// 8 bit is in ssse3_gemm.h
-
-namespace intgemm {
-namespace SSE2 {
-
-INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
-  return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
-}
-
-INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
-
-class QuantizeTile16 {
-  public:
-    INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
-      return Tile(mult_reg, input, input + 4);
-    }
-
-    INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
-      return Tile(mult_reg,
-        input,
-        input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
-    }
-
-    INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
-      return Consecutive(mult_reg, input);
-    }
-
-  private:
-    INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
-      __m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
-      __m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
-      return _mm_packs_epi32(g0, g1);
-    }
-};
-
-// This should be pure SSE2 (and below).
-struct Kernels16 {
-  typedef int16_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
-  INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
-    assert(size % 8 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    FRegister q = set1_ps<FRegister>(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 8, output += 8) {
-      *reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
-    }
-  }
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 8;
-  static const Index kBTileCol = 8;
-
-  INTGEMM_PREPARE_B_16(INTGEMM_SSE2, QuantizeTile16)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, int16_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, QuantizeTile16, int16_t)
-
-  INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    //TODO #DEFINE
-    SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
-  }
-  INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
-
-  constexpr static const char *const kName = "16-bit SSE2";
-
-  static const CPUType kUses = CPUType::SSE2;
-};
-
-} // namespace SSE2
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/ssse3_gemm.h
+++ b/third_party/intgemm/intgemm/ssse3_gemm.h
@ -1,154 +0,0 @@
-#pragma once
-
-#include "interleave.h"
-#include "kernels.h"
-#include "multiply.h"
-#include "types.h"
-
-#include <cstdint>
-#include <cstring>
-
-// 16-bit is in sse2_gemm.h
-
-namespace intgemm {
-namespace SSSE3 {
-
-INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
-  return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
-}
-
-INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
-
-class QuantizeTile8 {
-  public:
-    INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
-      // Skip a row.
-      return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
-    }
-
-    INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
-      return Tile(mult_reg, input, input + 4, input + 8, input + 12);
-    }
-
-    INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
-      return TileU(mult_reg, input, input + 4, input + 8, input + 12);
-    }
-
-    INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
-      const float* inputs[4];
-      for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
-        while (cols_left < sizeof(Register) / sizeof(float)) {
-          input += cols * (row_step - 1);
-          cols_left += cols;
-        }
-        inputs[i] = input;
-        input += sizeof(Register) / sizeof(float);
-        cols_left -= sizeof(Register) / sizeof(float);
-      }
-      return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
-    }
-
-    // Quantize 16xfloat into 16xint8_t
-    INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
-      const __m128i neg128 = _mm_set1_epi8(-128);
-      __m128i g0 = QuantizerGrab(input0, mult_reg);
-      __m128i g1 = QuantizerGrab(input1, mult_reg);
-      __m128i g2 = QuantizerGrab(input2, mult_reg);
-      __m128i g3 = QuantizerGrab(input3, mult_reg);
-      __m128i packed0 = _mm_packs_epi32(g0, g1);
-      __m128i packed1 = _mm_packs_epi32(g2, g3);
-      __m128i packed = _mm_packs_epi16(packed0, packed1);
-      /* Ban -128.
-       * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127).  Instead,
-       * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
-       * The first generates 0xff for fields -128.
-       * The second subtracts 0xff from -128 which has the effect of converting
-       * to -127.
-       */
-      // packed = _mm_max_epi8(packed, neg127);
-      __m128i evils = _mm_cmpeq_epi8(packed, neg128);
-      return _mm_sub_epi8(packed, evils);
-      // No permute needed.  packs is in order for SSE.
-    }
-
-  private:
-    INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
-      const __m128i neg128 = _mm_set1_epi8(-128);
-      const __m128i pos127 = _mm_set1_epi8(127);
-      __m128i g0 = QuantizerGrab(input0, mult_reg);
-      __m128i g1 = QuantizerGrab(input1, mult_reg);
-      __m128i g2 = QuantizerGrab(input2, mult_reg);
-      __m128i g3 = QuantizerGrab(input3, mult_reg);
-      __m128i packed0 = _mm_packs_epi32(g0, g1);
-      __m128i packed1 = _mm_packs_epi32(g2, g3);
-      __m128i packed = _mm_packs_epi16(packed0, packed1);
-      /* Ban -128.
-       * Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127).  Instead,
-       * use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
-       * The first generates 0xff for fields -128.
-       * The second subtracts 0xff from -128 which has the effect of converting
-       * to -127.
-       */
-      // packed = _mm_max_epi8(packed, neg127);
-      __m128i evils = _mm_cmpeq_epi8(packed, neg128);
-      return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
-      // No permute needed.  packs is in order for SSE.
-    }
-};
-
-// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
-struct Kernels8 {
-  typedef int8_t Integer;
-
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
-    Quantize(input, output, quant_mult, rows * cols);
-  }
-
- private:
-  INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
- public:
-  INTGEMM_QUANTIZE(INTGEMM_SSSE3)
-
-  // Version with unsigned int + 127
-  // Currently A is prepared by quantization but this could theoretically change.
-  INTGEMM_SSSE3 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
-    QuantizeU(input, output, quant_mult, rows * cols);
-  }
-
-  INTGEMM_SSSE3 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
-    assert(size % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
-    assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
-    FRegister q = set1_ps<FRegister>(quant_mult);
-    const float *end = input + size;
-    for (; input != end; input += 16, output += 16) {
-      *reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
-    }
-  }
-
-  // Tile size for B; B must be a multiple of this block size.
-  static const Index kBTileRow = 16;
-  static const Index kBTileCol = 8;
-
-  INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8)
-  INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
-  INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
-
-  INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
-    SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
-  }
-
-  INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
-
-  INTGEMM_MULTIPLY8SHIFT(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
-
-  INTGEMM_PREPAREBIASFOR8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
-
-  constexpr static const char *const kName = "8-bit SSSE3";
-
-  static const CPUType kUses = CPUType::SSSE3;
-};
-
-} // namespace SSSE3
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/stats.h
+++ b/third_party/intgemm/intgemm/stats.h
@ -1,76 +0,0 @@
-#pragma once
-
-#include <cmath>
-#include "intrinsics.h"
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace intgemm {
-
-/* Horizontal max and sums.  TODO make a template argument? */
-
-INTGEMM_SSE2 static inline float MaxFloat32(__m128 a) {
-  // Fold to just using the first 64 bits.
-  __m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
-  a = _mm_max_ps(a, second_half);
-  // Fold to just using the first 32 bits.
-  second_half = _mm_shuffle_ps(a, a, 1);
-  a = _mm_max_ps(a, second_half);
-  // This casting compiles to nothing.
-  return *reinterpret_cast<float*>(&a);
-}
-INTGEMM_SSE2 static inline float AddFloat32(__m128 a) {
-  // Fold to just using the first 64 bits.
-  __m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
-  a = _mm_add_ps(a, second_half);
-  // Fold to just using the first 32 bits.
-  second_half = _mm_shuffle_ps(a, a, 1);
-  a = _mm_add_ps(a, second_half);
-  // This casting compiles to nothing.
-  return *reinterpret_cast<float*>(&a);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) {
-  return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
-}
-INTGEMM_AVX2 static inline float AddFloat32(__m256 a) {
-  return AddFloat32(add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-// Find the maximum float.
-INTGEMM_AVX512F static inline float MaxFloat32(__m512 a) {
-  // _mm512_extractf32x8_ps is AVX512DQ but we don't care about masking.
-  // So cast to pd, do AVX512F _mm512_extractf64x4_pd, then cast to ps.
-  __m256 upper = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), 1));
-  return MaxFloat32(max_ps(_mm512_castps512_ps256(a), upper));
-}
-INTGEMM_AVX512F static inline float AddFloat32(__m512 a) {
-  __m256 upper = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), 1));
-  return AddFloat32(add_ps(_mm512_castps512_ps256(a), upper));
-}
-#endif
-
-constexpr int32_t kFloatAbsoluteMask = 0x7fffffff;
-
-} // namespace intgemm
-
-#define INTGEMM_THIS_IS_SSE2
-#include "stats.inl"
-#undef INTGEMM_THIS_IS_SSE2
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-#define INTGEMM_THIS_IS_AVX2
-#include "stats.inl"
-#undef INTGEMM_THIS_IS_AVX2
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-#define INTGEMM_THIS_IS_AVX512DQ
-#include "stats.inl"
-#undef INTGEMM_THIS_IS_AVX512DQ
-#endif
--- a/third_party/intgemm/intgemm/stats.inl
+++ b/third_party/intgemm/intgemm/stats.inl
@ -1,98 +0,0 @@
-/* This file is included multiple times, once per architecture. */
-#if defined(INTGEMM_THIS_IS_AVX512DQ)
-#define INTGEMM_ARCH AVX512BW
-#define INTGEMM_TARGET INTGEMM_AVX512DQ
-#elif defined(INTGEMM_THIS_IS_AVX2)
-#define INTGEMM_ARCH AVX2
-#define INTGEMM_TARGET INTGEMM_AVX2
-#elif defined(INTGEMM_THIS_IS_SSE2)
-#define INTGEMM_ARCH SSE2
-#define INTGEMM_TARGET INTGEMM_SSE2
-#else
-#error Included with unexpected architecture
-#endif
-
-namespace intgemm {
-namespace INTGEMM_ARCH {
-
-/* Compute the maximum absolute value over floats aligned to register size.
- * Do not call this function directly; it's a subroutine of MaxAbsolute.
- */
-INTGEMM_TARGET static inline float MaxAbsoluteThread(const FRegister *begin, const FRegister *end) {
-  FRegister highest = setzero_ps<FRegister>();
-  const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
-#pragma omp for
-  for (const FRegister *i = begin; i < end; ++i) {
-    FRegister reg = and_ps(abs_mask, *i);
-    highest = max_ps(highest, reg);
-  }
-  return MaxFloat32(highest);
-}
-
-/* Compute the maximum absolute value of an array of floats.
- * begin_float must be aligned to a multiple of the register size.
-*/
-INTGEMM_TARGET static inline float MaxAbsolute(const float *begin_float, const float *end_float) {
-  assert(reinterpret_cast<uintptr_t>(begin_float) % sizeof(FRegister) == 0);
-  const float *end_reg = end_float - (reinterpret_cast<uintptr_t>(end_float) % sizeof(FRegister)) / sizeof(float);
-  float ret = 0.0;
-#pragma omp parallel reduction(max:ret) num_threads(std::max<int>(1, std::min<int>(omp_get_max_threads(), (end_float - begin_float) / 16384)))
-  {
-    float shard_max = MaxAbsoluteThread(
-        reinterpret_cast<const FRegister*>(begin_float),
-        reinterpret_cast<const FRegister*>(end_reg));
-    ret = std::max(ret, shard_max);
-  }
-  /* Overhang. The beginning was aligned so if there's any overhang we're
-   * allowed to read the next full register.  Then mask that to 0. */
-#if defined(INTGEMM_THIS_IS_AVX512DQ)
-  if (end_float != end_reg) {
-    const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
-    __mmask16 mask = (1 << (end_float - end_reg)) - 1;
-    FRegister masked = _mm512_maskz_and_ps(mask, abs_mask, *reinterpret_cast<const FRegister*>(end_reg));
-    ret = std::max(ret, MaxFloat32(masked));
-  }
-#else
-  for (const float *i = end_reg; i < end_float; ++i) {
-    ret = std::max(ret, std::fabs(*i));
-  }
-#endif
-  return ret;
-}
-
-/* Computes the euclidean norm and returns the mean and the standard deviation. Optionally it can be the mean and standard deviation in absolute terms. */
-INTGEMM_TARGET static inline MeanStd VectorMeanStd(const float *begin_float, const float *end_float, bool absolute) {
-  assert(end_float > begin_float);
-  assert((end_float - begin_float) % (sizeof(FRegister) / sizeof(float)) == 0);
-  size_t num_items = end_float - begin_float;
-  const FRegister *begin = reinterpret_cast<const FRegister*>(begin_float);
-  const FRegister *end = reinterpret_cast<const FRegister*>(end_float);
-  FRegister squares = set1_ps<FRegister>(0);
-  FRegister sums = set1_ps<FRegister>(0);
-  if (absolute) {
-    const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
-    for (; begin != end; begin++) {
-      FRegister vec = and_ps(abs_mask, *begin);
-      squares = add_ps(squares, mul_ps(vec, vec));
-      sums = add_ps(sums, vec);
-    }
-  } else {
-    for (; begin != end; begin++) {
-      FRegister vec = *begin;
-      squares = add_ps(squares, mul_ps(vec, vec));
-      sums = add_ps(sums, vec);
-    }
-  }
-  float squares_sum = AddFloat32(squares);
-  float normal_sums = AddFloat32(sums);
-  MeanStd ret;
-  ret.mean = normal_sums/num_items;
-  ret.stddev = std::sqrt((squares_sum/num_items) - (ret.mean*ret.mean));
-  return ret;
-}
-
-} // namespace INTGEMM_ARCH
-} // namespace intgemm
-
-#undef INTGEMM_ARCH
-#undef INTGEMM_TARGET
--- a/third_party/intgemm/intgemm/types.h
+++ b/third_party/intgemm/intgemm/types.h
@ -1,118 +0,0 @@
-#pragma once
-#include "intgemm/intgemm_config.h"
-
-#include <exception>
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-#include <immintrin.h>
-#endif
-#include <emmintrin.h>
-
-// clang-cl bug doesn't include these headers when pretending to be MSVC
-// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
-#if defined(_MSC_VER) && defined(__clang__)
-#include <avxintrin.h>
-#include <avx2intrin.h>
-#include <smmintrin.h>
-#include <avx512fintrin.h>
-#include <avx512dqintrin.h>
-#include <avx512bwintrin.h>
-#include <avx512vnniintrin.h>
-#endif
-
-#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER)
-/* Real MSVC does not appear to have target attributes but is also fine with
- * just using intrinsics anywhere.  clang-cl pretending to be MSVC requires
- * target attributes, so it's excluded from the above.
- *
- * The Intel compiler has a bug whereby constructors with target attributes do
- * not link.  Like this program doesn't compile with icpc:
- * class Foo {
- *   public:
- *     __attribute__ ((target ("avx2"))) Foo() {}
- * };
- * int main() { Foo a; }
- *
- * It appears to be erroneously activating function multiversioning when only
- * one version of a constructor with target attributes is defined.  Normal
- * methods with one target attribute work fine.  The Intel compiler also allows
- * intrinsics without any target attributes so we just leave them blank.
- */
-  #define INTGEMM_SSE2
-  #define INTGEMM_SSSE3
-  #define INTGEMM_AVX2
-  #define INTGEMM_AVX512F
-  #define INTGEMM_AVX512BW
-  #define INTGEMM_AVX512DQ
-  #define INTGEMM_AVX512VNNI
-#else
-  /* gcc and clang take lists of all the flavors */
-  #define INTGEMM_SSE2 __attribute__ ((target ("sse2")))
-  #define INTGEMM_SSSE3 __attribute__ ((target ("ssse3")))
-  #define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
-  #define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
-  #define INTGEMM_AVX512BW __attribute__ ((target ("avx512f,avx512bw,avx512dq")))
-  #define INTGEMM_AVX512DQ __attribute__ ((target ("avx512f,avx512bw,avx512dq")))
-  #define INTGEMM_AVX512VNNI __attribute__ ((target ("avx512f,avx512bw,avx512dq,avx512vnni")))
-#endif
-namespace intgemm {
-
-// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
-class UnsupportedCPU : public std::exception {
-  public:
-    UnsupportedCPU() {}
-
-    ~UnsupportedCPU() throw() {}
-
-    const char *what() const throw() override {
-      return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
-    }
-};
-
-typedef unsigned int Index;
-
-// If you want to detect the CPU and dispatch yourself, here's what to use:
-enum class CPUType {
-  UNSUPPORTED = 0,
-  SSE2 = 1,
-  SSSE3 = 2,
-  AVX2 = 3,
-  AVX512BW = 4,
-  AVX512VNNI = 5
-};
-
-// Running CPU type.  This is defined in intgemm.cc (as the dispatcher).
-extern const CPUType kCPU;
-
-struct MeanStd {
-  float mean;
-  float stddev;
-};
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-namespace AVX512VNNI {
-typedef __m512i Register;
-typedef __m512 FRegister;
-} // namespace AVX512VNNI
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-namespace AVX512BW {
-typedef __m512i Register;
-typedef __m512 FRegister;
-} // namespace AVX512BW
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-namespace AVX2 {
-typedef __m256i Register;
-typedef __m256 FRegister;
-} // namespace AVX2
-#endif
-namespace SSSE3 {
-typedef __m128i Register;
-typedef __m128 FRegister;
-} // namespace SSSE3
-namespace SSE2 {
-typedef __m128i Register;
-typedef __m128 FRegister;
-} // namespace SSE2
-
-} // namespace intgemm
--- a/third_party/intgemm/intgemm/utils.h
+++ b/third_party/intgemm/intgemm/utils.h
@ -1,82 +0,0 @@
-#pragma once
-
-#include <tuple>
-
-namespace intgemm {
-
-/*
- * Sequence of unsigned integers
- *
- * Examples:
- *   sequence<1, 2, 3>()
- *   sequence_pushback<4, sequence<1, 2, 3>>() = sequence<1, 2, 3, 4>()
- *   sequence_popfront<sequence<1, 2, 3>>() = sequence<2, 3>()
- *   make_sequence<3>() = sequence<0, 1, 2>()
- */
-template <unsigned... Indices>
-struct sequence { using type = sequence; };
-
-template <unsigned I, typename Sequence>
-struct sequence_pushback;
-
-template <unsigned I, unsigned... Indices>
-struct sequence_pushback<I, sequence<Indices...>> : sequence<Indices..., I> {};
-
-template <typename Sequence>
-struct sequence_popfront;
-
-template <unsigned FirstIndex, unsigned... RestIndices>
-struct sequence_popfront<sequence<FirstIndex, RestIndices...>> : sequence<RestIndices...> {};
-
-namespace { // anonymous namespace
-template <unsigned N>
-struct make_sequence_impl : sequence_pushback<N - 1, typename make_sequence_impl<N - 1>::type> {};
-template <>
-struct make_sequence_impl<0> : sequence<> {};
-} // anonymous namespace
-
-template <unsigned N>
-using make_sequence = typename make_sequence_impl<N>::type;
-
-/*
- * Make a subtuple
- */
-template <typename Tuple, unsigned... Indices>
-using subtuple_t = typename std::tuple<typename std::tuple_element<Indices, Tuple>::type...>;
-
-template <typename Tuple, unsigned... Indices>
-constexpr subtuple_t<Tuple, Indices...> make_subtuple(const Tuple& tuple, sequence<Indices...>) {
-  return std::make_tuple(std::get<Indices>(tuple)...);
-}
-
-/*
- * Factorial
- */
-static constexpr unsigned long long factorial(unsigned n) {
-  return n <= 1 ? 1 : n * factorial(n - 1);
-}
-
-/*
- * e^n, where n is integer
- */
-static constexpr double expi_nonnegative(unsigned n) {
-  return n == 0 ? 1.0 : (n == 1 ? 2.718281828459045 : expi_nonnegative(n / 2) * expi_nonnegative((n + 1) / 2));
-}
-
-static constexpr double expi(int n) {
-  return (n >= 0 ? expi_nonnegative(n) : 1.0 / expi_nonnegative(-n));
-}
-
-// Version that returns float.
-static constexpr float expif(int n) {
-  return static_cast<float>(expi(n));
-}
-
-/*
- * Round up
- */
-static constexpr Index round_up(Index value, Index factor) {
-  return (value + factor - 1) / factor * factor;
-}
-
-}
--- a/third_party/intgemm/intgemm/vec_traits.h
+++ b/third_party/intgemm/intgemm/vec_traits.h
@ -1,57 +0,0 @@
-#pragma once
-
-#include "types.h"
-
-namespace intgemm {
-
-/*
- * Vector traits
- */
-template <CPUType CPUType_, typename ElemType_> struct vector_s;
-template <> struct vector_s<CPUType::SSE2, int8_t> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSE2, int16_t> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSE2, int> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSE2, float> { using type = __m128; };
-template <> struct vector_s<CPUType::SSE2, double> { using type = __m128d; };
-template <> struct vector_s<CPUType::SSSE3, int8_t> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSSE3, int16_t> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSSE3, int> { using type = __m128i; };
-template <> struct vector_s<CPUType::SSSE3, float> { using type = __m128; };
-template <> struct vector_s<CPUType::SSSE3, double> { using type = __m128d; };
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template <> struct vector_s<CPUType::AVX2, int8_t> { using type = __m256i; };
-template <> struct vector_s<CPUType::AVX2, int16_t> { using type = __m256i; };
-template <> struct vector_s<CPUType::AVX2, int> { using type = __m256i; };
-template <> struct vector_s<CPUType::AVX2, float> { using type = __m256; };
-template <> struct vector_s<CPUType::AVX2, double> { using type = __m256d; };
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template <> struct vector_s<CPUType::AVX512BW, int8_t> { using type = __m512i; };
-template <> struct vector_s<CPUType::AVX512BW, int16_t> { using type = __m512i; };
-template <> struct vector_s<CPUType::AVX512BW, int> { using type = __m512i; };
-template <> struct vector_s<CPUType::AVX512BW, float> { using type = __m512; };
-template <> struct vector_s<CPUType::AVX512BW, double> { using type = __m512d; };
-#endif
-
-template <CPUType CPUType_, typename ElemType_>
-using vector_t = typename vector_s<CPUType_, ElemType_>::type;
-
-template <CPUType CPUType_, typename ElemType_>
-struct dvector_t {
-  using type = vector_t<CPUType_, ElemType_>;
-
-  type first;
-  type second;
-};
-
-template <CPUType CPUType_, typename ElemType_>
-struct qvector_t {
-  using type = vector_t<CPUType_, ElemType_>;
-
-  type first;
-  type second;
-  type third;
-  type fourth;
-};
-
-}
--- a/third_party/intgemm/test/3rd_party/LICENSE_1_0.txt
+++ b/third_party/intgemm/test/3rd_party/LICENSE_1_0.txt
@ -1,24 +0,0 @@
-Boost Software License - Version 1.0 - August 17th, 2003
-
-Permission is hereby granted, free of charge, to any person or organization
-obtaining a copy of the software and accompanying documentation covered by
-this license (the "Software") to use, reproduce, display, distribute,
-execute, and transmit the Software, and to prepare derivative works of the
-Software, and to permit third-parties to whom the Software is furnished to
-do so, all subject to the following:
-
-The copyright notices in the Software and this entire statement, including
-the above license grant, this restriction and the following disclaimer,
-must be included in all copies of the Software, in whole or in part, and
-all derivative works of the Software, unless such copies or derivative
-works are solely in the form of machine-executable object code generated by
-a source language processor.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
-
--- a/third_party/intgemm/test/3rd_party/catch.hpp
+++ b/third_party/intgemm/test/3rd_party/catch.hpp
--- a/third_party/intgemm/test/add127_test.cc
+++ b/third_party/intgemm/test/add127_test.cc
@ -1,492 +0,0 @@
-#include "test.h"
-
-namespace intgemm {
-namespace {
-
-void CompareAs(int8_t * output_old, uint8_t * output_new, Index rows, Index cols) {
-	for (Index r = 0; r<rows; r++) {
-		for (Index c = 0; c<cols; c++) {
-			int a = int(output_old[rows*c + r]);
-			int b = int(output_new[rows*c + r]);
-			INFO("Inaccurate at row: " << r << " column " << c << ' '
-			 << a << ' ' << b);
-			CHECK(a+127 == b);
-		}
-	}
-}
-
-template <class Routine> void TestPrepareA(Index rows, Index cols) {
-  std::mt19937 gen;
-  // Go somewhat out of range too.
-  std::uniform_real_distribution<float> dist(-2, 2);
-  // Create array.
-  AlignedVector<float> inputA(rows * cols);
-  for (auto& it : inputA) {
-    it = dist(gen);
-  }
-  AlignedVector<int8_t> oldA(rows * cols);
-  AlignedVector<uint8_t> newA(rows * cols);
-  float quant_mult = 64; //From example
-  Routine::PrepareA(inputA.begin(), oldA.begin(), quant_mult, rows, cols);
-  Routine::PrepareA(inputA.begin(), newA.begin(), quant_mult, rows, cols);
-  CompareAs(oldA.begin(), newA.begin(), rows, cols);
-}
-
-template <class Routine> void TestPrepareBias(Index rows, Index cols) {
-  std::mt19937 gen;
-  // Go somewhat out of range too.
-  std::uniform_real_distribution<float> dist(-30.0, 30.0);
-  // Create array.
-  AlignedVector<float> inputB(rows * cols);
-  for (auto& it : inputB) {
-    it = dist(gen);
-  }
-
-  float alpha = 25;
-  float quant_mult = 127/alpha;
-
-  AlignedVector<int8_t> B_prep(inputB.size());
-  AlignedVector<int8_t> B_quant(inputB.size());
-  Routine::PrepareB(inputB.begin(), B_prep.begin(), quant_mult, rows, cols);
-  Routine::Quantize(inputB.begin(), B_quant.begin(), quant_mult, static_cast<intgemm::Index>(inputB.size()));
-
-
-  AlignedVector<float> inputBias(cols);
-  AlignedVector<float> goldBias(cols);
-
-  for (auto& it : goldBias) {
-  	it = dist(gen);
-  }
-  int i = 0;
-  for (auto& it : inputBias) {
-    it = goldBias[i];
-    i++;
-  }
-
-  float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f);
-
-  Routine::PrepareBias(B_prep.begin(), rows, cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, inputBias.begin(), inputBias.begin()));
-
-  int A_rows = 1;
-  AlignedVector<int8_t> A_prep2(A_rows*rows);
-  for (auto& it : A_prep2) {
-    it =1;
-  }
-  //Routine::Multiply(A_prep2.begin(), B_prep.begin(), A_rows, rows, cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, goldBias.begin(), goldBias.begin()));
-  //CompareEps(goldBias.begin(), inputBias.begin(), cols, 0.0001f);
-  AlignedVector<float> slowint_C(cols);
-  references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, rows, cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return sum * unquant_mult_forprep + goldBias[info.col_idx];
-  });
-  CompareEps(slowint_C.begin(), inputBias.begin(), cols, 0.0001f);
-}
-
-template <class Routine> void TestMultiplyBiasNew(Index A_rows, Index width, Index B_cols,
- float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<uint8_t> A_prep(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  /*REFERENCE MULTIPLICATION
-  *
-  *
-  */
-  AlignedVector<int8_t> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Taking the original A_preparation which means A would be int8_t
-  AlignedVector<int8_t> A_prep2(A.size());
-  Routine::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width);
-  references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return sum * unquant_mult + bias[info.col_idx];
-  });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
-    return static_cast<float>(sum) + bias[info.col_idx];
-  });
-
-  /*ACTUAL MULTIPLICATION
-  *
-  */
-  float unquant_mult_forprep = (-1.0f)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
-  Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
-  //Routine::PrepareBias(B.begin(), bias.begin(), alpha, width, B_cols);
-  Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-template <class Routine> void TestMultiplyShiftNonShift(Index A_rows, Index width, Index B_cols,
- float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = 0;
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<uint8_t> A_prep(A.size());
-  AlignedVector<int8_t> A_prep_old(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareA(A.begin(), A_prep_old.begin(), quant_mult, A_rows, width); //Non shited version
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  /*
-   * Reference non shift multiplication instead of slowint
-   */
-  AlignedVector<float> slowint_C(test_C.size());
-  Routine::Multiply(A_prep_old.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), slowint_C.begin()));
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
-    return static_cast<float>(sum) + bias[info.col_idx];
-  });
-
-  /*
-   * Multiply8 shift multiplication
-   */
-  float unquant_mult_forprep = (-1.0f)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
-  Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
-  Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, Index B_cols,
- float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = 0;
-  }
-
-  float alpha = 2.0f;
-  float quant_mult = 127.0f / alpha;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<uint8_t> A_prep(A.size());
-  AlignedVector<int8_t> A_prep_old(A.size());
-  AlignedVector<int8_t> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareA(A.begin(), A_prep_old.begin(), quant_mult, A_rows, width); //Non shited version
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  /*
-   * Reference float multiplication
-   */
-  AlignedVector<int8_t> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Taking the original A_preparation which means A would be int8_t
-  // references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-  //   return sum * unquant_mult + bias[info.col_idx];
-  // });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
-    return static_cast<float>(sum) + bias[info.col_idx];
-  });
-  /*
-   * Multiply8 shift multiplication
-   */
-  //First prepare SlowInteger Bias:
-  AlignedVector<int8_t> A_prep2(1*width);
-  for (auto& it : A_prep2) {
-    it = 1;
-  }
-  AlignedVector<float> ShiftedBias(B_cols);
-  float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
-  references::Multiply(A_prep2.begin(), B_quant.begin(), ShiftedBias.begin(), 1, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return sum * unquant_mult_forprep + bias[info.col_idx];
-  });
-  
-
-  //Now prepare Fast integer Bias
-  Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
-  Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
-  // Reference INT VERSION HERE with ADD127
-  // Taking the original A_preparation which means A would be int8_t
-  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return sum * unquant_mult + ShiftedBias[info.col_idx];
-  });
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-
-// Bias
-TEST_CASE("PrepareBias SSSE3", "[Add127]") {
-	if (kCPU < CPUType::SSSE3) return;
-	TestPrepareBias<SSSE3::Kernels8>(256,256);
-	TestPrepareBias<SSSE3::Kernels8>(2048,256);
-	TestPrepareBias<SSSE3::Kernels8>(512,512);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("PrepareBias AVX2", "[Add127]") {
-	if (kCPU < CPUType::AVX2) return;
-	TestPrepareBias<AVX2::Kernels8>(256,256);
-	TestPrepareBias<AVX2::Kernels8>(2048,256);
-	TestPrepareBias<AVX2::Kernels8>(512,512);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("PrepareBias AVX512F", "[Add127]") {
-	if (kCPU < CPUType::AVX512BW) return;
-	TestPrepareBias<AVX512BW::Kernels8>(256,256);
-	TestPrepareBias<AVX512BW::Kernels8>(2048,256);
-	TestPrepareBias<AVX512BW::Kernels8>(512,512);
-}
-#endif
-
-//A
-TEST_CASE("PrepareA SSSE3", "[Add127]") {
-	if (kCPU < CPUType::SSSE3) return;
-	TestPrepareA<SSSE3::Kernels8>(64,64);
-	TestPrepareA<SSSE3::Kernels8>(256,256);
-	TestPrepareA<SSSE3::Kernels8>(512,512);
-  TestPrepareA<SSSE3::Kernels8>(2048,256);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("PrepareA AVX2", "[Add127]") {
-	if (kCPU < CPUType::AVX2) return;
-	TestPrepareA<AVX2::Kernels8>(64,64);
-	TestPrepareA<AVX2::Kernels8>(256,256);
-	TestPrepareA<AVX2::Kernels8>(512,512);
-  TestPrepareA<AVX2::Kernels8>(2048,256);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("PrepareA AVX512F", "[Add127]") {
-	if (kCPU < CPUType::AVX512BW) return;
-	TestPrepareA<AVX512BW::Kernels8>(64,64);
-	TestPrepareA<AVX512BW::Kernels8>(256,256);
-	TestPrepareA<AVX512BW::Kernels8>(512,512);
-  TestPrepareA<AVX512BW::Kernels8>(2048,256);
-}
-#endif
-
-// Multiply
-
-TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyBiasNew<SSSE3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<SSSE3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBiasNew<AVX2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
-  TestMultiplyBiasNew<AVX2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyBiasNew<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") {
-    if (kCPU < CPUType::AVX512VNNI) return;
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-    TestMultiplyBiasNew<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
-  }
-#endif
-
-//Multiply old vs new
-TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
-  TestMultiplyShiftNonShift<SSSE3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftNonShift<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
-  TestMultiplyShiftNonShift<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-  TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") {
-    if (kCPU < CPUType::AVX512VNNI) return;
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
-    TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
-  }
-#endif
-
-//Multiply Shift vs int shift implementation
-TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyShiftInt<SSSE3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<SSSE3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyShiftInt<AVX2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
-  TestMultiplyShiftInt<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") {
-  if (kCPU < CPUType::AVX512VNNI) return;
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
-  TestMultiplyShiftInt<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
-}
-#endif
-
-} // namespace
-} // namespace intgemm
--- a/third_party/intgemm/test/kernels/add_bias_test.cc
+++ b/third_party/intgemm/test/kernels/add_bias_test.cc
@ -1,66 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename ElemType_>
-void kernel_add_bias_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, ElemType_>;
-  constexpr static auto VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
-
-  AlignedVector<ElemType_> input(VECTOR_LENGTH);
-  AlignedVector<ElemType_> bias(VECTOR_LENGTH);
-  AlignedVector<ElemType_> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<ElemType_>(0));
-  std::fill(bias.begin(), bias.end(), static_cast<ElemType_>(100));
-
-  *output.template as<vec_t>() = kernels::add_bias(*input.template as<vec_t>(), bias.begin(), 0);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == ElemType_(100 + i));
-}
-
-template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int16_t>();
-template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int>();
-template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, float>();
-template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, double>();
-KERNEL_TEST_CASE("add_bias/int8 SSE2") { return kernel_add_bias_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("add_bias/int16 SSE2") { return kernel_add_bias_test<CPUType::SSE2, int16_t>(); }
-KERNEL_TEST_CASE("add_bias/int SSE2") { return kernel_add_bias_test<CPUType::SSE2, int>(); }
-KERNEL_TEST_CASE("add_bias/float SSE2") { return kernel_add_bias_test<CPUType::SSE2, float>(); }
-KERNEL_TEST_CASE("add_bias/double SSE2") { return kernel_add_bias_test<CPUType::SSE2, double>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int16_t>();
-template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int>();
-template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, float>();
-template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, double>();
-KERNEL_TEST_CASE("add_bias/int8 AVX2") { return kernel_add_bias_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("add_bias/int16 AVX2") { return kernel_add_bias_test<CPUType::AVX2, int16_t>(); }
-KERNEL_TEST_CASE("add_bias/int AVX2") { return kernel_add_bias_test<CPUType::AVX2, int>(); }
-KERNEL_TEST_CASE("add_bias/float AVX2") { return kernel_add_bias_test<CPUType::AVX2, float>(); }
-KERNEL_TEST_CASE("add_bias/double AVX2") { return kernel_add_bias_test<CPUType::AVX2, double>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int16_t>();
-template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int>();
-template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, float>();
-template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, double>();
-KERNEL_TEST_CASE("add_bias/int8 AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("add_bias/int16 AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int16_t>(); }
-KERNEL_TEST_CASE("add_bias/int AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int>(); }
-KERNEL_TEST_CASE("add_bias/float AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, float>(); }
-KERNEL_TEST_CASE("add_bias/double AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, double>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/bitwise_not_test.cc
+++ b/third_party/intgemm/test/kernels/bitwise_not_test.cc
@ -1,41 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstdlib>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_bitwise_not_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, int>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(int);
-
-  AlignedVector<int> input(VECTOR_LENGTH);
-  AlignedVector<int> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), 0);
-
-  *output.template as<vec_t>() = kernels::bitwise_not(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == ~input[i]);
-}
-
-template INTGEMM_SSE2 void kernel_bitwise_not_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("bitwise_not SSE2") { return kernel_bitwise_not_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_bitwise_not_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("bitwise_not AVX2") { return kernel_bitwise_not_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_bitwise_not_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("bitwise_not AVX512BW") { return kernel_bitwise_not_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/downcast_test.cc
+++ b/third_party/intgemm/test/kernels/downcast_test.cc
@ -1,107 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_downcast32to8_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
-
-  AlignedVector<int32_t> input(LENGTH);
-  AlignedVector<int8_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
-
-  *output.template as<vi>() = kernels::downcast32to8(
-    input.template as<vi>()[0], input.template as<vi>()[1],
-    input.template as<vi>()[2], input.template as<vi>()[3]);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int8_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_downcast32to8_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("downcast32to8 SSE2") { return kernel_downcast32to8_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_downcast32to8_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("downcast32to8 AVX2") { return kernel_downcast32to8_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_downcast32to8_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("downcast32to8 AVX512BW") { return kernel_downcast32to8_test<CPUType::AVX512BW>(); }
-#endif
-
-template <CPUType CPUType_>
-void kernel_downcast32to16_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int16_t);
-
-  AlignedVector<int32_t> input(LENGTH);
-  AlignedVector<int16_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
-
-  *output.template as<vi>() = kernels::downcast32to16(
-    input.template as<vi>()[0], input.template as<vi>()[1]);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int16_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_downcast32to16_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("downcast32to16 SSE2") { return kernel_downcast32to16_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_downcast32to16_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("downcast32to16 AVX2") { return kernel_downcast32to16_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_downcast32to16_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("downcast32to16 AVX512BW") { return kernel_downcast32to16_test<CPUType::AVX512BW>(); }
-#endif
-
-template <CPUType CPUType_>
-void kernel_downcast16to8_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
-
-  AlignedVector<int16_t> input(LENGTH);
-  AlignedVector<int8_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int16_t>(-LENGTH / 2));
-
-  *output.template as<vi>() = kernels::downcast16to8(
-    input.template as<vi>()[0], input.template as<vi>()[1]);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int8_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_downcast16to8_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("downcast16to8 SSE2") { return kernel_downcast16to8_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_downcast16to8_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("downcast16to8 AVX2") { return kernel_downcast16to8_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_downcast16to8_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("downcast16to8 AVX512BW") { return kernel_downcast16to8_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/exp_test.cc
+++ b/third_party/intgemm/test/kernels/exp_test.cc
@ -1,38 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_exp_approx_taylor_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, float>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
-
-  AlignedVector<float> input(VECTOR_LENGTH);
-  AlignedVector<float> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
-
-  *output.template as<vec_t>() = kernels::exp_approx_taylor(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK_EPS(output[i], exp(input[i]), 0.001f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_exp_approx_taylor_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("exp_approx_taylor AVX2") { return kernel_exp_approx_taylor_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_exp_approx_taylor_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("exp_approx_taylor AVX512BW") { return kernel_exp_approx_taylor_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/floor_test.cc
+++ b/third_party/intgemm/test/kernels/floor_test.cc
@ -1,41 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_floor_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, float>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
-
-  AlignedVector<float> input(VECTOR_LENGTH);
-  AlignedVector<float> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
-
-  *output.template as<vec_t>() = kernels::floor(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == std::floor(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_floor_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("floor SSE2") { return kernel_floor_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_floor_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("floor AVX2") { return kernel_floor_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_floor_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("floor AVX512BW") { return kernel_floor_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/multiply_test.cc
+++ b/third_party/intgemm/test/kernels/multiply_test.cc
@ -1,67 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstdint>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename Type_>
-void kernel_multiply_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, Type_>;
-  constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(Type_);
-
-  AlignedVector<Type_> input1(VECTOR_LENGTH);
-  AlignedVector<Type_> input2(VECTOR_LENGTH);
-  AlignedVector<Type_> output(VECTOR_LENGTH);
-
-  std::iota(input1.begin(), input1.end(), static_cast<Type_>(-VECTOR_LENGTH / 2));
-  std::iota(input2.begin(), input2.end(), static_cast<Type_>(-VECTOR_LENGTH / 3));
-
-  *output.template as<vec_t>() = kernels::multiply<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == Type_(input1[i] * input2[i]));
-}
-
-template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int16_t>();
-template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int>();
-template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, float>();
-template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, double>();
-KERNEL_TEST_CASE("multiply/int8 SSE2") { return kernel_multiply_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("multiply/int16 SSE2") { return kernel_multiply_test<CPUType::SSE2, int16_t>(); }
-KERNEL_TEST_CASE("multiply/int SSE2") { return kernel_multiply_test<CPUType::SSE2, int>(); }
-KERNEL_TEST_CASE("multiply/float SSE2") { return kernel_multiply_test<CPUType::SSE2, float>(); }
-KERNEL_TEST_CASE("multiply/double SSE2") { return kernel_multiply_test<CPUType::SSE2, double>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int16_t>();
-template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int>();
-template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, float>();
-template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, double>();
-KERNEL_TEST_CASE("multiply/int8 AVX2") { return kernel_multiply_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("multiply/int16 AVX2") { return kernel_multiply_test<CPUType::AVX2, int16_t>(); }
-KERNEL_TEST_CASE("multiply/int AVX2") { return kernel_multiply_test<CPUType::AVX2, int>(); }
-KERNEL_TEST_CASE("multiply/float AVX2") { return kernel_multiply_test<CPUType::AVX2, float>(); }
-KERNEL_TEST_CASE("multiply/double AVX2") { return kernel_multiply_test<CPUType::AVX2, double>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int16_t>();
-template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int>();
-template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, float>();
-template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, double>();
-KERNEL_TEST_CASE("multiply/int8 AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("multiply/int16 AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int16_t>(); }
-KERNEL_TEST_CASE("multiply/int AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int>(); }
-KERNEL_TEST_CASE("multiply/float AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, float>(); }
-KERNEL_TEST_CASE("multiply/double AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, double>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/quantize_test.cc
+++ b/third_party/intgemm/test/kernels/quantize_test.cc
@ -1,41 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_quantize_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using input_vec_t = vector_t<CPUType_, float>;
-  using output_vec_t = vector_t<CPUType_, int>;
-
-  AlignedVector<float> input(sizeof(input_vec_t) / sizeof(float));
-  AlignedVector<int> output(sizeof(output_vec_t) / sizeof(int));
-
-  std::iota(input.begin(), input.end(), 0.0f);
-  auto quant_mult = set1_ps<input_vec_t>(2.f);
-
-  *output.template as<output_vec_t>() = kernels::quantize(*input.template as<input_vec_t>(), quant_mult);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int(i*2.f));
-}
-
-template INTGEMM_SSE2 void kernel_quantize_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("quantize SSE2") { return kernel_quantize_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_quantize_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("quantize AVX2") { return kernel_quantize_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_quantize_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("quantize AVX512BW") { return kernel_quantize_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/relu_test.cc
+++ b/third_party/intgemm/test/kernels/relu_test.cc
@ -1,65 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstdint>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename ElemType_>
-void kernel_relu_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, ElemType_>;
-  constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
-
-  AlignedVector<ElemType_> input(VECTOR_LENGTH);
-  AlignedVector<ElemType_> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<ElemType_>(-VECTOR_LENGTH / 2));
-
-  *output.template as<vec_t>() = kernels::relu<ElemType_>(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == (input[i] < 0 ? 0 : input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int16_t>();
-template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int>();
-template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, float>();
-template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, double>();
-KERNEL_TEST_CASE("relu/int8 SSE2") { return kernel_relu_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("relu/int16 SSE2") { return kernel_relu_test<CPUType::SSE2, int16_t>(); }
-KERNEL_TEST_CASE("relu/int SSE2") { return kernel_relu_test<CPUType::SSE2, int>(); }
-KERNEL_TEST_CASE("relu/float SSE2") { return kernel_relu_test<CPUType::SSE2, float>(); }
-KERNEL_TEST_CASE("relu/double SSE2") { return kernel_relu_test<CPUType::SSE2, double>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int16_t>();
-template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int>();
-template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, float>();
-template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, double>();
-KERNEL_TEST_CASE("relu/int8 AVX2") { return kernel_relu_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("relu/int16 AVX2") { return kernel_relu_test<CPUType::AVX2, int16_t>(); }
-KERNEL_TEST_CASE("relu/int AVX2") { return kernel_relu_test<CPUType::AVX2, int>(); }
-KERNEL_TEST_CASE("relu/float AVX2") { return kernel_relu_test<CPUType::AVX2, float>(); }
-KERNEL_TEST_CASE("relu/double AVX2") { return kernel_relu_test<CPUType::AVX2, double>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int16_t>();
-template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int>();
-template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, float>();
-template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, double>();
-KERNEL_TEST_CASE("relu/int8 AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("relu/int16 AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int16_t>(); }
-KERNEL_TEST_CASE("relu/int AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int>(); }
-KERNEL_TEST_CASE("relu/float AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, float>(); }
-KERNEL_TEST_CASE("relu/double AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, double>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/rescale_test.cc
+++ b/third_party/intgemm/test/kernels/rescale_test.cc
@ -1,43 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstdint>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_rescale_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  using vf = vector_t<CPUType_, float>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int);
-
-  AlignedVector<int32_t> input(LENGTH);
-  AlignedVector<int32_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
-  float scale = 2;
-
-  *output.template as<vi>() = kernels::rescale(*input.template as<vi>(), intgemm::set1_ps<vf>(scale));
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == std::round(input[i] * scale));
-}
-
-template INTGEMM_SSE2 void kernel_rescale_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("rescale SSE2") { return kernel_rescale_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_rescale_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("rescale AVX2") { return kernel_rescale_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_rescale_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("rescale AVX512BW") { return kernel_rescale_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/sigmoid_test.cc
+++ b/third_party/intgemm/test/kernels/sigmoid_test.cc
@ -1,45 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-float sigmoid_ref(float x) {
-  if (x < 0)
-    return exp(x) / (1 + exp(x));
-  else
-    return 1 / (1 + exp(-x));
-}
-
-template <CPUType CPUType_>
-void kernel_sigmoid_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, float>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
-
-  AlignedVector<float> input(VECTOR_LENGTH);
-  AlignedVector<float> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
-
-  *output.template as<vec_t>() = kernels::sigmoid(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK_EPS(output[i], sigmoid_ref(input[i]), 0.001f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_sigmoid_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("sigmoid AVX2") { return kernel_sigmoid_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_sigmoid_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("sigmoid AVX512BW") { return kernel_sigmoid_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/tanh_test.cc
+++ b/third_party/intgemm/test/kernels/tanh_test.cc
@ -1,38 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_tanh_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, float>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
-
-  AlignedVector<float> input(VECTOR_LENGTH);
-  AlignedVector<float> output(VECTOR_LENGTH);
-
-  std::generate(input.begin(), input.end(), [] () { static int n = -int(VECTOR_LENGTH / 2); return n++ / float(VECTOR_LENGTH / 2); });
-
-  *output.template as<vec_t>() = kernels::tanh(*input.template as<vec_t>());
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK_EPS(output[i], tanh(input[i]), 0.001f);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_tanh_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("tanh AVX2") { return kernel_tanh_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_tanh_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("tanh AVX512BW") { return kernel_tanh_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/unquantize_test.cc
+++ b/third_party/intgemm/test/kernels/unquantize_test.cc
@ -1,41 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_unquantize_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using input_vec_t = vector_t<CPUType_, int>;
-  using output_vec_t = vector_t<CPUType_, float>;
-
-  AlignedVector<int> input(sizeof(input_vec_t) / sizeof(int));
-  AlignedVector<float> output(sizeof(output_vec_t) / sizeof(float));
-
-  std::iota(input.begin(), input.end(), 0);
-  auto unquant_mult = set1_ps<output_vec_t>(0.5f);
-
-  *output.template as<output_vec_t>() = kernels::unquantize(*input.template as<input_vec_t>(), unquant_mult);
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == i * 0.5f);
-}
-
-template INTGEMM_SSE2 void kernel_unquantize_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("unquantize SSE2") { return kernel_unquantize_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_unquantize_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("unquantize AVX2") { return kernel_unquantize_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_unquantize_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("unquantize AVX512BW") { return kernel_unquantize_test<CPUType::AVX512BW>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/kernels/upcast_test.cc
+++ b/third_party/intgemm/test/kernels/upcast_test.cc
@ -1,118 +0,0 @@
-// This test triggers an internal compiler error in gcc 5.
-#if defined(__OPTIMIZE__) || defined(__clang__) || defined(__INTEL_COMPILER) || !defined(__GNUC__) || (__GNUC__ != 5)
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstdint>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_>
-void kernel_upcast8to16_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
-
-  AlignedVector<int8_t> input(LENGTH);
-  AlignedVector<int16_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int8_t>(-LENGTH / 2));
-
-  auto result = kernels::upcast8to16(*input.template as<vi>());
-  output.template as<vi>()[0] = result.first;
-  output.template as<vi>()[1] = result.second;
-
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int16_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_upcast8to16_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("upcast8to16 SSE2") { return kernel_upcast8to16_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_upcast8to16_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("upcast8to16 AVX2") { return kernel_upcast8to16_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_upcast8to16_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("upcast8to16 AVX512BW") { return kernel_upcast8to16_test<CPUType::AVX512BW>(); }
-#endif
-
-template <CPUType CPUType_>
-void kernel_upcast16to32_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int16_t);
-
-  AlignedVector<int16_t> input(LENGTH);
-  AlignedVector<int32_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int16_t>(-LENGTH / 2));
-
-  auto result = kernels::upcast16to32(*input.template as<vi>());
-  output.template as<vi>()[0] = result.first;
-  output.template as<vi>()[1] = result.second;
-
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int32_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_upcast16to32_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("upcast16to32 SSE2") { return kernel_upcast16to32_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_upcast16to32_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("upcast16to32 AVX2") { return kernel_upcast16to32_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); }
-#endif
-
-
-template <CPUType CPUType_>
-void kernel_upcast8to32_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vi = vector_t<CPUType_, int>;
-  constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
-
-  AlignedVector<int8_t> input(LENGTH);
-  AlignedVector<int32_t> output(LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<int8_t>(-LENGTH / 2));
-
-  auto result = kernels::upcast8to32(*input.template as<vi>());
-  output.template as<vi>()[0] = result.first;
-  output.template as<vi>()[1] = result.second;
-  output.template as<vi>()[2] = result.third;
-  output.template as<vi>()[3] = result.fourth;
-
-  for (std::size_t i = 0; i < output.size(); ++i)
-    CHECK(output[i] == int32_t(input[i]));
-}
-
-template INTGEMM_SSE2 void kernel_upcast8to32_test<CPUType::SSE2>();
-KERNEL_TEST_CASE("upcast8to32 SSE2") { return kernel_upcast8to32_test<CPUType::SSE2>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_upcast8to32_test<CPUType::AVX2>();
-KERNEL_TEST_CASE("upcast8to32 AVX2") { return kernel_upcast8to32_test<CPUType::AVX2>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_upcast8to32_test<CPUType::AVX512BW>();
-KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUType::AVX512BW>(); }
-#endif
-
-}
-#endif
--- a/third_party/intgemm/test/kernels/write_test.cc
+++ b/third_party/intgemm/test/kernels/write_test.cc
@ -1,65 +0,0 @@
-#include "../test.h"
-#include "../../intgemm/aligned.h"
-#include "../../intgemm/kernels.h"
-
-#include <cstddef>
-#include <numeric>
-
-namespace intgemm {
-
-template <CPUType CPUType_, typename ElemType_>
-void kernel_write_test() {
-  if (kCPU < CPUType_)
-    return;
-
-  using vec_t = vector_t<CPUType_, ElemType_>;
-  constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
-
-  AlignedVector<ElemType_> input(VECTOR_LENGTH);
-  AlignedVector<ElemType_> output(VECTOR_LENGTH);
-
-  std::iota(input.begin(), input.end(), static_cast<ElemType_>(0));
-
-  kernels::write(*input.template as<vec_t>(), output.begin(), 0);
-  for (std::size_t i = 0; i < VECTOR_LENGTH; ++i)
-    CHECK(output[i] == ElemType_(i));
-}
-
-template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int8_t>();
-template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int16_t>();
-template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int>();
-template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, float>();
-template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, double>();
-KERNEL_TEST_CASE("write/int8 SSE2") { return kernel_write_test<CPUType::SSE2, int8_t>(); }
-KERNEL_TEST_CASE("write/int16 SSE2") { return kernel_write_test<CPUType::SSE2, int16_t>(); }
-KERNEL_TEST_CASE("write/int SSE2") { return kernel_write_test<CPUType::SSE2, int>(); }
-KERNEL_TEST_CASE("write/float SSE2") { return kernel_write_test<CPUType::SSE2, float>(); }
-KERNEL_TEST_CASE("write/double SSE2") { return kernel_write_test<CPUType::SSE2, double>(); }
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int8_t>();
-template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int16_t>();
-template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int>();
-template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, float>();
-template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, double>();
-KERNEL_TEST_CASE("write/int8 AVX2") { return kernel_write_test<CPUType::AVX2, int8_t>(); }
-KERNEL_TEST_CASE("write/int16 AVX2") { return kernel_write_test<CPUType::AVX2, int16_t>(); }
-KERNEL_TEST_CASE("write/int AVX2") { return kernel_write_test<CPUType::AVX2, int>(); }
-KERNEL_TEST_CASE("write/float AVX2") { return kernel_write_test<CPUType::AVX2, float>(); }
-KERNEL_TEST_CASE("write/double AVX2") { return kernel_write_test<CPUType::AVX2, double>(); }
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int8_t>();
-template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int16_t>();
-template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int>();
-template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, float>();
-template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, double>();
-KERNEL_TEST_CASE("write/int8 AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int8_t>(); }
-KERNEL_TEST_CASE("write/int16 AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int16_t>(); }
-KERNEL_TEST_CASE("write/int AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int>(); }
-KERNEL_TEST_CASE("write/float AVX512BW") { return kernel_write_test<CPUType::AVX512BW, float>(); }
-KERNEL_TEST_CASE("write/double AVX512BW") { return kernel_write_test<CPUType::AVX512BW, double>(); }
-#endif
-
-}
--- a/third_party/intgemm/test/multiply_test.cc
+++ b/third_party/intgemm/test/multiply_test.cc
@ -1,761 +0,0 @@
-#include "test.h"
-#include "../intgemm/aligned.h"
-#include "../intgemm/callbacks.h"
-#include "../intgemm/interleave.h"
-#include "../intgemm/intgemm.h"
-#include "../intgemm/multiply.h"
-#include "../intgemm/stats.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <numeric>
-#include <random>
-
-namespace intgemm {
-
-#ifndef __INTEL_COMPILER
-INTGEMM_SSE2
-#endif
-TEST_CASE("Transpose 16", "[transpose]") {
-  if (kCPU < CPUType::SSE2) return;
-  const unsigned N = 8;
-  AlignedVector<int16_t> input(N * N);
-  std::iota(input.begin(), input.end(), static_cast<int16_t>(0));
-
-  AlignedVector<int16_t> ref(N * N);
-  references::Transpose(input.begin(), ref.begin(), N, N);
-
-  // Overwrite input.
-  __m128i *t = input.as<__m128i>();
-  Transpose16InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7]);
-
-  for (std::size_t i = 0; i < input.size(); ++i) {
-  	CHECK_MESSAGE(ref[i] == input[i], "16-bit transpose failure at: " << i << ": " << ref[i] << " != " << input[i]);
-  }
-}
-
-#ifndef __INTEL_COMPILER
-INTGEMM_SSSE3
-#endif
-TEST_CASE("Transpose 8", "[transpose]") {
-  if (kCPU < CPUType::SSSE3) return;
-  const unsigned N = 16;
-  AlignedVector<int8_t> input(N * N);
-  std::iota(input.begin(), input.end(), static_cast<int8_t>(0));
-
-  AlignedVector<int8_t> ref(input.size());
-  references::Transpose(input.begin(), ref.begin(), N, N);
-
-  // Overwrite input.
-  __m128i *t = input.as<__m128i>();
-  Transpose8InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], t[13], t[14], t[15]);
-
-  for (std::size_t i = 0; i < input.size(); ++i) {
-    CHECK_MESSAGE(ref[i] == input[i], "8-bit transpose failure at " << i << ": " << (int16_t)ref[i] << " != " << (int16_t)input[i]);
-  }
-}
-
-template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
-  std::mt19937 gen;
-  // Go somewhat out of range too.
-  std::uniform_real_distribution<float> dist(-129.0, 129.0);
-  // Create array.
-  AlignedVector<float> input(rows * cols);
-  for (auto& it : input) {
-    it = dist(gen);
-  }
-
-  using Integer = typename Routine::Integer;
-  // Call Prepare
-  AlignedVector<Integer> test(input.size());
-  Routine::PrepareB(input.begin(), test.begin(), 1, rows, cols);
-
-  // Compute reference output.
-  AlignedVector<Integer> quantized(input.size());
-  Routine::Quantize(input.begin(), quantized.begin(), 1, static_cast<Index>(input.size()));
-  AlignedVector<Integer> reference(input.size());
-  // Note this won't work for Int8/Int16 generic routines because tile sizes vary.
-  references::Rearragement(quantized.begin(), reference.begin(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
-  CHECK_MESSAGE(memcmp(reference.begin(), test.begin(), test.size() * sizeof(Integer)) == 0, Routine::kName << " Mismatch:\n" <<
-  	"Quantized Input" << '\n' << PrintMatrix(quantized.begin(), rows, cols) << "Reference" << '\n' <<
-  	 PrintMatrix(reference.begin(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.begin(), rows, cols));
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("Prepare AVX512", "[prepare]") {
-  if (kCPU < CPUType::AVX512BW) return;
-	TestPrepare<AVX512BW::Kernels8>(64, 8);
-	TestPrepare<AVX512BW::Kernels8>(256, 32);
-    TestPrepare<AVX512BW::Kernels16>(64, 8);
-    TestPrepare<AVX512BW::Kernels16>(256, 32);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("Prepare AVX2", "[prepare]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestPrepare<AVX2::Kernels8>(64, 32);
-  TestPrepare<AVX2::Kernels16>(64, 32);
-}
-#endif
-
-TEST_CASE("Prepare SSSE3", "[prepare]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestPrepare<SSSE3::Kernels8>(16, 8);
-  TestPrepare<SSSE3::Kernels8>(32, 16);
-  TestPrepare<SSSE3::Kernels8>(32, 32);
-}
-
-TEST_CASE("Prepare SSE2", "[prepare]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestPrepare<SSE2::Kernels16>(8, 8);
-  TestPrepare<SSE2::Kernels16>(32, 32);
-}
-
-template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) {
-  std::mt19937 gen;
-  // Go somewhat out of range too.
-  std::uniform_real_distribution<float> dist(-129.0, 129.0);
-  AlignedVector<float> input(rows * cols);
-  for (auto& it : input) {
-    it = dist(gen);
-  }
-  using Integer = typename Routine::Integer;
-  AlignedVector<Integer> prepared(input.size());
-  Routine::PrepareB(input.begin(), prepared.begin(), 1, rows, cols);
-
-  const int kSelectCols = 24;
-  Index select_cols[kSelectCols];
-  std::uniform_int_distribution<Index> col_dist(0, cols - 1);
-  for (auto& it : select_cols) {
-    it = col_dist(gen);
-  }
-
-  AlignedVector<Integer> test(rows * kSelectCols);
-  Routine::SelectColumnsB(prepared.begin(), test.begin(), rows, select_cols, select_cols + kSelectCols);
-
-  // Select columns manually in float space.
-  AlignedVector<float> selected(rows * kSelectCols);
-  for (Index r = 0; r < rows; ++r) {
-    for (int c = 0; c < kSelectCols; ++c) {
-      assert(c + r * kSelectCols < rows * kSelectCols);
-      selected[c + r * kSelectCols] = input[select_cols[c] + r * cols];
-    }
-  }
-  AlignedVector<Integer> ref(rows * kSelectCols);
-  Routine::PrepareB(selected.begin(), ref.begin(), 1, rows, kSelectCols);
-  CHECK_MESSAGE(memcmp(ref.begin(), test.begin(), sizeof(Integer) * rows * kSelectCols) == 0, "Reference:\n" <<
-  	PrintMatrix(ref.begin(), rows, kSelectCols) << PrintMatrix(test.begin(), rows, kSelectCols));
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("SelectColumnsB AVX512", "[select]") {
-  if (kCPU < CPUType::AVX512BW) return;
-    TestSelectColumnsB<AVX512BW::Kernels8>();
-    TestSelectColumnsB<AVX512BW::Kernels16>(256, 256);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("SelectColumnsB AVX2", "[select]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestSelectColumnsB<AVX2::Kernels8>(256, 256);
-  TestSelectColumnsB<AVX2::Kernels16>(256, 256);
-}
-#endif
-
-TEST_CASE("SelectColumnsB SSSE3", "[select]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestSelectColumnsB<SSSE3::Kernels8>();
-  TestSelectColumnsB<SSSE3::Kernels8>(256, 256);
-}
-
-TEST_CASE("SelectColumnsB SSE2", "[select]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestSelectColumnsB<SSE2::Kernels16>();
-  TestSelectColumnsB<SSE2::Kernels16>(256, 256);
-}
-
-template <class Register> void TestMax() {
-  Register r = set1_ps<Register>(-2.0);
-  for (std::size_t i = 0; i < sizeof(Register) / sizeof(float); ++i) {
-    Register c = r;
-    reinterpret_cast<float*>(&c)[i] = -1.0;
-    CHECK_MESSAGE((MaxFloat32(c) == -1.0), "MaxFloat32 produced " << MaxFloat32(c));
-  }
-}
-
-TEST_CASE("Max", "[max]") {
-  TestMax<__m128>();
-}
-
-void CompareMaxAbs(const float *begin, const float *end, float test, std::size_t offset) {
-  float largest = std::fabs(*std::max_element(begin, end));
-  float smallest = std::fabs(*std::min_element(begin, end));
-  largest = std::max(largest, smallest);
-  CHECK_MESSAGE(largest == test, "Error: " << largest << " versus " << test << " in length " << (end - begin) << " offset " << offset);
-}
-
-template <float (*Backend) (const float *, const float *)> void TestMaxAbsolute() {
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-8.0, 8.0);
-  const std::size_t kLengthMax = 65;
-  AlignedVector<float> test(kLengthMax);
-  for (std::size_t len = 1; len < kLengthMax; ++len) {
-    for (std::size_t t = 0; t < len; ++t) {
-      // Fill with [-8, 8).
-      for (auto& it : test) {
-        it = dist(gen);
-      }
-      CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
-      test[t] = -32.0;
-      CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
-      test[t] = 32.0;
-      CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
-    }
-  }
-}
-
-TEST_CASE("MaxAbsolute SSE2", "[max]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMaxAbsolute<SSE2::MaxAbsolute>();
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("MaxAbsolute AVX2", "[max]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMaxAbsolute<AVX2::MaxAbsolute>();
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("MaxAbsolute AVX512BW", "[max]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  TestMaxAbsolute<AVX512BW::MaxAbsolute>();
-}
-#endif
-
-// Based on https://arxiv.org/abs/1705.01991
-
-// Copyright (c) 2017 Microsoft Corporation
-
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-
-// The above copyright notice and this permission notice shall be included in all
-// copies or substantial portions of the Software.
-
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-// Compute A*B slowly in floats.
-
-template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_cols,
- float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  using Integer = typename Routine::Integer;
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-
-  float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<Integer> A_prep(A.size());
-  AlignedVector<Integer> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-  OMPParallelWrap<callbacks::UnquantizeAndWrite, Routine>(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin()));
-  // Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::Sequence(
-  //   callbacks::Unquantize(unquant_mult),
-  //   callbacks::Write<float>(test_C.begin())
-  // ));
-
-  AlignedVector<Integer> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Assuming A is just quantization here.
-  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo&) {
-    return sum * unquant_mult;
-  });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo&) {
-    return static_cast<float>(sum);
-  });
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-template <class Routine> void TestMultiplyRelu(Index A_rows, Index width, Index B_cols,
- float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
-  using Integer = typename Routine::Integer;
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-
-  float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<Integer> A_prep(A.size());
-  AlignedVector<Integer> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-  OMPParallelWrap<callbacks::UnquantizeAndWriteRelu, Routine>(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWriteRelu(unquant_mult, test_C.begin()));
-  // Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::Sequence(
-  //   callbacks::Unquantize(unquant_mult),
-  //   callbacks::Write<float>(test_C.begin())
-  // ));
-
-  AlignedVector<Integer> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Assuming A is just quantization here.
-  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo&) {
-    float ret = std::max(0.0f, sum * unquant_mult);
-    return ret;
-  });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo&) {
-    return static_cast<float>(std::max(0.0,sum));
-  });
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-//Code duplication may be avoided through some use of variadic templates, as the different WriteC symbols
-//Require different number of arguments. I don't think the refactoring is worth it.
-template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index B_cols,
- float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
-  using Integer = typename Routine::Integer;
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<Integer> A_prep(A.size());
-  AlignedVector<Integer> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
-
-  AlignedVector<Integer> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Assuming A is just quantization here.
-  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return sum * unquant_mult + bias[info.col_idx];
-  });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
-    return static_cast<float>(sum) + bias[info.col_idx];
-  });
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-template <class Routine> void TestMultiplyBiasRelu(Index A_rows, Index width, Index B_cols,
- float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
-  using Integer = typename Routine::Integer;
-  std::ostringstream info;
-  info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
-
-  // Initialize A and B.
-  AlignedVector<float> A(A_rows * width);
-  AlignedVector<float> B(width * B_cols);
-  AlignedVector<float> bias(B_cols);
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  for (auto& it : A) {
-    it = dist(gen);
-  }
-  for (auto& it : B) {
-    it = dist(gen);
-  }
-  for (auto& it : bias) {
-    it = dist(gen);
-  }
-
-  float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
-  float unquant_mult = 1.0f / (quant_mult*quant_mult);
-
-  AlignedVector<Integer> A_prep(A.size());
-  AlignedVector<Integer> B_prep(B.size());
-  Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
-  Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
-
-  AlignedVector<float> test_C(A_rows * B_cols);
-
-  Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWriteRelu(unquant_mult, bias.begin(), test_C.begin()));
-
-  AlignedVector<Integer> B_quant(B.size());
-  Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
-  AlignedVector<float> slowint_C(test_C.size());
-  // Assuming A is just quantization here.
-  references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
-    return std::max(0.0f, sum * unquant_mult + bias[info.col_idx]);
-  });
-
-  AlignedVector<float> float_C(test_C.size());
-  references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
-    return std::max(0.0f, static_cast<float>(sum) + bias[info.col_idx]);
-  });
-
-  CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
-   int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
-}
-
-TEST_CASE ("Multiply SSE2 16bit", "[multiply]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMultiply<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiply<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply SSE2 16bit with relu", "[multiply_relu]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMultiplyRelu<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyRelu<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMultiplyBias<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBias<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply SSE2 16bit with bias and relu", "[biased_multiply_relu]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMultiplyBiasRelu<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBiasRelu<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiply<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiply<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiply<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiply<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiply<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiply<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
-}
-
-TEST_CASE ("Multiply SSSE3 8bit with relu", "[multiply_relu]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyRelu<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiplyRelu<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiplyRelu<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiplyRelu<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiplyRelu<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiplyRelu<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
-}
-
-TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyBias<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiplyBias<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiplyBias<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiplyBias<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiplyBias<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiplyBias<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
-}
-
-TEST_CASE ("Multiply SSSE3 8bit with bias and relu", "[biased_multiply_relu]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
-  TestMultiplyBiasRelu<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
-}
-
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE ("Multiply AVX2 8bit", "[multiply]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiply<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiply<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiply<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
-}
-
-TEST_CASE ("Multiply AVX2 8bit with relu", "[multiply_relu]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyRelu<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyRelu<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiplyRelu<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyRelu<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyRelu<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyRelu<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
-}
-
-TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBias<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiplyBias<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBias<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
-}
-
-TEST_CASE ("Multiply AVX2 8bit with bias and relu", "[biased_multiply_relu]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBiasRelu<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBiasRelu<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
-  TestMultiplyBiasRelu<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBiasRelu<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBiasRelu<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
-  TestMultiplyBiasRelu<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
-}
-
-TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiply<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiply<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiply<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply AVX2 16bit with relu", "[multiply_relu]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyRelu<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyRelu<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyRelu<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBias<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBias<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBias<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-
-TEST_CASE ("Multiply AVX2 16bit with bias and relu", "[biased_multiply_relu]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMultiplyBiasRelu<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
-  TestMultiplyBiasRelu<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-  TestMultiplyBiasRelu<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiply<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiply<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiply<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiply<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiply<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiply<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-  }
-
-  TEST_CASE ("Multiply AVX512 8bit with relu", "[multiply_relu]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyRelu<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiplyRelu<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiplyRelu<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiplyRelu<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyRelu<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyRelu<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-  }
-
-  TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBias<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiplyBias<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiplyBias<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiplyBias<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBias<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBias<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-  }
-
-  TEST_CASE ("Multiply AVX512 8bit with bias and relu", "[biased_multiply_relu]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-  }
-
-  #ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
-    TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") {
-      if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiply<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiply<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiply<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiply<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiply<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiply<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-    }
-
-    TEST_CASE ("Multiply AVX512VNNI 8bit with relu", "[multiply_relu]") {
-      if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyRelu<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-    }
-
-    TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") {
-      if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiplyBias<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiplyBias<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiplyBias<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiplyBias<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBias<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBias<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-    }
-
-    TEST_CASE ("Multiply AVX512VNNI 8bit with bias and relu", "[biased_multiply_relu]") {
-      if (kCPU < CPUType::AVX512VNNI) return;
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
-      TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
-    }
-  #endif
-
-  TEST_CASE ("Multiply AVX512 16bit", "[multiply]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiply<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiply<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiply<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-  }
-
-  TEST_CASE ("Multiply AVX512 16bit with relu", "[multiply_relu]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyRelu<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyRelu<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiplyRelu<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyRelu<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyRelu<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyRelu<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-  }
-
-
-  TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBias<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiplyBias<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBias<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-  }
-
-  TEST_CASE ("Multiply AVX512 16bit with bias and relu", "[biased_multiply_relu]") {
-    if (kCPU < CPUType::AVX512BW) return;
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
-    TestMultiplyBiasRelu<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
-  }
-#endif
-
-} // namespace intgemm
--- a/third_party/intgemm/test/prepare_b_quantized_transposed.cc
+++ b/third_party/intgemm/test/prepare_b_quantized_transposed.cc
@ -1,96 +0,0 @@
-#include "test.h"
-#include "../intgemm/aligned.h"
-#include "../intgemm/avx2_gemm.h"
-#include "../intgemm/avx512_gemm.h"
-#include "../intgemm/sse2_gemm.h"
-#include "../intgemm/ssse3_gemm.h"
-
-#include <cmath>
-#include <cstring>
-#include <iostream>
-
-namespace intgemm {
-namespace {
-
-template <typename Backend>
-void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index B_transposed_cols, Index B_transposed_rows) {
-  using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
-  constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
-
-  auto output_it = output;
-  for (Index r = 0; r < B_transposed_rows; r += 8)
-    for (Index c = 0; c < B_transposed_cols; c += vec_len)
-      for (Index ri = 0; ri < 8; ++ri)
-        for (Index ci = 0; ci < vec_len; ++ci)
-          *output_it++ = input[(r + ri) * B_transposed_cols + c + ci];
-}
-
-template <typename Backend>
-bool Test(const AlignedVector<typename Backend::Integer>& input, Index B_rows, Index B_cols) {
-  bool success = true;
-
-  AlignedVector<typename Backend::Integer> output(input.size());
-  Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), B_rows, B_cols);
-
-  AlignedVector<typename Backend::Integer> reference(input.size());
-  PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), B_rows, B_cols);
-
-  for (std::size_t i = 0; i < output.size(); ++i) {
-    if (output[i] != reference[i]) {
-      UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i]));
-      success = false;
-      break;
-    }
-  }
-  return success;
-}
-
-template <typename Backend>
-bool TestMany(Index B_rows, Index B_cols) {
-  AlignedVector<typename Backend::Integer> input(B_rows * B_cols);
-
-  std::generate(input.begin(), input.end(), []() {
-    static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
-    static int value = 0;
-    return static_cast<typename Backend::Integer>((value++) % divider);
-  });
-
-  return Test<Backend>(input, B_rows, B_cols);
-}
-
-TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
-  if (kCPU < CPUType::SSE2)
-    return;
-
-  CHECK(TestMany<SSE2::Kernels16>(32, 128));
-}
-
-TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
-  if (kCPU < CPUType::SSSE3)
-    return;
-
-  CHECK(TestMany<SSSE3::Kernels8>(32, 128));
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
-  if (kCPU < CPUType::AVX2)
-    return;
-
-  CHECK(TestMany<AVX2::Kernels8>(32, 128));
-  CHECK(TestMany<AVX2::Kernels16>(32, 128));
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-  TEST_CASE("PrepareBQuantizedTransposed AVX512", "") {
-    if (kCPU < CPUType::AVX512BW)
-      return;
-
-    CHECK(TestMany<AVX512BW::Kernels8>(64, 128));
-    CHECK(TestMany<AVX512BW::Kernels16>(64, 128));
-  }
-#endif
-
-}
-}
--- a/third_party/intgemm/test/prepare_b_transposed.cc
+++ b/third_party/intgemm/test/prepare_b_transposed.cc
@ -1,97 +0,0 @@
-#include "test.h"
-#include "../intgemm/aligned.h"
-#include "../intgemm/avx2_gemm.h"
-#include "../intgemm/avx512_gemm.h"
-#include "../intgemm/sse2_gemm.h"
-#include "../intgemm/ssse3_gemm.h"
-
-#include <cmath>
-#include <cstring>
-#include <iostream>
-
-namespace intgemm {
-namespace {
-
-template <typename Backend>
-void PrepareBTransposedRef(const float* input, typename Backend::Integer* output, float quant_mult, Index B_transposed_cols, Index B_transposed_rows) {
-  using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
-  constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
-
-  for (Index i = 0; i < B_transposed_rows * B_transposed_cols / 8; i += vec_len)
-    for (Index j = 0; j < 8; ++j)
-      for (Index k = 0; k < vec_len; ++k) {
-        Index col = (i + k) % B_transposed_cols;
-        Index row = 8 * ((i + k) / B_transposed_cols) + j;
-        *output++ = static_cast<typename Backend::Integer>(input[row * B_transposed_cols + col] * quant_mult);
-      }
-}
-
-template <typename Backend>
-bool Test(const AlignedVector<float>& input, Index B_rows, Index B_cols, float quant_mult) {
-  bool success = true;
-
-  AlignedVector<typename Backend::Integer> output(input.size());
-  Backend::PrepareBTransposed(input.begin(), output.begin(), quant_mult, B_rows, B_cols);
-
-  AlignedVector<typename Backend::Integer> reference(input.size());
-  PrepareBTransposedRef<Backend>(input.begin(), reference.begin(), quant_mult, B_rows, B_cols);
-
-  for (std::size_t i = 0; i < output.size(); ++i) {
-    if (output[i] != reference[i]) {
-      UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i]));
-      success = false;
-      break;
-    }
-  }
-  return success;
-}
-
-template <typename Backend>
-bool TestMany(Index B_rows, Index B_cols, float quant_mult) {
-  AlignedVector<float> input(B_rows * B_cols);
-
-  std::generate(input.begin(), input.end(), []() {
-    static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
-    static int value = 0;
-    return static_cast<float>((value++) % divider);
-  });
-
-  return Test<Backend>(input, B_rows, B_cols, quant_mult);
-}
-
-TEST_CASE("PrepareBTransposed SSE2", "") {
-  if (kCPU < CPUType::SSE2)
-    return;
-
-  CHECK(TestMany<SSE2::Kernels16>(4, 128, 2.0f));
-}
-
-TEST_CASE("PrepareBTransposed SSSE3", "") {
-  if (kCPU < CPUType::SSSE3)
-    return;
-
-  CHECK(TestMany<SSSE3::Kernels8>(4, 128, 2.0f));
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("PrepareBTransposed AVX2", "") {
-  if (kCPU < CPUType::AVX2)
-    return;
-
-  CHECK(TestMany<AVX2::Kernels8>(8, 128, 2.0f));
-  CHECK(TestMany<AVX2::Kernels16>(8, 128, 2.0f));
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("PrepareBTransposed AVX512", "") {
-  if (kCPU < CPUType::AVX512BW)
-    return;
-
-  CHECK(TestMany<AVX512BW::Kernels8>(16, 128, 2.0f));
-  CHECK(TestMany<AVX512BW::Kernels16>(16, 128, 2.0f));
-}
-#endif
-
-}
-}
--- a/third_party/intgemm/test/quantize_test.cc
+++ b/third_party/intgemm/test/quantize_test.cc
@ -1,199 +0,0 @@
-#include "test.h"
-#include "../intgemm/aligned.h"
-#include "../intgemm/avx2_gemm.h"
-#include "../intgemm/avx512_gemm.h"
-#include "../intgemm/sse2_gemm.h"
-#include "../intgemm/ssse3_gemm.h"
-#include "../intgemm/stats.h"
-
-#include <cmath>
-#include <cstring>
-#include <iostream>
-
-namespace intgemm {
-namespace {
-
-void QuantizeRef(const float *input, int16_t *output, float quant_mult, std::size_t size) {
-  for (std::size_t i = 0; i < size; ++i) {
-    float value = roundf(input[i] * quant_mult);
-    value = std::max(-32768.0f, value);
-    value = std::min(32767.0f, value);
-    // float should be exact in this range.
-    output[i] = static_cast<int16_t>(value);
-  }
-}
-
-void QuantizeRef(const float *input, int8_t *output, float quant_mult, std::size_t size) {
-  for (std::size_t i = 0; i < size; ++i) {
-    float value = roundf(input[i] * quant_mult);
-    value = std::max(-127.0f, value);
-    value = std::min(127.0f, value);
-    output[i] = static_cast<int8_t>(value);
-  }
-}
-
-MeanStd VectorMeanStd(AlignedVector<float>& vals, int num_items, bool absolute) {
-  float normal_sums = 0;
-  float squares_sum = 0;
-  if (absolute) {
-    std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=std::abs(n);});
-  } else {
-    std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=n;});
-  }
-  std::for_each(vals.begin(), vals.end(), [&] (float n) {squares_sum+=n*n;});
-
-  MeanStd ret;
-  ret.mean = normal_sums/num_items;
-  ret.stddev = std::sqrt((squares_sum/num_items) - (ret.mean*ret.mean));
-  return ret;
-}
-
-template <MeanStd (*Backend) (const float *, const float *, bool)>
-void testVectorMeanStd(int num_items, bool absolute=false) {
-  std::mt19937 gen;
-  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
-  AlignedVector<float> inputVec(num_items);
-
-  for (auto&& it : inputVec) {
-    it = dist(gen);
-  }
-
-  MeanStd reference = VectorMeanStd(inputVec, num_items, absolute);
-  MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute);
-
-  float meanDifference = std::fabs(reference.mean - fast.mean);
-  float stdDifference = std::fabs(reference.stddev - fast.stddev);
-  float eps = 0.00002f; //Accumulating horizontal sums can lead to errors.
-
-  CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean);
-  CHECK_MESSAGE(stdDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.stddev << " actual: " << fast.stddev);
-
-}
-
-template <class I> bool IsOff(float from, I ref, I test) {
-  if (ref == test) return false;
-  if (ref - test > 1 && test - ref > 1) return true;
-  float off_test = std::fabs(static_cast<float>(test) - from);
-  float off_ref = std::fabs(static_cast<float>(ref) - from);
-  // Allow 0.5 to round either way.
-  if (off_test > 0.49 && off_test < 0.51 && off_ref > 0.49 && off_ref < 0.51) return false;
-  return true;
-}
-
-template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) {
-  using Integer = typename Backend::Integer;
-  bool success = true;
-  AlignedVector<float> input(size);
-  std::memcpy(input.begin(), input_unaligned, sizeof(float) * size);
-
-  AlignedVector<Integer> ref(size);
-  AlignedVector<Integer> test(size);
-  QuantizeRef(input.begin(), ref.begin(), quant_mult, static_cast<Index>(size));
-  Backend::Quantize(input.begin(), test.begin(), quant_mult, static_cast<Index>(size));
-  for (std::size_t i = 0; i < size; ++i) {
-    if (IsOff(input[i] * quant_mult, ref[i], test[i])) {
-      UNSCOPED_INFO("Error at " << i << " from " << input[i] << '*' << quant_mult << '=' << (input[i]*quant_mult) << " ref = " << static_cast<int>(ref[i]) << " test = " << static_cast<int>(test[i]));
-      success = false;
-    }
-  }
-  return success;
-}
-
-template <class Backend> void TestMany(std::size_t grow) {
-  float input[33] = {
-    0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f,
-    14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f,
-    26.f, 27.f, 28.f, 29.f, 30.f, 31.f, 32.f};
-  float corners[33] = {
-    -32769.f, -32768.f, -32767.f, -129.f, -128.f, -127.f, -1.f, 0.f, 1.f,
-    126.f, 127.f, 128.f, 129.f, 32766.f, 32768.f, 32769.f, -1.9f, -1.5f, -1.1f,
-    -1.f, -0.9f, -0.5f, -0.1f, 0.0f, 0.1f, 0.5f, 0.9f, 1.0f, 1.1f, 1.5f, 1.9f,
-    16056.8f, 2.5f};
-  for (std::size_t len = 0; len <= 33; len += grow) {
-    CHECK(Test<Backend>(input, 1.0f, len));
-    CHECK(Test<Backend>(input, 32.0f, len));
-    CHECK(Test<Backend>(corners, 1.0f, len));
-    CHECK(Test<Backend>(corners, -1.0f, len));
-    CHECK(Test<Backend>(corners, -0.49f, len));
-  }
-}
-
-TEST_CASE ("Quantize SSE2", "[quantize]") {
-  if (kCPU < CPUType::SSE2) return;
-  TestMany<SSE2::Kernels16>(8);
-}
-
-TEST_CASE ("Quantize SSSE3", "[quantize]") {
-  if (kCPU < CPUType::SSSE3) return;
-  TestMany<SSSE3::Kernels8>(1);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE ("Quantize AVX2", "[quantize]") {
-  if (kCPU < CPUType::AVX2) return;
-  TestMany<AVX2::Kernels8>(1);
-  TestMany<AVX2::Kernels16>(16);
-}
-#endif
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE ("Quantize AVX512", "[quantize]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  TestMany<AVX512BW::Kernels8>(1);
-  TestMany<AVX512BW::Kernels16>(16);
-}
-#endif
-
-TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") {
-  if (kCPU < CPUType::SSSE3) return;
-  testVectorMeanStd<SSE2::VectorMeanStd>(64);
-  testVectorMeanStd<SSE2::VectorMeanStd>(64, true);
-  testVectorMeanStd<SSE2::VectorMeanStd>(256);
-  testVectorMeanStd<SSE2::VectorMeanStd>(256, true);
-  testVectorMeanStd<SSE2::VectorMeanStd>(2048);
-  testVectorMeanStd<SSE2::VectorMeanStd>(2048, true);
-  testVectorMeanStd<SSE2::VectorMeanStd>(65536);
-  testVectorMeanStd<SSE2::VectorMeanStd>(65536, true);
-  testVectorMeanStd<SSE2::VectorMeanStd>(81920);
-  testVectorMeanStd<SSE2::VectorMeanStd>(81920, true);
-  testVectorMeanStd<SSE2::VectorMeanStd>(120832);
-  testVectorMeanStd<SSE2::VectorMeanStd>(120832, true);
-}
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
-TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") {
-  if (kCPU < CPUType::AVX2) return;
-  testVectorMeanStd<AVX2::VectorMeanStd>(64);
-  testVectorMeanStd<AVX2::VectorMeanStd>(64, true);
-  testVectorMeanStd<AVX2::VectorMeanStd>(256);
-  testVectorMeanStd<AVX2::VectorMeanStd>(256, true);
-  testVectorMeanStd<AVX2::VectorMeanStd>(2048);
-  testVectorMeanStd<AVX2::VectorMeanStd>(2048, true);
-  testVectorMeanStd<AVX2::VectorMeanStd>(65536);
-  testVectorMeanStd<AVX2::VectorMeanStd>(65536, true);
-  testVectorMeanStd<AVX2::VectorMeanStd>(81920);
-  testVectorMeanStd<AVX2::VectorMeanStd>(81920, true);
-  testVectorMeanStd<AVX2::VectorMeanStd>(120832);
-  testVectorMeanStd<AVX2::VectorMeanStd>(120832, true);
-}
-#endif
-
-#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
-TEST_CASE("QuantizeStd AVX512BW", "[VectorMeanStd]") {
-  if (kCPU < CPUType::AVX512BW) return;
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(64);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(64, true);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(256);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(256, true);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(2048);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(2048, true);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(65536);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(65536, true);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(81920);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(81920, true);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(120832);
-  testVectorMeanStd<AVX512BW::VectorMeanStd>(120832, true);
-}
-#endif
-
-} // namespace
-} // namespace intgemm
--- a/third_party/intgemm/test/test.cc
+++ b/third_party/intgemm/test/test.cc
@ -1,27 +0,0 @@
-#define CATCH_CONFIG_RUNNER
-#include "test.h"
-
-#include <cmath>
-
-int main(int argc, char ** argv) {
-  return Catch::Session().run(argc, argv);
-}
-
-namespace intgemm {
-
-void CompareMSE(const float *float_ref, const float *int_ref, const float *int_test, std::size_t size, std::string test_info,
-             float int_tolerance, float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance) {
-  float int_sum = 0.0, float_sum = 0.0;
-  for (std::size_t i = 0; i < size; ++i) {
-    float int_diff = int_ref[i] - int_test[i];
-    float float_diff = float_ref[i] - int_test[i];
-    CHECK_MESSAGE(std::fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
-    CHECK_MESSAGE(std::fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
-    int_sum += int_diff * int_diff;
-    float_sum += float_diff * float_diff;
-  }
-  CHECK_MESSAGE(std::fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
-  CHECK_MESSAGE(std::fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
-}
-
-} // namespace intgemm
--- a/third_party/intgemm/test/test.h
+++ b/third_party/intgemm/test/test.h
@ -1,132 +0,0 @@
-#pragma once
-
-#include "intgemm/intgemm_config.h"
-
-#include "3rd_party/catch.hpp"
-#include "../intgemm/intgemm.h"
-#include "../intgemm/aligned.h"
-
-#include <cmath>
-#include <sstream>
-#include <iostream>
-#include <iomanip>
-
-#define CHECK_MESSAGE(cond, msg) do { INFO(msg); CHECK(cond); } while(0)
-#define CHECK_FALSE_MESSAGE(cond, msg) do { INFO(msg); CHECK_FALSE(cond); } while(0)
-#define REQUIRE_MESSAGE(cond, msg) do { INFO(msg); REQUIRE(cond); } while(0)
-#define REQUIRE_FALSE_MESSAGE(cond, msg) do { INFO(msg); REQUIRE_FALSE(cond); } while(0)
-
-#define CHECK_EPS(actual, expected, epsilon) \
-  do { \
-    if (std::fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
-    else { CHECK((actual) == (expected)); } \
-  } while(0)
-
-#define KERNEL_TEST_CASE(name) TEST_CASE("Kernel: " name, "[kernel_test]")
-
-namespace intgemm {
-
-template <typename Type>
-void Compare(const Type* reference, const Type* actual, Index size) {
-  for (Index i = 0; i < size; ++i) {
-    INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
-    CHECK(reference[i] == actual[i]);
-  }
-}
-
-template <typename Type>
-void CompareEps(const Type* reference, const Type* actual, Index size, Type epsilon) {
-  for (Index i = 0; i < size; ++i) {
-    INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
-    // Ratio to maximum value.
-    float threshold = epsilon * std::max<float>(0.01f, std::fabs(reference[i]));
-    CHECK(std::fabs(reference[i] - actual[i]) < threshold);
-  }
-}
-
-void CompareMSE(const float *float_ref, const float *int_ref, const float *int_test,
-                std::size_t size, std::string test_info, float int_tolerance,
-                float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance);
-
-template <typename Type>
-std::string PrintMatrix(const Type *mem, Index rows, Index cols) {
-  std::ostringstream out;
-  for (Index r = 0; r < rows; ++r) {
-    for (Index c = 0; c < cols; ++c) {
-      out << std::setw(4) << (int64_t) mem[r * cols + c] << ' ';
-    }
-    out << '\n';
-  }
-  return out.str();
-}
-
-/*
- * References
- */
-namespace references {
-
-// Quantize
-template <typename Type>
-void Quantize(const float* input, Type* output, float quant_mult, Index size) {
-  for (Index i = 0; i < size; ++i) {
-    float value = roundf(input[i] * quant_mult);
-    value = std::max<float>(std::numeric_limits<Type>::min(), value);
-    value = std::min<float>(std::numeric_limits<Type>::max(), value);
-    output[i] = value;
-  }
-}
-
-/*
- * Multiply C = A x B
- *
- * Notes: A and B has to be both integers or both floating points.
- *
- * Callback takes two arguments:
- *   - Intermediate value of multiplication 1 row times 1 column - it's int32_t or double based on types A and B.
- *   - Object containing information about position in output matrix - callbacks::OutputBufferInfo.
- */
-template <typename TypeA, typename TypeB, typename TypeC, typename LambdaCallback,
-          typename std::enable_if<
-            (std::is_integral<TypeA>::value && std::is_integral<TypeB>::value) ||
-            (std::is_floating_point<TypeA>::value && std::is_floating_point<TypeB>::value)
-          >::type* = nullptr>
-void Multiply(const TypeA* A, const TypeB* B, TypeC* C, Index A_rows, Index width, Index B_cols, LambdaCallback callback) {
-  using IntermediateType = typename std::conditional<std::is_integral<TypeA>::value, int32_t, double>::type;
-
-  for (Index r = 0; r < A_rows; ++r) {
-    for (Index c = 0; c < B_cols; ++c) {
-      IntermediateType sum = 0;
-      for (Index k = 0; k < width; ++k) {
-        sum += IntermediateType(A[r * width + k]) * IntermediateType(B[k * B_cols + c]);
-      }
-      C[r * B_cols + c] = callback(sum, {r, c, A_rows, B_cols});
-    }
-  }
-}
-
-// Matrix rearragement
-template <typename Type>
-void Rearragement(const Type* input, Type* output, Index simd, Index unroll, Index rows, Index cols) {
-  for (Index c = 0; c < cols; c += unroll) {
-    for (Index r = 0; r < rows; r += simd) {
-      for (Index i = 0; i < unroll; ++i)
-        for (Index j = 0; j < simd; ++j)
-          output[simd * i + j] = input[cols * r + c + cols * j + i];
-
-      output += unroll * simd;
-    }
-  }
-}
-
-// Transpose
-template <typename Type>
-void Transpose(const Type* input, Type* output, Index rows, Index cols) {
-  for (Index r = 0; r < rows; ++r) {
-    for (Index c = 0; c < cols; ++c) {
-      output[rows * c + r] = input[cols * r + c];
-    }
-  }
-}
-
-} // namespace references
-} // namespace intgemm
--- a/third_party/intgemm/test/utils_test.cc
+++ b/third_party/intgemm/test/utils_test.cc
@ -1,45 +0,0 @@
-#include "test.h"
-#include "../intgemm/utils.h"
-
-namespace intgemm {
-namespace {
-
-TEST_CASE("Factorial",) {
-  CHECK(factorial(0) == 1);
-  CHECK(factorial(1) == 1);
-  CHECK(factorial(2) == 2);
-  CHECK(factorial(3) == 6);
-  CHECK(factorial(4) == 24);
-
-  // Maximum result that fits in unsinged long long
-  CHECK(factorial(20) == 2432902008176640000);
-}
-
-TEST_CASE("Expi (negative)",) {
-  const double eps = 0.0000001;
-  CHECK_EPS(expi(-1), 0.3678794411714423, eps);
-  CHECK_EPS(expi(-2), 0.1353352832366127, eps);
-  CHECK_EPS(expi(-10), 0.0000453999297625, eps);
-}
-
-TEST_CASE("Expi (zero)",) {
-  const double eps = 0.0000001;
-  CHECK_EPS(expi(0), 1.0, eps);
-}
-
-TEST_CASE("Expi (positive)",) {
-  const double eps = 0.0000001;
-  CHECK_EPS(expi(1), 2.7182818284590452, eps);
-  CHECK_EPS(expi(2), 7.3890560989306502, eps);
-  CHECK_EPS(expi(10), 22026.4657948067165170, eps);
-}
-
-TEST_CASE("Round up",) {
-  CHECK(round_up(0, 5) == 0);
-  CHECK(round_up(1, 5) == 5);
-  CHECK(round_up(4, 5) == 5);
-  CHECK(round_up(6, 5) == 10);
-}
-
-}
-}
--- a/third_party/xsimd/Changelog.rst
+++ b/third_party/xsimd/Changelog.rst
@ -0,0 +1,151 @@
+.. Copyright (c) Serge Guelton and Johan Mabille
+   Copyright (c) QuantStack
+
+   Distributed under the terms of the BSD 3-Clause License.
+
+   The full license is in the file LICENSE, distributed with this software.
+
+
+Changelog
+=========
+
+9.0.1
+-----
+
+    * Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to
+      size-dependent type.
+
+9.0.0
+-----
+
+    * Support fixed size SVE
+
+    * Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16``
+
+    * Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max``
+
+    * Properly report unsupported double for neon on arm32
+
+    * Fill holes in xsimd scalar api
+
+    * Fix ``find_package(xsimd)`` for xtl enabled xsimd
+
+    * Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast``
+
+    * Native ``xsimd::hadd`` for float on arm64
+
+    * Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex
+
+    * Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)``
+
+    * Flag some function with ``[[nodiscard]]``
+
+    * Accept both relative and absolute libdir and include dir in xsimd.pc
+
+    * Implement ``xsimd::nearbyint_as_int`` for NEON
+
+    * Add ``xsimd::polar``
+
+    * Speedup double -> F32/I32 gathers
+
+    * Add ``xsimd::slide_left`` and ``xsimd::slide_right``
+
+    * Support integral ``xsimd::swizzles`` on AVX
+
+8.1.0
+-----
+
+    * Add ``xsimd::gather`` and ``xsimd::scatter``
+
+    * Add ``xsimd::nearbyint_as_int``
+
+    * Add ``xsimd::none``
+
+    * Add ``xsimd::reciprocal``
+
+    * Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead
+
+    * Leave to msvc users the opportunity to manually disable FMA3 on AVX
+
+    * Provide ``xsimd::insert`` to modify a single value from a vector
+
+    * Make ``xsimd::pow`` implementation resilient to ``FE_INVALID``
+
+    * Reciprocal square root support through ``xsimd::rsqrt``
+
+    * NEON: Improve ``xsimd::any`` and ``xsimd::all``
+
+    * Provide type utility to explicitly require a batch of given size and type
+
+    * Implement ``xsimd::swizzle`` on x86, neon and neon64
+
+    * Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi``
+
+    * Only use ``_mm256_unpacklo_epi<N>`` on AVX2
+
+    * Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)``
+
+    * Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float``
+
+    * Provide AVX2 conversion function from ``(u)int64_t`` to ``double``
+
+    * Provide better SSE conversion function from ``uint64_t`` to ``double``
+
+    * Provide better SSE conversion function to ``double``
+
+    * Support logical xor for ``xsimd::batch_bool``
+
+    * Clarify fma support:
+
+        - FMA3 + SSE -> ``xsimd::fma3<sse4_2>``
+        - FMA3 + AVX -> ``xsimd::fma3<avx>``
+        - FMA3 + AVX2 -> ``xsimd::fma3<avx2>``
+        - FMA4 -> ``xsimd::fma4``
+
+    * Allow ``xsimd::transform`` to work with complex types
+
+    * Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj``
+
+8.0.5
+-----
+
+    * Fix neon ``xsimd::hadd`` implementation
+
+    * Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE``
+      if needs be
+
+8.0.4
+-----
+
+    * Provide some conversion operators for ``float`` -> ``uint32``
+
+    * Improve code generated for AVX2 signed integer comparisons
+
+    * Enable detection of avx512cd and avx512dq, and fix avx512bw detection
+
+    * Enable detection of AVX2+FMA
+
+    * Pick the best compatible architecture in ``xsimd::dispatch``
+
+    * Enables support for FMA when AVX2 is detected on Windows
+
+    * Add missing includes / forward declaration
+
+    * Mark all functions inline and noexcept
+
+    * Assert when using incomplete ``std::initializer_list``
+
+8.0.3
+-----
+
+    * Improve CI & testing, no functional change
+
+8.0.2
+-----
+
+    * Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction
+
+8.0.1
+-----
+
+    * Fix invalid constexpr ``std::make_tuple`` usage in neon64
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@ -34,8 +34,8 @@ namespace xsimd
    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
    template <class T, class A>
    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
-    template <class B, class T, class A>
-    inline B bitwise_cast(batch<T, A> const& self) noexcept;
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
    template <class T, class A>
    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
    template <class T, class A>
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@ -909,7 +909,7 @@ namespace xsimd
                e = fms(x, e, hxs);
                using i_type = as_integer_t<batch_type>;
                i_type ik = to_int(k);
-                batch_type two2mk = ::xsimd::bitwise_cast<batch_type>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type two2mk = ::xsimd::bitwise_cast<float>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
                batch_type y = batch_type(1.) - two2mk - (e - x);
                return ldexp(y, ik);
            }
@ -936,7 +936,7 @@ namespace xsimd
                e = (x * (e - c) - c) - hxs;
                using i_type = as_integer_t<batch_type>;
                i_type ik = to_int(k);
-                batch_type two2mk = ::xsimd::bitwise_cast<batch_type>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type two2mk = ::xsimd::bitwise_cast<double>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
                batch_type ct1 = batch_type(1.) - two2mk - (e - x);
                batch_type ct2 = ++(x - (e + two2mk));
                batch_type y = select(k < batch_type(20.), ct1, ct2);
@ -1004,13 +1004,14 @@ namespace xsimd
        inline batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
        {
            using batch_type = batch<T, A>;
-            using i_type = batch<as_integer_t<T>, A>;
+            using int_type = as_integer_t<T>;
+            using i_type = batch<int_type, A>;
            i_type m1f = constants::mask1frexp<batch_type>();
-            i_type r1 = m1f & ::xsimd::bitwise_cast<i_type>(self);
-            batch_type x = self & ::xsimd::bitwise_cast<batch_type>(~m1f);
+            i_type r1 = m1f & ::xsimd::bitwise_cast<int_type>(self);
+            batch_type x = self & ::xsimd::bitwise_cast<T>(~m1f);
            exp = (r1 >> constants::nmb<batch_type>()) - constants::maxexponentm1<batch_type>();
            exp = select(batch_bool_cast<typename i_type::value_type>(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0)));
-            return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<batch_type>(constants::mask2frexp<batch_type>()), batch_type(0.));
+            return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<T>(constants::mask2frexp<batch_type>()), batch_type(0.));
        }

        // from bool
@ -1058,7 +1059,7 @@ namespace xsimd
            using itype = as_integer_t<batch_type>;
            itype ik = other + constants::maxexponent<T>();
            ik = ik << constants::nmb<T>();
-            return self * ::xsimd::bitwise_cast<batch_type>(ik);
+            return self * ::xsimd::bitwise_cast<T>(ik);
        }

        // lgamma
@ -1383,7 +1384,8 @@ namespace xsimd
        inline batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<float, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
            batch_type x = self;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
@ -1391,15 +1393,15 @@ namespace xsimd
            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
            if (any(test))
            {
-                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(23), k);
+                k = select(batch_bool_cast<int_type>(test), k - i_type(23), k);
                x = select(test, x * batch_type(8388608ul), x);
            }
 #endif
-            i_type ix = ::xsimd::bitwise_cast<i_type>(x);
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
            ix += 0x3f800000 - 0x3f3504f3;
            k += (ix >> 23) - 0x7f;
            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
-            x = ::xsimd::bitwise_cast<batch_type>(ix);
+            x = ::xsimd::bitwise_cast<float>(ix);
            batch_type f = --x;
            batch_type s = f / (batch_type(2.) + f);
            batch_type z = s * s;
@ -1422,17 +1424,18 @@ namespace xsimd
        inline batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<double, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;

            batch_type x = self;
-            i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
 #ifndef XSIMD_NO_DENORMALS
            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
            if (any(test))
            {
-                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
                x = select(test, x * batch_type(18014398509481984ull), x);
            }
 #endif
@ -1440,7 +1443,7 @@ namespace xsimd
            k += (hx >> 20) - 0x3ff;
            batch_type dk = to_float(k);
            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
-            x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));

            batch_type f = --x;
            batch_type hfsq = batch_type(0.5) * f * f;
@ -1471,7 +1474,8 @@ namespace xsimd
        inline batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<float, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
            batch_type x = self;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
@ -1479,15 +1483,15 @@ namespace xsimd
            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
            if (any(test))
            {
-                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(25), k);
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
                x = select(test, x * batch_type(33554432ul), x);
            }
 #endif
-            i_type ix = ::xsimd::bitwise_cast<i_type>(x);
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
            ix += 0x3f800000 - 0x3f3504f3;
            k += (ix >> 23) - 0x7f;
            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
-            x = ::xsimd::bitwise_cast<batch_type>(ix);
+            x = ::xsimd::bitwise_cast<float>(ix);
            batch_type f = --x;
            batch_type s = f / (batch_type(2.) + f);
            batch_type z = s * s;
@ -1510,9 +1514,10 @@ namespace xsimd
        inline batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<double, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
            batch_type x = self;
-            i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
 #ifndef XSIMD_NO_DENORMALS
@ -1526,7 +1531,7 @@ namespace xsimd
            hx += 0x3ff00000 - 0x3fe6a09e;
            k += (hx >> 20) - 0x3ff;
            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
-            x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
            batch_type f = --x;
            batch_type s = f / (batch_type(2.) + f);
            batch_type z = s * s;
@ -1536,7 +1541,7 @@ namespace xsimd
            batch_type R = t2 + t1;
            batch_type hfsq = batch_type(0.5) * f * f;
            batch_type hi = f - hfsq;
-            hi = hi & ::xsimd::bitwise_cast<batch_type>((constants::allbits<i_type>() << 32));
+            hi = hi & ::xsimd::bitwise_cast<double>((constants::allbits<i_type>() << 32));
            batch_type lo = fma(s, hfsq + R, f - hi - hfsq);
            batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
            batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
@ -1591,7 +1596,8 @@ namespace xsimd
                ivln10lo(-3.1689971365e-05f),
                log10_2hi(3.0102920532e-01f),
                log10_2lo(7.9034151668e-07f);
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
            batch_type x = self;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
@ -1599,15 +1605,15 @@ namespace xsimd
            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
            if (any(test))
            {
-                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(25), k);
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
                x = select(test, x * batch_type(33554432ul), x);
            }
 #endif
-            i_type ix = ::xsimd::bitwise_cast<i_type>(x);
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
            ix += 0x3f800000 - 0x3f3504f3;
            k += (ix >> 23) - 0x7f;
            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
-            x = ::xsimd::bitwise_cast<batch_type>(ix);
+            x = ::xsimd::bitwise_cast<float>(ix);
            batch_type f = --x;
            batch_type s = f / (batch_type(2.) + f);
            batch_type z = s * s;
@ -1618,7 +1624,7 @@ namespace xsimd
            batch_type dk = to_float(k);
            batch_type hfsq = batch_type(0.5) * f * f;
            batch_type hibits = f - hfsq;
-            hibits &= ::xsimd::bitwise_cast<batch_type>(i_type(0xfffff000));
+            hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
            batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq);
            batch_type r = fma(dk, log10_2hi,
                               fma(hibits, ivln10hi,
@ -1641,23 +1647,24 @@ namespace xsimd
                ivln10lo(2.50829467116452752298e-11),
                log10_2hi(3.01029995663611771306e-01),
                log10_2lo(3.69423907715893078616e-13);
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
            batch_type x = self;
-            i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
            i_type k(0);
            auto isnez = (self != batch_type(0.));
 #ifndef XSIMD_NO_DENORMALS
            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
            if (any(test))
            {
-                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
                x = select(test, x * batch_type(18014398509481984ull), x);
            }
 #endif
            hx += 0x3ff00000 - 0x3fe6a09e;
            k += (hx >> 20) - 0x3ff;
            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
-            x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
            batch_type f = --x;
            batch_type dk = to_float(k);
            batch_type s = f / (batch_type(2.) + f);
@ -1668,7 +1675,7 @@ namespace xsimd
            batch_type R = t2 + t1;
            batch_type hfsq = batch_type(0.5) * f * f;
            batch_type hi = f - hfsq;
-            hi = hi & ::xsimd::bitwise_cast<batch_type>(constants::allbits<i_type>() << 32);
+            hi = hi & ::xsimd::bitwise_cast<double>(constants::allbits<i_type>() << 32);
            batch_type lo = f - hi - hfsq + s * (hfsq + R);
            batch_type val_hi = hi * ivln10hi;
            batch_type y = dk * log10_2hi;
@ -1705,14 +1712,15 @@ namespace xsimd
        inline batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<float, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
            const batch_type uf = self + batch_type(1.);
            auto isnez = (uf != batch_type(0.));
-            i_type iu = ::xsimd::bitwise_cast<i_type>(uf);
+            i_type iu = ::xsimd::bitwise_cast<int_type>(uf);
            iu += 0x3f800000 - 0x3f3504f3;
            i_type k = (iu >> 23) - 0x7f;
            iu = (iu & i_type(0x007fffff)) + 0x3f3504f3;
-            batch_type f = --(::xsimd::bitwise_cast<batch_type>(iu));
+            batch_type f = --(::xsimd::bitwise_cast<float>(iu));
            batch_type s = f / (batch_type(2.) + f);
            batch_type z = s * s;
            batch_type w = z * z;
@ -1736,16 +1744,17 @@ namespace xsimd
        inline batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
        {
            using batch_type = batch<double, A>;
-            using i_type = as_integer_t<batch_type>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
            const batch_type uf = self + batch_type(1.);
            auto isnez = (uf != batch_type(0.));
-            i_type hu = ::xsimd::bitwise_cast<i_type>(uf) >> 32;
+            i_type hu = ::xsimd::bitwise_cast<int_type>(uf) >> 32;
            hu += 0x3ff00000 - 0x3fe6a09e;
            i_type k = (hu >> 20) - 0x3ff;
            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
            batch_type c = select(batch_bool_cast<double>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
            hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e;
-            batch_type f = ::xsimd::bitwise_cast<batch_type>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(uf)));
+            batch_type f = ::xsimd::bitwise_cast<double>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(uf)));
            f = --f;
            batch_type hfsq = batch_type(0.5) * f * f;
            batch_type s = f / (batch_type(2.) + f);
@ -1897,13 +1906,13 @@ namespace xsimd

                static inline batch_type next(const batch_type& b) noexcept
                {
-                    batch_type n = ::xsimd::bitwise_cast<batch_type>(::xsimd::bitwise_cast<int_batch>(b) + int_type(1));
+                    batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
                    return select(b == constants::infinity<batch_type>(), b, n);
                }

                static inline batch_type prev(const batch_type& b) noexcept
                {
-                    batch_type p = ::xsimd::bitwise_cast<batch_type>(::xsimd::bitwise_cast<int_batch>(b) - int_type(1));
+                    batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
                    return select(b == constants::minusinfinity<batch_type>(), b, p);
                }
            };
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@ -159,7 +159,7 @@ namespace xsimd
        template <class A, class T_out, class T_in>
        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
        {
-            return { bitwise_cast<batch<T_out, A>>(batch<T_in, A>(self.data)).data };
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
        }

        // bitwise_and
@ -1493,8 +1493,8 @@ namespace xsimd
                                                  V7> const& mask,
                                   requires_arch<avx>) noexcept
        {
-            return bitwise_cast<batch<T, A>>(
-                swizzle(bitwise_cast<batch<float, A>>(self), mask));
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
        }

        template <class A,
@ -1509,8 +1509,8 @@ namespace xsimd
                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
                requires_arch<avx>) noexcept
        {
-            return bitwise_cast<batch<T, A>>(
-                swizzle(bitwise_cast<batch<double, A>>(self), mask));
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
        }

        // trunc
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@ -574,7 +574,17 @@ namespace xsimd
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
+                __m256i res_lo = _mm256_mullo_epi16(self, other);
+                __m256i other_hi = _mm256_srli_epi16(other, 8);
+                __m256i self_hi = _mm256_and_si256(self, mask_hi);
+                __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
+                __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
+                return res;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
            {
                return _mm256_mullo_epi16(self, other);
            }
@ -852,7 +862,7 @@ namespace xsimd
        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
        {
-            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
        }
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
@ -862,7 +872,7 @@ namespace xsimd
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
        {
-            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
        }

        // zip_hi
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@ -551,7 +551,7 @@ namespace xsimd
        template <class A, uint16_t... Vs>
        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
        }

        template <class A, uint8_t... Vs>
@ -563,7 +563,7 @@ namespace xsimd
        template <class A, uint8_t... Vs>
        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
        }

        // zip_hi
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@ -768,6 +768,45 @@ namespace xsimd
            return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
        }

+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_pd(x, y, z);
+        }
+
        // from bool
        template <class A, class T>
        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
@ -1763,7 +1802,7 @@ namespace xsimd
        template <class A, uint64_t... Vs>
        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx512f {}));
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
        }

        template <class A, uint32_t... Vs>
@ -1775,7 +1814,7 @@ namespace xsimd
        template <class A, uint32_t... Vs>
        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx512f {}));
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
        }

        namespace detail
@ -1833,7 +1872,7 @@ namespace xsimd
        inline batch<int16_t, A>
        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512f {}));
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
        }

        // trunc
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@ -452,59 +452,114 @@ namespace xsimd
         * load *
         ********/

+        // It is not possible to use a call to A::alignment() here, so use an
+        // immediate instead.
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_u8((uint8_t*)src);
+            return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
        }

        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_s8((int8_t*)src);
+            return xsimd_aligned_load(vld1q_s8, int8_t*, src);
        }

        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_u16((uint16_t*)src);
+            return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_s16((int16_t*)src);
+            return xsimd_aligned_load(vld1q_s16, int16_t*, src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_u32((uint32_t*)src);
+            return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_s32((int32_t*)src);
+            return xsimd_aligned_load(vld1q_s32, int32_t*, src);
        }
        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_u64((uint64_t*)src);
+            return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
        }
        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return vld1q_s64((int64_t*)src);
+            return xsimd_aligned_load(vld1q_s64, int64_t*, src);
        }

        template <class A>
        inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
        {
-            return vld1q_f32(src);
+            return xsimd_aligned_load(vld1q_f32, float*, src);
        }

-        template <class A, class T>
+#undef xsimd_aligned_load
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
        {
-            return load_aligned<A>(src, convert<T>(), A {});
+            return vld1q_u8((uint8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s8((int8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u16((uint16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s16((int16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u32((uint32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s32((int32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u64((uint64_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s64((int64_t*)src);
+        }
+
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return vld1q_f32(src);
        }

        /*********
@ -2526,9 +2581,9 @@ namespace xsimd
                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
                {
                    const auto left = vdupq_n_u8(0);
-                    const auto right = bitwise_cast<batch<uint8_t, A>>(x).data;
+                    const auto right = bitwise_cast<uint8_t>(x).data;
                    const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
-                    return bitwise_cast<batch<T, A>>(res);
+                    return bitwise_cast<T>(res);
                }
            };

@ -2558,10 +2613,10 @@ namespace xsimd
                template <class A, class T>
                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
                {
-                    const auto left = bitwise_cast<batch<uint8_t, A>>(x).data;
+                    const auto left = bitwise_cast<uint8_t>(x).data;
                    const auto right = vdupq_n_u8(0);
                    const batch<uint8_t, A> res(vextq_u8(left, right, N));
-                    return bitwise_cast<batch<T, A>>(res);
+                    return bitwise_cast<T>(res);
                }
            };

--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@ -133,18 +133,26 @@ namespace xsimd
        /********
         * load *
         ********/
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif

        template <class A>
        inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
        {
-            return vld1q_f64(src);
+            return xsimd_aligned_load(vld1q_f64, double*, src);
        }

        template <class A>
        inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
        {
-            return load_aligned<A>(src, convert<double>(), A {});
+            return vld1q_f64(src);
        }
+#undef xsimd_aligned_load

        /*********
         * store *
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@ -441,7 +441,16 @@ namespace xsimd
        return !(x0 == x1);
    }

-#if defined(_GNU_SOURCE) && !defined(__APPLE__) && !defined(__MINGW32__) && !defined(__ANDROID__)
+#if defined(__APPLE__)
+    inline float exp10(const float& x) noexcept
+    {
+        return __exp10f(x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return __exp10(x);
+    }
+#elif defined(__GLIBC__)
    inline float exp10(const float& x) noexcept
    {
        return ::exp10f(x);
@ -450,14 +459,24 @@ namespace xsimd
    {
        return ::exp10(x);
    }
-#endif
-
+#elif defined(_WIN32)
    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
    inline T exp10(const T& x) noexcept
    {
-        // FIXME: very inefficient
+        // Very inefficient but other implementations give incorrect results
+        // on Windows
        return std::pow(T(10), x);
    }
+#else
+    inline float exp10(const float& x) noexcept
+    {
+        return std::exp(0x1.26bb1cp+1f * x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return std::exp(0x1.26bb1bbb55516p+1 * x);
+    }
+#endif

    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
    inline auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@ -23,8 +23,8 @@ namespace xsimd
    template <class batch_type, bool... Values>
    struct batch_bool_constant;

-    template <class B, class T, class A>
-    inline B bitwise_cast(batch<T, A> const& x) noexcept;
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;

    template <class batch_type, typename batch_type::value_type... Values>
    struct batch_constant;
@ -140,7 +140,7 @@ namespace xsimd
        template <class A, class T_out, class T_in>
        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
        {
-            return { bitwise_cast<batch<T_out, A>>(batch<T_in, A>(self.data)).data };
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
        }

        // bitwise_and
@ -1185,7 +1185,7 @@ namespace xsimd
            batch<T, A> acc2 = max(acc1, step2);
            if (sizeof(T) == 2)
                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
            batch<T, A> acc3 = max(acc2, step3);
            return acc3.get(0);
        }
@ -1207,7 +1207,7 @@ namespace xsimd
            batch<T, A> acc2 = min(acc1, step2);
            if (sizeof(T) == 2)
                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
            batch<T, A> acc3 = min(acc2, step3);
            return acc3.get(0);
        }
@ -1600,7 +1600,7 @@ namespace xsimd
        template <class A, uint64_t V0, uint64_t V1>
        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
        {
-            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, sse2 {}));
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@ -1613,7 +1613,7 @@ namespace xsimd
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
        {
-            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, sse2 {}));
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
        }

        // zip_hi
--- a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@ -118,7 +118,7 @@ namespace xsimd
        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
        {
-            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
        }

        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
@ -132,7 +132,7 @@ namespace xsimd
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
        {
-            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));
        }

    }
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@ -289,45 +289,54 @@
 #ifdef _MSC_VER

 #if XSIMD_WITH_AVX512
+
 #undef XSIMD_WITH_AVX2
 #define XSIMD_WITH_AVX2 1
+
 #endif

 #if XSIMD_WITH_AVX2
+
 #undef XSIMD_WITH_AVX
 #define XSIMD_WITH_AVX 1
+
 #undef XSIMD_WITH_FMA3_AVX
 #define XSIMD_WITH_FMA3_AVX 1
+
 #undef XSIMD_WITH_FMA3_AVX2
 #define XSIMD_WITH_FMA3_AVX2 1
+
 #endif

 #if XSIMD_WITH_AVX
-#undef XSIMD_WITH_SSE4_2
-#define XSIMD_WITH_SSE4_2 1
-#endif

-#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
 #undef XSIMD_WITH_SSE4_2
 #define XSIMD_WITH_SSE4_2 1
+
 #endif

 #if XSIMD_WITH_SSE4_2
+
 #undef XSIMD_WITH_SSE4_1
 #define XSIMD_WITH_SSE4_1 1
+
 #endif

 #if XSIMD_WITH_SSE4_1
+
 #undef XSIMD_WITH_SSSE3
 #define XSIMD_WITH_SSSE3 1
+
 #endif

 #if XSIMD_WITH_SSSE3
+
 #undef XSIMD_WITH_SSE3
 #define XSIMD_WITH_SSE3 1
+
 #endif

-#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#if XSIMD_WITH_SSE3 || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
 #undef XSIMD_WITH_SSE2
 #define XSIMD_WITH_SSE2 1
 #endif
--- a/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
@ -71,6 +71,21 @@ namespace xsimd
    template <class C>
    using container_alignment_t = typename container_alignment<C>::type;

+    /*********************
+     * alignment checker *
+     *********************/
+
+    /**
+     * Checks whether pointer \c ptr is aligned according the alignment
+     * requirements of \c Arch.
+     * @return true if the alignment requirements are met
+     */
+    template <class Arch = default_arch>
+    inline bool is_aligned(void const* ptr)
+    {
+        return (reinterpret_cast<uintptr_t>(ptr) % static_cast<uintptr_t>(Arch::alignment())) == 0;
+    }
+
 }

 #endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@ -314,11 +314,12 @@ namespace xsimd
     * @param x batch of \c T_in
     * @return \c x reinterpreted as \c T_out
     */
-    template <class B, class T, class A>
-    inline B bitwise_cast(batch<T, A> const& x) noexcept
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
    {
-        detail::static_check_supported_config<T, A>();
-        return kernel::bitwise_cast<A>(x, B {}, A {});
+        detail::static_check_supported_config<T_in, A>();
+        detail::static_check_supported_config<T_out, A>();
+        return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
    }

    /**
@ -886,10 +887,10 @@ namespace xsimd
     * @return the result of the reduction, as a scalar.
     */
    template <class T, class A, class F>
-    inline T reduce(F&& r, batch<T, A> const& x) noexcept
+    inline T reduce(F&& f, batch<T, A> const& x) noexcept
    {
        detail::static_check_supported_config<T, A>();
-        return kernel::detail::reduce(std::forward<F>(r), x, std::integral_constant<unsigned, batch<T, A>::size>());
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
    }

    /**
--- a/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
@ -16,6 +16,13 @@

 #include "xsimd_batch.hpp"

+/**
+ * high level type traits
+ *
+ * @defgroup batch_traits Type traits
+ *
+ **/
+
 namespace xsimd
 {

@ -205,11 +212,18 @@ namespace xsimd
    template <class T1, class T2, class A = default_arch>
    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;

-    /************
-     * is_batch *
-     ************/
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct is_batch;

-    template <class V>
+    template <class T>
    struct is_batch : std::false_type
    {
    };
@ -219,11 +233,16 @@ namespace xsimd
    {
    };

-    /*****************
-     * is_batch_bool *
-     *****************/
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch_bool<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */

-    template <class V>
+    template <class T>
    struct is_batch_bool : std::false_type
    {
    };
@ -233,11 +252,16 @@ namespace xsimd
    {
    };

-    /********************
-     * is_batch_complex *
-     ********************/
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<std::complex<...>>
+     * types and from @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */

-    template <class V>
+    template <class T>
    struct is_batch_complex : std::false_type
    {
    };
@ -246,6 +270,50 @@ namespace xsimd
    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
    {
    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch<T>::value and to @c T otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct scalar_type
+    {
+        using type = T;
+    };
+    template <class T, class A>
+    struct scalar_type<batch<T, A>>
+    {
+        using type = T;
+    };
+
+    template <class T>
+    using scalar_type_t = typename scalar_type<T>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch_bool<T>::value and to @c bool otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct mask_type
+    {
+        using type = bool;
+    };
+    template <class T, class A>
+    struct mask_type<batch<T, A>>
+    {
+        using type = typename batch<T, A>::batch_bool_type;
+    };
+
+    template <class T>
+    using mask_type_t = typename mask_type<T>::type;
 }

 #endif
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@ -10,8 +10,8 @@ origin:

  url: https://github.com/QuantStack/xsimd

-  release: 75b043b8e031f1ada8053fe80d5ba635e2a75588 (2023-01-05T06:45:23Z).
-  revision: 75b043b8e031f1ada8053fe80d5ba635e2a75588
+  release: e8f209c3397c8a866be2312682689a04e4abfd66 (2023-02-27T06:32:46Z).
+  revision: e8f209c3397c8a866be2312682689a04e4abfd66

  license: BSD-3-Clause