Bug 1815790 - Replace intgemm by gemmology r=yury

gemmology is a rewrite of intgemm based on xsimd (that we already
vendor), with a focus on the API we actually use.

It also supports sse2 and has a decent implementation for arm32 and
aarch64.

Differential Revision: https://phabricator.services.mozilla.com/D171265
This commit is contained in:
serge-sans-paille 2023-03-28 13:22:27 +00:00
Родитель b3fa955c10
Коммит 194cb113aa
90 изменённых файлов: 2238 добавлений и 23699 удалений

Просмотреть файл

@ -3174,3 +3174,4 @@ set_config("SSE2_FLAGS", ["-msse2"])
set_config("SSSE3_FLAGS", ["-mssse3"])
set_config("SSE4_2_FLAGS", ["-msse4.2"])
set_config("FMA_FLAGS", ["-mfma"])
set_config("AVX2_FLAGS", ["-mavx2"])

Просмотреть файл

@ -11,7 +11,7 @@
#include "mozilla/CheckedInt.h"
#include "mozilla/IntegerPrintfMacros.h"
#include <intgemm.h>
#include <gemmology_fwd.h>
#include "js/ErrorReport.h"
#include "js/HeapAPI.h"
@ -20,6 +20,26 @@
#include "wasm/WasmInstance.h"
#include "wasm/WasmLog.h"
#if defined(USE_AVX2)
# define SUPPORTED_ARCHS \
xsimd::arch_list<xsimd::avx2, xsimd::ssse3, xsimd::sse2>
#elif defined(USE_SSSE3)
# define SUPPORTED_ARCHS xsimd::arch_list<xsimd::ssse3, xsimd::sse2>
#elif defined(USE_SSE2)
# define SUPPORTED_ARCHS xsimd::arch_list<xsimd::sse2>
#else
# error no supported architecture
#endif
// Dispatch *at runtime* based on run-time hardware and compile-time
// architectures.
//
// FIXME: Ideally we would not run the dispatch code at each function call.
#define GEMMOLOGY_DISPATCH(FUNC) \
xsimd::dispatch<SUPPORTED_ARCHS>([](auto arch, auto... args) { \
return gemmology::Engine<decltype(arch)>::FUNC(args...); \
})
struct JSContext;
static constexpr uint32_t ARRAY_ALIGNMENT = 64;
@ -118,10 +138,10 @@ int32_t js::intgemm::IntrI8PrepareB(wasm::Instance* instance,
// Actual call to the 3rd party library (intgemm) for PrepareB
uint8_t* inputMatrixBPtr = &memBase[inputMatrixB];
uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
::intgemm::Int8::PrepareB((const float*)inputMatrixBPtr,
(int8_t*)outputMatrixBPtr,
(float)scale, // Quant Mult
rowsB, colsB);
GEMMOLOGY_DISPATCH(PrepareB)
((const float*)inputMatrixBPtr, (int8_t*)outputMatrixBPtr,
(float)scale, // Quant Mult
rowsB, colsB);
return 0;
}
@ -160,10 +180,10 @@ int32_t js::intgemm::IntrI8PrepareBFromTransposed(
// Actual call to the 3rd party library (intgemm) for PrepareBTransposed
uint8_t* inputMatrixBTransposedPtr = &memBase[inputMatrixBTransposed];
uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
::intgemm::Int8::PrepareBTransposed((const float*)inputMatrixBTransposedPtr,
(int8_t*)outputMatrixBPtr,
(float)scale, // Quant Mult
rowsB, colsB);
GEMMOLOGY_DISPATCH(PrepareBTransposed)
((const float*)inputMatrixBTransposedPtr, (int8_t*)outputMatrixBPtr,
(float)scale, // Quant Mult
rowsB, colsB);
return 0;
}
@ -202,9 +222,9 @@ int32_t js::intgemm::IntrI8PrepareBFromQuantizedTransposed(
uint8_t* inputMatrixBQuantizedTransposedPtr =
&memBase[inputMatrixBQuantizedTransposed];
uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
::intgemm::Int8::PrepareBQuantizedTransposed(
(const int8_t*)inputMatrixBQuantizedTransposedPtr,
(int8_t*)outputMatrixBPtr, rowsB, colsB);
GEMMOLOGY_DISPATCH(PrepareBQuantizedTransposed)
((const int8_t*)inputMatrixBQuantizedTransposedPtr, (int8_t*)outputMatrixBPtr,
rowsB, colsB);
return 0;
}
@ -243,9 +263,8 @@ int32_t js::intgemm::IntrI8PrepareA(wasm::Instance* instance,
// Actual call to the 3rd party library (intgemm)
uint8_t* inputMatrixAPtr = &memBase[inputMatrixA];
uint8_t* outputMatrixAPtr = &memBase[outputMatrixA];
::intgemm::Int8Shift::PrepareA((const float*)inputMatrixAPtr,
(int8_t*)outputMatrixAPtr, scale, rowsA,
colsA);
GEMMOLOGY_DISPATCH(Shift::PrepareA)
((const float*)inputMatrixAPtr, outputMatrixAPtr, scale, rowsA, colsA);
return 0;
}
@ -290,10 +309,10 @@ int32_t js::intgemm::IntrI8PrepareBias(
uint8_t* outputPtr = &memBase[output];
float unquantFactor =
(-1) * ((127.0f / scaleA) * (127.0f / scaleB)) / (127.0f);
::intgemm::Int8Shift::PrepareBias(
(const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
GEMMOLOGY_DISPATCH(Shift::PrepareBias)
((const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
return 0;
}
@ -347,12 +366,12 @@ int32_t js::intgemm::IntrI8MultiplyAndAddBias(
uint8_t* inputBiasPreparedPtr = &memBase[inputBiasPrepared];
uint8_t* outputPtr = &memBase[output];
float unquantFactor = unquantMultiplier / (scaleA * scaleB);
::intgemm::Int8Shift::Multiply(
(const int8_t*)inputMatrixAPreparedPtr,
(const int8_t*)inputMatrixBPreparedPtr, rowsA, width, colsB,
::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
unquantFactor, (const float*)inputBiasPreparedPtr,
(float*)outputPtr));
GEMMOLOGY_DISPATCH(Shift::Multiply)
(inputMatrixAPreparedPtr, (const int8_t*)inputMatrixBPreparedPtr, rowsA,
width, colsB,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite(
unquantFactor, (const float*)inputBiasPreparedPtr, (float*)outputPtr));
return 0;
}
@ -401,9 +420,12 @@ int32_t js::intgemm::IntrI8SelectColumnsOfB(wasm::Instance* instance,
uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
uint8_t* colIndexListPtr = &memBase[colIndexList];
uint8_t* outputPtr = &memBase[output];
::intgemm::Int8::SelectColumnsB(
(const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
(const uint32_t*)colIndexListPtr,
(const uint32_t*)colIndexListPtr + sizeColIndexList);
GEMMOLOGY_DISPATCH(SelectColumnsB)
((const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
(const uint32_t*)colIndexListPtr,
(const uint32_t*)colIndexListPtr + sizeColIndexList);
return 0;
}
#undef GEMMOLOGY_DISPATCH
#undef SUPPORTED_ARCHS

Просмотреть файл

@ -1,18 +1,18 @@
This directory contains build files for the intgemm reference implementation.
The actual library source is in $TOPSRCDIR/third_party/intgemm/
This directory contains build files for the gemmology reference implementation.
The actual library source is in $TOPSRCDIR/third_party/gemmology/
Any patches or additional configuration to be applied to the
upstream source should be kept in $TOPSRCDIR/third_party/intgemm/.
upstream source should be kept in $TOPSRCDIR/third_party/gemmology/.
To update the library source and build config files, execute
./mach vendor js/src/intgemm/moz.yaml
./mach vendor third_party/gemmology/moz.yaml
To update to a specific upstream git tag or commit, use
./mach vendor js/src/intgemm/moz.yaml -r <commit>
./mach vendor third_party/gemmology/moz.yaml -r <commit>
The upstream git repository is https://github.com/kpu/intgemm
The upstream git repository is https://github.com/serge-sans-paille/gemmology
To view the information about the current version, check the
'origin' section of moz.yaml.
'origin' section of moz.yaml.

Просмотреть файл

@ -1,24 +0,0 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
def main(output, intgemm_config):
with open(intgemm_config, "r") as f:
config = f.read()
# Enable intel AVX2 hardware extension specific code to allow using AVX2 at run time
# if target cpu supports it
config = config.replace(
"#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX2",
"#define INTGEMM_COMPILER_SUPPORTS_AVX2",
)
# Disable more advanced intel hardware extensions for now because base-toolchain compiler
# versions aren't able to compile them
config = config.replace("#cmakedefine", "#undef")
output.write(config)
output.close()
return 0

Просмотреть файл

@ -15,19 +15,32 @@ with Files("*"):
LOCAL_INCLUDES += [
"!..",
"..",
"/third_party/intgemm/intgemm",
"/third_party/gemmology",
"/third_party/xsimd/include",
]
if CONFIG["INTEL_ARCHITECTURE"]:
DEFINES["USE_SSE2"] = True
SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"]
SOURCES["/third_party/gemmology/kernels/GemmologyEngineSSE2.cpp"].flags += CONFIG[
"SSE2_FLAGS"
]
if CONFIG["SSSE3_FLAGS"]:
DEFINES["USE_SSSE3"] = True
SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"]
SOURCES[
"/third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp"
].flags += CONFIG["SSSE3_FLAGS"]
if CONFIG["AVX2_FLAGS"]:
DEFINES["USE_AVX2"] = True
SOURCES += ["/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"]
SOURCES[
"/third_party/gemmology/kernels/GemmologyEngineAVX2.cpp"
].flags += CONFIG["AVX2_FLAGS"]
SOURCES += [
"/third_party/intgemm/intgemm/intgemm.cc",
"IntegerGemmIntrinsic.cpp",
]
GeneratedFile(
"intgemm/intgemm_config.h",
script="enable_intel_extensions.py",
inputs=["/third_party/intgemm/intgemm/intgemm_config.h.in"],
)
# We allow warnings for third-party code that can be updated from upstream.
AllowCompilerWarnings()

Просмотреть файл

@ -1,47 +0,0 @@
# Version of this schema
schema: 1
bugzilla:
# Bugzilla product and component for this directory and subdirectories
product: Core
component: "JavaScript: WebAssembly"
# Document the source of externally hosted code
origin:
# Short name of the package/library
name: intgemm
description: integer matrix multiplication
# Full URL for the package's homepage/etc
# Usually different from repository url
url: https://github.com/kpu/intgemm
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit fc3a614351ce6e667197307d97f45db5265c96af (2022-02-09T14:56:05Z).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: fc3a614351ce6e667197307d97f45db5265c96af
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
# Multiple licenses can be specified (as a YAML list)
# A "LICENSE" file must exist containing the full license text
license: MIT
vendoring:
url: https://github.com/kpu/intgemm
source-hosting: github
vendor-directory: third_party/intgemm
exclude:
- build/.gitattributes
- build/.gitignore
update-actions:
- action: delete-path
path: '{yaml_dir}/config'

Просмотреть файл

@ -179,7 +179,8 @@ rsync_filter_list = """
- /third_party/python/gyp
+ /third_party/python/**
+ /third_party/rust/**
+ /third_party/intgemm/**
+ /third_party/gemmology/**
+ /third_party/xsimd/**
+ /layout/tools/reftest/reftest/**
+ /testing/mach_commands.py

22
third_party/gemmology/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,22 @@
MIT License
Copyright (c) 2023 Serge Guelton
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--
The original 8-bit code came from:
MIT License
Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

1335
third_party/gemmology/gemmology.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

218
third_party/gemmology/gemmology_fwd.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,218 @@
/***************************************************************
* _ *
* | | *
* __ _ ___ _ __ ___ _ __ ___ ___ | | ___ __ _ _ _ *
* / _` |/ _ \ '_ ` _ \| '_ ` _ \ / _ \| |/ _ \ / _` | | | | *
* | (_| | __/ | | | | | | | | | | (_) | | (_) | (_| | |_| | *
* \__, |\___|_| |_| |_|_| |_| |_|\___/|_|\___/ \__, |\__, | *
* __/ | __/ | __/ | *
* |___/ |___/ |___/ *
* *
* version 0.1 *
***************************************************************/
#ifndef GEMMOLOGY_FWD_H
#define GEMMOLOGY_FWD_H
#include <cstdint>
#include <cstring>
#include <tuple>
#include <xsimd/xsimd.hpp>
namespace gemmology {
namespace callbacks {
struct Unquantize {
float unquant_mult;
template <class Arch>
xsimd::batch<float, Arch> operator()(xsimd::batch<int32_t, Arch> total, size_t, size_t, size_t);
template <class Arch>
std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> operator()(
std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
total,
size_t, size_t, size_t);
};
struct AddBias {
const float *bias_addr;
template <class Arch>
xsimd::batch<float, Arch> operator()(xsimd::batch<float, Arch> total, size_t, size_t col_idx,
size_t);
template <class Arch>
std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>>
operator()(
std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> total,
size_t, size_t col_idx, size_t);
};
struct Write {
float *output_addr;
Write(float *o) : output_addr(o) {}
template <class Arch>
void operator()(xsimd::batch<float, Arch> result, size_t row_idx,
size_t col_idx, size_t col_size);
template <class Arch>
void operator()(xsimd::batch<int32_t, Arch> result, size_t row_idx,
size_t col_idx, size_t col_size);
template <class Arch>
void operator()(
std::tuple<xsimd::batch<float, Arch>, xsimd::batch<float, Arch>> result,
size_t row_idx, size_t col_idx, size_t col_size);
template <class Arch>
void operator()(
std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
result,
size_t row_idx, size_t col_idx, size_t col_size);
};
struct UnquantizeAndWrite {
Unquantize unquantize;
Write write;
UnquantizeAndWrite(float factor, float *output)
: unquantize{factor}, write{output} {}
template <class T>
void operator()(T const &total, size_t row_idx, size_t col_idx,
size_t col_size);
};
struct UnquantizeAndAddBiasAndWrite {
Unquantize unquantize;
AddBias add_bias;
Write write;
UnquantizeAndAddBiasAndWrite(float factor, const float *bias, float *output)
: unquantize{factor}, add_bias{bias}, write{output} {}
template <class T>
void operator()(T const &total, size_t row_idx, size_t col_idx,
size_t col_size);
};
} // namespace callbacks
//
// Arch-specific implementation of each routine
//
template <class Arch> struct Engine {
static void QuantizeU(const float *input, uint8_t *output, float quant_mult,
size_t size);
static void Quantize(const float *const input, int8_t *const output,
float quant_mult, size_t size);
template <typename IntegerTy>
static void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
const IntegerTy *cols_begin,
const IntegerTy *cols_end);
static void PrepareBTransposed(const float *input, int8_t *output,
float quant_mult, size_t cols, size_t rows);
static void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
size_t cols, size_t rows);
static void PrepareB(const float *input, int8_t *output_shadow,
float quant_mult, size_t rows, size_t cols);
static void PrepareA(const float *input, int8_t *output, float quant_mult,
size_t rows, size_t cols);
struct Shift {
static void PrepareA(const float *input, uint8_t *output, float quant_mult,
size_t rows, size_t cols);
template <class Callback>
static void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
size_t width, size_t B_cols, Callback callback);
template <class Callback>
static void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
Callback C);
};
};
//
// Top-level wrappers that mostly match intgemm API
//
template <class Arch = xsimd::default_arch>
inline void QuantizeU(const float *input, uint8_t *output, float quant_mult,
size_t size) {
return Engine<Arch>::QuantizeU(input, output, quant_mult, size);
}
template <class Arch = xsimd::default_arch>
inline void Quantize(const float *const input, int8_t *const output,
float quant_mult, size_t size) {
return Engine<Arch>::Quantize(input, output, quant_mult, size);
}
template <class Arch = xsimd::default_arch, typename IntegerTy>
inline void SelectColumnsB(const int8_t *input, int8_t *output, size_t rows,
const IntegerTy *cols_begin,
const IntegerTy *cols_end) {
return Engine<Arch>::SelectColumnsB(input, output, rows, cols_begin,
cols_end);
}
template <class Arch = xsimd::default_arch>
inline void PrepareBTransposed(const float *input, int8_t *output,
float quant_mult, size_t cols, size_t rows) {
return Engine<Arch>::PrepareBTransposed(input, output, quant_mult, cols,
rows);
}
template <class Arch = xsimd::default_arch>
inline void PrepareBQuantizedTransposed(const int8_t *input, int8_t *output,
size_t cols, size_t rows) {
return Engine<Arch>::PrepareBQuantizedTransposed(input, output, cols, rows);
}
template <class Arch = xsimd::default_arch>
inline void PrepareB(const float *input, int8_t *output_shadow,
float quant_mult, size_t rows, size_t cols) {
return Engine<Arch>::PrepareB(input, output_shadow, quant_mult, rows, cols);
}
template <class Arch = xsimd::default_arch>
inline void PrepareA(const float *input, int8_t *output, float quant_mult,
size_t rows, size_t cols) {
return Engine<Arch>::PrepareA(input, output, quant_mult, rows, cols);
}
namespace Shift {
template <class Arch = xsimd::default_arch>
inline void PrepareA(const float *input, uint8_t *output, float quant_mult,
size_t rows, size_t cols) {
return Engine<Arch>::Shift::PrepareA(input, output, quant_mult, rows, cols);
}
template <class Arch = xsimd::default_arch, class Callback>
inline void Multiply(const uint8_t *A, const int8_t *B, size_t A_rows,
size_t width, size_t B_cols, Callback C) {
return Engine<Arch>::Shift::Multiply(A, B, A_rows, width, B_cols, C);
}
template <class Arch = xsimd::default_arch, class Callback>
inline void PrepareBias(const int8_t *B, size_t width, size_t B_cols,
Callback C) {
return Engine<Arch>::Shift::PrepareBias(B, width, B_cols, C);
}
} // namespace Shift
} // namespace gemmology
#endif

19
third_party/gemmology/kernels/GemmologyEngineAVX2.cpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,19 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <gemmology.h>
namespace gemmology {
template struct Engine<xsimd::avx2>;
template void Engine<xsimd::avx2>::SelectColumnsB(int8_t const*, int8_t*,
size_t, uint32_t const*,
uint32_t const*);
template void Engine<xsimd::avx2>::Shift::Multiply(
uint8_t const*, int8_t const*, size_t, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
template void Engine<xsimd::avx2>::Shift::PrepareBias(
int8_t const*, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
} // namespace gemmology

19
third_party/gemmology/kernels/GemmologyEngineSSE2.cpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,19 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <gemmology.h>
namespace gemmology {
template struct Engine<xsimd::sse2>;
template void Engine<xsimd::sse2>::SelectColumnsB(int8_t const*, int8_t*,
size_t, uint32_t const*,
uint32_t const*);
template void Engine<xsimd::sse2>::Shift::Multiply(
uint8_t const*, int8_t const*, size_t, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
template void Engine<xsimd::sse2>::Shift::PrepareBias(
int8_t const*, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
} // namespace gemmology

19
third_party/gemmology/kernels/GemmologyEngineSSSE3.cpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,19 @@
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* this source code form is subject to the terms of the mozilla public
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include <gemmology.h>
namespace gemmology {
template struct Engine<xsimd::ssse3>;
template void Engine<xsimd::ssse3>::SelectColumnsB(int8_t const*, int8_t*,
size_t, uint32_t const*,
uint32_t const*);
template void Engine<xsimd::ssse3>::Shift::Multiply(
uint8_t const*, int8_t const*, size_t, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
template void Engine<xsimd::ssse3>::Shift::PrepareBias(
int8_t const*, size_t, size_t,
gemmology::callbacks::UnquantizeAndAddBiasAndWrite);
} // namespace gemmology

29
third_party/gemmology/moz.yaml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,29 @@
schema: 1
bugzilla:
product: Core
component: "JavaScript: WebAssembly"
origin:
name: gemmology
description: small integer matrix multiply
url: https://github.com/serge-sans-paille/gemmology
release: e1167c52cbbfd989390e4d9515c84c88878bfe80 (2023-03-28T11:32:43Z).
revision: e1167c52cbbfd989390e4d9515c84c88878bfe80
license: MIT
vendoring:
url: https://github.com/serge-sans-paille/gemmology
source-hosting: github
tracking: commit
exclude:
- ".*"
- "*.rst"
- test
keep:
- kernels/*.cpp

175
third_party/intgemm/CMake/Catch.cmake поставляемый
Просмотреть файл

@ -1,175 +0,0 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
#[=======================================================================[.rst:
Catch
-----
This module defines a function to help use the Catch test framework.
The :command:`catch_discover_tests` discovers tests by asking the compiled test
executable to enumerate its tests. This does not require CMake to be re-run
when tests change. However, it may not work in a cross-compiling environment,
and setting test properties is less convenient.
This command is intended to replace use of :command:`add_test` to register
tests, and will create a separate CTest test for each Catch test case. Note
that this is in some cases less efficient, as common set-up and tear-down logic
cannot be shared by multiple test cases executing in the same instance.
However, it provides more fine-grained pass/fail information to CTest, which is
usually considered as more beneficial. By default, the CTest test name is the
same as the Catch name; see also ``TEST_PREFIX`` and ``TEST_SUFFIX``.
.. command:: catch_discover_tests
Automatically add tests with CTest by querying the compiled test executable
for available tests::
catch_discover_tests(target
[TEST_SPEC arg1...]
[EXTRA_ARGS arg1...]
[WORKING_DIRECTORY dir]
[TEST_PREFIX prefix]
[TEST_SUFFIX suffix]
[PROPERTIES name1 value1...]
[TEST_LIST var]
)
``catch_discover_tests`` sets up a post-build command on the test executable
that generates the list of tests by parsing the output from running the test
with the ``--list-test-names-only`` argument. This ensures that the full
list of tests is obtained. Since test discovery occurs at build time, it is
not necessary to re-run CMake when the list of tests changes.
However, it requires that :prop_tgt:`CROSSCOMPILING_EMULATOR` is properly set
in order to function in a cross-compiling environment.
Additionally, setting properties on tests is somewhat less convenient, since
the tests are not available at CMake time. Additional test properties may be
assigned to the set of tests as a whole using the ``PROPERTIES`` option. If
more fine-grained test control is needed, custom content may be provided
through an external CTest script using the :prop_dir:`TEST_INCLUDE_FILES`
directory property. The set of discovered tests is made accessible to such a
script via the ``<target>_TESTS`` variable.
The options are:
``target``
Specifies the Catch executable, which must be a known CMake executable
target. CMake will substitute the location of the built executable when
running the test.
``TEST_SPEC arg1...``
Specifies test cases, wildcarded test cases, tags and tag expressions to
pass to the Catch executable with the ``--list-test-names-only`` argument.
``EXTRA_ARGS arg1...``
Any extra arguments to pass on the command line to each test case.
``WORKING_DIRECTORY dir``
Specifies the directory in which to run the discovered test cases. If this
option is not provided, the current binary directory is used.
``TEST_PREFIX prefix``
Specifies a ``prefix`` to be prepended to the name of each discovered test
case. This can be useful when the same test executable is being used in
multiple calls to ``catch_discover_tests()`` but with different
``TEST_SPEC`` or ``EXTRA_ARGS``.
``TEST_SUFFIX suffix``
Similar to ``TEST_PREFIX`` except the ``suffix`` is appended to the name of
every discovered test case. Both ``TEST_PREFIX`` and ``TEST_SUFFIX`` may
be specified.
``PROPERTIES name1 value1...``
Specifies additional properties to be set on all tests discovered by this
invocation of ``catch_discover_tests``.
``TEST_LIST var``
Make the list of tests available in the variable ``var``, rather than the
default ``<target>_TESTS``. This can be useful when the same test
executable is being used in multiple calls to ``catch_discover_tests()``.
Note that this variable is only available in CTest.
#]=======================================================================]
#------------------------------------------------------------------------------
function(catch_discover_tests TARGET)
cmake_parse_arguments(
""
""
"TEST_PREFIX;TEST_SUFFIX;WORKING_DIRECTORY;TEST_LIST"
"TEST_SPEC;EXTRA_ARGS;PROPERTIES"
${ARGN}
)
if(NOT _WORKING_DIRECTORY)
set(_WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")
endif()
if(NOT _TEST_LIST)
set(_TEST_LIST ${TARGET}_TESTS)
endif()
## Generate a unique name based on the extra arguments
string(SHA1 args_hash "${_TEST_SPEC} ${_EXTRA_ARGS}")
string(SUBSTRING ${args_hash} 0 7 args_hash)
# Define rule to generate test list for aforementioned test executable
set(ctest_include_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_include-${args_hash}.cmake")
set(ctest_tests_file "${CMAKE_CURRENT_BINARY_DIR}/${TARGET}_tests-${args_hash}.cmake")
get_property(crosscompiling_emulator
TARGET ${TARGET}
PROPERTY CROSSCOMPILING_EMULATOR
)
add_custom_command(
TARGET ${TARGET} POST_BUILD
BYPRODUCTS "${ctest_tests_file}"
COMMAND "${CMAKE_COMMAND}"
-D "TEST_TARGET=${TARGET}"
-D "TEST_EXECUTABLE=$<TARGET_FILE:${TARGET}>"
-D "TEST_EXECUTOR=${crosscompiling_emulator}"
-D "TEST_WORKING_DIR=${_WORKING_DIRECTORY}"
-D "TEST_SPEC=${_TEST_SPEC}"
-D "TEST_EXTRA_ARGS=${_EXTRA_ARGS}"
-D "TEST_PROPERTIES=${_PROPERTIES}"
-D "TEST_PREFIX=${_TEST_PREFIX}"
-D "TEST_SUFFIX=${_TEST_SUFFIX}"
-D "TEST_LIST=${_TEST_LIST}"
-D "CTEST_FILE=${ctest_tests_file}"
-P "${_CATCH_DISCOVER_TESTS_SCRIPT}"
VERBATIM
)
file(WRITE "${ctest_include_file}"
"if(EXISTS \"${ctest_tests_file}\")\n"
" include(\"${ctest_tests_file}\")\n"
"else()\n"
" add_test(${TARGET}_NOT_BUILT-${args_hash} ${TARGET}_NOT_BUILT-${args_hash})\n"
"endif()\n"
)
if(NOT ${CMAKE_VERSION} VERSION_LESS "3.10.0")
# Add discovered tests to directory TEST_INCLUDE_FILES
set_property(DIRECTORY
APPEND PROPERTY TEST_INCLUDE_FILES "${ctest_include_file}"
)
else()
# Add discovered tests as directory TEST_INCLUDE_FILE if possible
get_property(test_include_file_set DIRECTORY PROPERTY TEST_INCLUDE_FILE SET)
if (NOT ${test_include_file_set})
set_property(DIRECTORY
PROPERTY TEST_INCLUDE_FILE "${ctest_include_file}"
)
else()
message(FATAL_ERROR
"Cannot set more than one TEST_INCLUDE_FILE"
)
endif()
endif()
endfunction()
###############################################################################
set(_CATCH_DISCOVER_TESTS_SCRIPT
${CMAKE_CURRENT_LIST_DIR}/CatchAddTests.cmake
)

78
third_party/intgemm/CMake/CatchAddTests.cmake поставляемый
Просмотреть файл

@ -1,78 +0,0 @@
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
set(prefix "${TEST_PREFIX}")
set(suffix "${TEST_SUFFIX}")
set(spec ${TEST_SPEC})
set(extra_args ${TEST_EXTRA_ARGS})
set(properties ${TEST_PROPERTIES})
set(script)
set(suite)
set(tests)
function(add_command NAME)
set(_args "")
foreach(_arg ${ARGN})
if(_arg MATCHES "[^-./:a-zA-Z0-9_]")
set(_args "${_args} [==[${_arg}]==]") # form a bracket_argument
else()
set(_args "${_args} ${_arg}")
endif()
endforeach()
set(script "${script}${NAME}(${_args})\n" PARENT_SCOPE)
endfunction()
# Run test executable to get list of available tests
if(NOT EXISTS "${TEST_EXECUTABLE}")
message(FATAL_ERROR
"Specified test executable '${TEST_EXECUTABLE}' does not exist"
)
endif()
execute_process(
COMMAND ${TEST_EXECUTOR} "${TEST_EXECUTABLE}" ${spec} --list-test-names-only
OUTPUT_VARIABLE output
RESULT_VARIABLE result
)
# Catch --list-test-names-only reports the number of tests, so 0 is... surprising
if(${result} EQUAL 0)
message(WARNING
"Test executable '${TEST_EXECUTABLE}' contains no tests!\n"
)
elseif(${result} LESS 0)
message(FATAL_ERROR
"Error running test executable '${TEST_EXECUTABLE}':\n"
" Result: ${result}\n"
" Output: ${output}\n"
)
endif()
string(REPLACE "\n" ";" output "${output}")
# Parse output
foreach(line ${output})
set(test ${line})
# use escape commas to handle properly test cases with commans inside the name
string(REPLACE "," "\\," test_name ${test})
# ...and add to script
add_command(add_test
"${prefix}${test}${suffix}"
${TEST_EXECUTOR}
"${TEST_EXECUTABLE}"
"${test_name}"
${extra_args}
)
add_command(set_tests_properties
"${prefix}${test}${suffix}"
PROPERTIES
WORKING_DIRECTORY "${TEST_WORKING_DIR}"
${properties}
)
list(APPEND tests "${prefix}${test}${suffix}")
endforeach()
# Create a list of all discovered tests, which users may use to e.g. set
# properties on the tests
add_command(set ${TEST_LIST} ${tests})
# Write CTest script
file(WRITE "${CTEST_FILE}" "${script}")

136
third_party/intgemm/CMakeLists.txt поставляемый
Просмотреть файл

@ -1,136 +0,0 @@
cmake_minimum_required(VERSION 3.5)
project(intgemm)
string(ASCII 27 Esc)
set(Orange "${Esc}[33m")
set(ColourReset "${Esc}[m")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(CMAKE_CXX_STANDARD 11)
if(MSVC)
add_compile_options(/W4 /WX)
else()
add_compile_options(-Wall -Wextra -pedantic -Werror -Wno-unknown-pragmas)
if (COMPILE_WASM)
# Disabling Pthreads + memory growth warning to be an error for WASM
# Pthreads + memory growth causes JS accessing the wasm memory to be slow
# https://github.com/WebAssembly/design/issues/1271
add_compile_options(-Wno-error=pthreads-mem-growth)
endif()
endif()
# Check if compiler supports AVX2 (this should only catch emscripten)
try_compile(INTGEMM_COMPILER_SUPPORTS_AVX2
${CMAKE_CURRENT_BINARY_DIR}/compile_tests
${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx2.cc)
# Check if compiler supports AVX512BW
try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512BW
${CMAKE_CURRENT_BINARY_DIR}/compile_tests
${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx512bw.cc)
# Check if the compiler supports AVX512VNNI
try_compile(INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
${CMAKE_CURRENT_BINARY_DIR}/compile_tests
${CMAKE_CURRENT_SOURCE_DIR}/compile_test/avx512vnni.cc)
if (NOT INTGEMM_COMPILER_SUPPORTS_AVX2 OR NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW OR NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
set(UNSUPPORTED "Your compiler is too old to support")
if (NOT INTGEMM_COMPILER_SUPPORTS_AVX2)
set(UNSUPPORTED "${UNSUPPORTED} AVX2")
endif()
if (NOT INTGEMM_COMPILER_SUPPORTS_AVX512BW)
set(UNSUPPORTED "${UNSUPPORTED} AVX512BW")
endif()
if (NOT INTGEMM_COMPILER_SUPPORTS_AVX512VNNI)
set(UNSUPPORTED "${UNSUPPORTED} AVX512VNNI")
endif()
message(WARNING "${Orange}${UNSUPPORTED}. Multiplication will be slower on CPUs that support these instructions. For details rerun cmake with --debug-trycompile then try to build in compile_tests/CMakeFiles/CMakeTmp.${ColourReset}")
endif()
add_library(intgemm STATIC intgemm/intgemm.cc)
# Generate configure file
configure_file(intgemm/intgemm_config.h.in intgemm/intgemm_config.h)
#Ensure it is included by users.
include_directories(${CMAKE_CURRENT_BINARY_DIR})
target_include_directories(intgemm PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
# This isn't necessary since intgemm uses entirely relative paths but source code depending on it may want to #include <intgemm/intgemm.h>
target_include_directories(intgemm INTERFACE .)
option(USE_OPENMP "Use OpenMP" OFF)
if (USE_OPENMP)
message(STATUS "Compiling with OpenMP")
find_package(OpenMP)
if (NOT ${OpenMP_CXX_FOUND})
message(SEND_ERROR "OpenMP requested but C++ support not found")
endif()
add_compile_options(${OpenMP_CXX_FLAGS})
target_link_libraries(intgemm PUBLIC OpenMP::OpenMP_CXX)
endif()
if (COMPILE_WASM)
# A compile defintion to compile intgemm on WASM platform
target_compile_definitions(intgemm PUBLIC WASM)
endif()
option(WORMHOLE "Use WASM wormhole https://bugzilla.mozilla.org/show_bug.cgi?id=1672160" OFF)
if (WORMHOLE)
target_compile_definitions(intgemm PUBLIC INTGEMM_WORMHOLE)
endif()
option(INTGEMM_CPUID_ENVIRONMENT "Allow INTGEMM_CPUID environment variable to downgrade CPU model, which is mainly for testing." ON)
if (INTGEMM_CPUID_ENVIRONMENT)
target_compile_definitions(intgemm PRIVATE INTGEMM_CPUID_ENVIRONMENT)
endif()
if(INTGEMM_DONT_BUILD_TESTS)
return()
endif()
foreach(exe benchmark biasmultiply benchmark_quantizer)
add_executable(${exe} benchmarks/${exe}.cc)
target_link_libraries(${exe} intgemm)
endforeach()
add_executable(example example.cc)
target_link_libraries(example intgemm)
add_executable(tests
test/test.cc
# General tests
test/add127_test.cc
test/multiply_test.cc
test/prepare_b_quantized_transposed.cc
test/prepare_b_transposed.cc
test/quantize_test.cc
test/utils_test.cc
# Kernels tests
test/kernels/add_bias_test.cc
test/kernels/bitwise_not_test.cc
test/kernels/downcast_test.cc
test/kernels/exp_test.cc
test/kernels/floor_test.cc
test/kernels/multiply_test.cc
test/kernels/quantize_test.cc
test/kernels/relu_test.cc
test/kernels/rescale_test.cc
test/kernels/sigmoid_test.cc
test/kernels/tanh_test.cc
test/kernels/unquantize_test.cc
test/kernels/upcast_test.cc
test/kernels/write_test.cc
)
target_link_libraries(tests intgemm)
#CTest integration with Catch2
include(${CMAKE_CURRENT_SOURCE_DIR}/CMake/Catch.cmake)
include(CTest)
catch_discover_tests(tests)

70
third_party/intgemm/LICENSE поставляемый
Просмотреть файл

@ -1,70 +0,0 @@
MIT License
Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
test/3rd_party/catch.hpp
Copyright (c) 2019 Two Blue Cubes Ltd. All rights reserved.
Distributed under the Boost Software License, Version 1.0. (See accompanying
file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
The original 16-bit SSE2 code came from:
Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU by Jacob Devlin
https://arxiv.org/abs/1705.01991
Under a license:
Copyright (c) 2017 Microsoft Corporation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

91
third_party/intgemm/README.md поставляемый
Просмотреть файл

@ -1,91 +0,0 @@
[![Build SSE](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-SSE.svg?label=SSE)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-SSE/)
[![Build AVX2](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX2.svg?label=AVX2)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX2/)
[![Build AVX512BW](https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/intgemm/job/intgemm-AVX512BW.svg?label=AVX512BW)](http://vali.inf.ed.ac.uk/jenkins/job/intgemm-AVX512BW/)
![Build Ubuntu](https://github.com/kpu/intgemm/workflows/Ubuntu/badge.svg)
![Build Ubuntu debug](https://github.com/kpu/intgemm/workflows/Ubuntu%20debug/badge.svg)
![Build Ubuntu OpenMP](https://github.com/kpu/intgemm/workflows/Ubuntu%20OpenMP/badge.svg)
![Build Windows](https://github.com/kpu/intgemm/workflows/Windows/badge.svg)
![Build Mac](https://github.com/kpu/intgemm/workflows/Mac/badge.svg)
[![Intel Compiler](https://github.com/kpu/intgemm/actions/workflows/intel-19.yml/badge.svg)](https://github.com/kpu/intgemm/actions/workflows/intel-19.yml)
# Integer Matrix Multiplication
This repository implements 8-bit and 16-bit matrix multiplication:
C = A * B
It's designed with neural network inference in mind: A is typically activations, B is typically fixed parameters, and C is activations for the next layer.
A can have any number of rows. Typically this is a batch size.
The shared dimension, A's columns and B's rows, must be a multiple of 32 (for 16-bit) or 64 (for 8-bit).
B's columns must be a multiple of 8.
## Accuracy
16-bit multiplication accumulates into 32-bit integers WITHOUT SATURATION (because there is no 32-bit add with saturation). If width is too large (i.e. >2048) or many 16-bit values are large, there is substantial risk of overflow. Choose a smaller quantization multiplier to scale things down or implement periodic upcasting to 64-bit for me.
8-bit multiplication accumulates into 16-bit integers with saturation. This saturates for larger widths (~1024) and is worst on SSSE3 because it accumulates in fewer values. It's possible to upcast to 32-bit every so often, but this has not been implemented yet.
## Usage
A full example appears in [example.cc](example.cc).
Both A and B should be prepared before multiplication.
```C++
#include "intgemm/intgemm.h"
/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
* A is A_rows x width.
* B is width x B_cols.
*/
/* Prepare A for multiplication. This might be offline or on the fly. */
intgemm::Int16::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
/* Prepare B for multiplication. This is typically done offline. */
intgemm::Int16::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
/* Multiply and produce results in C */
intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0 / (quant_mult * quant_mult), C.begin()));
```
For 8-bit, use `Int8` instead of `Int16`.
When repesented as floats, all of A, B, and C are in row-major format.
The last argument of `Multiply` is a callback which is usually used to performs postprocessing on the output matrix (C). Full set of built-in callbacks can be found in [callbacks/configs.h](callbacks/configs.h). You can also write your own callback. To do that you just need to:
1. Add configuration structure for your callback in [callbacks/configs.h](callbacks/configs.h).
2. Add your callback implementation:
- in [callbacks/implementations.inl](callbacks/implementations.inl) if you want to implement it for all architecturs at the same time.
- in `callbacks/ARCHITECTURE.h` (e.g. [callbacks/sse2.h](callbacks/sse2.h)) if you want to implement it only for the specific architecture.
For 8-bit, you can make use a of a slightly faster implementation, assuming you can determine tha quantization multipliers and prepare the biases offline:
```C++
#include "intgemm/intgemm.h"
/* Not shown: allocate 64-byte aligned memory with e.g. aligned_alloc.
* A is A_rows x width.
* B is width x B_cols.
* If you want to make use of the slightly faster 8bit codepath (assuming you can cache biases and quantization multipliers)
* This routine only supports C = A*B + Bias
* In practise it computes C = (A+127)*B + Bias - |127|*B
* Prepare A and B first:
*/
float alpha = 25;
float quant_mult = 127/alpha;
intgemm::Int8Shift::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
intgemm::Int8Shift::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
/* Prepare the bias (inplace) */
float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f);
intgemm::Int8Shift::PrepareBias(B_prepared.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, inputBias.begin(), inputBias.begin()));
/* Multiply */
intgemm::Int8Shift::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), C.begin()));
```
## Quantization
Floating-point values are multiplied by a user-specified constant then rounded to an integer.
In 16 bit, Jacob Devlin recommends 1024.0 for neural networks to prevent the aforementioned overflow.
In 8 bit, use 127.0 / the largest value (use MaxAbsolute). Quantization will saturate so it's possible to use larger multipliers to obtain clipping.
## Acknowledgments
The original 16-bit SSE2 code came from:
Sharp Models on Dull Hardware: Fast and Accurate Neural Machine Translation Decoding on the CPU by Jacob Devlin https://arxiv.org/abs/1705.01991 under the MIT license.

214
third_party/intgemm/benchmarks/benchmark.cc поставляемый
Просмотреть файл

@ -1,214 +0,0 @@
#include "../intgemm/aligned.h"
#include "intgemm/intgemm_config.h"
#include "../intgemm/avx512_gemm.h"
#include "../intgemm/sse2_gemm.h"
#include "../intgemm/avx2_gemm.h"
#include "../intgemm/ssse3_gemm.h"
#include "../intgemm/intgemm.h"
#include "../intgemm/stats.h"
#include "../intgemm/callbacks.h"
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <random>
namespace intgemm {
namespace {
struct RandomMatrices {
RandomMatrices(Index A_rows_in, Index width_in, Index B_cols_in) :
A_rows(A_rows_in), width(width_in), B_cols(B_cols_in),
A(A_rows * width), B(width * B_cols) {
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.f, 1.f);
gen.seed(45678);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
}
const Index A_rows, width, B_cols;
AlignedVector<float> A, B;
};
template <class Backend> double Run(const RandomMatrices &m) {
using Integer = typename Backend::Integer;
float quant_mult = 127.0f / 2.0f;
float unquant_mult = 1.0f / (quant_mult * quant_mult);
AlignedVector<Integer> A_prepared(m.A_rows * m.width);
Backend::PrepareA(m.A.begin(), A_prepared.begin(), quant_mult, m.A_rows, m.width);
AlignedVector<Integer> B_prepared(m.width * m.B_cols);
Backend::PrepareB(m.B.begin(), B_prepared.begin(), quant_mult, m.width, m.B_cols);
AlignedVector<float> output(m.A_rows * m.B_cols);
// Burn in
Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
auto start = std::chrono::steady_clock::now();
Backend::Multiply(A_prepared.begin(), B_prepared.begin(), m.A_rows, m.width, m.B_cols, callbacks::UnquantizeAndWrite(unquant_mult, output.begin()));
return std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
}
template <class Backend> void RunAll(RandomMatrices *matrices, RandomMatrices *matrices_end, std::vector<std::vector<double>> &stats) {
if (Backend::kUses > kCPU) return;
std::size_t size = matrices_end - matrices;
if (stats.size() < size)
stats.resize(size);
for (std::size_t i = 0; i < size; ++i) {
stats[i].push_back(Run<Backend>(matrices[i]));
}
}
struct BackendStats {
std::vector<std::vector<double>> ssse3_8bit;
std::vector<std::vector<double>> avx2_8bit;
std::vector<std::vector<double>> avx512_8bit;
std::vector<std::vector<double>> avx512vnni_8bit;
std::vector<std::vector<double>> sse2_16bit;
std::vector<std::vector<double>> avx2_16bit;
std::vector<std::vector<double>> avx512_16bit;
};
const float kOutlierThreshold = 0.75;
void Summarize(std::vector<double> &stats) {
// Throw out outliers.
std::vector<double>::iterator keep = stats.begin() + static_cast<std::size_t>(static_cast<float>(stats.size()) * kOutlierThreshold);
std::nth_element(stats.begin(), keep, stats.end());
double avg = 0.0;
for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
avg += *i;
}
avg /= (keep - stats.begin());
double stddev = 0.0;
for (std::vector<double>::const_iterator i = stats.begin(); i != keep; ++i) {
double off = (double)*i - avg;
stddev += off * off;
}
stddev = sqrt(stddev / (keep - stats.begin() - 1));
std::cout << std::setw(10) << *std::min_element(stats.begin(), stats.end()) << '\t' << std::setw(8) << avg << '\t' << std::setw(8) << stddev;
}
template <class Backend> void Print(std::vector<std::vector<double>> &stats, std::size_t index) {
if (stats.empty()) return;
std::cout << std::setw(16) << Backend::kName << '\t';
Summarize(stats[index]);
std::cout << '\n';
}
} // namespace intgemm
} // namespace
// Program takes no input
int main(int, char ** argv) {
std::cerr << "Remember to run this on a specific core:\ntaskset --cpu-list 0 " << argv[0] << std::endl;
using namespace intgemm;
RandomMatrices matrices[] = {
{1, 64, 8},
{8, 256, 256},
{8, 2048, 256},
{8, 256, 2048},
{320, 256, 256},
{472, 256, 256},
{248, 256, 256},
{200, 256, 256},
// Additional stuff
{256, 256, 256},
{512, 512, 512},
{1024, 1024, 1024},
/* {4096, 4096, 4096},
{4096, 4096, 2048},
{4096, 4096, 1024},
{4096, 4096, 512},
{4096, 4096, 256},*/
{4096, 4096, 128}
};
RandomMatrices *matrices_end = (RandomMatrices*)matrices + sizeof(matrices) / sizeof(RandomMatrices);
// Only do full sampling for <1024 rows.
RandomMatrices *full_sample;
for (full_sample = matrices_end - 1; full_sample >= matrices && full_sample->A_rows >= 1024; --full_sample) {}
++full_sample;
BackendStats stats;
const int kSamples = 100;
// Realistically, we don't expect different architectures or different precisions to run in the
// same run of an application. Benchmark per architecture and per precision level.
std::cerr << "SSSE3 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<SSSE3::Kernels8>(matrices, end, stats.ssse3_8bit);
}
std::cerr << "SSE2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<SSE2::Kernels16>(matrices, end, stats.sse2_16bit);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
std::cerr << "AVX2 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<AVX2::Kernels8>(matrices, end, stats.avx2_8bit);
}
std::cerr << "AVX2 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<AVX2::Kernels16>(matrices, end, stats.avx2_16bit);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
std::cerr << "AVX512 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<AVX512BW::Kernels8>(matrices, end, stats.avx512_8bit);
}
std::cerr << "AVX512 16bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<AVX512BW::Kernels16>(matrices, end, stats.avx512_16bit);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
std::cerr << "AVX512VNNI 8bit, 100 samples..." << std::endl;
for (int samples = 0; samples < kSamples; ++samples) {
RandomMatrices *end = (samples < 4) ? matrices_end : full_sample;
RunAll<AVX512VNNI::Kernels8>(matrices, end, stats.avx512vnni_8bit);
}
#endif
if (stats.sse2_16bit.empty()) {
std::cerr << "No CPU support." << std::endl;
return 1;
}
for (std::size_t i = 0; i < sizeof(matrices) / sizeof(RandomMatrices); ++i) {
std::cout << "Multiply\t" << matrices[i].A_rows << '\t' << matrices[i].width << '\t' << matrices[i].B_cols << '\t' << "Samples=" << (kOutlierThreshold * stats.sse2_16bit[i].size()) << '\n';
Print<SSSE3::Kernels8>(stats.ssse3_8bit, i);
Print<AVX2::Kernels8>(stats.avx2_8bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
Print<AVX512BW::Kernels8>(stats.avx512_8bit, i);
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
Print<AVX512VNNI::Kernels8>(stats.avx512vnni_8bit, i);
#endif
Print<SSE2::Kernels16>(stats.sse2_16bit, i);
Print<AVX2::Kernels16>(stats.avx2_16bit, i);
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
Print<AVX512BW::Kernels16>(stats.avx512_16bit, i);
#endif
}
return 0;
}

Просмотреть файл

@ -1,74 +0,0 @@
#include "../intgemm/intgemm.h"
#include "../intgemm/aligned.h"
#include "../intgemm/ssse3_gemm.h"
#include "../intgemm/avx2_gemm.h"
#include "../intgemm/avx512_gemm.h"
#include <chrono>
#include <iomanip>
#include <iostream>
#include <random>
#include <vector>
namespace {
float MaxAbsoluteBaseline(const float *begin, const float *end) {
auto res = std::minmax_element(begin, end);
return std::max(std::fabs(*res.first), std::fabs(*res.second));
}
void BenchmarkMaxAbsolute() {
std::mt19937 gen;
std::uniform_real_distribution<float> dist(0.f, 1.f);
gen.seed(45678);
intgemm::AlignedVector<float> v(4096 * 4096);
for (auto& it : v) {
it = dist(gen);
}
// Hopefully these don't get optimized out...
MaxAbsoluteBaseline(v.begin(), v.end());
auto start = std::chrono::steady_clock::now();
MaxAbsoluteBaseline(v.begin(), v.end());
double baseline = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
intgemm::MaxAbsolute(v.begin(), v.end());
start = std::chrono::steady_clock::now();
intgemm::MaxAbsolute(v.begin(), v.end());
double optimized = std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count();
std::cout << "MaxAbsolute baseline = " << baseline << " optimized = " << optimized << " speedup = " << (optimized / baseline) << '\n';
}
template <class Backend> void QuantizerBench(const float *in, int8_t *out, intgemm::Index count) {
if (intgemm::kCPU < Backend::kUses) return;
Backend::Quantize(in, out, 1.0, count);
const std::size_t kTries = 60;
auto start = std::chrono::steady_clock::now();
for (std::size_t t = 0; t < kTries; ++t) {
Backend::Quantize(in, out, 1.0, count);
}
auto end = std::chrono::steady_clock::now();
double took = std::chrono::duration<double>(end - start).count() / kTries;
std::cout << std::setw(9) << count << ' ' << std::fixed << std::setw(9) << std::setprecision(7) << took << ' ' << Backend::kName << std::endl;
}
} // namespace
int main() {
BenchmarkMaxAbsolute();
for (std::size_t count = 1; count < (1ULL<<30); count *= 2) {
intgemm::AlignedVector<float> in(count);
intgemm::AlignedVector<int8_t> out(count);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-129.0, 129.0);
for (float &element : in) {
element = dist(gen);
}
QuantizerBench<intgemm::SSSE3::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
QuantizerBench<intgemm::AVX2::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
QuantizerBench<intgemm::AVX512BW::Kernels8>(in.begin(), out.begin(), static_cast<intgemm::Index>(count));
#endif
}
}

278
third_party/intgemm/benchmarks/biasmultiply.cc поставляемый
Просмотреть файл

@ -1,278 +0,0 @@
#include "../intgemm/intgemm.h"
#include "../intgemm/aligned.h"
#include <chrono>
#include <random>
#include <iostream>
using namespace intgemm;
template <class Routine>
void testOld(Index /*rows*/, Index /*cols*/) {
}
template <class Routine>
std::chrono::duration<double> testNew(Index A_rows, Index width, Index B_cols) {
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = dist(gen);
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<uint8_t> A_prep(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
auto start = std::chrono::system_clock::now();
Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
return elapsed_seconds;
}
template <class Routine>
std::chrono::duration<double> testOld(Index A_rows, Index width, Index B_cols) {
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = dist(gen);
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<int8_t> A_prep(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
auto start = std::chrono::system_clock::now();
Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
return elapsed_seconds;
}
template <class Routine>
std::chrono::duration<double> testOld_nobias(Index A_rows, Index width, Index B_cols) {
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<int8_t> A_prep(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
auto start = std::chrono::system_clock::now();
Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin()));
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
return elapsed_seconds;
}
int main(int argc, char ** argv) {
int repeat = 1000;
if (argc > 1) {
repeat = atoi(argv[1]);
}
std::chrono::duration<double> oldSSSE3_nobias = testOld_nobias<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 256, 256);
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(8, 2048, 256);
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(320, 256, 256);
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(472, 256, 256);
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(248, 256, 256);
oldSSSE3_nobias += testOld_nobias<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 without bias took: " << oldSSSE3_nobias.count() << " seconds." << std::endl;
std::chrono::duration<double> oldSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldSSSE3 += testOld<SSSE3::Kernels8>(8, 256, 256);
oldSSSE3 += testOld<SSSE3::Kernels8>(8, 2048, 256);
oldSSSE3 += testOld<SSSE3::Kernels8>(320, 256, 256);
oldSSSE3 += testOld<SSSE3::Kernels8>(472, 256, 256);
oldSSSE3 += testOld<SSSE3::Kernels8>(248, 256, 256);
oldSSSE3 += testOld<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of SSSE3 took: " << oldSSSE3.count() << " seconds." << std::endl;
std::chrono::duration<double> newTimeSSSE3 = testOld<SSSE3::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 256, 256);
newTimeSSSE3 += testNew<SSSE3::Kernels8>(8, 2048, 256);
newTimeSSSE3 += testNew<SSSE3::Kernels8>(320, 256, 256);
newTimeSSSE3 += testNew<SSSE3::Kernels8>(472, 256, 256);
newTimeSSSE3 += testNew<SSSE3::Kernels8>(248, 256, 256);
newTimeSSSE3 += testNew<SSSE3::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted SSSE3 took: " << newTimeSSSE3.count() << " seconds." << std::endl;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
std::chrono::duration<double> oldAVX2_nobias = testOld_nobias<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 256, 256);
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(8, 2048, 256);
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(320, 256, 256);
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(472, 256, 256);
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(248, 256, 256);
oldAVX2_nobias += testOld_nobias<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 without bias took: " << oldAVX2_nobias.count() << " seconds." << std::endl;
std::chrono::duration<double> oldAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX2 += testOld<AVX2::Kernels8>(8, 256, 256);
oldAVX2 += testOld<AVX2::Kernels8>(8, 2048, 256);
oldAVX2 += testOld<AVX2::Kernels8>(320, 256, 256);
oldAVX2 += testOld<AVX2::Kernels8>(472, 256, 256);
oldAVX2 += testOld<AVX2::Kernels8>(248, 256, 256);
oldAVX2 += testOld<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX2 took: " << oldAVX2.count() << " seconds." << std::endl;
std::chrono::duration<double> newTimeAVX2 = testOld<AVX2::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
newTimeAVX2 += testNew<AVX2::Kernels8>(8, 256, 256);
newTimeAVX2 += testNew<AVX2::Kernels8>(8, 2048, 256);
newTimeAVX2 += testNew<AVX2::Kernels8>(320, 256, 256);
newTimeAVX2 += testNew<AVX2::Kernels8>(472, 256, 256);
newTimeAVX2 += testNew<AVX2::Kernels8>(248, 256, 256);
newTimeAVX2 += testNew<AVX2::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX2 took: " << newTimeAVX2.count() << " seconds." << std::endl;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
if (kCPU < CPUType::AVX512BW) return 0;
std::chrono::duration<double> oldAVX512_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 256, 256);
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(8, 2048, 256);
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(320, 256, 256);
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(472, 256, 256);
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(248, 256, 256);
oldAVX512_nobias += testOld_nobias<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 without bias took: " << oldAVX512_nobias.count() << " seconds." << std::endl;
std::chrono::duration<double> oldAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX512 += testOld<AVX512BW::Kernels8>(8, 256, 256);
oldAVX512 += testOld<AVX512BW::Kernels8>(8, 2048, 256);
oldAVX512 += testOld<AVX512BW::Kernels8>(320, 256, 256);
oldAVX512 += testOld<AVX512BW::Kernels8>(472, 256, 256);
oldAVX512 += testOld<AVX512BW::Kernels8>(248, 256, 256);
oldAVX512 += testOld<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512 took: " << oldAVX512.count() << " seconds." << std::endl;
std::chrono::duration<double> newTimeAVX512 = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 256, 256);
newTimeAVX512 += testNew<AVX512BW::Kernels8>(8, 2048, 256);
newTimeAVX512 += testNew<AVX512BW::Kernels8>(320, 256, 256);
newTimeAVX512 += testNew<AVX512BW::Kernels8>(472, 256, 256);
newTimeAVX512 += testNew<AVX512BW::Kernels8>(248, 256, 256);
newTimeAVX512 += testNew<AVX512BW::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512 took: " << newTimeAVX512.count() << " seconds." << std::endl;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
if (kCPU < CPUType::AVX512VNNI) return 0;
std::chrono::duration<double> oldAVX512VNNI_nobias = testOld_nobias<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 256, 256);
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(8, 2048, 256);
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(320, 256, 256);
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(472, 256, 256);
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(248, 256, 256);
oldAVX512VNNI_nobias += testOld_nobias<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI without bias took: " << oldAVX512VNNI_nobias.count() << " seconds." << std::endl;
std::chrono::duration<double> oldAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 256, 256);
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(8, 2048, 256);
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(320, 256, 256);
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(472, 256, 256);
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(248, 256, 256);
oldAVX512VNNI += testOld<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of AVX512VNNI took: " << oldAVX512VNNI.count() << " seconds." << std::endl;
std::chrono::duration<double> newTimeAVX512VNNI = testOld<AVX512BW::Kernels8>(1, 64, 8);
for (int i = 0; i<repeat; i++) {
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 256, 256);
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(8, 2048, 256);
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(320, 256, 256);
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(472, 256, 256);
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(248, 256, 256);
newTimeAVX512VNNI += testNew<AVX512VNNI::Kernels8>(200, 256, 256);
}
std::cout << repeat << " iterations of Shifted AVX512VNNI took: " << newTimeAVX512VNNI.count() << " seconds." << std::endl;
#endif
}

25
third_party/intgemm/compile_test/avx2.cc поставляемый
Просмотреть файл

@ -1,25 +0,0 @@
// Some compilers don't have AVX2 support. Test for them.
#include <immintrin.h>
// clang-cl bug doesn't include these headers when pretending to be MSVC
// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
#if defined(_MSC_VER) && defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#include <smmintrin.h>
#endif
#if defined(_MSC_VER) && !defined(__clang__)
#define INTGEMM_AVX2
#else
#define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
#endif
INTGEMM_AVX2 int Test() {
__m256i value = _mm256_set1_epi32(1);
value = _mm256_abs_epi8(value);
return *(int*)&value;
}
int main() {
}

31
third_party/intgemm/compile_test/avx512bw.cc поставляемый
Просмотреть файл

@ -1,31 +0,0 @@
// Some compilers don't have AVX512BW support. Test for them.
#include <immintrin.h>
// clang-cl bug doesn't include these headers when pretending to be MSVC
// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
#if defined(_MSC_VER) && defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#include <smmintrin.h>
#include <avx512fintrin.h>
#include <avx512dqintrin.h>
#include <avx512bwintrin.h>
#endif
#if defined(_MSC_VER) && !defined(__clang__)
#define INTGEMM_AVX512BW
#elif defined(__INTEL_COMPILER)
#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f")))
#else
#define INTGEMM_AVX512BW __attribute__ ((target ("avx512bw")))
#endif
INTGEMM_AVX512BW int Test() {
// AVX512BW
__m512i value = _mm512_set1_epi32(1);
value = _mm512_maddubs_epi16(value, value);
return *(int*)&value;
}
int main() {
}

Просмотреть файл

@ -1,36 +0,0 @@
#include <immintrin.h>
// clang-cl bug doesn't include these headers when pretending to be MSVC
// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
#if defined(_MSC_VER) && defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#include <smmintrin.h>
#include <avx512fintrin.h>
#include <avx512dqintrin.h>
#include <avx512bwintrin.h>
#include <avx512vnniintrin.h>
#endif
#if defined(_MSC_VER) && !defined(__clang__)
#elif defined(__INTEL_COMPILER)
__attribute__ ((target ("avx512f")))
#else
__attribute__ ((target ("avx512f,avx512bw,avx512dq,avx512vnni")))
#endif
bool Foo() {
// AVX512F
__m512i value = _mm512_set1_epi32(1);
// AVX512BW
value = _mm512_maddubs_epi16(value, value);
// AVX512DQ
__m256i value2 = _mm256_set1_epi8(1);
value = _mm512_inserti32x8(value, value2, 1);
// AVX512VNNI
value = _mm512_dpbusd_epi32(value, value, value);
return *(int*)&value;
}
int main() {
return Foo();
}

79
third_party/intgemm/example.cc поставляемый
Просмотреть файл

@ -1,79 +0,0 @@
#include "intgemm/intgemm.h"
// This is just for AlignedVector, which helps managed 64-byte aligned memory.
// Feel free to manage memory yourself.
#include "intgemm/aligned.h"
#include "intgemm/callbacks.h"
#include <cassert>
#include <cmath>
#include <random>
int main() {
using intgemm::Index;
const Index A_rows = 1;
// The shared dimension: A's columns and B's rows.
const Index width = 64;
const Index B_cols = 8;
// This is a simple vector class that allocates memory aligned to 64 bytes.
// You don't have to use it; just use aligned_alloc and friends directly.
using intgemm::AlignedVector;
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
// Fill with random values in range [-2, 2].
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-2.f, 2.f);
gen.seed(1);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
// Compute the top left corner of C as a sanity check.
float top_left_reference = 0.0f;
for (Index w = 0; w < width; ++w) {
top_left_reference += A[w] * B[w * B_cols];
}
// 16-bit multiplication.
{
// For 16-bit, Jacob Devlin recommends 1024 so as to not overflow in 32-bit accumulation.
float quant_mult = 1024.0f;
AlignedVector<int16_t> A_prepared(A.size());
AlignedVector<int16_t> B_prepared(B.size());
// Quantize A.
intgemm::Int16::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
// Quantize and reshape B.
// Typically you will do this once when parameters are loaded, not every time.
intgemm::Int16::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
AlignedVector<float> C(A_rows * B_cols);
// Do the actual multiply.
intgemm::Int16::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
// Sanity check. C will be row major.
assert(std::fabs(C[0] - top_left_reference) < 0.05f);
}
// 8-bit multiplication.
{
// For 8-bit a good quantization multiplier is 127 / largest absolute value..
float quant_mult = 127.0f / 2.0f;
AlignedVector<int8_t> A_prepared(A.size());
AlignedVector<int8_t> B_prepared(B.size());
// Quantize A.
intgemm::Int8::PrepareA(A.begin(), A_prepared.begin(), quant_mult, A_rows, width);
// Quantize and reshape B.
// Typically you will do this once when parameters are loaded, not every time.
intgemm::Int8::PrepareB(B.begin(), B_prepared.begin(), quant_mult, width, B_cols);
AlignedVector<float> C(A_rows * B_cols);
// Do the actual multiply.
intgemm::Int8::Multiply(A_prepared.begin(), B_prepared.begin(), A_rows, width, B_cols, intgemm::callbacks::UnquantizeAndWrite(1.0f / (quant_mult * quant_mult), C.begin()));
// Sanity check. C will be row major.
assert(std::fabs(C[0] - top_left_reference) < 0.05f);
}
return 0;
}

90
third_party/intgemm/intgemm/aligned.h поставляемый
Просмотреть файл

@ -1,90 +0,0 @@
#pragma once
#include <cstdlib>
#include <new>
#ifdef _MSC_VER
// Ensure _HAS_EXCEPTIONS is defined
#include <vcruntime.h>
#include <malloc.h>
#endif
#if !((defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS))
#include <cstdlib>
#endif
// Aligned simple vector.
namespace intgemm {
template <class T> class AlignedVector {
public:
AlignedVector() : mem_(nullptr), size_(0) {}
explicit AlignedVector(std::size_t size, std::size_t alignment = 64 /* CPU cares about this */)
: size_(size) {
#ifdef _MSC_VER
mem_ = static_cast<T*>(_aligned_malloc(size * sizeof(T), alignment));
if (!mem_) {
# if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
throw std::bad_alloc();
# else
std::abort();
# endif
}
#else
if (posix_memalign(reinterpret_cast<void **>(&mem_), alignment, size * sizeof(T))) {
# if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
throw std::bad_alloc();
# else
std::abort();
# endif
}
#endif
}
AlignedVector(AlignedVector &&from) : mem_(from.mem_), size_(from.size_) {
from.mem_ = nullptr;
from.size_ = 0;
}
AlignedVector &operator=(AlignedVector &&from) {
if (this == &from) return *this;
release();
mem_ = from.mem_;
size_ = from.size_;
from.mem_ = nullptr;
from.size_ = 0;
return *this;
}
AlignedVector(const AlignedVector&) = delete;
AlignedVector& operator=(const AlignedVector&) = delete;
~AlignedVector() { release(); }
std::size_t size() const { return size_; }
T &operator[](std::size_t offset) { return mem_[offset]; }
const T &operator[](std::size_t offset) const { return mem_[offset]; }
T *begin() { return mem_; }
const T *begin() const { return mem_; }
T *end() { return mem_ + size_; }
const T *end() const { return mem_ + size_; }
template <typename ReturnType>
ReturnType *as() { return reinterpret_cast<ReturnType*>(mem_); }
private:
T *mem_;
std::size_t size_;
void release() {
#ifdef _MSC_VER
_aligned_free(mem_);
#else
std::free(mem_);
#endif
}
};
} // namespace intgemm

232
third_party/intgemm/intgemm/avx2_gemm.h поставляемый
Просмотреть файл

@ -1,232 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#include "interleave.h"
#include "kernels.h"
#include "multiply.h"
#include "types.h"
#include <cstdint>
#include <cstring>
namespace intgemm {
namespace AVX2 {
INTGEMM_AVX2 inline Register QuantizerGrab(const float *input, const __m256 quant_mult_reg) {
return kernels::quantize(loadu_ps<FRegister>(input), quant_mult_reg);
}
INTGEMM_SELECT_COL_B(INTGEMM_AVX2, __m256i)
class QuantizeTile16 {
public:
INTGEMM_AVX2 static inline Register Consecutive(FRegister mult_reg, const float *input) {
return Tile(mult_reg, input, input + 8);
}
INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
return Tile(mult_reg,
input,
input + 8 + (cols_left <= 8 ? cols * (row_step - 1) : 0));
}
INTGEMM_AVX2 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// 8 rows in the first 128-bit register, 8 in the second register.
return Tile(mult_reg, input, input + 8 * cols);
}
private:
INTGEMM_AVX2 static inline Register Tile(FRegister mult_reg, const float *input0, const float *input1) {
Register g0 = QuantizerGrab(input0, mult_reg);
Register g1 = QuantizerGrab(input1, mult_reg);
Register packed = _mm256_packs_epi32(g0, g1);
// Reorder the packed values because Intel does 0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15.
// Technically this could be removed if the PrepareB did the same reordering internally.
return _mm256_permute4x64_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
};
struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_AVX2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Just quantize everything in order.
INTGEMM_AVX2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
*reinterpret_cast<__m256i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
/*
INTGEMM_AVX2 static void PrepareB(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
PrepareBFor16(input, output, AVX2::QuantizeTile16(quant_mult), rows, cols);
}*/
INTGEMM_PREPARE_B_16(INTGEMM_AVX2, AVX2::QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int16_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile16, int16_t)
INTGEMM_AVX2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m256i, INTGEMM_AVX2, CPUType::AVX2)
constexpr static const char *const kName = "16-bit AVX2";
static const CPUType kUses = CPUType::AVX2;
};
/* Read 8 floats at a time from input0, input1, input2, and input3. Quantize
* them to 8-bit by multiplying with quant_mult_reg then rounding. Concatenate
* the result into one register and return it.
*/
class QuantizeTile8 {
public:
INTGEMM_AVX2 static inline Register Consecutive(FRegister quant_mult, const float *input) {
return Tile(quant_mult, input, input + 8, input + 16, input + 24);
}
INTGEMM_AVX2 static inline Register ConsecutiveU(FRegister quant_mult, const float *input) {
return TileU(quant_mult, input, input + 8, input + 16, input + 24);
}
INTGEMM_AVX2 static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
input += cols * (row_step - 1);
cols_left += cols;
}
inputs[i] = input;
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
return Tile(quant_mult, inputs[0], inputs[1], inputs[2], inputs[3]);
}
INTGEMM_AVX2 static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
// Put higher rows in the second half of the register. These will jumble
// around in the same way then conveniently land in the right place.
return Tile(quant_mult, input, input + 2 * cols, input + 16 * cols, input + 18 * cols);
}
INTGEMM_AVX2 static inline __m256i Tile(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
__m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
__m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
__m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
__m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
// Pack 16-bit to 8-bit.
__m256i packed = _mm256_packs_epi16(packed0, packed1);
// Ban -128.
packed = _mm256_max_epi8(packed, neg127);
// Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
// Or as 32-bit integers 0 2 4 6 1 3 5 7
// Technically this could be removed so long as the rows are bigger than 16
// and the values are only used for GEMM.
return _mm256_permutevar8x32_epi32(packed, shuffle_param);
}
private:
//A version that produces uint8_ts
INTGEMM_AVX2 static inline Register TileU(FRegister quant_mult, const float *input0, const float *input1, const float *input2, const float *input3) {
// Looking at the assembly, gcc has pulled this outside the loops calling this.
const __m256i neg127 = _mm256_set1_epi8(-127);
const __m256i pos127 = _mm256_set1_epi8(127);
const __m256i shuffle_param = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
// Grab 4 registers at a time in 32-bit format.
__m256i g0 = AVX2::QuantizerGrab(input0, quant_mult);
__m256i g1 = AVX2::QuantizerGrab(input1, quant_mult);
__m256i g2 = AVX2::QuantizerGrab(input2, quant_mult);
__m256i g3 = AVX2::QuantizerGrab(input3, quant_mult);
// Pack 32-bit to 16-bit.
__m256i packed0 = _mm256_packs_epi32(g0, g1);
__m256i packed1 = _mm256_packs_epi32(g2, g3);
// Pack 16-bit to 8-bit.
__m256i packed = _mm256_packs_epi16(packed0, packed1);
// Ban -128.
packed = _mm256_max_epi8(packed, neg127); //Could be removed if we use +128
packed = _mm256_add_epi8(packed, pos127);
// Currently in 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
// Or as 32-bit integers 0 2 4 6 1 3 5 7
// Technically this could be removed so long as the rows are bigger than 16
// and the values are only used for GEMM.
return _mm256_permutevar8x32_epi32(packed, shuffle_param);
}
};
struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_AVX2 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
private:
INTGEMM_QUANTIZE_THREAD(INTGEMM_AVX2)
public:
INTGEMM_QUANTIZE(INTGEMM_AVX2)
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_AVX2 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
QuantizeU(input, output, quant_mult, rows * cols);
}
// Just quantize everything in order.
INTGEMM_AVX2 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
assert(size % 32 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 32 == 0);
FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 32, output += 32) {
*reinterpret_cast<__m256i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_8(INTGEMM_AVX2, AVX2::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX2, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX2, AVX2::QuantizeTile8, int8_t)
INTGEMM_AVX2 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
AVX2::SelectColumnsOfB((const __m256i*)input, (__m256i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
INTGEMM_MULTIPLY8SHIFT(__m256i, INTGEMM_AVX2, CPUType::AVX2)
INTGEMM_PREPAREBIASFOR8(__m256i, INTGEMM_AVX2, CPUType::AVX2)
constexpr static const char *const kName = "8-bit AVX2";
static const CPUType kUses = CPUType::AVX2;
};
} // namespace AVX2
} // namespace intgemm
#endif

411
third_party/intgemm/intgemm/avx512_gemm.h поставляемый
Просмотреть файл

@ -1,411 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
#include "interleave.h"
#include "kernels.h"
#include "multiply.h"
#include "types.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstdlib>
/* AVX512 implementation.
* This uses INTGEMM_AVX512BW, INTGEMM_AVX512DQ, and might use AVX512VL
* That means it supports mainstream CPUs with AVX512, starting with Skylake
* Xeons.
* It does not support any Knights / Xeon Phi processors.
*
* All memory must be 64-byte aligned.
*/
namespace intgemm {
// AVX512 has combined collapse and store instructions:
// _mm512_mask_cvtsepi32_storeu_epi16
// _mm512_mask_cvtsepi32_storeu_epi8
// So conversion in memory uses these, but I also implement a wider version for
// rearranging B.
namespace AVX512BW {
// Load from memory, multiply, and convert to int32_t.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW inline __m512i QuantizerGrab(const float *input, const __m512 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m512>(input), quant_mult_reg);
}
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_SELECT_COL_B(INTGEMM_AVX512BW, __m512i)
// For PrepareB we want to read 8 columns at a time. When converting 32-bit
// floats to 8-bit values, that's 32 bytes of floats. But AVX512 is 64 bytes
// wide so it reads off the edge of the tile. We could expand the tile size
// but then the memory written to won't be contiguous anyway so we'd be doing a
// scatter anyway. Easier to just read the 8 columns we wanted as 256 bits
// concatenate.
INTGEMM_AVX512DQ inline __m512 Concat(const __m256 first, const __m256 second) {
// INTGEMM_AVX512DQ but that goes with INTGEMM_AVX512BW anyway.
return _mm512_insertf32x8(_mm512_castps256_ps512(first), second, 1);
}
// Like QuantizerGrab, but allows 32-byte halves (i.e. 8 columns) to be controlled independently.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW inline __m512i QuantizerGrabHalves(const float *input0, const float *input1, const __m512 quant_mult_reg) {
__m512 appended = Concat(loadu_ps<__m256>(input0), loadu_ps<__m256>(input1));
appended = _mm512_mul_ps(appended, quant_mult_reg);
return _mm512_cvtps_epi32(appended);
}
// These are only used for reshaping due to the AVX512 instructions
// _mm512_mask_cvtsepi32_storeu_epi16 and _mm512_mask_cvtsepi32_storeu_epi8
// being used for the quantizer.
class QuantizeTile16 {
public:
INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
auto input0 = input;
auto input1 = input + 16 + (cols_left <= 16 ? cols * (row_step - 1) : 0);
auto g0 = QuantizerGrabHalves(input0, input1, quant_mult);
auto g1 = QuantizerGrabHalves(input0 + 8, input1 + 8, quant_mult);
auto packed = packs_epi32(g0, g1);
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
INTGEMM_AVX512BW static inline Register ForReshape(FRegister quant_mult, const float *input, Index cols) {
__m512i g0 = QuantizerGrabHalves(input, input + 16 * cols, quant_mult);
__m512i g1 = QuantizerGrabHalves(input + 8 * cols, input + 24 * cols, quant_mult);
__m512i packed = packs_epi32(g0, g1);
// Permute within 256-bit lanes, so same as INTGEMM_AVX2
return _mm512_permutex_epi64(packed, 0xd8 /* 0, 2, 1, 3 */);
}
};
class QuantizeTile8 {
public:
INTGEMM_AVX512BW static inline Register ConsecutiveWithWrapping(FRegister quant_mult, const float *input, Index cols_left, Index cols, Index row_step) {
static const __m512i neg127 = _mm512_set1_epi8(-127);
static const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
input += cols * (row_step - 1);
cols_left += cols;
}
inputs[i] = input;
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
auto g0 = QuantizerGrab(inputs[0], quant_mult);
auto g1 = QuantizerGrab(inputs[1], quant_mult);
auto g2 = QuantizerGrab(inputs[2], quant_mult);
auto g3 = QuantizerGrab(inputs[3], quant_mult);
auto packed0 = packs_epi32(g0, g1);
auto packed1 = packs_epi32(g2, g3);
auto packed = _mm512_packs_epi16(packed0, packed1);
packed = _mm512_max_epi8(packed, neg127);
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
INTGEMM_AVX512BW static inline __m512i ForReshape(FRegister quant_mult, const float *input, Index cols) {
// TODO: try alternative: _mm512_cvtsepi32_epi8 ?
const __m512i neg127 = _mm512_set1_epi8(-127);
// In reverse order: grabbing the first 32-bit values from each 128-bit register, then the second 32-bit values, etc.
const __m512i shuffle_param = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
// 32-bit format.
__m512i g0 = QuantizerGrabHalves(input, input + 2 * cols, quant_mult);
__m512i g1 = QuantizerGrabHalves(input + 16 * cols, input + 18 * cols, quant_mult);
__m512i g2 = QuantizerGrabHalves(input + 32 * cols, input + 34 * cols, quant_mult);
__m512i g3 = QuantizerGrabHalves(input + 48 * cols, input + 50 * cols, quant_mult);
// Pack 32-bit to 16-bit.
__m512i packed0 = packs_epi32(g0, g1);
__m512i packed1 = packs_epi32(g2, g3);
// Pack 16-bit to 8-bit.
__m512i packed = _mm512_packs_epi16(packed0, packed1);
// Ban -128.
packed = _mm512_max_epi8(packed, neg127);
// 0 1 2 3 16 17 18 19 32 33 34 35 48 49 50 51 4 5 6 7 20 21 22 23 36 37 38 39 52 53 54 55 8 9 10 11 24 25 26 27 40 41 42 43 56 57 58 59 12 13 14 15 28 29 30 31 44 45 46 47 60 61 62 63
return _mm512_permutexvar_epi32(shuffle_param, packed);
}
};
struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
// rows * cols must be a multiple of 16.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// size must be a multiple of 16.
// Convert to 16-bit signed integers.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
// Fill with the quantization multiplier.
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
// There doesn't seem to be an unmasked version.
_mm512_mask_cvtsepi32_storeu_epi16(output, 0xffff, QuantizerGrab(input, quant_mult_reg));
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 32;
static const Index kBTileCol = 8;
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PREPARE_B_16(INTGEMM_AVX512BW, QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int16_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile16, int16_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows * 2, cols_begin, cols_end);
}
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_MULTIPLY16(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
constexpr static const char *const kName = "16-bit AVX512";
static const CPUType kUses = CPUType::AVX512BW;
};
struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
private:
/* g++ (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 does not carry target attributes
* to the hidden function it creates in implementing #pragma omp parallel for.
* So intrinstics were not working inside the for loop when compiled with
* OMP. Also, passing register types across #pragma omp parallel for
* generated an internal compiler error.
* The problem does not occur in g++-8 (Ubuntu 8.3.0-6ubuntu1~18.04.1) 8.3.0.
* As a workaround, I split into #pragma omp parallel with boring types
* passed across the boundary then call this function with target attributes.
*/
INTGEMM_AVX512BW static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) {
const __m512i neg127 = _mm512_set1_epi32(-127);
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
#pragma omp for
for (std::size_t i = 0; i < count; i += kBatch) {
__m512i asint = QuantizerGrab(input + i, quant_mult_reg);
asint = _mm512_max_epi32(asint, neg127);
// There doesn't seem to be an unmasked version.
_mm512_mask_cvtsepi32_storeu_epi8(output + i, 0xffff, asint);
}
}
public:
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// Convert to 8-bit signed integers.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void Quantize(const float *input, int8_t *output, float quant_mult, Index size) {
assert(reinterpret_cast<uintptr_t>(input) % sizeof(__m512i) == 0);
const std::size_t kBatch = sizeof(__m512i) / sizeof(float);
std::size_t fast_size = (size & ~(kBatch - 1));
const float *fast_input_end = input + fast_size;
int8_t *fast_output_end = output + fast_size;
#pragma omp parallel
{
QuantizeThread(input, output, quant_mult, fast_size);
}
std::size_t overhang = size & (kBatch - 1);
if (!overhang) return; // We needed a branch anyway for the empty case.
const __m512i neg127 = _mm512_set1_epi32(-127);
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
__m512i asint = QuantizerGrab(fast_input_end, quant_mult_reg);
asint = _mm512_max_epi32(asint, neg127);
_mm512_mask_cvtsepi32_storeu_epi8(fast_output_end, (1 << overhang) - 1, asint);
}
// Preparing A for the signed/unsigned multiplication. Using add 127
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
QuantizeU(input, output, quant_mult, rows * cols);
}
// Technically output can be unaligned in Quantize.
// But then it will need to be aligned for Multiply.
// Convert to 8-bit signed integers.
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 64 == 0);
const __m512i pos127 = _mm512_set1_epi32(127);
const __m512i zero = _mm512_setzero_si512();
const __m512 quant_mult_reg = _mm512_set1_ps(quant_mult);
const float *end = input + size;
for (; input < end; input += 16, output += 16) {
__m512i asint = QuantizerGrab(input, quant_mult_reg);
asint = _mm512_min_epi32(asint, pos127);
asint = _mm512_add_epi32(asint, pos127);
asint = _mm512_max_epi32(asint, zero);
_mm512_mask_cvtusepi32_storeu_epi8(output, 0xffff, asint);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 64;
static const Index kBTileCol = 8;
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PREPARE_B_8(INTGEMM_AVX512BW, QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_AVX512BW, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_AVX512BW, QuantizeTile8, int8_t)
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
SelectColumnsOfB((const __m512i*)input, (__m512i*)output, rows, cols_begin, cols_end);
}
// Special AVX512 implementation due to having 32 registers (so I don't have to
// allocate registers manually) and no sign instruction.
template <typename Callback>
INTGEMM_AVX512BW static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
// This is copy-paste from Multiply8_SSE2OrAVX2.
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
// There's 8 results for INTGEMM_AVX2 to handle.
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
const Index simd_width = width / sizeof(Register);
// Added for AVX512.
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
#pragma omp for
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
// Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
// Iterate over shared (inner) dimension.
const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
const Register *A_end = A_live + simd_width;
const Register *B_live = B0_col;
// Do the first iteration to initialize the sums.
__m512i a = *A_live;
__mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
__m512i a_positive = _mm512_abs_epi8(a);
// These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.
Register sum0 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[0], neg_mask, zeros, B_live[0]));
Register sum1 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[1], neg_mask, zeros, B_live[1]));
Register sum2 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[2], neg_mask, zeros, B_live[2]));
Register sum3 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[3], neg_mask, zeros, B_live[3]));
Register sum4 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[4], neg_mask, zeros, B_live[4]));
Register sum5 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[5], neg_mask, zeros, B_live[5]));
Register sum6 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[6], neg_mask, zeros, B_live[6]));
Register sum7 = maddubs_epi16(a_positive, _mm512_mask_sub_epi8(B_live[7], neg_mask, zeros, B_live[7]));
++A_live;
B_live += 8;
// Use A as the loop variable so the add can be done where gcc likes it
// for branch prediction.
for (; A_live != A_end; ++A_live, B_live += 8) {
// Unique code here: can we do an inline function?
// Retrieve a. We will use this as the unsigned part.
a = *A_live;
// Retrieve the conveniently consecutive values of B.
__m512i b0 = *B_live;
__m512i b1 = *(B_live + 1);
__m512i b2 = *(B_live + 2);
__m512i b3 = *(B_live + 3);
__m512i b4 = *(B_live + 4);
__m512i b5 = *(B_live + 5);
__m512i b6 = *(B_live + 6);
__m512i b7 = *(B_live + 7);
// Get a mask where a is negative.
// Didn't seem to make a difference definining sign bits here vs at top
neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
a_positive = _mm512_abs_epi8(a);
// Negate by subtracting from zero with a mask.
b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
// The magic 8-bit multiply then horizontal sum into 16-bit.
b0 = _mm512_maddubs_epi16(a_positive, b0);
b1 = _mm512_maddubs_epi16(a_positive, b1);
b2 = _mm512_maddubs_epi16(a_positive, b2);
b3 = _mm512_maddubs_epi16(a_positive, b3);
b4 = _mm512_maddubs_epi16(a_positive, b4);
b5 = _mm512_maddubs_epi16(a_positive, b5);
b6 = _mm512_maddubs_epi16(a_positive, b6);
b7 = _mm512_maddubs_epi16(a_positive, b7);
// Now we have 16-bit results that are the sum of two multiplies.
// Choosing to approximate and do adds.
// Perhaps every so often we could accumulate by upcasting.
sum0 = _mm512_adds_epi16(sum0, b0);
sum1 = _mm512_adds_epi16(sum1, b1);
sum2 = _mm512_adds_epi16(sum2, b2);
sum3 = _mm512_adds_epi16(sum3, b3);
sum4 = _mm512_adds_epi16(sum4, b4);
sum5 = _mm512_adds_epi16(sum5, b5);
sum6 = _mm512_adds_epi16(sum6, b6);
sum7 = _mm512_adds_epi16(sum7, b7);
// Unique code ends: can we do an inline function?
}
// Upcast to 32-bit and horizontally add.
Register ones = set1_epi16<Register>(1);
sum0 = madd_epi16(sum0, ones);
sum1 = madd_epi16(sum1, ones);
sum2 = madd_epi16(sum2, ones);
sum3 = madd_epi16(sum3, ones);
sum4 = madd_epi16(sum4, ones);
sum5 = madd_epi16(sum5, ones);
sum6 = madd_epi16(sum6, ones);
sum7 = madd_epi16(sum7, ones);
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
auto total = PermuteSummer(pack0123, pack4567);
callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
}
}
}
INTGEMM_MULTIPLY8SHIFT(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
INTGEMM_PREPAREBIASFOR8(__m512i, INTGEMM_AVX512BW, CPUType::AVX2)
constexpr static const char *const kName = "8-bit AVX512BW";
static const CPUType kUses = CPUType::AVX512BW;
};
} // namespace AVX512BW
} // namespace intgemm
#endif

168
third_party/intgemm/intgemm/avx512vnni_gemm.h поставляемый
Просмотреть файл

@ -1,168 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
#include "avx512_gemm.h"
#include "types.h"
namespace intgemm {
namespace AVX512VNNI {
// Workaround extra vmovdqa64 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
INTGEMM_AVX512VNNI static inline void VNNI8(__m512i &c, __m512i a, __m512i b) {
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
asm ("vpdpbusds %2, %1, %0" : "+x"(c) : "x"(a), "mx"(b));
#else
c = _mm512_dpbusds_epi32(c, a, b);
#endif
}
struct Kernels8 : public AVX512BW::Kernels8 {
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
const Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
#pragma omp for
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
// Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
// Iterate over shared (inner) dimension.
const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
const Register *A_end = A_live + simd_width;
const Register *B_live = B0_col;
// TODO: separate first step.
Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
for (; A_live != A_end; ++A_live, B_live += 8) {
Register a = *A_live;
// Retrieve the conveniently consecutive values of B.
Register b0 = *B_live;
Register b1 = *(B_live + 1);
Register b2 = *(B_live + 2);
Register b3 = *(B_live + 3);
Register b4 = *(B_live + 4);
Register b5 = *(B_live + 5);
Register b6 = *(B_live + 6);
Register b7 = *(B_live + 7);
// Get a mask where a is negative.
__mmask64 neg_mask = _mm512_test_epi8_mask(a, _mm512_set1_epi8(-128));
Register a_positive = _mm512_abs_epi8(a);
// Negate by subtracting from zero with a mask.
b0 = _mm512_mask_sub_epi8(b0, neg_mask, zeros, b0);
b1 = _mm512_mask_sub_epi8(b1, neg_mask, zeros, b1);
b2 = _mm512_mask_sub_epi8(b2, neg_mask, zeros, b2);
b3 = _mm512_mask_sub_epi8(b3, neg_mask, zeros, b3);
b4 = _mm512_mask_sub_epi8(b4, neg_mask, zeros, b4);
b5 = _mm512_mask_sub_epi8(b5, neg_mask, zeros, b5);
b6 = _mm512_mask_sub_epi8(b6, neg_mask, zeros, b6);
b7 = _mm512_mask_sub_epi8(b7, neg_mask, zeros, b7);
VNNI8(sum0, a_positive, b0);
VNNI8(sum1, a_positive, b1);
VNNI8(sum2, a_positive, b2);
VNNI8(sum3, a_positive, b3);
VNNI8(sum4, a_positive, b4);
VNNI8(sum5, a_positive, b5);
VNNI8(sum6, a_positive, b6);
VNNI8(sum7, a_positive, b7);
}
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
auto total = PermuteSummer(pack0123, pack4567);
callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
}
}
}
template <typename Callback>
INTGEMM_AVX512VNNI static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
const Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
// Go over 8 columns of B at a time.
#pragma omp for
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
// Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) {
// Iterate over shared (inner) dimension.
const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width);
const Register *A_end = A_live + simd_width;
const Register *B_live = B0_col;
// TODO: separate first step.
Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
for (; A_live != A_end; ++A_live, B_live += 8) {
Register a = *A_live;
//MultiplyAdd
VNNI8(sum0, a, *B_live);
VNNI8(sum1, a, *(B_live + 1));
VNNI8(sum2, a, *(B_live + 2));
VNNI8(sum3, a, *(B_live + 3));
VNNI8(sum4, a, *(B_live + 4));
VNNI8(sum5, a, *(B_live + 5));
VNNI8(sum6, a, *(B_live + 6));
VNNI8(sum7, a, *(B_live + 7));
}
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
auto total = PermuteSummer(pack0123, pack4567);
callback_impl.Run(total, callbacks::OutputBufferInfo(A_rowidx, B0_colidx, A_rows, B_cols));
}
}
}
template <typename Callback>
INTGEMM_AVX512VNNI static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
assert(width % sizeof(Register) == 0);
assert(B_cols % 8 == 0);
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0);
auto callback_impl = callbacks::CallbackImpl<CPUType::AVX2, Callback>(callback);
Index simd_width = width / sizeof(Register);
Register zeros = setzero_si<Register>();
const Register a = set1_epi8<Register>(1);
// Go over 8 columns of B at a time.
#pragma omp for
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) {
const Register *B0_col = reinterpret_cast<const Register*>(B) + B0_colidx * simd_width;
const Register *B_live = B0_col; //In order to make the code look as much as possible as the above function
const Register *B_end = B_live + simd_width*8;
// TODO: separate first step.
Register sum0 = zeros, sum1 = zeros, sum2 = zeros, sum3 = zeros, sum4 = zeros, sum5 = zeros, sum6 = zeros, sum7 = zeros;
for (; B_live != B_end; B_live += 8) {
// Retrieve the conveniently consecutive values of B.
VNNI8(sum0, a, *B_live);
VNNI8(sum1, a, *(B_live + 1));
VNNI8(sum2, a, *(B_live + 2));
VNNI8(sum3, a, *(B_live + 3));
VNNI8(sum4, a, *(B_live + 4));
VNNI8(sum5, a, *(B_live + 5));
VNNI8(sum6, a, *(B_live + 6));
VNNI8(sum7, a, *(B_live + 7));
}
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3);
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7);
auto total = PermuteSummer(pack0123, pack4567);
callback_impl.Run(total, callbacks::OutputBufferInfo(0, B0_colidx, 1, B_cols));
}
}
constexpr static const char *const kName = "8-bit AVX512VNNI";
static const CPUType kUses = CPUType::AVX512VNNI;
};
} // namespace AVX512VNNI
} // namespace intgemm
#endif

28
third_party/intgemm/intgemm/callbacks.h поставляемый
Просмотреть файл

@ -1,28 +0,0 @@
#pragma once
#include "callbacks/configs.h"
#include "callbacks/output_buffer_info.h"
#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "kernels.h"
#include "types.h"
#include "utils.h"
#include "vec_traits.h"
#define CALLBACKS_THIS_IS_SSE2
#include "callbacks/implementations.inl"
#undef CALLBACKS_THIS_IS_SSE2
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#define CALLBACKS_THIS_IS_AVX2
#include "callbacks/implementations.inl"
#undef CALLBACKS_THIS_IS_AVX2
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
#define CALLBACKS_THIS_IS_AVX512BW
#include "callbacks/implementations.inl"
#undef CALLBACKS_THIS_IS_AVX512BW
#endif

Просмотреть файл

@ -1,73 +0,0 @@
#pragma once
#include <tuple>
namespace intgemm {
namespace callbacks {
/*
* Sequence meta-config
*/
template <typename... Configs>
std::tuple<Configs...> Sequence(const Configs&... configs) {
return std::make_tuple(configs...);
}
/*
* Configs
*/
struct Dummy {
};
template <typename Type>
struct Write {
Type* output_addr;
Write(Type* output_addr) : output_addr(output_addr) {}
};
struct Unquantize {
float unquant_mult;
Unquantize(float unquant_mult) : unquant_mult(unquant_mult) {}
};
struct UnquantizeAndWrite {
float unquant_mult;
float* output_addr;
UnquantizeAndWrite(float unquant_mult, float* output_addr) : unquant_mult(unquant_mult), output_addr(output_addr) {}
};
struct UnquantizeAndWriteRelu {
float unquant_mult;
float* output_addr;
UnquantizeAndWriteRelu(float unquant_mult, float* output_addr) : unquant_mult(unquant_mult), output_addr(output_addr) {}
};
struct AddBiasAndWrite {
const int* bias_addr;
int* output_addr;
AddBiasAndWrite(const int* bias_addr, int* output_addr) : bias_addr(bias_addr), output_addr(output_addr) {}
};
struct UnquantizeAndAddBiasAndWrite {
float unquant_mult;
const float* bias_addr;
float* output_addr;
UnquantizeAndAddBiasAndWrite(float unquant_mult, const float* bias_addr, float* output_addr) : unquant_mult(unquant_mult), bias_addr(bias_addr), output_addr(output_addr) {}
};
struct UnquantizeAndAddBiasAndWriteRelu {
float unquant_mult;
const float* bias_addr;
float* output_addr;
UnquantizeAndAddBiasAndWriteRelu(float unquant_mult, const float* bias_addr, float* output_addr) : unquant_mult(unquant_mult), bias_addr(bias_addr), output_addr(output_addr) {}
};
}
}

Просмотреть файл

@ -1,258 +0,0 @@
/* This file is included multiple times, once per architecture. */
#if defined(CALLBACKS_THIS_IS_SSE2)
#define CPU_NAME SSE2
#define INTGEMM_TARGET INTGEMM_SSE2
#elif defined(CALLBACKS_THIS_IS_AVX2)
#define CPU_NAME AVX2
#define INTGEMM_TARGET INTGEMM_AVX2
#elif defined(CALLBACKS_THIS_IS_AVX512BW)
#define CPU_NAME AVX512BW
#define INTGEMM_TARGET INTGEMM_AVX512BW
#else
#error "Only SSE2, AVX2 and AVX512BW are supported"
#endif
#if defined(CALLBACKS_THIS_IS_SSE2)
#define vi vector_t<CPUType::SSE2, int>
#define vf vector_t<CPUType::SSE2, float>
#define vd vector_t<CPUType::SSE2, double>
#else
#define vi vector_t<CPUType::AVX2, int>
#define vf vector_t<CPUType::AVX2, float>
#define vd vector_t<CPUType::AVX2, double>
#endif
/* Intel compiler 19.1.0.166 20191121 fails to link constructors with target attributes */
#ifdef __INTEL_COMPILER
#define INTGEMM_TARGET_CONSTRUCTOR
#else
#define INTGEMM_TARGET_CONSTRUCTOR INTGEMM_TARGET
#endif
namespace intgemm {
namespace callbacks {
template <CPUType CpuType, typename CallbackConfig>
class CallbackImpl;
}}
/*
* Callbacks implementations....
*/
namespace intgemm {
namespace callbacks {
/*
* Sequence
*/
template <typename... Configs>
class CallbackImpl<CPUType::CPU_NAME, std::tuple<Configs...>> {
public:
explicit CallbackImpl(const std::tuple<Configs...>& configs) : callbacks(init_callbacks(configs, make_sequence<sizeof...(Configs)>())) {}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
run_callbacks(input, info, callbacks, make_sequence<sizeof...(Configs)>());
}
private:
using CallbacksTupleType = std::tuple<CallbackImpl<CPUType::CPU_NAME, Configs>...>;
CallbacksTupleType callbacks;
template <unsigned... Indices>
CallbacksTupleType init_callbacks(const std::tuple<Configs...>& configs, sequence<Indices...>) {
return std::make_tuple(CallbackImpl<CPUType::CPU_NAME, typename std::tuple_element<Indices, std::tuple<Configs...>>::type>(std::get<Indices>(configs))...);
}
#define RUN_CALLBACKS_PIPELINE_IMPL(vtype) \
template <unsigned FirstIndex> \
INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex>) { \
std::get<FirstIndex>(tuple)(input, info); \
} \
template <unsigned FirstIndex, unsigned SecondIndex, unsigned... RestIndices> \
INTGEMM_TARGET static inline void run_callbacks(vtype input, const OutputBufferInfo& info, CallbacksTupleType& tuple, sequence<FirstIndex, SecondIndex, RestIndices...>) { \
auto output = std::get<FirstIndex>(tuple)(input, info); \
run_callbacks(output, info, tuple, sequence<SecondIndex, RestIndices...>()); \
}
RUN_CALLBACKS_PIPELINE_IMPL(vi)
RUN_CALLBACKS_PIPELINE_IMPL(vf)
RUN_CALLBACKS_PIPELINE_IMPL(vd)
#undef RUN_CALLBACKS_PIPELINE_IMPL
};
/*
* Dummy
*/
template <> class CallbackImpl<CPUType::CPU_NAME, Dummy> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Dummy&) {}
INTGEMM_TARGET void Run(vi, const OutputBufferInfo&) {}
};
/*
* Write
*/
template <typename Type>
class CallbackImpl<CPUType::CPU_NAME, Write<Type>> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Write<Type>& config) : config(config) {}
INTGEMM_TARGET void Run(vector_t<CPUType::CPU_NAME, Type> input, const OutputBufferInfo& info) {
kernels::write(input, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
Write<Type> config;
};
/*
* Unquantize
*/
template <> class CallbackImpl<CPUType::CPU_NAME, Unquantize> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const Unquantize& config) : config(config) {
unquant_mult = set1_ps<vf>(config.unquant_mult);
}
INTGEMM_TARGET vf Run(vi input, const OutputBufferInfo&) {
return kernels::unquantize(input, unquant_mult);
}
private:
vf unquant_mult;
Unquantize config;
};
/*
* UnquantizeAndWrite
*/
template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndWrite> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndWrite& config) : config(config) {
unquant_mult = set1_ps<vf>(config.unquant_mult);
}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
// Workaround gcc 5 internal compiler error that can't read register members in debug.
vf mult_reg;
#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
#else
mult_reg = unquant_mult;
#endif
auto result = kernels::unquantize(input, mult_reg);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
vf unquant_mult;
UnquantizeAndWrite config;
};
/*
* UnquantizeAndWriteRelu
*/
template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndWriteRelu> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndWriteRelu& config) : config(config) {
unquant_mult = set1_ps<vf>(config.unquant_mult);
}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
// Workaround gcc 5 internal compiler error that can't read register members in debug.
vf mult_reg;
#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
#else
mult_reg = unquant_mult;
#endif
auto result = kernels::relu<float>(kernels::unquantize(input, mult_reg));
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
vf unquant_mult;
UnquantizeAndWriteRelu config;
};
/*
* AddBiasAndWrite
*/
template <> class CallbackImpl<CPUType::CPU_NAME, AddBiasAndWrite> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const AddBiasAndWrite& config) : config(config) {}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
auto result = kernels::add_bias(input, config.bias_addr, info.col_idx);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
AddBiasAndWrite config;
};
/*
* UnquantizeAndAddBiasAndWrite
*/
template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndAddBiasAndWrite> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndAddBiasAndWrite& config) : config(config) {
unquant_mult = set1_ps<vf>(config.unquant_mult);
}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
// Workaround gcc 5 internal compiler error that can't read register members in debug.
vf mult_reg;
#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
#else
mult_reg = unquant_mult;
#endif
auto result = kernels::unquantize(input, mult_reg);
result = kernels::add_bias(result, config.bias_addr, info.col_idx);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
vf unquant_mult;
UnquantizeAndAddBiasAndWrite config;
};
/*
* UnquantizeAndAddBiasAndWrite
*/
template <> class CallbackImpl<CPUType::CPU_NAME, UnquantizeAndAddBiasAndWriteRelu> {
public:
explicit INTGEMM_TARGET_CONSTRUCTOR CallbackImpl(const UnquantizeAndAddBiasAndWriteRelu& config) : config(config) {
unquant_mult = set1_ps<vf>(config.unquant_mult);
}
INTGEMM_TARGET void Run(vi input, const OutputBufferInfo& info) {
// Workaround gcc 5 internal compiler error that can't read register members in debug.
vf mult_reg;
#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && !defined(__clang__) && !defined(__INTEL_COMPILER)
asm ("vmovdqa %1, %0" : "=x" (mult_reg) : "m" (unquant_mult));
#else
mult_reg = unquant_mult;
#endif
auto result = kernels::unquantize(input, mult_reg);
result = kernels::add_bias(result, config.bias_addr, info.col_idx);
result = kernels::relu<float>(result);
kernels::write(result, config.output_addr, info.row_idx * info.cols + info.col_idx);
}
private:
vf unquant_mult;
UnquantizeAndAddBiasAndWriteRelu config;
};
}
}
#undef CPU_NAME
#undef INTGEMM_TARGET
#undef vi
#undef vf
#undef vd

Просмотреть файл

@ -1,20 +0,0 @@
#pragma once
#include "../types.h"
namespace intgemm {
namespace callbacks {
struct OutputBufferInfo {
Index row_idx;
Index col_idx;
Index rows; // = A_rows
Index cols; // = B_cols
OutputBufferInfo(Index row_idx, Index col_idx, Index rows, Index cols)
: row_idx(row_idx), col_idx(col_idx), rows(rows), cols(cols) {}
};
}
}

317
third_party/intgemm/intgemm/interleave.h поставляемый
Просмотреть файл

@ -1,317 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "types.h"
#include <algorithm>
#include <cassert>
namespace intgemm {
/*
* Interleave vectors.
*/
#define INTGEMM_INTERLEAVE_N(target, type, N) \
target static inline void Interleave##N(type &first, type &second) { \
type temp = unpacklo_epi##N(first, second); \
second = unpackhi_epi##N(first, second); \
first = temp; \
}
#define INTGEMM_INTERLEAVE(target, type) \
INTGEMM_INTERLEAVE_N(target, type, 8) \
INTGEMM_INTERLEAVE_N(target, type, 16) \
INTGEMM_INTERLEAVE_N(target, type, 32) \
INTGEMM_INTERLEAVE_N(target, type, 64)
INTGEMM_INTERLEAVE(INTGEMM_SSE2, __m128i)
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_INTERLEAVE(INTGEMM_AVX2, __m256i)
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
INTGEMM_INTERLEAVE(INTGEMM_AVX512BW, __m512i)
#endif
/*
* Swap vectors.
*/
#define INTGEMM_SWAP(target, Register) \
target static inline void Swap(Register &a, Register &b) { \
Register tmp = a; \
a = b; \
b = tmp; \
} \
INTGEMM_SWAP(INTGEMM_SSE2, __m128i)
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_SWAP(INTGEMM_AVX2, __m256i)
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_SWAP(INTGEMM_AVX512BW, __m512i)
#endif
/* Transpose registers containing 8 packed 16-bit integers.
* Each 128-bit lane is handled independently.
*/
#define INTGEMM_TRANSPOSE16(target, Register) \
target static inline void Transpose16InLane(Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7) { \
/* r0: columns 0 1 2 3 4 5 6 7 from row 0
r1: columns 0 1 2 3 4 5 6 7 from row 1*/ \
Interleave16(r0, r1); \
Interleave16(r2, r3); \
Interleave16(r4, r5); \
Interleave16(r6, r7); \
/* r0: columns 0 0 1 1 2 2 3 3 from rows 0 and 1
r1: columns 4 4 5 5 6 6 7 7 from rows 0 and 1
r2: columns 0 0 1 1 2 2 3 3 from rows 2 and 3
r3: columns 4 4 5 5 6 6 7 7 from rows 2 and 3
r4: columns 0 0 1 1 2 2 3 3 from rows 4 and 5
r5: columns 4 4 5 5 6 6 7 7 from rows 4 and 5
r6: columns 0 0 1 1 2 2 3 3 from rows 6 and 7
r7: columns 4 4 5 5 6 6 7 7 from rows 6 and 7*/ \
Interleave32(r0, r2); \
Interleave32(r1, r3); \
Interleave32(r4, r6); \
Interleave32(r5, r7); \
/* r0: columns 0 0 0 0 1 1 1 1 from rows 0, 1, 2, and 3
r1: columns 4 4 4 4 5 5 5 5 from rows 0, 1, 2, and 3
r2: columns 2 2 2 2 3 3 3 3 from rows 0, 1, 2, and 3
r3: columns 6 6 6 6 7 7 7 7 from rows 0, 1, 2, and 3
r4: columns 0 0 0 0 1 1 1 1 from rows 4, 5, 6, and 7
r5: columns 4 4 4 4 5 5 5 5 from rows 4, 5, 6, and 7
r6: columns 2 2 2 2 3 3 3 3 from rows 4, 5, 6, and 7
r7: columns 6 6 6 6 7 7 7 7 from rows 4, 5, 6, and 7*/ \
Interleave64(r0, r4); \
Interleave64(r1, r5); \
Interleave64(r2, r6); \
Interleave64(r3, r7); \
/* r0: columns 0 0 0 0 0 0 0 0 from rows 0 through 7
r1: columns 4 4 4 4 4 4 4 4 from rows 0 through 7
r2: columns 2 2 2 2 2 2 2 2 from rows 0 through 7
r3: columns 6 6 6 6 6 6 6 6 from rows 0 through 7
r4: columns 1 1 1 1 1 1 1 1 from rows 0 through 7
r5: columns 5 5 5 5 5 5 5 5 from rows 0 through 7*/ \
/* Empirically gcc is able to remove these movs and just rename the outputs of Interleave64. */ \
Swap(r1, r4); \
Swap(r3, r6); \
} \
INTGEMM_TRANSPOSE16(INTGEMM_SSE2, __m128i)
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_TRANSPOSE16(INTGEMM_AVX2, __m256i)
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_TRANSPOSE16(INTGEMM_AVX512BW, __m512i)
#endif
/* Tranpose registers containing 16 packed 8-bit integers.
* Each 128-bit lane is handled independently.
*/
template <class Register> static inline void Transpose8InLane(
Register &r0, Register &r1, Register &r2, Register &r3, Register &r4, Register &r5, Register &r6, Register &r7,
Register &r8, Register &r9, Register &r10, Register &r11, Register &r12, Register &r13, Register &r14, Register &r15) {
// Get 8-bit values to 16-bit values so they can travel together.
Interleave8(r0, r1);
// r0: columns 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 from rows 0 and 1.
// r1: columns 8 8 9 9 10 10 11 11 12 12 13 13 14 14 15 15 from rows 0 and 1.
Interleave8(r2, r3);
// r2: columns 0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 from rows 2 and 3.
Interleave8(r4, r5);
Interleave8(r6, r7);
Interleave8(r8, r9);
Interleave8(r10, r11);
Interleave8(r12, r13);
Interleave8(r14, r15);
Transpose16InLane(r0, r2, r4, r6, r8, r10, r12, r14);
Transpose16InLane(r1, r3, r5, r7, r9, r11, r13, r15);
// Permute into correct order. This is free because the outputs just get pemuted.
Register tmp;
tmp = r2;
r2 = r4;
r4 = r8;
r8 = r1;
r1 = tmp;
tmp = r3;
r3 = r6;
r6 = r12;
r12 = r9;
r9 = tmp;
tmp = r5;
r5 = r10;
r10 = tmp;
tmp = r7;
r7 = r14;
r14 = r13;
r13 = r11;
r11 = tmp;
}
// PREPARE B: quantize and rearrange. B is presumed to be constantparameters
// so we can take our time rearranging it in order to save during the multiply.
//
// We presume B starts in row-major order.
//
// In INTGEMM_AVX2, a register holds 32 8-bit values or 16 16-bit values and we want
// that many values from the same column in the register.
//
// The multiplier reads 8 rows at a time and we want these reads to be
// contiguous.
//
// Each 8x32 (for 8-bit) or 8x16 (for 16-bit) tile of B is transposed.
// The tiles are stored in column major order.
//
// For INTGEMM_AVX2, this matrix shows what index each value of B will be stored at:
// 0 16 ... 240
// 1 17 ... 241
// 2 18 ... 242
// 3 19 ... 243
// 4 20 ... 244
// 5 21 ... 245
// 6 22 ... 246
// 7 23 ... 247
// 8 24 ... 248
// 9 25 ... 249
// 10 26 ... 250
// 11 27 ... 251
// 12 28 ... 252
// 13 29 ... 253
// 14 30 ... 254
// 15 31 ... 255
// 256 272
// 257 273
// ... ...
#define INTGEMM_PREPARE_B_8(target, QuantClass) \
target static inline void PrepareB(const float *input, int8_t *output_shadow, float quant_mult, Index rows, Index cols) { \
FRegister q = set1_ps<FRegister>(quant_mult); \
/* Currently all multipliers have a stride of 8 columns.*/ \
const Index kColStride = 8; \
assert(cols % kColStride == 0); \
assert(rows % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
Register *output = reinterpret_cast<Register*>(output_shadow); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
for (Index c = 0; c < cols; c += kColStride) { \
for (Index r = 0; r < rows; r += sizeof(Register), output += 8) { \
/* Quantize and perform a transpose with height sizeof(Register) and width 8. \
This isn't quite Transpose8InLane because it's half the number of columns, \
so each register starts with two rows instead of being one row. \
The quantizers know to skip a row.*/ \
output[0] = QuantClass::ForReshape(q, input + cols * (r ) + c, cols); \
output[1] = QuantClass::ForReshape(q, input + cols * (r + 1) + c, cols); \
output[2] = QuantClass::ForReshape(q, input + cols * (r + 4) + c, cols); \
output[3] = QuantClass::ForReshape(q, input + cols * (r + 5) + c, cols); \
output[4] = QuantClass::ForReshape(q, input + cols * (r + 8) + c, cols); \
output[5] = QuantClass::ForReshape(q, input + cols * (r + 9) + c, cols); \
output[6] = QuantClass::ForReshape(q, input + cols * (r + 12) + c, cols); \
output[7] = QuantClass::ForReshape(q, input + cols * (r + 13) + c, cols); \
Interleave8(output[0], output[1]); \
Interleave8(output[2], output[3]); \
Interleave8(output[4], output[5]); \
Interleave8(output[6], output[7]); \
Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
} \
} \
} \
#define INTGEMM_PREPARE_B_16(target, QuantClass) \
target static inline void PrepareB(const float *input, int16_t *output_shadow, float quant_mult, Index rows, Index cols) { \
FRegister q = set1_ps<FRegister>(quant_mult); \
assert(cols % 8 == 0); \
assert(rows % (sizeof(Register) / sizeof(int16_t)) == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
Register *output = reinterpret_cast<Register*>(output_shadow); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
for (Index c = 0; c < cols; c += 8) { \
for (Index r = 0; r < rows; r += (sizeof(Register) / sizeof(int16_t)), output += 8) { \
/* gcc unrolls this loop and uses registers for output[k]*/ \
for (Index k = 0; k < 8; ++k) { \
output[k] = QuantClass::ForReshape(q, input + cols * (r + k) + c, cols); \
} \
Transpose16InLane(output[0], output[1], output[2], output[3], output[4], output[5], output[6], output[7]); \
} \
} \
}
/*
* Prepare B matrix.
* B matrix has to be transposed and quantized.
* Cols has to be a multiple of sizeof(Register) / sizeof(Integer).
*
* cols and rows describe size of transposed B.
*/
#define INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(target, Integer) \
target static inline void PrepareBQuantizedTransposed(const Integer* input, Integer* output, Index cols, Index rows) { \
const Index RegisterElems = sizeof(Register) / sizeof(Integer); \
const Index kColStride = 8; \
\
assert(cols % RegisterElems == 0); \
assert(rows % kColStride == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
\
Register* output_it = reinterpret_cast<Register*>(output); \
for (Index r = 0; r < rows; r += kColStride) \
for (Index c = 0; c < cols; c += RegisterElems) \
for (Index ri = 0; ri < 8; ++ri) \
*output_it++ = *reinterpret_cast<const Register*>(input + (r + ri) * cols + c); \
}
/*
* Prepare B matrix.
* B matrix has to be transposed.
* Cols has to be a multiple of sizeof(Register) / sizeof(float).
*
* cols and rows describe size of transposed B.
*/
#define INTGEMM_PREPARE_B_TRANSPOSED(target, Quantizer, Integer) \
target static inline void PrepareBTransposed(const float* input, Integer* output, float quant_mult, Index cols, Index rows) { \
const Index RegisterElemsInt = sizeof(Register) / sizeof(Integer); \
const Index kColStride = 8; \
\
assert(cols % (sizeof(Register) / sizeof(float)) == 0); \
assert(rows % kColStride == 0); \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
\
FRegister q = set1_ps<FRegister>(quant_mult); \
Register* output_it = reinterpret_cast<Register*>(output); \
Index r = 0; \
Index c = 0; \
while (r < rows) { \
for (Index ri = 0; ri < 8; ++ri) \
*output_it++ = Quantizer::ConsecutiveWithWrapping(q, input + (r + ri) * cols + c, cols - c, cols, 8); \
c += RegisterElemsInt; \
while (c >= cols) { \
r += kColStride; \
c -= cols; \
} \
} \
}
/* Select columns of B from PrepareB format to PrepareB format.
*/
#define INTGEMM_SELECT_COL_B(target, Register) \
target static inline void SelectColumnsOfB(const Register *input, Register *output, Index rows_bytes /* number of bytes in a row */, const Index *cols_begin, const Index *cols_end) { \
assert(rows_bytes % sizeof(Register) == 0); \
assert((cols_end - cols_begin) % 8 == 0); \
/* Do columns for multiples of 8.*/ \
Index register_rows = rows_bytes / sizeof(Register); \
const Register *starts[8]; \
for (; cols_begin != cols_end; cols_begin += 8) { \
for (Index k = 0; k < 8; ++k) { \
starts[k] = input + (cols_begin[k] & 7) + (cols_begin[k] & ~7) * register_rows; \
} \
for (Index r = 0; r < register_rows; ++r) { \
for (Index k = 0; k < 8; ++k) { \
*(output++) = *starts[k]; \
starts[k] += 8; \
} \
} \
} \
}
} // namespace intgemm

207
third_party/intgemm/intgemm/intgemm.cc поставляемый
Просмотреть файл

@ -1,207 +0,0 @@
#if defined(WASM)
// No header for CPUID since it's hard-coded.
#elif defined(__INTEL_COMPILER)
#include <immintrin.h>
#elif defined(_MSC_VER)
#include <intrin.h>
#else
// Assume GCC and clang style.
#include <cpuid.h>
#endif
#include "intgemm.h"
#include "stats.h"
#include <stdio.h>
#include <stdlib.h>
namespace intgemm {
namespace {
// Return the maximum CPU model that's found and supported at compile time.
CPUType RealCPUID() {
#if defined(WASM)
// emscripten does SSE4.1 but we only use up to SSSE3.
return CPUType::SSSE3;
#elif defined(__INTEL_COMPILER)
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
if (_may_i_use_cpu_feature(_FEATURE_AVX512_VNNI)) return CPUType::AVX512VNNI;
# endif
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
if (_may_i_use_cpu_feature(_FEATURE_AVX512BW)) return CPUType::AVX512BW;
# endif
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
if (_may_i_use_cpu_feature(_FEATURE_AVX2)) return CPUType::AVX2;
# endif
if (_may_i_use_cpu_feature(_FEATURE_SSSE3)) return CPUType::SSSE3;
if (_may_i_use_cpu_feature(_FEATURE_SSE2)) return CPUType::SSE2;
return CPUType::UNSUPPORTED;
#else
// Not emscripten, not Intel compiler
# if defined(_MSC_VER)
int regs[4];
int &eax = regs[0], &ebx = regs[1], &ecx = regs[2], &edx = regs[3];
__cpuid(regs, 0);
int m = eax;
# else
/* gcc and clang.
* If intgemm is compiled by gcc 6.4.1 then dlopened into an executable
* compiled by gcc 7.3.0, there will be a undefined symbol __cpu_info.
* Work around this by calling the intrinsics more directly instead of
* __builtin_cpu_supports.
*
* clang 6.0.0-1ubuntu2 supports vnni but doesn't have
* __builtin_cpu_supports("avx512vnni")
* so use the hand-coded CPUID for clang.
*/
unsigned int m = __get_cpuid_max(0, 0);
unsigned int eax, ebx, ecx, edx;
# endif
if (m >= 7) {
# if defined(_MSC_VER)
__cpuid(regs, 7);
# else
__cpuid_count(7, 0, eax, ebx, ecx, edx);
# endif
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
if (ecx & (1 << 11)) return CPUType::AVX512VNNI;
# endif
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
if (ebx & (1 << 30)) return CPUType::AVX512BW;
# endif
# ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
if (ebx & (1 << 5)) return CPUType::AVX2;
# endif
}
if (m >= 1) {
# if defined(_MSC_VER)
__cpuid(regs, 1);
# else
__cpuid_count(1, 0, eax, ebx, ecx, edx);
# endif
if (ecx & (1 << 9)) return CPUType::SSSE3;
if (edx & (1 << 26)) return CPUType::SSE2;
}
return CPUType::UNSUPPORTED;
#endif
}
#ifdef INTGEMM_CPUID_ENVIRONMENT
CPUType EnvironmentCPUID() {
# if defined(_MSC_VER)
char env_override[11];
size_t len = 0;
if (getenv_s(&len, env_override, sizeof(env_override), "INTGEMM_CPUID")) return CPUType::AVX512VNNI;
if (!len) return CPUType::AVX512VNNI;
# else
const char *env_override = getenv("INTGEMM_CPUID");
if (!env_override) return CPUType::AVX512VNNI; /* This will be capped to actual ID */
# endif
if (!strcmp(env_override, "AVX512VNNI")) return CPUType::AVX512VNNI;
if (!strcmp(env_override, "AVX512BW")) return CPUType::AVX512BW;
if (!strcmp(env_override, "AVX2")) return CPUType::AVX2;
if (!strcmp(env_override, "SSSE3")) return CPUType::SSSE3;
if (!strcmp(env_override, "SSE2")) return CPUType::SSE2;
fprintf(stderr, "Ignoring unrecognized INTGEMM_CPUID %s\n", env_override);
return CPUType::AVX512VNNI;
}
#endif
} // namespace
CPUType GetCPUID() {
static const CPUType kLocalCPU =
#ifdef INTGEMM_CPUID_ENVIRONMENT
std::min(RealCPUID(), EnvironmentCPUID());
#else
RealCPUID();
#endif
return kLocalCPU;
}
const CPUType kCPU = GetCPUID();
void UnsupportedCPUError() {
#if (defined(_MSC_VER) && !defined(__clang__)) ? (_HAS_EXCEPTIONS) : (__EXCEPTIONS)
throw UnsupportedCPU();
#else
fprintf(stderr, "intgemm does not support this CPU.\n");
abort();
#endif
}
float Unsupported_MaxAbsolute(const float * /*begin*/, const float * /*end*/) {
UnsupportedCPUError();
return 0.0f;
}
MeanStd Unsupported_VectorMeanStd(const float * /*begin*/, const float * /*end*/, bool /*absolute*/) {
UnsupportedCPUError();
return MeanStd();
}
void (*Int16::Quantize)(const float *input, int16_t *output, float quant_mult, Index size) = ChooseCPU(AVX512BW::Kernels16::Quantize, AVX512BW::Kernels16::Quantize, AVX2::Kernels16::Quantize, SSE2::Kernels16::Quantize, SSE2::Kernels16::Quantize, Unsupported_16bit::Quantize);
void (*Int16::PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512BW::Kernels16::PrepareB, AVX512BW::Kernels16::PrepareB, AVX2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, SSE2::Kernels16::PrepareB, Unsupported_16bit::PrepareB);
void (*Int16::PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX512BW::Kernels16::PrepareBQuantizedTransposed, AVX2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, SSE2::Kernels16::PrepareBQuantizedTransposed, Unsupported_16bit::PrepareBQuantizedTransposed);
void (*Int16::PrepareBTransposed)(const float *input, int16_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels16::PrepareBTransposed, AVX512BW::Kernels16::PrepareBTransposed, AVX2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, SSE2::Kernels16::PrepareBTransposed, Unsupported_16bit::PrepareBTransposed);
void (*Int16::SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512BW::Kernels16::SelectColumnsB, AVX512BW::Kernels16::SelectColumnsB, AVX2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, SSE2::Kernels16::SelectColumnsB, Unsupported_16bit::SelectColumnsB);
const char *const Int16::kName = ChooseCPU(AVX512BW::Kernels16::kName, AVX512BW::Kernels16::kName, AVX2::Kernels16::kName, SSE2::Kernels16::kName, SSE2::Kernels16::kName, Unsupported_16bit::kName);
void (*Int8::Quantize)(const float *input, int8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::Quantize, AVX512BW::Kernels8::Quantize, AVX2::Kernels8::Quantize, SSSE3::Kernels8::Quantize, Unsupported_8bit::Quantize, Unsupported_8bit::Quantize);
void (*Int8::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
void (*Int8::PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) = ChooseCPU(AVX512VNNI::Kernels8::PrepareB, AVX512BW::Kernels8::PrepareB, AVX2::Kernels8::PrepareB, SSSE3::Kernels8::PrepareB, Unsupported_8bit::PrepareB, Unsupported_8bit::PrepareB);
void (*Int8::PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX512BW::Kernels8::PrepareBQuantizedTransposed, AVX2::Kernels8::PrepareBQuantizedTransposed, SSSE3::Kernels8::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed, Unsupported_8bit::PrepareBQuantizedTransposed);
void (*Int8::PrepareBTransposed)(const float *input, int8_t *output, float quant_mult, Index inner, Index B_untransposed_cols) = ChooseCPU(AVX512BW::Kernels8::PrepareBTransposed, AVX512BW::Kernels8::PrepareBTransposed, AVX2::Kernels8::PrepareBTransposed, SSSE3::Kernels8::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed, Unsupported_8bit::PrepareBTransposed);
void (*Int8::SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) = ChooseCPU(AVX512VNNI::Kernels8::SelectColumnsB, AVX512BW::Kernels8::SelectColumnsB, AVX2::Kernels8::SelectColumnsB, SSSE3::Kernels8::SelectColumnsB, Unsupported_8bit::SelectColumnsB, Unsupported_8bit::SelectColumnsB);
const char *const Int8::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
void (*Int8Shift::QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size) = ChooseCPU(AVX512VNNI::Kernels8::QuantizeU, AVX512BW::Kernels8::QuantizeU, AVX2::Kernels8::QuantizeU, SSSE3::Kernels8::QuantizeU, Unsupported_8bit::QuantizeU, Unsupported_8bit::QuantizeU);
const char *const Int8Shift::kName = ChooseCPU(AVX512VNNI::Kernels8::kName, AVX512BW::Kernels8::kName, AVX2::Kernels8::kName, SSSE3::Kernels8::kName, Unsupported_8bit::kName, Unsupported_8bit::kName);
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX2)
namespace AVX2{
using SSE2::MaxAbsolute;
using SSE2::VectorMeanStd;
} // namespace AVX2
#endif
#if !defined(INTGEMM_COMPILER_SUPPORTS_AVX512BW)
namespace AVX512BW {
using AVX2::MaxAbsolute;
using AVX2::VectorMeanStd;
} // namespace AVX512BW
#endif
float (*MaxAbsolute)(const float *begin, const float *end) = ChooseCPU(AVX512BW::MaxAbsolute, AVX512BW::MaxAbsolute, AVX2::MaxAbsolute, SSE2::MaxAbsolute, SSE2::MaxAbsolute, Unsupported_MaxAbsolute);
MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool absolute) = ChooseCPU(AVX512BW::VectorMeanStd, AVX512BW::VectorMeanStd, AVX2::VectorMeanStd, SSE2::VectorMeanStd, SSE2::VectorMeanStd, Unsupported_VectorMeanStd);
constexpr const char *const Unsupported_16bit::kName;
constexpr const char *const Unsupported_8bit::kName;
constexpr const char *const SSE2::Kernels16::kName;
constexpr const char *const SSSE3::Kernels8::kName;
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
constexpr const char *const AVX2::Kernels8::kName;
constexpr const char *const AVX2::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
constexpr const char *const AVX512BW::Kernels8::kName;
constexpr const char *const AVX512BW::Kernels16::kName;
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
constexpr const char *const AVX512VNNI::Kernels8::kName;
#endif
}

365
third_party/intgemm/intgemm/intgemm.h поставляемый
Просмотреть файл

@ -1,365 +0,0 @@
#pragma once
/* Main interface for integer matrix multiplication.
*
* We are computing C = A * B with an optional scaling factor.
*
* A is typically activations.
* Rows a multiple of 1 (no restriction)
* Columns a multiple of 64 for 8-bit or 32 for 16-bit.
* Use PrepareA to prepare A for multiplication. This is meant to be fast.
*
* B is typically fixed model parameters.
* Rows a multiple of 64 for 8-bit or 32 for 16-bit.
* Columns a multiple of: 8
* Use PrepareB to prepare B for multiplication. This is slower, with the
* intention that it will be prepared once and remembered.
*
* C is row major.
*
* Once both A and B are prepared, call Multiply.
*
* All memory (A, B, and C in float or prepared form) must be 64-byte aligned.
* It's easy to write code that works on your CPU with lower alignment, but
* breaks on AVX512.
*
* When preparing, you provide a quantization multiplier. Values will be
* multiplied by this then rounded to an integer.
* For 16-bit neural networks, Jacob Devlin recommends 1024.0.
* For 8-bit, use 127 / largest absolute value.
*
* Note that quantization saturates. However, 16-bit does accumulation in
* 32-bit which can overflow if you use too big of a multiplier.
*
* The multiply routine expects an unquantization multiplier.
* This should be unquant_mult = 1.0 / (A_quant_mult * B_quant_mult).
* Where A_quant_mult is what you passed to PrepareA and B_quant_mult is what you
* passed to PrepareB.
*
* Feel free to multiply in a scaling factor to compute C = \lambda A * B by
* passing unquant_mult = \lambda / (A_quant_mult * B_quant_mult).
*/
#include <cstdint>
#include "types.h"
#include "sse2_gemm.h"
#include "ssse3_gemm.h"
#include "avx2_gemm.h"
#include "avx512_gemm.h"
#include "avx512vnni_gemm.h"
/* Dispatch to functions based on runtime CPUID. This adds one call-by-variable to each call. */
namespace intgemm {
void UnsupportedCPUError();
struct Unsupported_16bit {
static void Quantize(const float *, int16_t *, float, Index) {
UnsupportedCPUError();
}
static void PrepareB(const float *, int16_t *, float, Index, Index) {
UnsupportedCPUError();
}
static void PrepareBQuantizedTransposed(const int16_t *, int16_t *, Index, Index) {
UnsupportedCPUError();
}
static void PrepareBTransposed(const float *, int16_t *, float, Index, Index) {
UnsupportedCPUError();
}
static void SelectColumnsB(const int16_t *, int16_t *, Index, const Index *, const Index *) {
UnsupportedCPUError();
}
template <typename Callback>
static void Multiply(const int16_t *, const int16_t *, Index, Index, Index, Callback) {
UnsupportedCPUError();
}
constexpr static const char *const kName = "16-bit Unsupported";
};
struct Unsupported_8bit {
static void Quantize(const float *, int8_t *, float, Index) {
UnsupportedCPUError();
}
static void QuantizeU(const float *, uint8_t *, float, Index) {
UnsupportedCPUError();
}
static void PrepareA(const float *, int8_t *, float, Index, Index) {
UnsupportedCPUError();
}
static void PrepareBQuantizedTransposed(const int8_t *, int8_t *, Index, Index) {
UnsupportedCPUError();
}
static void PrepareBTransposed(const float *, int8_t *, float, Index, Index) {
UnsupportedCPUError();
}
static void PrepareB(const float *, int8_t *, float, Index, Index) {
UnsupportedCPUError();
}
template<class Callback>
static void PrepareBias(const int8_t *, Index, Index, Callback) {
UnsupportedCPUError();
}
static void SelectColumnsB(const int8_t *, int8_t *, Index, const Index *, const Index *) {
UnsupportedCPUError();
}
template <typename Callback>
static void Multiply(const int8_t *, const int8_t *, Index, Index, Index, Callback) {
UnsupportedCPUError();
}
template<class Callback>
static void Multiply8Shift(const uint8_t *, const int8_t *, Index, Index, Index, Callback) {
UnsupportedCPUError();
}
constexpr static const char *const kName = "8-bit Unsupported";
};
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
// These won't ever be called in this capacity, but it does let the code below compile.
namespace AVX512VNNI {
typedef Unsupported_8bit Kernels8;
} // namespace AVX512VNNI
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX512BW
namespace AVX512BW {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
} // namespace AVX512BW
#endif
#ifndef INTGEMM_COMPILER_SUPPORTS_AVX2
namespace AVX2 {
typedef Unsupported_8bit Kernels8;
typedef Unsupported_16bit Kernels16;
} // namespace AVX2
#endif
CPUType GetCPUID();
/* Returns:
* axx512vnni if the CPU supports AVX512VNNI
*
* avx512bw if the CPU supports AVX512BW
*
* avx2 if the CPU supports AVX2
*
* ssse3 if the CPU supports SSSE3 (this distinction from SSE2 matters for 8-bit)
*
* sse2 if the CPU supports SSE2
*
* unsupported otherwise
*/
template <class T> T ChooseCPU(T avx512vnni, T avx512bw, T avx2, T ssse3, T sse2, T unsupported) {
const T ret[] = {unsupported, sse2, ssse3, avx2, avx512bw, avx512vnni};
return ret[(int)GetCPUID()];
}
struct TileInfo {
const Index a_rows;
const Index a_cols;
const Index b_rows;
const Index b_cols;
};
/*
* 8-bit matrix multiplication
*/
struct Int8 {
using Integer = int8_t;
// A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
static constexpr TileInfo tile_info{1, 64, 64, 8};
// Currently A is prepared by quantization but this could theoretically change.
// A's columns must be a multiple of 8.
// The number of rows is anything.
static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Multiply floats by quant_mult then convert to 8-bit integers with saturation.
static void (*Quantize)(const float *input, int8_t *output, float quant_mult, Index size);
// Multiply floats by quant_mult then convert to 8-bit integers with saturation.
// A version that adds 127 to each number, making sure that all numbers are positive
static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size);
// Warning: the output of PrepareB depends on the CPU.
// It will match the Multiply function on the same CPU though.
static void (*PrepareB)(const float *input, int8_t *output, float quant_mult, Index rows, Index cols);
// Convert from a B that was already transposed (routine not provided) and
// quantized (e.g. with Quantize) to the CPU-dependent format used for
// Multiply. This is useful for storing a quantized model on disk then in a
// CPU-independent fashion.
static void (*PrepareBQuantizedTransposed)(const int8_t *input, int8_t *output, Index inner, Index B_untransposed_cols);
// Convert from a B that was already transposed (routine not provided) to
// the CPU-dependent format used for Multiply. This is useful for storing
// a quantized model on disk then in a CPU-independent fashion.
static void (*PrepareBTransposed)(const float *input, int8_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
// Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
static void (*SelectColumnsB)(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
// Multiply C = A * B, presuming A and B have been prepared.
template <typename Callback>
static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
}
static const char *const kName;
private:
template <typename Callback>
struct MultiplyImpl {
static void (*run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
};
};
template <typename Callback>
void (*Int8::MultiplyImpl<Callback>::run)(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512VNNI::Kernels8>, OMPParallelWrap<Callback, AVX512BW::Kernels8>, OMPParallelWrap<Callback, AVX2::Kernels8>, OMPParallelWrap<Callback, SSSE3::Kernels8>, Unsupported_8bit::Multiply<Callback>, Unsupported_8bit::Multiply<Callback>);
/*
* 8-bit matrix multiplication with shifting A by 127
*/
struct Int8Shift {
using Integer = int8_t;
// A's size must be a multiple of 1x64, B's size must be a multiple of 64x8.
static constexpr TileInfo tile_info{1, 64, 64, 8};
// Identical to the Int8 Version, except it adds 127 to each number, making sure that all numbers are positive.
static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
QuantizeU(input, reinterpret_cast<uint8_t *>(output), quant_mult, rows * cols);
}
// Multiply floats by quant_mult then convert to 8-bit integers with saturation.
// A version that adds 127 to each number, making sure that all numbers are positive
static void (*QuantizeU)(const float *input, uint8_t *output, float quant_mult, Index size);
// Warning: the output of PrepareB depends on the CPU.
// It will match the Multiply function on the same CPU though.
static void PrepareB(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Int8::PrepareB(input, output, quant_mult, rows, cols);
}
// Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
Int8::SelectColumnsB(input, output, rows, cols_begin, cols_end);
}
// A slightly faster version compared to the Int8 one (assuming a bias is used) because of better handling of the sign bit
// Multiply C = A * B + Bias, presuming A, B and Bias have all been prepared (for A, PrepareAnew should be used
template<class Callback>
static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
MultiplyImpl<Callback>::run((const uint8_t *)A, B, A_rows, width, B_cols, callback);
}
// This function prepares the bias for the Multiply routine that does unsigned * signed multiplication.
// The function takes:
// a preparedB matrix, width, B_cols and
// the callback UnquantizeAndAddBiasAndWrite(unquant_mult, Bias_matrix, Bias_matrix)
// unquant_mult is computed by (-1)*(alpha)*(alpha)/(127.0f);
template<class Callback>
static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) {
PrepareBiasImpl<Callback>::run(B, width, B_cols, callback);
}
static const char *const kName;
private:
template <typename Callback>
struct MultiplyImpl {
static void (*run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
};
template <typename Callback>
struct PrepareBiasImpl {
static void (*run)(const int8_t *B, Index width, Index B_cols, Callback callback);
};
};
template <class Callback>
void (*Int8Shift::MultiplyImpl<Callback>::run)(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(
OMPParallelWrap8Shift<Callback, AVX512VNNI::Kernels8>,
OMPParallelWrap8Shift<Callback, AVX512BW::Kernels8>,
OMPParallelWrap8Shift<Callback, AVX2::Kernels8>,
OMPParallelWrap8Shift<Callback, SSSE3::Kernels8>,
Unsupported_8bit::Multiply8Shift<Callback>, Unsupported_8bit::Multiply8Shift<Callback>);
template <class Callback>
void (*Int8Shift::PrepareBiasImpl<Callback>::run)(const int8_t *B, Index width, Index B_cols, Callback callback) = ChooseCPU(AVX512VNNI::Kernels8::PrepareBias<Callback>, AVX512BW::Kernels8::PrepareBias<Callback>, AVX2::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, SSSE3::Kernels8::PrepareBias<Callback>, Unsupported_8bit::PrepareBias);
/*
* 16-bit matrix multiplication
*/
struct Int16 {
using Integer = int16_t;
// A's size must be a multiple of 1x32, B's size must be a multiple of 32x8.
static constexpr TileInfo tile_info{1, 32, 32, 8};
// Currently A is prepared by quantization but this could theoretically change.
// A's columns must be a multiple of 8.
// The number of rows is anything.
static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
// Multiply floats by quant_mult then convert to 16-bit integers with saturation.
// input
static void (*Quantize)(const float *input, int16_t *output, float quant_mult, Index size);
// Warning: the output of PrepareB depends on the CPU.
// It will match the Multiply function on the same CPU though.
static void (*PrepareB)(const float *input, int16_t *output, float quant_mult, Index rows, Index cols);
// Convert from a B that was already transposed (routine not provided) and
// quantized (e.g. with Quantize) to the CPU-dependent format used for
// Multiply. This is useful for storing a quantized model on disk then in a
// CPU-independent fashion.
static void (*PrepareBQuantizedTransposed)(const int16_t *input, int16_t *output, Index inner, Index B_untransposed_cols);
// Convert from a B that was already transposed (routine not provided) to
// the CPU-dependent format used for Multiply. This is useful for storing
// a quantized model on disk then in a CPU-independent fashion.
static void (*PrepareBTransposed)(const float *input, int16_t *output, float quant_mul, Index inner, Index B_untransposed_cols);
// Select columns from a prepared B matrix. The number of selected columns must be a multiple of 8.
static void (*SelectColumnsB)(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end);
// Multiply C = A * B, presuming A and B have been prepared.
template <typename Callback>
static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
MultiplyImpl<Callback>::run(A, B, A_rows, width, B_cols, callback);
}
static const char *const kName;
private:
template <typename Callback>
struct MultiplyImpl {
static void (*run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback);
};
};
template <typename Callback>
void (*Int16::MultiplyImpl<Callback>::run)(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) = ChooseCPU(OMPParallelWrap<Callback, AVX512BW::Kernels16> /*TODO VNNI 16-bit. */, OMPParallelWrap<Callback, AVX512BW::Kernels16>, OMPParallelWrap<Callback, AVX2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, OMPParallelWrap<Callback, SSE2::Kernels16>, Unsupported_16bit::Multiply<Callback>);
extern const CPUType kCPU;
// Get the maximum absolute value of an array of floats. The number of floats must be a multiple of 16 and 64-byte aligned.
extern float (*MaxAbsolute)(const float *begin, const float *end);
// Get a Quantization value that is equant to the mean of the data +N standard deviations. Use 2 by default
extern MeanStd (*VectorMeanStd)(const float *begin, const float *end, bool);
/* Returns the Mean and the Standard deviation of a vector.
* If "absolute" is set to true, it computes the mean and the standard deviation of the absolute values of the vector */
static inline MeanStd GetVectorMeanStd(const float * begin, const float * end, bool absolute=false) {
return VectorMeanStd(begin, end, absolute);
}
} // namespace intgemm

Просмотреть файл

@ -1,5 +0,0 @@
#pragma once
#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX2
#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512BW
#cmakedefine INTGEMM_COMPILER_SUPPORTS_AVX512VNNI

611
third_party/intgemm/intgemm/intrinsics.h поставляемый
Просмотреть файл

@ -1,611 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include "types.h"
#include <tmmintrin.h>
#include <emmintrin.h>
#include <xmmintrin.h>
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#include <immintrin.h>
#endif
#ifdef INTGEMM_WORMHOLE
#include <wasm_simd128.h>
#endif
#include <cstdint>
/*
* NOTE: Please keep intrinsics in alphabetical order.
*/
namespace intgemm {
/*
* Define a bunch of intrinstics as overloaded functions so they work with
* templates.
*/
template <class Register> static inline Register load_ps(float const* from);
template <class Register> static inline Register loadu_ps(const float* mem_addr);
template <class Register> static inline Register set1_epi16(int16_t to);
template <class Register> static inline Register set1_epi32(int32_t to);
template <class Register> static inline Register set1_epi8(int8_t to);
template <class Register> static inline Register set1_pd(double to);
template <class Register> static inline Register set1_ps(float to);
template <class Register> static inline Register setzero_pd();
template <class Register> static inline Register setzero_ps();
template <class Register> static inline Register setzero_si();
/*
*
* SSE2
*
*/
INTGEMM_SSSE3 static inline __m128i abs_epi8(__m128i arg) {
return _mm_abs_epi8(arg);
}
INTGEMM_SSE2 static inline __m128i add_epi8(__m128i a, __m128i b) {
return _mm_add_epi8(a, b);
}
INTGEMM_SSE2 static inline __m128i add_epi16(__m128i a, __m128i b) {
return _mm_add_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i add_epi32(__m128i first, __m128i second) {
return _mm_add_epi32(first, second);
}
INTGEMM_SSE2 static inline __m128i adds_epi16(__m128i first, __m128i second) {
return _mm_adds_epi16(first, second);
}
INTGEMM_SSE2 static inline __m128d add_pd(__m128d a, __m128d b) {
return _mm_add_pd(a, b);
}
INTGEMM_SSE2 static inline __m128 add_ps(__m128 a, __m128 b) {
return _mm_add_ps(a, b);
}
INTGEMM_SSE2 static inline __m128 and_ps(__m128 first, __m128 second) {
return _mm_and_ps(first, second);
}
INTGEMM_SSE2 static inline __m128 andnot_ps(__m128 a, __m128 b) {
return _mm_andnot_ps(a, b);
}
INTGEMM_SSE2 static inline __m128i and_si(__m128i a, __m128i b) {
return _mm_and_si128(a, b);
}
INTGEMM_SSE2 static inline __m128 cast_ps(__m128i a) {
return _mm_castsi128_ps(a);
}
INTGEMM_SSE2 static inline __m128 cvtepi32_ps(__m128i arg) {
return _mm_cvtepi32_ps(arg);
}
INTGEMM_SSE2 static inline __m128i cvtps_epi32(__m128 arg) {
return _mm_cvtps_epi32(arg);
}
INTGEMM_SSE2 static inline __m128i cvttps_epi32(__m128 a) {
return _mm_cvttps_epi32(a);
}
INTGEMM_SSE2 static inline __m128 div_ps(__m128 a, __m128 b) {
return _mm_div_ps(a, b);
}
/*
* Missing i32gather_ps for SSE2
*/
template <> INTGEMM_SSE2 inline __m128 load_ps<__m128>(const float* from) {
return _mm_load_ps(from);
}
template <> INTGEMM_SSE2 inline __m128 loadu_ps(const float* mem_addr) {
return _mm_loadu_ps(mem_addr);
}
INTGEMM_SSE2 static inline __m128i madd_epi16(__m128i first, __m128i second) {
// https://bugzilla.mozilla.org/show_bug.cgi?id=1672160
#ifdef INTGEMM_WORMHOLE
return wasm_v8x16_shuffle(first, second, 31, 0, 30, 2, 29, 4, 28, 6, 27, 8, 26, 10, 25, 12, 24, 2 /* PMADDWD */);
#else
return _mm_madd_epi16(first, second);
#endif
}
INTGEMM_SSSE3 static inline __m128i maddubs_epi16(__m128i first, __m128i second) {
// https://bugzilla.mozilla.org/show_bug.cgi?id=1672160
#ifdef INTGEMM_WORMHOLE
return wasm_v8x16_shuffle(first, second, 31, 0, 30, 2, 29, 4, 28, 6, 27, 8, 26, 10, 25, 12, 24, 1 /* PMADDUBSW */);
#else
return _mm_maddubs_epi16(first, second);
#endif
}
/*
* Missing max_epi8 for SSE2
*/
INTGEMM_SSE2 static inline __m128i max_epi16(__m128i first, __m128i second) {
return _mm_max_epi16(first, second);
}
INTGEMM_SSE2 static inline __m128d max_pd(__m128d first, __m128d second) {
return _mm_max_pd(first, second);
}
INTGEMM_SSE2 static inline __m128 max_ps(__m128 first, __m128 second) {
return _mm_max_ps(first, second);
}
INTGEMM_SSE2 static inline __m128 min_ps(__m128 a, __m128 b) {
return _mm_min_ps(a, b);
}
INTGEMM_SSE2 static inline __m128i mul_epu32(__m128i a, __m128i b) {
return _mm_mul_epu32(a, b);
}
INTGEMM_SSE2 static inline __m128d mul_pd(__m128d a, __m128d b) {
return _mm_mul_pd(a, b);
}
INTGEMM_SSE2 static inline __m128 mul_ps(__m128 a, __m128 b) {
return _mm_mul_ps(a, b);
}
INTGEMM_SSE2 static inline __m128i mulhi_epi16(__m128i a, __m128i b) {
return _mm_mulhi_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i mullo_epi16(__m128i a, __m128i b) {
return _mm_mullo_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i or_si(__m128i a, __m128i b) {
return _mm_or_si128(a, b);
}
INTGEMM_SSE2 static inline __m128i packs_epi16(__m128i a, __m128i b) {
return _mm_packs_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i packs_epi32(__m128i a, __m128i b) {
return _mm_packs_epi32(a, b);
}
template <> INTGEMM_SSE2 inline __m128i set1_epi8<__m128i>(int8_t to) {
return _mm_set1_epi8(to);
}
template <> INTGEMM_SSE2 inline __m128i set1_epi16<__m128i>(int16_t to) {
return _mm_set1_epi16(to);
}
template <> INTGEMM_SSE2 inline __m128i set1_epi32<__m128i>(int32_t to) {
return _mm_set1_epi32(to);
}
template <> INTGEMM_SSE2 inline __m128d set1_pd<__m128d>(double to) {
return _mm_set1_pd(to);
}
template <> INTGEMM_SSE2 inline __m128 set1_ps<__m128>(float to) {
return _mm_set1_ps(to);
}
template <> INTGEMM_SSE2 inline __m128d setzero_pd<__m128d>() {
return _mm_setzero_pd();
}
template <> INTGEMM_SSE2 inline __m128 setzero_ps<__m128>() {
return _mm_setzero_ps();
}
template <> INTGEMM_SSE2 inline __m128i setzero_si<__m128i>() {
return _mm_setzero_si128();
}
INTGEMM_SSSE3 static inline __m128i sign_epi8(__m128i first, __m128i second) {
return _mm_sign_epi8(first, second);
}
template <int imm8> INTGEMM_SSE2 static inline __m128i slli_epi16(__m128i a) {
return _mm_slli_epi16(a, imm8);
}
template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi16(__m128i a) {
return _mm_srai_epi16(a, imm8);
}
template <int imm8> INTGEMM_SSE2 static inline __m128i srai_epi32(__m128i a) {
return _mm_srai_epi32(a, imm8);
}
template <int imm8> INTGEMM_SSE2 static inline __m128i srli_epi16(__m128i a) {
return _mm_srli_epi16(a, imm8);
}
INTGEMM_SSE2 static inline void storeu_ps(float* mem_addr, __m128 a) {
_mm_storeu_ps(mem_addr, a);
}
INTGEMM_SSE2 static inline __m128d sub_pd(__m128d a, __m128d b) {
return _mm_sub_pd(a, b);
}
INTGEMM_SSE2 static inline __m128 sub_ps(__m128 a, __m128 b) {
return _mm_sub_ps(a, b);
}
INTGEMM_SSE2 static inline __m128i unpacklo_epi8(__m128i a, __m128i b) {
return _mm_unpacklo_epi8(a, b);
}
INTGEMM_SSE2 static inline __m128i unpackhi_epi8(__m128i a, __m128i b) {
return _mm_unpackhi_epi8(a, b);
}
INTGEMM_SSE2 static inline __m128i unpacklo_epi16(__m128i a, __m128i b) {
return _mm_unpacklo_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i unpackhi_epi16(__m128i a, __m128i b) {
return _mm_unpackhi_epi16(a, b);
}
INTGEMM_SSE2 static inline __m128i unpacklo_epi32(__m128i a, __m128i b) {
return _mm_unpacklo_epi32(a, b);
}
INTGEMM_SSE2 static inline __m128i unpackhi_epi32(__m128i a, __m128i b) {
return _mm_unpackhi_epi32(a, b);
}
INTGEMM_SSE2 static inline __m128i unpacklo_epi64(__m128i a, __m128i b) {
return _mm_unpacklo_epi64(a, b);
}
INTGEMM_SSE2 static inline __m128i unpackhi_epi64(__m128i a, __m128i b) {
return _mm_unpackhi_epi64(a, b);
}
INTGEMM_SSE2 static inline __m128i xor_si(__m128i a, __m128i b) {
return _mm_xor_si128(a, b);
}
/*
*
* AVX2
*
*/
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 static inline __m256i abs_epi8(__m256i arg) {
return _mm256_abs_epi8(arg);
}
INTGEMM_AVX2 static inline __m256i add_epi8(__m256i a, __m256i b) {
return _mm256_add_epi8(a, b);
}
INTGEMM_AVX2 static inline __m256i add_epi16(__m256i a, __m256i b) {
return _mm256_add_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i add_epi32(__m256i first, __m256i second) {
return _mm256_add_epi32(first, second);
}
INTGEMM_AVX2 static inline __m256i adds_epi16(__m256i first, __m256i second) {
return _mm256_adds_epi16(first, second);
}
INTGEMM_AVX2 static inline __m256d add_pd(__m256d a, __m256d b) {
return _mm256_add_pd(a, b);
}
INTGEMM_AVX2 static inline __m256 add_ps(__m256 a, __m256 b) {
return _mm256_add_ps(a, b);
}
INTGEMM_AVX2 static inline __m256 and_ps(__m256 first, __m256 second) {
return _mm256_and_ps(first, second);
}
INTGEMM_AVX2 static inline __m256 andnot_ps(__m256 a, __m256 b) {
return _mm256_andnot_ps(a, b);
}
INTGEMM_AVX2 static inline __m256i and_si(__m256i a, __m256i b) {
return _mm256_and_si256(a, b);
}
INTGEMM_AVX2 static inline __m256 cast_ps(__m256i a) {
return _mm256_castsi256_ps(a);
}
INTGEMM_AVX2 static inline __m256 cvtepi32_ps(__m256i arg) {
return _mm256_cvtepi32_ps(arg);
}
INTGEMM_AVX2 static inline __m256i cvtps_epi32(__m256 arg) {
return _mm256_cvtps_epi32(arg);
}
INTGEMM_AVX2 static inline __m256i cvttps_epi32(__m256 a) {
return _mm256_cvttps_epi32(a);
}
INTGEMM_AVX2 static inline __m256 div_ps(__m256 a, __m256 b) {
return _mm256_div_ps(a, b);
}
template <unsigned Scale>
INTGEMM_AVX2 static inline __m256 i32gather_ps(float const *base_addr, __m256i vindex) {
return _mm256_i32gather_ps(base_addr, vindex, Scale);
}
template <> INTGEMM_AVX2 inline __m256 loadu_ps(const float* mem_addr) {
return _mm256_loadu_ps(mem_addr);
}
template <> INTGEMM_AVX2 inline __m256 load_ps<__m256>(const float* from) {
return _mm256_load_ps(from);
}
INTGEMM_AVX2 static inline __m256i madd_epi16(__m256i first, __m256i second) {
return _mm256_madd_epi16(first, second);
}
INTGEMM_AVX2 static inline __m256i maddubs_epi16(__m256i first, __m256i second) {
return _mm256_maddubs_epi16(first, second);
}
INTGEMM_AVX2 static inline __m256i max_epi8(__m256i first, __m256i second) {
return _mm256_max_epi8(first, second);
}
INTGEMM_AVX2 static inline __m256i max_epi16(__m256i first, __m256i second) {
return _mm256_max_epi16(first, second);
}
INTGEMM_AVX2 static inline __m256d max_pd(__m256d first, __m256d second) {
return _mm256_max_pd(first, second);
}
INTGEMM_AVX2 static inline __m256 max_ps(__m256 first, __m256 second) {
return _mm256_max_ps(first, second);
}
INTGEMM_AVX2 static inline __m256 min_ps(__m256 a, __m256 b) {
return _mm256_min_ps(a, b);
}
INTGEMM_AVX2 static inline __m256i mul_epu32(__m256i a, __m256i b) {
return _mm256_mul_epu32(a, b);
}
INTGEMM_AVX2 static inline __m256d mul_pd(__m256d a, __m256d b) {
return _mm256_mul_pd(a, b);
}
INTGEMM_AVX2 static inline __m256 mul_ps(__m256 a, __m256 b) {
return _mm256_mul_ps(a, b);
}
INTGEMM_AVX2 static inline __m256i mulhi_epi16(__m256i a, __m256i b) {
return _mm256_mulhi_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i mullo_epi16(__m256i a, __m256i b) {
return _mm256_mullo_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i or_si(__m256i a, __m256i b) {
return _mm256_or_si256(a, b);
}
INTGEMM_AVX2 static inline __m256i packs_epi16(__m256i a, __m256i b) {
return _mm256_packs_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i packs_epi32(__m256i a, __m256i b) {
return _mm256_packs_epi32(a, b);
}
template <> INTGEMM_AVX2 inline __m256i set1_epi8<__m256i>(int8_t to) {
return _mm256_set1_epi8(to);
}
template <> INTGEMM_AVX2 inline __m256i set1_epi16<__m256i>(int16_t to) {
return _mm256_set1_epi16(to);
}
template <> INTGEMM_AVX2 inline __m256i set1_epi32<__m256i>(int32_t to) {
return _mm256_set1_epi32(to);
}
template <> INTGEMM_AVX2 inline __m256d set1_pd<__m256d>(double to) {
return _mm256_set1_pd(to);
}
template <> INTGEMM_AVX2 inline __m256 set1_ps<__m256>(float to) {
return _mm256_set1_ps(to);
}
template <> INTGEMM_AVX2 inline __m256d setzero_pd<__m256d>() {
return _mm256_setzero_pd();
}
template <> INTGEMM_AVX2 inline __m256 setzero_ps<__m256>() {
return _mm256_setzero_ps();
}
template <> INTGEMM_AVX2 inline __m256i setzero_si<__m256i>() {
return _mm256_setzero_si256();
}
INTGEMM_AVX2 static inline __m256i sign_epi8(__m256i first, __m256i second) {
return _mm256_sign_epi8(first, second);
}
template <int imm8> INTGEMM_AVX2 static inline __m256i slli_epi16(__m256i a) {
return _mm256_slli_epi16(a, imm8);
}
template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi16(__m256i a) {
return _mm256_srai_epi16(a, imm8);
}
template <int imm8> INTGEMM_AVX2 static inline __m256i srai_epi32(__m256i a) {
return _mm256_srai_epi32(a, imm8);
}
template <int imm8> INTGEMM_AVX2 static inline __m256i srli_epi16(__m256i a) {
return _mm256_srli_epi16(a, imm8);
}
INTGEMM_AVX2 static inline void storeu_ps(float* mem_addr, __m256 a) {
_mm256_storeu_ps(mem_addr, a);
}
INTGEMM_AVX2 static inline __m256d sub_pd(__m256d a, __m256d b) {
return _mm256_sub_pd(a, b);
}
INTGEMM_AVX2 static inline __m256 sub_ps(__m256 a, __m256 b) {
return _mm256_sub_ps(a, b);
}
INTGEMM_AVX2 static inline __m256i unpacklo_epi8(__m256i a, __m256i b) {
return _mm256_unpacklo_epi8(a, b);
}
INTGEMM_AVX2 static inline __m256i unpackhi_epi8(__m256i a, __m256i b) {
return _mm256_unpackhi_epi8(a, b);
}
INTGEMM_AVX2 static inline __m256i unpacklo_epi16(__m256i a, __m256i b) {
return _mm256_unpacklo_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i unpackhi_epi16(__m256i a, __m256i b) {
return _mm256_unpackhi_epi16(a, b);
}
INTGEMM_AVX2 static inline __m256i unpacklo_epi32(__m256i a, __m256i b) {
return _mm256_unpacklo_epi32(a, b);
}
INTGEMM_AVX2 static inline __m256i unpackhi_epi32(__m256i a, __m256i b) {
return _mm256_unpackhi_epi32(a, b);
}
INTGEMM_AVX2 static inline __m256i unpacklo_epi64(__m256i a, __m256i b) {
return _mm256_unpacklo_epi64(a, b);
}
INTGEMM_AVX2 static inline __m256i unpackhi_epi64(__m256i a, __m256i b) {
return _mm256_unpackhi_epi64(a, b);
}
INTGEMM_AVX2 static inline __m256i xor_si(__m256i a, __m256i b) {
return _mm256_xor_si256(a, b);
}
#endif
/*
*
* AVX512
*
*/
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
INTGEMM_AVX512BW static inline __m512i abs_epi8(__m512i arg) {
return _mm512_abs_epi8(arg);
}
INTGEMM_AVX512BW static inline __m512i add_epi8(__m512i a, __m512i b) {
return _mm512_add_epi8(a, b);
}
INTGEMM_AVX512BW static inline __m512i add_epi16(__m512i a, __m512i b) {
return _mm512_add_epi16(a, b);
}
INTGEMM_AVX512BW static inline __m512i add_epi32(__m512i first, __m512i second) {
return _mm512_add_epi32(first, second);
}
INTGEMM_AVX512BW static inline __m512i adds_epi16(__m512i first, __m512i second) {
return _mm512_adds_epi16(first, second);
}
INTGEMM_AVX512BW static inline __m512d add_pd(__m512d a, __m512d b) {
return _mm512_add_pd(a, b);
}
INTGEMM_AVX512BW static inline __m512 add_ps(__m512 a, __m512 b) {
return _mm512_add_ps(a, b);
}
INTGEMM_AVX512DQ static inline __m512 and_ps(__m512 first, __m512 second) {
return _mm512_and_ps(first, second);
}
INTGEMM_AVX512DQ static inline __m512 andnot_ps(__m512 a, __m512 b) {
return _mm512_andnot_ps(a, b);
}
INTGEMM_AVX512BW static inline __m512i and_si(__m512i a, __m512i b) {
return _mm512_and_si512(a, b);
}
INTGEMM_AVX512F static inline __m512 cast_ps(__m512i a) {
return _mm512_castsi512_ps(a);
}
INTGEMM_AVX512BW static inline __m512 cvtepi32_ps(__m512i arg) {
return _mm512_cvtepi32_ps(arg);
}
INTGEMM_AVX512BW static inline __m512i cvtps_epi32(__m512 arg) {
return _mm512_cvtps_epi32(arg);
}
INTGEMM_AVX512BW static inline __m512i cvttps_epi32(__m512 a) {
return _mm512_cvttps_epi32(a);
}
INTGEMM_AVX512BW static inline __m512 div_ps(__m512 a, __m512 b) {
return _mm512_div_ps(a, b);
}
template <unsigned Scale>
INTGEMM_AVX512BW static inline __m512 i32gather_ps(float const *base_addr, __m512i vindex) {
return _mm512_i32gather_ps(vindex, base_addr, Scale);
}
template <> INTGEMM_AVX512BW inline __m512 loadu_ps(const float* mem_addr) {
return _mm512_loadu_ps(mem_addr);
}
INTGEMM_AVX512BW static inline __m512i madd_epi16(__m512i first, __m512i second) {
return _mm512_madd_epi16(first, second);
}
INTGEMM_AVX512BW static inline __m512i maddubs_epi16(__m512i first, __m512i second) {
return _mm512_maddubs_epi16(first, second);
}
INTGEMM_AVX512BW static inline __m512i max_epi8(__m512i first, __m512i second) {
return _mm512_max_epi8(first, second);
}
INTGEMM_AVX512BW static inline __m512i max_epi16(__m512i first, __m512i second) {
return _mm512_max_epi16(first, second);
}
INTGEMM_AVX512BW static inline __m512d max_pd(__m512d first, __m512d second) {
return _mm512_max_pd(first, second);
}
INTGEMM_AVX512BW static inline __m512 max_ps(__m512 first, __m512 second) {
return _mm512_max_ps(first, second);
}
INTGEMM_AVX512BW static inline __m512 min_ps(__m512 a, __m512 b) {
return _mm512_min_ps(a, b);
}
INTGEMM_AVX512BW static inline __m512i mul_epu32(__m512i a, __m512i b) {
return _mm512_mul_epu32(a, b);
}
INTGEMM_AVX512BW static inline __m512d mul_pd(__m512d a, __m512d b) {
return _mm512_mul_pd(a, b);
}
INTGEMM_AVX512BW static inline __m512 mul_ps(__m512 a, __m512 b) {
return _mm512_mul_ps(a, b);
}
INTGEMM_AVX512BW static inline __m512i mulhi_epi16(__m512i a, __m512i b) {
return _mm512_mulhi_epi16(a, b);
}
INTGEMM_AVX512BW static inline __m512i mullo_epi16(__m512i a, __m512i b) {
return _mm512_mullo_epi16(a, b);
}
INTGEMM_AVX512BW static inline __m512i or_si(__m512i a, __m512i b) {
return _mm512_or_si512(a, b);
}
INTGEMM_AVX512BW static inline __m512i packs_epi16(__m512i a, __m512i b) {
return _mm512_packs_epi16(a, b);
}
/* g++ (Ubuntu 5.4.0-6ubuntu1~16.04.12) 5.4.0 20160609 has a bug:
* /usr/lib/gcc/x86_64-linux-gnu/5/include/avx512bwintrin.h is missing
* _mm512_packs_epi32 when compiled with debugging.
*/
#if !defined(__OPTIMIZE__) && (__GNUC__ == 5) && (__GNUC_MINOR__ == 4)
INTGEMM_AVX512BW static inline __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) __m512i packs_epi32(__m512i a, __m512i b) {
return reinterpret_cast<__m512i>(__builtin_ia32_packssdw512_mask(
reinterpret_cast<__v16si>(a),
reinterpret_cast<__v16si>(b),
reinterpret_cast<__v32hi>(_mm512_setzero_si512()),
0xffffffff));
}
#else
INTGEMM_AVX512BW static inline __m512i packs_epi32(__m512i a, __m512i b) {
return _mm512_packs_epi32(a, b);
}
#endif
template <> inline INTGEMM_AVX512BW __m512i set1_epi8<__m512i>(int8_t to) {
return _mm512_set1_epi8(to);
}
template <> inline INTGEMM_AVX512BW __m512i set1_epi16<__m512i>(int16_t to) {
return _mm512_set1_epi16(to);
}
template <> inline INTGEMM_AVX512BW __m512i set1_epi32<__m512i>(int32_t to) {
return _mm512_set1_epi32(to);
}
template <> inline INTGEMM_AVX512BW __m512d set1_pd<__m512d>(double to) {
return _mm512_set1_pd(to);
}
template <> inline INTGEMM_AVX512BW __m512 set1_ps<__m512>(float to) {
return _mm512_set1_ps(to);
}
template <> INTGEMM_AVX512BW inline __m512d setzero_pd<__m512d>() {
return _mm512_setzero_pd();
}
template <> INTGEMM_AVX512BW inline __m512 setzero_ps<__m512>() {
return _mm512_setzero_ps();
}
template <> INTGEMM_AVX512BW inline __m512i setzero_si<__m512i>() {
return _mm512_setzero_si512();
}
template <> INTGEMM_AVX512BW inline __m512 load_ps<__m512>(const float* from) {
return _mm512_load_ps(from);
}
/*
* Missing sign_epi8
*/
template <int imm8> INTGEMM_AVX512BW static inline __m512i slli_epi16(__m512i a) {
return _mm512_slli_epi16(a, imm8);
}
template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi16(__m512i a) {
return _mm512_srai_epi16(a, imm8);
}
template <int imm8> INTGEMM_AVX512BW static inline __m512i srai_epi32(__m512i a) {
return _mm512_srai_epi32(a, imm8);
}
template <int imm8> INTGEMM_AVX512BW static inline __m512i srli_epi16(__m512i a) {
return _mm512_srli_epi16(a, imm8);
}
INTGEMM_AVX512BW static inline void storeu_ps(float* mem_addr, __m512 a) {
_mm512_storeu_ps(mem_addr, a);
}
INTGEMM_AVX512BW static inline __m512d sub_pd(__m512d a, __m512d b) {
return _mm512_sub_pd(a, b);
}
INTGEMM_AVX512BW static inline __m512 sub_ps(__m512 a, __m512 b) {
return _mm512_sub_ps(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpacklo_epi8(__m512i a, __m512i b) {
return _mm512_unpacklo_epi8(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpackhi_epi8(__m512i a, __m512i b) {
return _mm512_unpackhi_epi8(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpacklo_epi16(__m512i a, __m512i b) {
return _mm512_unpacklo_epi16(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpackhi_epi16(__m512i a, __m512i b) {
return _mm512_unpackhi_epi16(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpacklo_epi32(__m512i a, __m512i b) {
return _mm512_unpacklo_epi32(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpackhi_epi32(__m512i a, __m512i b) {
return _mm512_unpackhi_epi32(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpacklo_epi64(__m512i a, __m512i b) {
return _mm512_unpacklo_epi64(a, b);
}
INTGEMM_AVX512BW static inline __m512i unpackhi_epi64(__m512i a, __m512i b) {
return _mm512_unpackhi_epi64(a, b);
}
INTGEMM_AVX512BW static inline __m512i xor_si(__m512i a, __m512i b) {
return _mm512_xor_si512(a, b);
}
#endif
}

26
third_party/intgemm/intgemm/kernels.h поставляемый
Просмотреть файл

@ -1,26 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include "intrinsics.h"
#include "types.h"
#include "utils.h"
#include "vec_traits.h"
#include <cstdlib>
#define KERNELS_THIS_IS_SSE2
#include "kernels/implementations.inl"
#undef KERNELS_THIS_IS_SSE2
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#define KERNELS_THIS_IS_AVX2
#include "kernels/implementations.inl"
#undef KERNELS_THIS_IS_AVX2
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
#define KERNELS_THIS_IS_AVX512BW
#include "kernels/implementations.inl"
#undef KERNELS_THIS_IS_AVX512BW
#endif

Просмотреть файл

@ -1,456 +0,0 @@
/* This file is included multiple times, once for each backend instruction set. */
#if defined(KERNELS_THIS_IS_SSE2)
#define CPU_NAME SSE2
#define CPU_ATTR INTGEMM_SSE2
#elif defined(KERNELS_THIS_IS_AVX2)
#define CPU_NAME AVX2
#define CPU_ATTR INTGEMM_AVX2
#elif defined(KERNELS_THIS_IS_AVX512BW)
#define CPU_NAME AVX512BW
#define CPU_ATTR INTGEMM_AVX512BW
#else
#error "Only SSE2, AVX2 and AVX512BW are supported"
#endif
#define vi vector_t<CPUType::CPU_NAME, int>
#define vf vector_t<CPUType::CPU_NAME, float>
#define vd vector_t<CPUType::CPU_NAME, double>
/*
* Kernels implementations....
*/
namespace intgemm {
namespace kernels {
/*
* Write
*/
CPU_ATTR static inline void write(vi input, int8_t* output, Index offset) {
*reinterpret_cast<vi*>(output + offset) = input;
}
CPU_ATTR static inline void write(vi input, int16_t* output, Index offset) {
*reinterpret_cast<vi*>(output + offset) = input;
}
CPU_ATTR static inline void write(vi input, int* output, Index offset) {
*reinterpret_cast<vi*>(output + offset) = input;
}
CPU_ATTR static inline void write(vf input, float* output, Index offset) {
*reinterpret_cast<vf*>(output + offset) = input;
}
CPU_ATTR static inline void write(vd input, double* output, Index offset) {
*reinterpret_cast<vd*>(output + offset) = input;
}
/*
* Quantize
*/
CPU_ATTR static inline vi quantize(vf input, vf quant_mult) {
return cvtps_epi32(mul_ps(input, quant_mult));
}
/*
* Unquantize
*/
CPU_ATTR static inline vf unquantize(vi input, vf unquant_mult) {
return mul_ps(cvtepi32_ps(input), unquant_mult);
}
/*
* Add a bias term
*/
CPU_ATTR static inline vi add_bias(vi input, const int8_t* bias_addr, Index bias_offset) {
auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
return add_epi8(input, bias_term);
}
CPU_ATTR static inline vi add_bias(vi input, const int16_t* bias_addr, Index bias_offset) {
auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
return add_epi16(input, bias_term);
}
CPU_ATTR static inline vi add_bias(vi input, const int* bias_addr, Index bias_offset) {
auto bias_term = *reinterpret_cast<const vi*>(bias_addr + bias_offset);
return add_epi32(input, bias_term);
}
CPU_ATTR static inline vf add_bias(vf input, const float* bias_addr, Index bias_offset) {
auto bias_term = *reinterpret_cast<const vf*>(bias_addr + bias_offset);
return add_ps(input, bias_term);
}
CPU_ATTR static inline vd add_bias(vd input, const double* bias_addr, Index bias_offset) {
auto bias_term = *reinterpret_cast<const vd*>(bias_addr + bias_offset);
return add_pd(input, bias_term);
}
/*
* ReLU
*/
template <typename Type>
CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> relu(vector_t<CPUType::CPU_NAME, Type> input);
template <>
CPU_ATTR inline vi relu<int8_t>(vi input) {
static const auto vconst_zero = set1_epi8<vi>(0);
#if defined(KERNELS_THIS_IS_SSE2)
return and_si(input, _mm_cmplt_epi8(vconst_zero, input));
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_max_epi8(input, vconst_zero);
#else
return _mm512_max_epi8(input, vconst_zero);
#endif
}
template <>
CPU_ATTR inline vi relu<int16_t>(vi input) {
static const auto vconst_zero = set1_epi16<vi>(0);
return max_epi16(input, vconst_zero);
}
template <>
CPU_ATTR inline vi relu<int>(vi input) {
static const auto vconst_zero = set1_epi32<vi>(0);
#if defined(KERNELS_THIS_IS_SSE2)
return and_si(input, _mm_cmplt_epi32(vconst_zero, input));
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_max_epi32(input, vconst_zero);
#else
return _mm512_max_epi32(input, vconst_zero);
#endif
}
template <>
CPU_ATTR inline vf relu<float>(vf input) {
static const auto vconst_zero = setzero_ps<vf>();
return max_ps(input, vconst_zero);
}
template <>
CPU_ATTR inline vd relu<double>(vd input) {
static const auto vconst_zero = setzero_pd<vd>();
return max_pd(input, vconst_zero);
}
/*
* Multiply (elemwise)
*/
template <typename Type>
CPU_ATTR static inline vector_t<CPUType::CPU_NAME, Type> multiply(vector_t<CPUType::CPU_NAME, Type> a, vector_t<CPUType::CPU_NAME, Type> b);
template <>
CPU_ATTR inline vi multiply<int8_t>(vi a, vi b) {
auto even = mullo_epi16(a, b);
auto odd = mullo_epi16(srli_epi16<8>(a), srli_epi16<8>(b));
return or_si(slli_epi16<8>(odd), srli_epi16<8>(slli_epi16<8>(even)));
}
template <>
CPU_ATTR inline vi multiply<int16_t>(vi a, vi b) {
return mullo_epi16(a, b);
}
template <>
CPU_ATTR inline vi multiply<int>(vi a, vi b) {
#if defined(KERNELS_THIS_IS_SSE2)
auto even = mul_epu32(a, b);
auto odd = mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
return unpacklo_epi32(_mm_shuffle_epi32(even, 0x8 /* = 0 0 2 0 */), _mm_shuffle_epi32(odd, 0x8 /* = 0 0 2 0 */));
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_mullo_epi32(a, b);
#else
return _mm512_mullo_epi32(a, b);
#endif
}
template <>
CPU_ATTR inline vf multiply<float>(vf a, vf b) {
return mul_ps(a, b);
}
template <>
CPU_ATTR inline vd multiply<double>(vd a, vd b) {
return mul_pd(a, b);
}
/*
* Downcast
*/
CPU_ATTR static inline vi downcast32to8(vi input1, vi input2, vi input3, vi input4) {
auto result = packs_epi16(packs_epi32(input1, input2), packs_epi32(input3, input4));
#if defined(KERNELS_THIS_IS_SSE2)
return result;
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_shuffle_epi32(_mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */), 0xd8 /* = 0 2 1 3 */);
#else
static const auto permutation_indices = _mm512_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
return _mm512_castps_si512(_mm512_permutexvar_ps(permutation_indices, _mm512_castsi512_ps(result)));
#endif
}
CPU_ATTR static inline vi downcast32to16(vi input1, vi input2) {
auto result = packs_epi32(input1, input2);
#if defined(KERNELS_THIS_IS_SSE2)
return result;
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */);
#else
static const auto permutation_indices = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
return _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(result)));
#endif
}
CPU_ATTR static inline vi downcast16to8(vi input1, vi input2) {
auto result = packs_epi16(input1, input2);
#if defined(KERNELS_THIS_IS_SSE2)
return result;
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_permute4x64_epi64(result, 0xd8 /* = 0 2 1 3 */);
#else
static const auto permutation_indices = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
return _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(result)));
#endif
}
/*
* Upcast
*/
CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int16_t> upcast8to16(vi input) {
static const auto vzero = set1_epi8<vi>(0);
#if defined(KERNELS_THIS_IS_SSE2)
auto higher_byte = _mm_cmpgt_epi8(vzero, input);
#elif defined(KERNELS_THIS_IS_AVX2)
input = _mm256_permute4x64_epi64(input, 0xd8 /* = 0 2 1 3 */);
auto higher_byte = _mm256_cmpgt_epi8(vzero, input);
#else
static const auto vmax_negative = set1_epi8<vi>(-1 /* 0xff */);
static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
auto negatives = _mm512_cmp_epi8_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
auto higher_byte = _mm512_mask_blend_epi8(negatives, vzero, vmax_negative);
#endif
return {
unpacklo_epi8(input, higher_byte),
unpackhi_epi8(input, higher_byte),
};
}
CPU_ATTR static inline dvector_t<CPUType::CPU_NAME, int> upcast16to32(vi input) {
static const auto vzero = set1_epi16<vi>(0);
#if defined(KERNELS_THIS_IS_SSE2)
auto higher_byte = _mm_cmpgt_epi16(vzero, input);
#elif defined(KERNELS_THIS_IS_AVX2)
input = _mm256_permute4x64_epi64(input, 0xd8 /* = 0 2 1 3 */);
auto higher_byte = _mm256_cmpgt_epi16(vzero, input);
#else
static const auto vmax_negative = set1_epi16<vi>(-1 /* 0xffff */);
static const auto permutation_indices = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
input = _mm512_castpd_si512(_mm512_permutexvar_pd(permutation_indices, _mm512_castsi512_pd(input)));
auto negatives = _mm512_cmp_epi16_mask(input, vzero, 1 /* _MM_CMPINT_LT */);
auto higher_byte = _mm512_mask_blend_epi16(negatives, vzero, vmax_negative);
#endif
return {
unpacklo_epi16(input, higher_byte),
unpackhi_epi16(input, higher_byte),
};
}
CPU_ATTR static inline qvector_t<CPUType::CPU_NAME, int> upcast8to32(vi input) {
auto result16 = upcast8to16(input);
auto result32a = upcast16to32(result16.first);
auto result32b = upcast16to32(result16.second);
return {
result32a.first,
result32a.second,
result32b.first,
result32b.second,
};
}
/*
* Rescale int32
*/
CPU_ATTR static inline vi rescale(vi input, vf scale) {
return cvtps_epi32(mul_ps(cvtepi32_ps(input), scale));
}
/*
* Bitwise not
*/
CPU_ATTR static inline vi bitwise_not(vi v) {
return xor_si(v, set1_epi32<vi>(0xffffffff));
}
/*
* Floor
*/
CPU_ATTR static inline vf floor(vf input) {
#if defined(KERNELS_THIS_IS_SSE2)
static const auto vconst_zero = setzero_ps<vf>();
static const auto vconst_one = set1_ps<vf>(1.f);
auto result = cvtepi32_ps(cvttps_epi32(input));
auto negatives = _mm_cmplt_ps(input, vconst_zero);
auto nonintegers = _mm_cmpneq_ps(input, result);
return sub_ps(result, and_ps(vconst_one, and_ps(negatives, nonintegers)));
#elif defined(KERNELS_THIS_IS_AVX2)
return _mm256_floor_ps(input);
#else
// TODO: It should work but compiler throw the error "incorrect rounding operand"
// return _mm512_roundscale_round_ps(input, 0, _MM_FROUND_FLOOR);
static const auto vconst_zero = setzero_ps<vf>();
static const auto vconst_one = set1_ps<vf>(1.f);
auto result = cvtepi32_ps(cvttps_epi32(input));
auto negatives = _mm512_cmp_ps_mask(input, vconst_zero, _CMP_LT_OQ);
auto nonintegers = _mm512_cmp_ps_mask(input, result, _CMP_NEQ_OQ);
return _mm512_mask_blend_ps(_mm512_kand(negatives, nonintegers), result, sub_ps(result, vconst_one));
#endif
}
/*
* Calculate approximation of e^x using Taylor series and lookup table
*/
#if defined(KERNELS_THIS_IS_SSE2)
CPU_ATTR static inline vf exp_approx_taylor(vf) {
std::abort();
}
#else
CPU_ATTR static inline vf exp_approx_taylor(vf x) {
static constexpr int EXP_MIN = -20;
static constexpr int EXP_MAX = 20;
static constexpr float EXP_LOOKUP[EXP_MAX - EXP_MIN + 1] = {
expif(-20), expif(-19), expif(-18), expif(-17), expif(-16), expif(-15),
expif(-14), expif(-13), expif(-12), expif(-11), expif(-10), expif(-9),
expif(-8), expif(-7), expif(-6), expif(-5), expif(-4), expif(-3), expif(-2),
expif(-1), expif(0), expif(1), expif(2), expif(3), expif(4), expif(5),
expif(6), expif(7), expif(8), expif(9), expif(10), expif(11), expif(12),
expif(13), expif(14), expif(15), expif(16), expif(17), expif(18), expif(19),
expif(20),
};
static const vf dividers[] = {
set1_ps<vf>(1.f / factorial(7)),
set1_ps<vf>(1.f / factorial(6)),
set1_ps<vf>(1.f / factorial(5)),
set1_ps<vf>(1.f / factorial(4)),
set1_ps<vf>(1.f / factorial(3)),
set1_ps<vf>(1.f / factorial(2)),
set1_ps<vf>(1.f / factorial(1)),
};
static const auto const_one = set1_ps<vf>(1.f);
static const auto const_min_x = set1_ps<vf>(EXP_MIN);
static const auto const_max_x = set1_ps<vf>(EXP_MAX);
x = max_ps(x, const_min_x);
x = min_ps(x, const_max_x);
auto a = floor(x);
auto xa = sub_ps(x, a);
auto result = mul_ps(dividers[0], xa);
result = add_ps(result, dividers[1]);
result = mul_ps(result, xa);
result = add_ps(result, dividers[2]);
result = mul_ps(result, xa);
result = add_ps(result, dividers[3]);
result = mul_ps(result, xa);
result = add_ps(result, dividers[4]);
result = mul_ps(result, xa);
result = add_ps(result, dividers[5]);
result = mul_ps(result, xa);
result = add_ps(result, dividers[6]);
result = mul_ps(result, xa);
result = add_ps(result, const_one);
auto ea = i32gather_ps<4>(EXP_LOOKUP + EXP_MAX, cvtps_epi32(a));
return mul_ps(ea, result);
}
#endif
/*
* Sigmoid
*/
CPU_ATTR static inline vf sigmoid(vf
#ifndef KERNELS_THIS_IS_SSE2
input
#endif
) {
#if defined(KERNELS_THIS_IS_SSE2)
std::abort(); // TODO: missing exp_approx_taylor for SSE2
#elif defined(KERNELS_THIS_IS_AVX2)
static const auto vconst_zero = setzero_ps<vf>();
static const auto vconst_one = set1_ps<vf>(1.f);
auto x = input;
auto minus_x = sub_ps(vconst_zero, x);
auto e_x = exp_approx_taylor(x);
auto e_minus_x = exp_approx_taylor(minus_x);
auto sigmoid_case1 = _mm256_rcp_ps(add_ps(vconst_one, e_minus_x));
auto sigmoid_case2 = mul_ps(e_x, _mm256_rcp_ps(add_ps(vconst_one, e_x)));
auto nonnegative_x_mask = _mm256_cmp_ps(vconst_zero, x, _CMP_LT_OS);
return _mm256_blendv_ps(sigmoid_case1, sigmoid_case2, nonnegative_x_mask);
#else
static const auto vconst_zero = setzero_ps<vf>();
static const auto vconst_one = set1_ps<vf>(1.f);
auto x = input;
auto minus_x = sub_ps(vconst_zero, x);
auto e_x = exp_approx_taylor(x);
auto e_minus_x = exp_approx_taylor(minus_x);
auto sigmoid_case1 = _mm512_rcp14_ps(add_ps(vconst_one, e_minus_x));
auto sigmoid_case2 = mul_ps(e_x, _mm512_rcp14_ps(add_ps(vconst_one, e_x)));
auto nonnegative_x_mask = _mm512_cmp_ps_mask(vconst_zero, x, _CMP_LT_OS);
return _mm512_mask_blend_ps(nonnegative_x_mask, sigmoid_case1, sigmoid_case2);
#endif
}
/*
* Tanh
*/
#if defined(KERNELS_THIS_IS_SSE2)
CPU_ATTR static inline vf tanh(vf) {
std::abort(); // TODO: missing exp_approx_taylor for SSE2
}
#else
CPU_ATTR static inline vf tanh(vf input) {
const static auto vconst_zero = setzero_ps<vf>();
auto e_x = exp_approx_taylor(input);
auto e_minus_x = exp_approx_taylor(sub_ps(vconst_zero, input));
return div_ps(sub_ps(e_x, e_minus_x), add_ps(e_x, e_minus_x));
}
#endif
}
}
#undef CPU_NAME
#undef CPU_ATTR
#undef vi
#undef vf
#undef vd

626
third_party/intgemm/intgemm/multiply.h поставляемый
Просмотреть файл

@ -1,626 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include "interleave.h"
#include "intrinsics.h"
#include "vec_traits.h"
#include "callbacks.h"
namespace intgemm {
INTGEMM_SSE2 static inline dvector_t<CPUType::SSE2, int> PermuteSummer(__m128i pack0123, __m128i pack4567) {
// No op for 128 bits: already reduced fully.
return { pack0123, pack4567 };
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 static inline __m256i PermuteSummer(__m256i pack0123, __m256i pack4567) {
// This instruction generates 1s 2s 3s 4s 5f 6f 7f 8f
__m256i rev = _mm256_permute2f128_si256(pack0123, pack4567, 0x21);
// This instruction generates 1f 2f 3f 4f 5s 6s 7s 8s
__m256i blended = _mm256_blend_epi32(pack0123, pack4567, 0xf0);
return _mm256_add_epi32(rev, blended);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_AVX512BW static inline __m256i PermuteSummer(__m512i pack0123, __m512i pack4567) {
// Form [0th 128-bit register of pack0123, 0st 128-bit register of pack4567, 2nd 128-bit register of pack0123, 2nd 128-bit register of pack4567]
__m512i mix0 = _mm512_mask_permutex_epi64(pack0123, 0xcc, pack4567, (0 << 4) | (1 << 6));
// Form [1st 128-bit register of pack0123, 1st 128-bit register of pack4567, 3rd 128-bit register of pack0123, 3rd 128-bit register of pack4567]
__m512i mix1 = _mm512_mask_permutex_epi64(pack4567, 0x33, pack0123, 2 | (3 << 2));
__m512i added = _mm512_add_epi32(mix0, mix1);
// Now we have 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7.
// Fold register over itself.
return _mm256_add_epi32(_mm512_castsi512_si256(added), _mm512_extracti64x4_epi64(added, 1));
}
#endif
#ifdef _MSC_VER
#define INTGEMM_OMP_FOR __pragma(omp for)
#define INTGEMM_OMP_PARALLEL __pragma(omp parallel)
#else
#define INTGEMM_OMP_FOR _Pragma("omp for")
#define INTGEMM_OMP_PARALLEL _Pragma("omp parallel")
#endif
// Quantize function used for SSSE3 and AVX2.
// Separate function for thread to work around gcc 7 bug that doesn't imbue
// target attributes across #pragma omp parallel.
#define INTGEMM_QUANTIZE_THREAD(target) \
target static void QuantizeThread(const float *input, int8_t *output, float quant_mult, std::size_t count) { \
FRegister q = set1_ps<FRegister>(quant_mult); \
INTGEMM_OMP_FOR \
for (std::size_t i = 0; i < count; i += sizeof(Register)) { \
*reinterpret_cast<Register*>(output + i) = QuantizeTile8::Consecutive(q, input + i); \
} \
}
#define INTGEMM_QUANTIZE(target) \
target static void Quantize(const float *const input, int8_t *const output, float quant_mult, Index size) { \
assert(reinterpret_cast<uintptr_t>(input) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(output) % sizeof(Register) == 0); \
const std::size_t kBatch = sizeof(Register); \
const std::size_t fast_end = size & ~(kBatch - 1); \
INTGEMM_OMP_PARALLEL \
{ \
QuantizeThread(input, output, quant_mult, fast_end); \
} \
std::size_t overhang = size & (kBatch - 1); \
if (!overhang) return; \
FRegister q = set1_ps<FRegister>(quant_mult); \
/* Each does size(Register) / 32 == kBatch / 4 floats at a time.
* If we're allowed to read one of them, then we can read the whole register. */ \
const float *inputs[4]; \
std::size_t i; \
for (i = 0; i < (overhang + (kBatch / 4) - 1) / (kBatch / 4); ++i) { \
inputs[i] = &input[fast_end + i * (kBatch / 4)]; \
} \
/* These will be clipped off. */ \
for (; i < 4; ++i) { \
inputs[i] = &input[fast_end]; \
} \
Register result = QuantizeTile8::Tile(q, inputs[0], inputs[1], inputs[2], inputs[3]); \
std::memcpy(output + (size & ~(kBatch - 1)), &result, overhang); \
}
/* Take 4 registers with 32-bit values to be horizontally added. Reduce them
* to one register with 32-bit values in the pattern 1 2 3 4 1 2 3 4, leaving
* the final addition (which crosses 128-bit lanes) to the caller.
*/
#define INTGEMM_PACK0123(target, Register) \
target inline Register Pack0123(Register sum0, Register sum1, Register sum2, Register sum3) { \
Interleave32(sum0, sum1); \
Register pack01 = add_epi32(sum0, sum1); \
Interleave32(sum2, sum3); \
Register pack23 = add_epi32(sum2, sum3); \
Interleave64(pack01, pack23); \
return add_epi32(pack01, pack23); \
} \
INTGEMM_PACK0123(INTGEMM_SSE2, __m128i)
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_PACK0123(INTGEMM_AVX2, __m256i)
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
/* Only INTGEMM_AVX512F is necessary but due to GCC 5.4 bug we have to set INTGEMM_AVX512BW */
INTGEMM_PACK0123(INTGEMM_AVX512BW, __m512i)
#endif
template <typename Callback>
INTGEMM_SSE2 static inline void RunCallback(Callback& callback_impl, dvector_t<CPUType::SSE2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
callback_impl.Run(total.first, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
callback_impl.Run(total.second, callbacks::OutputBufferInfo(row_idx, col_idx + 4, rows, cols));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template <typename Callback>
INTGEMM_AVX2 static inline void RunCallback(Callback& callback_impl, vector_t<CPUType::AVX2, int> total, Index row_idx, Index col_idx, Index rows, Index cols) {
callback_impl.Run(total, callbacks::OutputBufferInfo(row_idx, col_idx, rows, cols));
}
#endif
// 16-bit multiplier for INTGEMM_SSE2, INTGEMM_AVX2, and AVX512.
// C = A * B * unquant_mult
//
// This has been substantially revised from Jacob Devlin's SSE code which is:
// Copyright (c) 2017 Microsoft Corporation
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// A is a row-major quantized matrix (from PrepareA)
// B is a rearranged quantized matrix (from PrepareB)
// C is output in row-major form.
//
// All of A, B, and C must be in aligned to a multiple of the register size:
// INTGEMM_SSE2: 16 bytes
// INTGEMM_AVX2: 32 bytes
// AVX512: 64 bytes.
//
// A_rows can be anything non-negative.
// width must be a multiple of the register size.
// B_cols must be a multiple of 8.
// Multiply16
#define INTGEMM_MULTIPLY16(Register, target, cpu_type) \
template <typename Callback> target static void Multiply(const int16_t *A, const int16_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
assert(width % (sizeof(Register) / sizeof(int16_t)) == 0); \
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
const Index simd_width = width / (sizeof(Register) / sizeof(int16_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
/* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 32-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
Index k = 0; \
Register a = *(A_row + k); \
Register sum0 = madd_epi16(a, *(B0_col + k * 8)); \
Register sum1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
Register sum2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
Register sum3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
Register sum4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
Register sum5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
Register sum6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
Register sum7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
for (k = 1; k < simd_width; ++k) { \
a = *(A_row + k); \
/* Multiply 16-bit, horizontally add to packed 32-bit integers.*/ \
Register mult0 = madd_epi16(a, *(B0_col + k * 8)); \
Register mult1 = madd_epi16(a, *(B0_col + k * 8 + 1)); \
Register mult2 = madd_epi16(a, *(B0_col + k * 8 + 2)); \
Register mult3 = madd_epi16(a, *(B0_col + k * 8 + 3)); \
Register mult4 = madd_epi16(a, *(B0_col + k * 8 + 4)); \
Register mult5 = madd_epi16(a, *(B0_col + k * 8 + 5)); \
Register mult6 = madd_epi16(a, *(B0_col + k * 8 + 6)); \
Register mult7 = madd_epi16(a, *(B0_col + k * 8 + 7)); \
/* Sum packed 32-bit integers with danger of overflow. TODO: accumulate in 64-bit every so often.*/ \
sum0 = add_epi32(sum0, mult0); \
sum1 = add_epi32(sum1, mult1); \
sum2 = add_epi32(sum2, mult2); \
sum3 = add_epi32(sum3, mult3); \
sum4 = add_epi32(sum4, mult4); \
sum5 = add_epi32(sum5, mult5); \
sum6 = add_epi32(sum6, mult6); \
sum7 = add_epi32(sum7, mult7); \
} \
/* Reduce sums within 128-bit lanes.*/ \
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
} \
} \
} \
//An int8_prepbias version of the above code, using the add 127 technique
#define INTGEMM_PREPAREBIASFOR8(Register, target, cpu_type) \
template <class Callback> target static void PrepareBias(const int8_t *B, Index width, Index B_cols, Callback callback) { \
assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
const Register a = set1_epi8<Register>(1); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
/*const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width);*/ \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
Index k = 0; \
Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
sum3 = madd_epi16(sum3, ones); \
sum4 = madd_epi16(sum4, ones); \
sum5 = madd_epi16(sum5, ones); \
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
for (k = 1; k < simd_width; ++k) { \
/*Register a = *(A_row + k);*/ \
/* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add.*/ \
mult0 = madd_epi16(mult0, ones); \
mult1 = madd_epi16(mult1, ones); \
mult2 = madd_epi16(mult2, ones); \
mult3 = madd_epi16(mult3, ones); \
mult4 = madd_epi16(mult4, ones); \
mult5 = madd_epi16(mult5, ones); \
mult6 = madd_epi16(mult6, ones); \
mult7 = madd_epi16(mult7, ones); \
/*Add in 32bit*/ \
sum0 = add_epi32(sum0, mult0); \
sum1 = add_epi32(sum1, mult1); \
sum2 = add_epi32(sum2, mult2); \
sum3 = add_epi32(sum3, mult3); \
sum4 = add_epi32(sum4, mult4); \
sum5 = add_epi32(sum5, mult5); \
sum6 = add_epi32(sum6, mult6); \
sum7 = add_epi32(sum7, mult7); \
\
} \
/* Reduce sums within 128-bit lanes.*/ \
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, 0, B0_colidx, 1, B_cols); \
} \
} \
//An int8 version of the above code, using the add 127 technique
#define INTGEMM_MULTIPLY8SHIFT(Register, target, cpu_type) \
template <class Callback> target static void Multiply8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
assert(width % (sizeof(Register) / sizeof(int8_t)) == 0); \
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
const Index simd_width = width / (sizeof(Register) / sizeof(int8_t)); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
/* Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
const Register *A_row = reinterpret_cast<const Register*>(A + A_rowidx * width); \
/* These will be packed 16-bit integers containing sums for each row of B multiplied by the row of A. \
Iterate over shared (inner) dimension.*/ \
Index k = 0; \
Register a = *(A_row + k); \
Register sum0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register sum1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
Register sum2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
Register sum3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
Register sum4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
Register sum5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
Register sum6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
Register sum7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add. Seems a bit faster if this is declared here.*/ \
Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
sum3 = madd_epi16(sum3, ones); \
sum4 = madd_epi16(sum4, ones); \
sum5 = madd_epi16(sum5, ones); \
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
for (k = 1; k < simd_width; ++k) { \
a = *(A_row + k); \
/* Multiply 8-bit, horizontally add to packed 16-bit integers.*/ \
Register mult0 = maddubs_epi16(a, *(B0_col + k * 8)); \
Register mult1 = maddubs_epi16(a, *(B0_col + k * 8 + 1)); \
Register mult2 = maddubs_epi16(a, *(B0_col + k * 8 + 2)); \
Register mult3 = maddubs_epi16(a, *(B0_col + k * 8 + 3)); \
Register mult4 = maddubs_epi16(a, *(B0_col + k * 8 + 4)); \
Register mult5 = maddubs_epi16(a, *(B0_col + k * 8 + 5)); \
Register mult6 = maddubs_epi16(a, *(B0_col + k * 8 + 6)); \
Register mult7 = maddubs_epi16(a, *(B0_col + k * 8 + 7)); \
/* Upcast to 32-bit and horizontally add.*/ \
mult0 = madd_epi16(mult0, ones); \
mult1 = madd_epi16(mult1, ones); \
mult2 = madd_epi16(mult2, ones); \
mult3 = madd_epi16(mult3, ones); \
mult4 = madd_epi16(mult4, ones); \
mult5 = madd_epi16(mult5, ones); \
mult6 = madd_epi16(mult6, ones); \
mult7 = madd_epi16(mult7, ones); \
/*Add in 32bit*/ \
sum0 = add_epi32(sum0, mult0); \
sum1 = add_epi32(sum1, mult1); \
sum2 = add_epi32(sum2, mult2); \
sum3 = add_epi32(sum3, mult3); \
sum4 = add_epi32(sum4, mult4); \
sum5 = add_epi32(sum5, mult5); \
sum6 = add_epi32(sum6, mult6); \
sum7 = add_epi32(sum7, mult7); \
\
} \
/* Reduce sums within 128-bit lanes.*/ \
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
/*The specific implementation may need to reduce further.*/ \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
} \
} \
} \
/* 8-bit matrix multiply used by AVX and AVX2.
* These have two peculiar properties:
* 1. The sign instructions don't exist in AVX512.
* 2. 16 registers means gcc's register allocation failed so I wrote it in my
* own asm.
* 3. They support 3-argument vpsignb and vpmaddubsw.
*
* Fun fact: AVX introduced the three-argument vpsignb and vpmaddubsw but only
* for 128-bit, despite the primary change in AVX being the addition of
* 256-bit. We had to wait for INTGEMM_AVX2 to get 256-bit versions of vpsignb and
* vpmaddubsw. That's why this code is generic over 128-bit or 256-bit.
*/
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 inline static void InnerINTGEMM_AVX2(
__m256i a, const __m256i *b,
__m256i &sum0, __m256i &sum1, __m256i &sum2, __m256i &sum3,
__m256i &sum4, __m256i &sum5, __m256i &sum6, __m256i &sum7) {
// Annoyingly the only 8-bit multiply is signed * unsigned (maddubs).
// So we take the sign bits off of a and apply them each b in a * b.
//
// We have only 16 YMM registers but we want to store:
// 1 for a (or |a|)
// 8 temporaries for applying sign to each column of B.
// 8 sums.
#if defined(__GNUC__) && !defined(__clang__)
// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94663
// gcc's register allocator does:
// 1 for a, do all the sign application, then overwrite with |a|
// 8 temporaries
// 7 sums in registers + 1 on the stack
//
// But it's possible to complete an operation early, freeing up its
// temporary register for reuse. But completing an operation early
// requires us to have |a| for vpmaddubsw while completing the later
// operation needs a again to apply sign.
//
// So we do two columns, 0 and 1, early. This allows b0_b6 and b1_b7
// to be reused by columns 6 and 7, respectively. And there's enough
// registers to store both a and |a|.
//
// These are the temporary variables used to process each column of b.
// We let the compiler choose which register number is which, but force
// it to allocate all registers.
__m256i absa;
__m256i b0_b6, b1_b7, b2, b3, b4, b5;
// Maybe this will tell gcc that we're accessing 8 registers starting
// at B_live. Though I doubt it because we're passing the address as a
// register.
typedef struct { __m256i x[8]; } B_range;
asm(
// Copy the first 6 columns of b to registers. We assume B has
// been rearranged so that these 8 columns are consecutive.
// vpsignb does not take a memory address as its second argument,
// so this can't be inlined into vsignb.
"vmovdqa (%[B]), %[b0_b6]\n"
"vmovdqa %c[size](%[B]), %[b1_b7]\n"
// These multiplies are executed by the assembler, not by the CPU
// at run time.
// I would have liked to just initialize b2 etc above but that
// would make it an input argument "+x" instead of "=&x". And +x
// counts as two operands for purposes of gcc's annoying 30-operand
// limit.
"vmovdqa 2*%c[size](%[B]), %[b2]\n"
"vmovdqa 3*%c[size](%[B]), %[b3]\n"
"vmovdqa 4*%c[size](%[B]), %[b4]\n"
"vmovdqa 5*%c[size](%[B]), %[b5]\n"
// Store the absolute value of a in absa.
"vpabsb %[a], %[absa]\n"
// If a byte of a is negative, negate the corresponding byte in
// b0_b6 etc.
"vpsignb %[a], %[b0_b6], %[b0_b6]\n"
"vpsignb %[a], %[b1_b7], %[b1_b7]\n"
// Multiply signed * unsigned then horizontally add to form packed
// 16-bit integers:
// b0[0] * |a|[0] + b0[1] * |a|[1], b0[2] * |a|[2] + b0[3] * |a|[3], ...
"vpmaddubsw %[b0_b6], %[absa], %[b0_b6]\n"
"vpmaddubsw %[b1_b7], %[absa], %[b1_b7]\n"
// vpmaddubsw has latency 5 so work on some other sign bits while
// we're at it.
"vpsignb %[a], %[b2], %[b2]\n"
"vpsignb %[a], %[b3], %[b3]\n"
"vpsignb %[a], %[b4], %[b4]\n"
"vpsignb %[a], %[b5], %[b5]\n"
// Perform a 16-bit add with saturation to accumlate sums.
"vpaddsw %[b0_b6], %[sum0], %[sum0]\n"
// Now we can reuse b0_b6 for b6
"vmovdqa 6*%c[size](%[B]), %[b0_b6]\n"
"vpaddsw %[b1_b7], %[sum1], %[sum1]\n"
// Now we can reuse b1_b7 for b7
"vmovdqa 7*%c[size](%[B]), %[b1_b7]\n"
// More crunching while the load happens.
"vpmaddubsw %[b2], %[absa], %[b2]\n"
"vpmaddubsw %[b3], %[absa], %[b3]\n"
"vpmaddubsw %[b4], %[absa], %[b4]\n"
"vpsignb %[a], %[b0_b6], %[b0_b6]\n"
"vpsignb %[a], %[b1_b7], %[b1_b7]\n"
"vpmaddubsw %[b5], %[absa], %[b5]\n"
"vpmaddubsw %[b0_b6], %[absa], %[b0_b6]\n"
"vpmaddubsw %[b1_b7], %[absa], %[b1_b7]\n"
"vpaddsw %[b2], %[sum2], %[sum2]\n"
"vpaddsw %[b3], %[sum3], %[sum3]\n"
"vpaddsw %[b4], %[sum4], %[sum4]\n"
"vpaddsw %[b5], %[sum5], %[sum5]\n"
"vpaddsw %[b0_b6], %[sum6], %[sum6]\n"
"vpaddsw %[b1_b7], %[sum7], %[sum7]\n"
: [sum0] "+x" (sum0),
[sum1] "+x" (sum1),
[sum2] "+x" (sum2),
[sum3] "+x" (sum3),
[sum4] "+x" (sum4),
[sum5] "+x" (sum5),
[sum6] "+x" (sum6),
[sum7] "+x" (sum7),
[b0_b6] "=&x" (b0_b6),
[b1_b7] "=&x" (b1_b7),
[b2] "=&x" (b2),
[b3] "=&x" (b3),
[b4] "=&x" (b4),
[b5] "=&x" (b5),
[absa] "=&x" (absa)
:
// I would like to use m here but that non-deterministically
// chooses %(eax) or -256$(eax) and there's no way to add to that
// memory address:
// https://gcc.gnu.org/ml/gcc-help/2011-04/msg00518.html
//
[B] "r" (reinterpret_cast<const B_range*>(b)),
[a] "x" (a),
[size] "i" (sizeof(__m256i))
);
#else
// https://bugs.llvm.org/show_bug.cgi?id=41482
// clang has a bug: target attribute avx2 doesn't allow inline assembly with
// +x for YMM registers. For example, this will not compile with default
// arguments:
// __attribute__ ((target ("avx2"))) void Foo(__m256i sum0) {
// asm("" : [sum0] "+x" (sum0));
// }
// but it will compile with -mavx2.
// However, clang does allow intrinsics and has a better register allocator
// than gcc. So here we just use intrinsics.
__m256i a_positive = abs_epi8(a);
sum0 = adds_epi16(sum0, maddubs_epi16(a_positive, sign_epi8(b[0], a)));
sum1 = adds_epi16(sum1, maddubs_epi16(a_positive, sign_epi8(b[1], a)));
sum2 = adds_epi16(sum2, maddubs_epi16(a_positive, sign_epi8(b[2], a)));
sum3 = adds_epi16(sum3, maddubs_epi16(a_positive, sign_epi8(b[3], a)));
sum4 = adds_epi16(sum4, maddubs_epi16(a_positive, sign_epi8(b[4], a)));
sum5 = adds_epi16(sum5, maddubs_epi16(a_positive, sign_epi8(b[5], a)));
sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
#endif
}
#endif
// For INTGEMM_SSSE3 without AVX
INTGEMM_SSSE3 inline static void InnerINTGEMM_SSSE3(
__m128i a, const __m128i *b,
__m128i &sum0, __m128i &sum1, __m128i &sum2, __m128i &sum3,
__m128i &sum4, __m128i &sum5, __m128i &sum6, __m128i &sum7) {
__m128i a_positive = abs_epi8(a);
sum0 = adds_epi16(sum0, maddubs_epi16(a_positive, sign_epi8(b[0], a)));
sum1 = adds_epi16(sum1, maddubs_epi16(a_positive, sign_epi8(b[1], a)));
sum2 = adds_epi16(sum2, maddubs_epi16(a_positive, sign_epi8(b[2], a)));
sum3 = adds_epi16(sum3, maddubs_epi16(a_positive, sign_epi8(b[3], a)));
sum4 = adds_epi16(sum4, maddubs_epi16(a_positive, sign_epi8(b[4], a)));
sum5 = adds_epi16(sum5, maddubs_epi16(a_positive, sign_epi8(b[5], a)));
sum6 = adds_epi16(sum6, maddubs_epi16(a_positive, sign_epi8(b[6], a)));
sum7 = adds_epi16(sum7, maddubs_epi16(a_positive, sign_epi8(b[7], a)));
}
//INTGEMM_AVX2 or INTGEMM_SSSE3 multiply
#define INTGEMM_MULTIPLY8(Register, target, cpu_type) \
template <typename Callback> target static void Multiply(const int8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) { \
assert(width % sizeof(Register) == 0); \
assert(B_cols % 8 == 0); \
assert(reinterpret_cast<uintptr_t>(A) % sizeof(Register) == 0); \
assert(reinterpret_cast<uintptr_t>(B) % sizeof(Register) == 0); \
const Index simd_width = width / sizeof(Register); \
auto callback_impl = callbacks::CallbackImpl<cpu_type, Callback>(callback); \
INTGEMM_OMP_FOR \
for (Index B0_colidx = 0; B0_colidx < B_cols; B0_colidx += 8) { \
const Register *B0_col = reinterpret_cast<const Register *>(B) + simd_width * B0_colidx; \
/*Process one row of A at a time. Doesn't seem to be faster to do multiple rows of A at once.*/ \
for (Index A_rowidx = 0; A_rowidx < A_rows; ++A_rowidx) { \
/*Iterate over shared (inner) dimension.*/ \
const Register *A_live = reinterpret_cast<const Register *>(A + A_rowidx * width); \
const Register *A_end = A_live + simd_width; \
const Register *B_live = B0_col; \
/* Rather than initializing as zeros and adding, just initialize the first.*/ \
Register a = *(A_live++); \
Register a_positive = abs_epi8(a); \
/* These will be packed 16-bit integers containing sums for each column of B multiplied by the row of A.*/ \
Register sum0 = maddubs_epi16(a_positive, sign_epi8(B_live[0], a)); \
Register sum1 = maddubs_epi16(a_positive, sign_epi8(B_live[1], a)); \
Register sum2 = maddubs_epi16(a_positive, sign_epi8(B_live[2], a)); \
Register sum3 = maddubs_epi16(a_positive, sign_epi8(B_live[3], a)); \
Register sum4 = maddubs_epi16(a_positive, sign_epi8(B_live[4], a)); \
Register sum5 = maddubs_epi16(a_positive, sign_epi8(B_live[5], a)); \
Register sum6 = maddubs_epi16(a_positive, sign_epi8(B_live[6], a)); \
Register sum7 = maddubs_epi16(a_positive, sign_epi8(B_live[7], a)); \
B_live += 8; \
/* Use A as the loop variable so the add can be done where gcc likes it for branch prediction.*/ \
for (; A_live != A_end; ++A_live, B_live += 8) { \
Inner##target(*A_live, B_live, sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7); \
} \
/* Convert 16-bit to 32-bit and add, not caring what parts are added.
* Implementations:
* 1. https://github.com/tesseract-ocr/tesseract/blob/master/src/arch/intsimdmatrixavx2.cpp#L67 under Apache license:
* This does a multiply by 1 and horizontal add:
* _mm512_madd_epi16(sum, _mm512_set1_epi16(1))
* Current fastest.
*
* 2. Signed extension and fold halves:
* sum = _mm512_add_epi32(
* _mm512_cvtepi16_epi32(_mm512_castsi512_si256(sum)),
* _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64(sum, 1)));
*
* 3. Sign extend by abuse of bitshift, then add.
* sum = _mm512_add_epi32(
* _mm512_srai_epi32(_mm512_slli_epi32(sum, 16), 16),
* _mm512_srai_epi32(sum, 16));
*/ \
Register ones = set1_epi16<Register>(1); \
sum0 = madd_epi16(sum0, ones); \
sum1 = madd_epi16(sum1, ones); \
sum2 = madd_epi16(sum2, ones); \
sum3 = madd_epi16(sum3, ones); \
sum4 = madd_epi16(sum4, ones); \
sum5 = madd_epi16(sum5, ones); \
sum6 = madd_epi16(sum6, ones); \
sum7 = madd_epi16(sum7, ones); \
Register pack0123 = Pack0123(sum0, sum1, sum2, sum3); \
Register pack4567 = Pack0123(sum4, sum5, sum6, sum7); \
auto total = PermuteSummer(pack0123, pack4567); \
RunCallback(callback_impl, total, A_rowidx, B0_colidx, A_rows, B_cols); \
} \
} \
}
/* Wrap a multiply call in OMP parallelism. Here it launches threads then
* inside the implementation there is a pragma omp for. In gcc >= 8 these
* could have been the same but older compilers don't imbue target attributes
* on the hidden function created by pragma omp parallel.
*
* Also, gcc 7 is unable to deduce the function pointer type (for ChooseCPU) if
* I use typename Backend::Integer directly in the arguments. As a workaround,
* have a default template argument Integer then use that so it's resolved.
*/
template <class Callback, class Backend, class Integer = typename Backend::Integer> static inline void OMPParallelWrap(const Integer *A, const Integer *B, Index A_rows, Index width, Index B_cols, Callback callback) {
#pragma omp parallel
Backend::template Multiply<Callback>(A, B, A_rows, width, B_cols, callback);
}
template <class Callback, class Backend> static inline void OMPParallelWrap8Shift(const uint8_t *A, const int8_t *B, Index A_rows, Index width, Index B_cols, Callback callback) {
#pragma omp parallel
Backend::template Multiply8Shift<Callback>(A, B, A_rows, width, B_cols, callback);
}
} // namespace intgemm

84
third_party/intgemm/intgemm/sse2_gemm.h поставляемый
Просмотреть файл

@ -1,84 +0,0 @@
#pragma once
#include "kernels.h"
#include "multiply.h"
#include "types.h"
#include <cstdint>
// 8 bit is in ssse3_gemm.h
namespace intgemm {
namespace SSE2 {
INTGEMM_SSE2 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
}
INTGEMM_SELECT_COL_B(INTGEMM_SSE2, __m128i)
class QuantizeTile16 {
public:
INTGEMM_SSE2 static inline Register Consecutive(__m128 mult_reg, const float *input) {
return Tile(mult_reg, input, input + 4);
}
INTGEMM_SSE2 static inline Register ConsecutiveWithWrapping(__m128 mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
return Tile(mult_reg,
input,
input + 4 + (cols_left <= 4 ? cols * (row_step - 1) : 0));
}
INTGEMM_SSE2 static inline Register ForReshape(__m128 mult_reg, const float *input, int) {
return Consecutive(mult_reg, input);
}
private:
INTGEMM_SSE2 static inline Register Tile(__m128 mult_reg, const float *input0, const float *input1) {
__m128i g0 = kernels::quantize(loadu_ps<__m128>(input0), mult_reg);
__m128i g1 = kernels::quantize(loadu_ps<__m128>(input1), mult_reg);
return _mm_packs_epi32(g0, g1);
}
};
// This should be pure SSE2 (and below).
struct Kernels16 {
typedef int16_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_SSE2 static inline void PrepareA(const float *input, int16_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
INTGEMM_SSE2 static void Quantize(const float *input, int16_t *output, float quant_mult, Index size) {
assert(size % 8 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 8, output += 8) {
*reinterpret_cast<__m128i*>(output) = QuantizeTile16::Consecutive(q, input);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 8;
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_16(INTGEMM_SSE2, QuantizeTile16)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSE2, int16_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSE2, QuantizeTile16, int16_t)
INTGEMM_SSE2 static void SelectColumnsB(const int16_t *input, int16_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
//TODO #DEFINE
SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows * 2, cols_begin, cols_end);
}
INTGEMM_MULTIPLY16(__m128i, INTGEMM_SSE2, CPUType::SSE2)
constexpr static const char *const kName = "16-bit SSE2";
static const CPUType kUses = CPUType::SSE2;
};
} // namespace SSE2
} // namespace intgemm

154
third_party/intgemm/intgemm/ssse3_gemm.h поставляемый
Просмотреть файл

@ -1,154 +0,0 @@
#pragma once
#include "interleave.h"
#include "kernels.h"
#include "multiply.h"
#include "types.h"
#include <cstdint>
#include <cstring>
// 16-bit is in sse2_gemm.h
namespace intgemm {
namespace SSSE3 {
INTGEMM_SSSE3 inline __m128i QuantizerGrab(const float *input, const __m128 quant_mult_reg) {
return kernels::quantize(loadu_ps<__m128>(input), quant_mult_reg);
}
INTGEMM_SELECT_COL_B(INTGEMM_SSSE3, __m128i)
class QuantizeTile8 {
public:
INTGEMM_SSSE3 static inline Register ForReshape(FRegister mult_reg, const float *input, Index cols) {
// Skip a row.
return Tile(mult_reg, input, input + 4, input + 2 * cols, input + 2 * cols + 4);
}
INTGEMM_SSSE3 static inline Register Consecutive(FRegister mult_reg, const float *input) {
return Tile(mult_reg, input, input + 4, input + 8, input + 12);
}
INTGEMM_SSSE3 static inline Register ConsecutiveU(FRegister mult_reg, const float *input) {
return TileU(mult_reg, input, input + 4, input + 8, input + 12);
}
INTGEMM_SSSE3 static inline Register ConsecutiveWithWrapping(FRegister mult_reg, const float *input, Index cols_left, Index cols, Index row_step) {
const float* inputs[4];
for (Index i = 0; i < sizeof(inputs) / sizeof(inputs[0]); ++i) {
while (cols_left < sizeof(Register) / sizeof(float)) {
input += cols * (row_step - 1);
cols_left += cols;
}
inputs[i] = input;
input += sizeof(Register) / sizeof(float);
cols_left -= sizeof(Register) / sizeof(float);
}
return Tile(mult_reg, inputs[0], inputs[1], inputs[2], inputs[3]);
}
// Quantize 16xfloat into 16xint8_t
INTGEMM_SSSE3 static inline __m128i Tile(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
__m128i g0 = QuantizerGrab(input0, mult_reg);
__m128i g1 = QuantizerGrab(input1, mult_reg);
__m128i g2 = QuantizerGrab(input2, mult_reg);
__m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
/* Ban -128.
* Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead,
* use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
* The first generates 0xff for fields -128.
* The second subtracts 0xff from -128 which has the effect of converting
* to -127.
*/
// packed = _mm_max_epi8(packed, neg127);
__m128i evils = _mm_cmpeq_epi8(packed, neg128);
return _mm_sub_epi8(packed, evils);
// No permute needed. packs is in order for SSE.
}
private:
INTGEMM_SSSE3 static inline __m128i TileU(FRegister mult_reg, const float *input0, const float *input1, const float *input2, const float *input3) {
const __m128i neg128 = _mm_set1_epi8(-128);
const __m128i pos127 = _mm_set1_epi8(127);
__m128i g0 = QuantizerGrab(input0, mult_reg);
__m128i g1 = QuantizerGrab(input1, mult_reg);
__m128i g2 = QuantizerGrab(input2, mult_reg);
__m128i g3 = QuantizerGrab(input3, mult_reg);
__m128i packed0 = _mm_packs_epi32(g0, g1);
__m128i packed1 = _mm_packs_epi32(g2, g3);
__m128i packed = _mm_packs_epi16(packed0, packed1);
/* Ban -128.
* Don't use the SSE4.1 instruction _mm_max_epi8(packed, neg127). Instead,
* use SSE2 instructions _mm_cmpeq_epi8 and _mm_sub_epi8.
* The first generates 0xff for fields -128.
* The second subtracts 0xff from -128 which has the effect of converting
* to -127.
*/
// packed = _mm_max_epi8(packed, neg127);
__m128i evils = _mm_cmpeq_epi8(packed, neg128);
return _mm_add_epi8(_mm_sub_epi8(packed, evils), pos127);
// No permute needed. packs is in order for SSE.
}
};
// pmaddubsw (the 8-bit multiply) is SSSE3, so pedantically that's the version we need.
struct Kernels8 {
typedef int8_t Integer;
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_SSSE3 static inline void PrepareA(const float *input, int8_t *output, float quant_mult, Index rows, Index cols) {
Quantize(input, output, quant_mult, rows * cols);
}
private:
INTGEMM_QUANTIZE_THREAD(INTGEMM_SSSE3)
public:
INTGEMM_QUANTIZE(INTGEMM_SSSE3)
// Version with unsigned int + 127
// Currently A is prepared by quantization but this could theoretically change.
INTGEMM_SSSE3 static inline void PrepareA(const float *input, uint8_t *output, float quant_mult, Index rows, Index cols) {
QuantizeU(input, output, quant_mult, rows * cols);
}
INTGEMM_SSSE3 static void QuantizeU(const float *input, uint8_t *output, float quant_mult, Index size) {
assert(size % 16 == 0);
assert(reinterpret_cast<uintptr_t>(input) % 16 == 0);
assert(reinterpret_cast<uintptr_t>(output) % 16 == 0);
FRegister q = set1_ps<FRegister>(quant_mult);
const float *end = input + size;
for (; input != end; input += 16, output += 16) {
*reinterpret_cast<__m128i*>(output) = QuantizeTile8::ConsecutiveU(q, input);
}
}
// Tile size for B; B must be a multiple of this block size.
static const Index kBTileRow = 16;
static const Index kBTileCol = 8;
INTGEMM_PREPARE_B_8(INTGEMM_SSSE3, SSSE3::QuantizeTile8)
INTGEMM_PREPARE_B_QUANTIZED_TRANSPOSED(INTGEMM_SSSE3, int8_t)
INTGEMM_PREPARE_B_TRANSPOSED(INTGEMM_SSSE3, QuantizeTile8, int8_t)
INTGEMM_SSSE3 static void SelectColumnsB(const int8_t *input, int8_t *output, Index rows, const Index *cols_begin, const Index *cols_end) {
SSSE3::SelectColumnsOfB((const __m128i*)input, (__m128i*)output, rows, cols_begin, cols_end);
}
INTGEMM_MULTIPLY8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
INTGEMM_MULTIPLY8SHIFT(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
INTGEMM_PREPAREBIASFOR8(__m128i, INTGEMM_SSSE3, CPUType::SSE2)
constexpr static const char *const kName = "8-bit SSSE3";
static const CPUType kUses = CPUType::SSSE3;
};
} // namespace SSSE3
} // namespace intgemm

76
third_party/intgemm/intgemm/stats.h поставляемый
Просмотреть файл

@ -1,76 +0,0 @@
#pragma once
#include <cmath>
#include "intrinsics.h"
#ifdef _OPENMP
#include <omp.h>
#endif
namespace intgemm {
/* Horizontal max and sums. TODO make a template argument? */
INTGEMM_SSE2 static inline float MaxFloat32(__m128 a) {
// Fold to just using the first 64 bits.
__m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
a = _mm_max_ps(a, second_half);
// Fold to just using the first 32 bits.
second_half = _mm_shuffle_ps(a, a, 1);
a = _mm_max_ps(a, second_half);
// This casting compiles to nothing.
return *reinterpret_cast<float*>(&a);
}
INTGEMM_SSE2 static inline float AddFloat32(__m128 a) {
// Fold to just using the first 64 bits.
__m128 second_half = _mm_shuffle_ps(a, a, 3 * 4 + 2);
a = _mm_add_ps(a, second_half);
// Fold to just using the first 32 bits.
second_half = _mm_shuffle_ps(a, a, 1);
a = _mm_add_ps(a, second_half);
// This casting compiles to nothing.
return *reinterpret_cast<float*>(&a);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
INTGEMM_AVX2 static inline float MaxFloat32(__m256 a) {
return MaxFloat32(max_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
}
INTGEMM_AVX2 static inline float AddFloat32(__m256 a) {
return AddFloat32(add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1)));
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
// Find the maximum float.
INTGEMM_AVX512F static inline float MaxFloat32(__m512 a) {
// _mm512_extractf32x8_ps is AVX512DQ but we don't care about masking.
// So cast to pd, do AVX512F _mm512_extractf64x4_pd, then cast to ps.
__m256 upper = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), 1));
return MaxFloat32(max_ps(_mm512_castps512_ps256(a), upper));
}
INTGEMM_AVX512F static inline float AddFloat32(__m512 a) {
__m256 upper = _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), 1));
return AddFloat32(add_ps(_mm512_castps512_ps256(a), upper));
}
#endif
constexpr int32_t kFloatAbsoluteMask = 0x7fffffff;
} // namespace intgemm
#define INTGEMM_THIS_IS_SSE2
#include "stats.inl"
#undef INTGEMM_THIS_IS_SSE2
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#define INTGEMM_THIS_IS_AVX2
#include "stats.inl"
#undef INTGEMM_THIS_IS_AVX2
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
#define INTGEMM_THIS_IS_AVX512DQ
#include "stats.inl"
#undef INTGEMM_THIS_IS_AVX512DQ
#endif

98
third_party/intgemm/intgemm/stats.inl поставляемый
Просмотреть файл

@ -1,98 +0,0 @@
/* This file is included multiple times, once per architecture. */
#if defined(INTGEMM_THIS_IS_AVX512DQ)
#define INTGEMM_ARCH AVX512BW
#define INTGEMM_TARGET INTGEMM_AVX512DQ
#elif defined(INTGEMM_THIS_IS_AVX2)
#define INTGEMM_ARCH AVX2
#define INTGEMM_TARGET INTGEMM_AVX2
#elif defined(INTGEMM_THIS_IS_SSE2)
#define INTGEMM_ARCH SSE2
#define INTGEMM_TARGET INTGEMM_SSE2
#else
#error Included with unexpected architecture
#endif
namespace intgemm {
namespace INTGEMM_ARCH {
/* Compute the maximum absolute value over floats aligned to register size.
* Do not call this function directly; it's a subroutine of MaxAbsolute.
*/
INTGEMM_TARGET static inline float MaxAbsoluteThread(const FRegister *begin, const FRegister *end) {
FRegister highest = setzero_ps<FRegister>();
const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
#pragma omp for
for (const FRegister *i = begin; i < end; ++i) {
FRegister reg = and_ps(abs_mask, *i);
highest = max_ps(highest, reg);
}
return MaxFloat32(highest);
}
/* Compute the maximum absolute value of an array of floats.
* begin_float must be aligned to a multiple of the register size.
*/
INTGEMM_TARGET static inline float MaxAbsolute(const float *begin_float, const float *end_float) {
assert(reinterpret_cast<uintptr_t>(begin_float) % sizeof(FRegister) == 0);
const float *end_reg = end_float - (reinterpret_cast<uintptr_t>(end_float) % sizeof(FRegister)) / sizeof(float);
float ret = 0.0;
#pragma omp parallel reduction(max:ret) num_threads(std::max<int>(1, std::min<int>(omp_get_max_threads(), (end_float - begin_float) / 16384)))
{
float shard_max = MaxAbsoluteThread(
reinterpret_cast<const FRegister*>(begin_float),
reinterpret_cast<const FRegister*>(end_reg));
ret = std::max(ret, shard_max);
}
/* Overhang. The beginning was aligned so if there's any overhang we're
* allowed to read the next full register. Then mask that to 0. */
#if defined(INTGEMM_THIS_IS_AVX512DQ)
if (end_float != end_reg) {
const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
__mmask16 mask = (1 << (end_float - end_reg)) - 1;
FRegister masked = _mm512_maskz_and_ps(mask, abs_mask, *reinterpret_cast<const FRegister*>(end_reg));
ret = std::max(ret, MaxFloat32(masked));
}
#else
for (const float *i = end_reg; i < end_float; ++i) {
ret = std::max(ret, std::fabs(*i));
}
#endif
return ret;
}
/* Computes the euclidean norm and returns the mean and the standard deviation. Optionally it can be the mean and standard deviation in absolute terms. */
INTGEMM_TARGET static inline MeanStd VectorMeanStd(const float *begin_float, const float *end_float, bool absolute) {
assert(end_float > begin_float);
assert((end_float - begin_float) % (sizeof(FRegister) / sizeof(float)) == 0);
size_t num_items = end_float - begin_float;
const FRegister *begin = reinterpret_cast<const FRegister*>(begin_float);
const FRegister *end = reinterpret_cast<const FRegister*>(end_float);
FRegister squares = set1_ps<FRegister>(0);
FRegister sums = set1_ps<FRegister>(0);
if (absolute) {
const FRegister abs_mask = cast_ps(set1_epi32<Register>(kFloatAbsoluteMask));
for (; begin != end; begin++) {
FRegister vec = and_ps(abs_mask, *begin);
squares = add_ps(squares, mul_ps(vec, vec));
sums = add_ps(sums, vec);
}
} else {
for (; begin != end; begin++) {
FRegister vec = *begin;
squares = add_ps(squares, mul_ps(vec, vec));
sums = add_ps(sums, vec);
}
}
float squares_sum = AddFloat32(squares);
float normal_sums = AddFloat32(sums);
MeanStd ret;
ret.mean = normal_sums/num_items;
ret.stddev = std::sqrt((squares_sum/num_items) - (ret.mean*ret.mean));
return ret;
}
} // namespace INTGEMM_ARCH
} // namespace intgemm
#undef INTGEMM_ARCH
#undef INTGEMM_TARGET

118
third_party/intgemm/intgemm/types.h поставляемый
Просмотреть файл

@ -1,118 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include <exception>
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
#include <immintrin.h>
#endif
#include <emmintrin.h>
// clang-cl bug doesn't include these headers when pretending to be MSVC
// https://github.com/llvm/llvm-project/blob/e9a294449575a1e1a0daca470f64914695dc9adc/clang/lib/Headers/immintrin.h#L69-L72
#if defined(_MSC_VER) && defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#include <smmintrin.h>
#include <avx512fintrin.h>
#include <avx512dqintrin.h>
#include <avx512bwintrin.h>
#include <avx512vnniintrin.h>
#endif
#if (defined(_MSC_VER) && !defined(__clang__)) || defined(__INTEL_COMPILER)
/* Real MSVC does not appear to have target attributes but is also fine with
* just using intrinsics anywhere. clang-cl pretending to be MSVC requires
* target attributes, so it's excluded from the above.
*
* The Intel compiler has a bug whereby constructors with target attributes do
* not link. Like this program doesn't compile with icpc:
* class Foo {
* public:
* __attribute__ ((target ("avx2"))) Foo() {}
* };
* int main() { Foo a; }
*
* It appears to be erroneously activating function multiversioning when only
* one version of a constructor with target attributes is defined. Normal
* methods with one target attribute work fine. The Intel compiler also allows
* intrinsics without any target attributes so we just leave them blank.
*/
#define INTGEMM_SSE2
#define INTGEMM_SSSE3
#define INTGEMM_AVX2
#define INTGEMM_AVX512F
#define INTGEMM_AVX512BW
#define INTGEMM_AVX512DQ
#define INTGEMM_AVX512VNNI
#else
/* gcc and clang take lists of all the flavors */
#define INTGEMM_SSE2 __attribute__ ((target ("sse2")))
#define INTGEMM_SSSE3 __attribute__ ((target ("ssse3")))
#define INTGEMM_AVX2 __attribute__ ((target ("avx2")))
#define INTGEMM_AVX512F __attribute__ ((target ("avx512f")))
#define INTGEMM_AVX512BW __attribute__ ((target ("avx512f,avx512bw,avx512dq")))
#define INTGEMM_AVX512DQ __attribute__ ((target ("avx512f,avx512bw,avx512dq")))
#define INTGEMM_AVX512VNNI __attribute__ ((target ("avx512f,avx512bw,avx512dq,avx512vnni")))
#endif
namespace intgemm {
// This will be thrown if a CPU isn't supported by the routines (16-bit without SSE2 or 8-bit without SSSE3).
class UnsupportedCPU : public std::exception {
public:
UnsupportedCPU() {}
~UnsupportedCPU() throw() {}
const char *what() const throw() override {
return "Integer matrix multiplication has not been efficiently implemented for your CPU.";
}
};
typedef unsigned int Index;
// If you want to detect the CPU and dispatch yourself, here's what to use:
enum class CPUType {
UNSUPPORTED = 0,
SSE2 = 1,
SSSE3 = 2,
AVX2 = 3,
AVX512BW = 4,
AVX512VNNI = 5
};
// Running CPU type. This is defined in intgemm.cc (as the dispatcher).
extern const CPUType kCPU;
struct MeanStd {
float mean;
float stddev;
};
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
namespace AVX512VNNI {
typedef __m512i Register;
typedef __m512 FRegister;
} // namespace AVX512VNNI
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
namespace AVX512BW {
typedef __m512i Register;
typedef __m512 FRegister;
} // namespace AVX512BW
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
namespace AVX2 {
typedef __m256i Register;
typedef __m256 FRegister;
} // namespace AVX2
#endif
namespace SSSE3 {
typedef __m128i Register;
typedef __m128 FRegister;
} // namespace SSSE3
namespace SSE2 {
typedef __m128i Register;
typedef __m128 FRegister;
} // namespace SSE2
} // namespace intgemm

82
third_party/intgemm/intgemm/utils.h поставляемый
Просмотреть файл

@ -1,82 +0,0 @@
#pragma once
#include <tuple>
namespace intgemm {
/*
* Sequence of unsigned integers
*
* Examples:
* sequence<1, 2, 3>()
* sequence_pushback<4, sequence<1, 2, 3>>() = sequence<1, 2, 3, 4>()
* sequence_popfront<sequence<1, 2, 3>>() = sequence<2, 3>()
* make_sequence<3>() = sequence<0, 1, 2>()
*/
template <unsigned... Indices>
struct sequence { using type = sequence; };
template <unsigned I, typename Sequence>
struct sequence_pushback;
template <unsigned I, unsigned... Indices>
struct sequence_pushback<I, sequence<Indices...>> : sequence<Indices..., I> {};
template <typename Sequence>
struct sequence_popfront;
template <unsigned FirstIndex, unsigned... RestIndices>
struct sequence_popfront<sequence<FirstIndex, RestIndices...>> : sequence<RestIndices...> {};
namespace { // anonymous namespace
template <unsigned N>
struct make_sequence_impl : sequence_pushback<N - 1, typename make_sequence_impl<N - 1>::type> {};
template <>
struct make_sequence_impl<0> : sequence<> {};
} // anonymous namespace
template <unsigned N>
using make_sequence = typename make_sequence_impl<N>::type;
/*
* Make a subtuple
*/
template <typename Tuple, unsigned... Indices>
using subtuple_t = typename std::tuple<typename std::tuple_element<Indices, Tuple>::type...>;
template <typename Tuple, unsigned... Indices>
constexpr subtuple_t<Tuple, Indices...> make_subtuple(const Tuple& tuple, sequence<Indices...>) {
return std::make_tuple(std::get<Indices>(tuple)...);
}
/*
* Factorial
*/
static constexpr unsigned long long factorial(unsigned n) {
return n <= 1 ? 1 : n * factorial(n - 1);
}
/*
* e^n, where n is integer
*/
static constexpr double expi_nonnegative(unsigned n) {
return n == 0 ? 1.0 : (n == 1 ? 2.718281828459045 : expi_nonnegative(n / 2) * expi_nonnegative((n + 1) / 2));
}
static constexpr double expi(int n) {
return (n >= 0 ? expi_nonnegative(n) : 1.0 / expi_nonnegative(-n));
}
// Version that returns float.
static constexpr float expif(int n) {
return static_cast<float>(expi(n));
}
/*
* Round up
*/
static constexpr Index round_up(Index value, Index factor) {
return (value + factor - 1) / factor * factor;
}
}

57
third_party/intgemm/intgemm/vec_traits.h поставляемый
Просмотреть файл

@ -1,57 +0,0 @@
#pragma once
#include "types.h"
namespace intgemm {
/*
* Vector traits
*/
template <CPUType CPUType_, typename ElemType_> struct vector_s;
template <> struct vector_s<CPUType::SSE2, int8_t> { using type = __m128i; };
template <> struct vector_s<CPUType::SSE2, int16_t> { using type = __m128i; };
template <> struct vector_s<CPUType::SSE2, int> { using type = __m128i; };
template <> struct vector_s<CPUType::SSE2, float> { using type = __m128; };
template <> struct vector_s<CPUType::SSE2, double> { using type = __m128d; };
template <> struct vector_s<CPUType::SSSE3, int8_t> { using type = __m128i; };
template <> struct vector_s<CPUType::SSSE3, int16_t> { using type = __m128i; };
template <> struct vector_s<CPUType::SSSE3, int> { using type = __m128i; };
template <> struct vector_s<CPUType::SSSE3, float> { using type = __m128; };
template <> struct vector_s<CPUType::SSSE3, double> { using type = __m128d; };
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template <> struct vector_s<CPUType::AVX2, int8_t> { using type = __m256i; };
template <> struct vector_s<CPUType::AVX2, int16_t> { using type = __m256i; };
template <> struct vector_s<CPUType::AVX2, int> { using type = __m256i; };
template <> struct vector_s<CPUType::AVX2, float> { using type = __m256; };
template <> struct vector_s<CPUType::AVX2, double> { using type = __m256d; };
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template <> struct vector_s<CPUType::AVX512BW, int8_t> { using type = __m512i; };
template <> struct vector_s<CPUType::AVX512BW, int16_t> { using type = __m512i; };
template <> struct vector_s<CPUType::AVX512BW, int> { using type = __m512i; };
template <> struct vector_s<CPUType::AVX512BW, float> { using type = __m512; };
template <> struct vector_s<CPUType::AVX512BW, double> { using type = __m512d; };
#endif
template <CPUType CPUType_, typename ElemType_>
using vector_t = typename vector_s<CPUType_, ElemType_>::type;
template <CPUType CPUType_, typename ElemType_>
struct dvector_t {
using type = vector_t<CPUType_, ElemType_>;
type first;
type second;
};
template <CPUType CPUType_, typename ElemType_>
struct qvector_t {
using type = vector_t<CPUType_, ElemType_>;
type first;
type second;
type third;
type fourth;
};
}

Просмотреть файл

@ -1,24 +0,0 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

14934
third_party/intgemm/test/3rd_party/catch.hpp поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

492
third_party/intgemm/test/add127_test.cc поставляемый
Просмотреть файл

@ -1,492 +0,0 @@
#include "test.h"
namespace intgemm {
namespace {
void CompareAs(int8_t * output_old, uint8_t * output_new, Index rows, Index cols) {
for (Index r = 0; r<rows; r++) {
for (Index c = 0; c<cols; c++) {
int a = int(output_old[rows*c + r]);
int b = int(output_new[rows*c + r]);
INFO("Inaccurate at row: " << r << " column " << c << ' '
<< a << ' ' << b);
CHECK(a+127 == b);
}
}
}
template <class Routine> void TestPrepareA(Index rows, Index cols) {
std::mt19937 gen;
// Go somewhat out of range too.
std::uniform_real_distribution<float> dist(-2, 2);
// Create array.
AlignedVector<float> inputA(rows * cols);
for (auto& it : inputA) {
it = dist(gen);
}
AlignedVector<int8_t> oldA(rows * cols);
AlignedVector<uint8_t> newA(rows * cols);
float quant_mult = 64; //From example
Routine::PrepareA(inputA.begin(), oldA.begin(), quant_mult, rows, cols);
Routine::PrepareA(inputA.begin(), newA.begin(), quant_mult, rows, cols);
CompareAs(oldA.begin(), newA.begin(), rows, cols);
}
template <class Routine> void TestPrepareBias(Index rows, Index cols) {
std::mt19937 gen;
// Go somewhat out of range too.
std::uniform_real_distribution<float> dist(-30.0, 30.0);
// Create array.
AlignedVector<float> inputB(rows * cols);
for (auto& it : inputB) {
it = dist(gen);
}
float alpha = 25;
float quant_mult = 127/alpha;
AlignedVector<int8_t> B_prep(inputB.size());
AlignedVector<int8_t> B_quant(inputB.size());
Routine::PrepareB(inputB.begin(), B_prep.begin(), quant_mult, rows, cols);
Routine::Quantize(inputB.begin(), B_quant.begin(), quant_mult, static_cast<intgemm::Index>(inputB.size()));
AlignedVector<float> inputBias(cols);
AlignedVector<float> goldBias(cols);
for (auto& it : goldBias) {
it = dist(gen);
}
int i = 0;
for (auto& it : inputBias) {
it = goldBias[i];
i++;
}
float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f);
Routine::PrepareBias(B_prep.begin(), rows, cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, inputBias.begin(), inputBias.begin()));
int A_rows = 1;
AlignedVector<int8_t> A_prep2(A_rows*rows);
for (auto& it : A_prep2) {
it =1;
}
//Routine::Multiply(A_prep2.begin(), B_prep.begin(), A_rows, rows, cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, goldBias.begin(), goldBias.begin()));
//CompareEps(goldBias.begin(), inputBias.begin(), cols, 0.0001f);
AlignedVector<float> slowint_C(cols);
references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, rows, cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return sum * unquant_mult_forprep + goldBias[info.col_idx];
});
CompareEps(slowint_C.begin(), inputBias.begin(), cols, 0.0001f);
}
template <class Routine> void TestMultiplyBiasNew(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = dist(gen);
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<uint8_t> A_prep(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
/*REFERENCE MULTIPLICATION
*
*
*/
AlignedVector<int8_t> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Taking the original A_preparation which means A would be int8_t
AlignedVector<int8_t> A_prep2(A.size());
Routine::PrepareA(A.begin(), A_prep2.begin(), quant_mult, A_rows, width);
references::Multiply(A_prep2.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return sum * unquant_mult + bias[info.col_idx];
});
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
return static_cast<float>(sum) + bias[info.col_idx];
});
/*ACTUAL MULTIPLICATION
*
*/
float unquant_mult_forprep = (-1.0f)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
//Routine::PrepareBias(B.begin(), bias.begin(), alpha, width, B_cols);
Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
template <class Routine> void TestMultiplyShiftNonShift(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = 0;
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<uint8_t> A_prep(A.size());
AlignedVector<int8_t> A_prep_old(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareA(A.begin(), A_prep_old.begin(), quant_mult, A_rows, width); //Non shited version
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
/*
* Reference non shift multiplication instead of slowint
*/
AlignedVector<float> slowint_C(test_C.size());
Routine::Multiply(A_prep_old.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), slowint_C.begin()));
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
return static_cast<float>(sum) + bias[info.col_idx];
});
/*
* Multiply8 shift multiplication
*/
float unquant_mult_forprep = (-1.0f)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
template <class Routine> void TestMultiplyShiftInt(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = 0;
}
float alpha = 2.0f;
float quant_mult = 127.0f / alpha;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<uint8_t> A_prep(A.size());
AlignedVector<int8_t> A_prep_old(A.size());
AlignedVector<int8_t> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareA(A.begin(), A_prep_old.begin(), quant_mult, A_rows, width); //Non shited version
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
/*
* Reference float multiplication
*/
AlignedVector<int8_t> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Taking the original A_preparation which means A would be int8_t
// references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
// return sum * unquant_mult + bias[info.col_idx];
// });
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
return static_cast<float>(sum) + bias[info.col_idx];
});
/*
* Multiply8 shift multiplication
*/
//First prepare SlowInteger Bias:
AlignedVector<int8_t> A_prep2(1*width);
for (auto& it : A_prep2) {
it = 1;
}
AlignedVector<float> ShiftedBias(B_cols);
float unquant_mult_forprep = (-1)*(alpha)*(alpha)/(127.0f); //Minus one to invert add_ps later on
references::Multiply(A_prep2.begin(), B_quant.begin(), ShiftedBias.begin(), 1, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return sum * unquant_mult_forprep + bias[info.col_idx];
});
//Now prepare Fast integer Bias
Routine::PrepareBias(B_prep.begin(), width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult_forprep, bias.begin(), bias.begin()));
Routine::Multiply8Shift(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
// Reference INT VERSION HERE with ADD127
// Taking the original A_preparation which means A would be int8_t
references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return sum * unquant_mult + ShiftedBias[info.col_idx];
});
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
// Bias
TEST_CASE("PrepareBias SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
TestPrepareBias<SSSE3::Kernels8>(256,256);
TestPrepareBias<SSSE3::Kernels8>(2048,256);
TestPrepareBias<SSSE3::Kernels8>(512,512);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareBias AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
TestPrepareBias<AVX2::Kernels8>(256,256);
TestPrepareBias<AVX2::Kernels8>(2048,256);
TestPrepareBias<AVX2::Kernels8>(512,512);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareBias AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
TestPrepareBias<AVX512BW::Kernels8>(256,256);
TestPrepareBias<AVX512BW::Kernels8>(2048,256);
TestPrepareBias<AVX512BW::Kernels8>(512,512);
}
#endif
//A
TEST_CASE("PrepareA SSSE3", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
TestPrepareA<SSSE3::Kernels8>(64,64);
TestPrepareA<SSSE3::Kernels8>(256,256);
TestPrepareA<SSSE3::Kernels8>(512,512);
TestPrepareA<SSSE3::Kernels8>(2048,256);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareA AVX2", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
TestPrepareA<AVX2::Kernels8>(64,64);
TestPrepareA<AVX2::Kernels8>(256,256);
TestPrepareA<AVX2::Kernels8>(512,512);
TestPrepareA<AVX2::Kernels8>(2048,256);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareA AVX512F", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
TestPrepareA<AVX512BW::Kernels8>(64,64);
TestPrepareA<AVX512BW::Kernels8>(256,256);
TestPrepareA<AVX512BW::Kernels8>(512,512);
TestPrepareA<AVX512BW::Kernels8>(2048,256);
}
#endif
// Multiply
TEST_CASE ("Multiply SSSE3 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyBiasNew<SSSE3::Kernels8>(1, 64, 8, 0.11f, 0.1f, 0.06f, 0.05f);
TestMultiplyBiasNew<SSSE3::Kernels8>(8, 256, 256, 0.45f, 0.54f, 0.17f, 0.16f);
TestMultiplyBiasNew<SSSE3::Kernels8>(8, 2048, 256, 1.7f, 1.7f, 0.46f, 0.43f);
TestMultiplyBiasNew<SSSE3::Kernels8>(320, 256, 256, 0.56f, 0.64f, 0.16f, 0.15f);
TestMultiplyBiasNew<SSSE3::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
TestMultiplyBiasNew<SSSE3::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
TestMultiplyBiasNew<SSSE3::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyBiasNew<AVX2::Kernels8>(1, 64, 8, 0.11f, 0.11f, 0.06f, 0.05f);
TestMultiplyBiasNew<AVX2::Kernels8>(8, 256, 256, 0.49f, 0.54f, 0.17f, 0.16f);
TestMultiplyBiasNew<AVX2::Kernels8>(8, 2048, 256, 1.57f, 1.66f, 0.46f, 0.46f);
TestMultiplyBiasNew<AVX2::Kernels8>(320, 256, 256, 0.49f, 0.64f, 0.16f, 0.15f);
TestMultiplyBiasNew<AVX2::Kernels8>(472, 256, 256, 0.46f, 0.62f, 0.17f, 0.16f);
TestMultiplyBiasNew<AVX2::Kernels8>(248, 256, 256, 0.48f, 0.64f, 0.16f, 0.15f);
TestMultiplyBiasNew<AVX2::Kernels8>(200, 256, 256, 0.55f, 0.74f, 0.17f, 0.16f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyBiasNew<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift with bias", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyBiasNew<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
//Multiply old vs new
TEST_CASE ("Multiply SSSE3 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyShiftNonShift<SSSE3::Kernels8>(1, 64, 8, 0.00001f, 0.1f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
TestMultiplyShiftNonShift<SSSE3::Kernels8>(8, 2048, 256, 17.9f, 1.7f, 0.46f, 4.2f); //Big difference here because the non-shift version is very bad
TestMultiplyShiftNonShift<SSSE3::Kernels8>(320, 256, 256, 1.2f, 0.64f, 0.16f, 0.006f);
TestMultiplyShiftNonShift<SSSE3::Kernels8>(472, 256, 256, 1.1f, 0.62f, 0.17f, 0.006f);
TestMultiplyShiftNonShift<SSSE3::Kernels8>(248, 256, 256, 0.9f, 0.64f, 0.16f, 0.007f);
TestMultiplyShiftNonShift<SSSE3::Kernels8>(200, 256, 256, 1, 0.74f, 0.17f, 0.006f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyShiftNonShift<AVX2::Kernels8>(1, 64, 8, 0.00001f, 0.11f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 256, 256, 0.00001f, 0.54f, 0.17f, 0.00001f);
TestMultiplyShiftNonShift<AVX2::Kernels8>(8, 2048, 256, 9.4f, 1.66f, 0.46f, 1.67f); //Big difference here because the non-shift version is very bad
TestMultiplyShiftNonShift<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftNonShift<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
TestMultiplyShiftNonShift<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftNonShift<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.001f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.001f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(8, 2048, 256, 3.51f, 0.61f, 0.17f, 0.3f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.001f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.001f);
TestMultiplyShiftNonShift<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs nonshift", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(1, 64, 8, 0.00001f, 0.05f, 0.03f, 0.00001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 256, 256, 0.00001f, 0.22f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(320, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(472, 256, 256, 0.00001f, 0.33f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(248, 256, 256, 0.00001f, 0.27f, 0.06f, 0.00001f);
TestMultiplyShiftNonShift<AVX512VNNI::Kernels8>(200, 256, 256, 0.00001f, 0.28f, 0.06f, 0.00001f);
}
#endif
//Multiply Shift vs int shift implementation
TEST_CASE ("Multiply SSSE3 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyShiftInt<SSSE3::Kernels8>(1, 64, 8, 0.0001f, 0.1f, 0.06f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(8, 2048, 256, 0.0001f, 1.7f, 0.46f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftInt<SSSE3::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyShiftInt<AVX2::Kernels8>(1, 64, 8, 0.0001f, 0.11f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(8, 256, 256, 0.0001f, 0.54f, 0.17f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(8, 2048, 256, 0.0001f, 1.66f, 0.46f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(320, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(472, 256, 256, 0.0001f, 0.62f, 0.17f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(248, 256, 256, 0.0001f, 0.64f, 0.16f, 0.0001f);
TestMultiplyShiftInt<AVX2::Kernels8>(200, 256, 256, 0.0001f, 0.74f, 0.17f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512F 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyShiftInt<AVX512BW::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512BW::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit Shift vs Int", "[Add127]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(1, 64, 8, 0.0001f, 0.05f, 0.03f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 256, 256, 0.0001f, 0.22f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(8, 2048, 256, 0.0001f, 0.61f, 0.17f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(320, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(472, 256, 256, 0.0001f, 0.33f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(248, 256, 256, 0.0001f, 0.27f, 0.06f, 0.0001f);
TestMultiplyShiftInt<AVX512VNNI::Kernels8>(200, 256, 256, 0.0001f, 0.28f, 0.06f, 0.0001f);
}
#endif
} // namespace
} // namespace intgemm

Просмотреть файл

@ -1,66 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <numeric>
namespace intgemm {
template <CPUType CPUType_, typename ElemType_>
void kernel_add_bias_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, ElemType_>;
constexpr static auto VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
AlignedVector<ElemType_> input(VECTOR_LENGTH);
AlignedVector<ElemType_> bias(VECTOR_LENGTH);
AlignedVector<ElemType_> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), static_cast<ElemType_>(0));
std::fill(bias.begin(), bias.end(), static_cast<ElemType_>(100));
*output.template as<vec_t>() = kernels::add_bias(*input.template as<vec_t>(), bias.begin(), 0);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == ElemType_(100 + i));
}
template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int8_t>();
template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int16_t>();
template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, int>();
template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, float>();
template INTGEMM_SSE2 void kernel_add_bias_test<CPUType::SSE2, double>();
KERNEL_TEST_CASE("add_bias/int8 SSE2") { return kernel_add_bias_test<CPUType::SSE2, int8_t>(); }
KERNEL_TEST_CASE("add_bias/int16 SSE2") { return kernel_add_bias_test<CPUType::SSE2, int16_t>(); }
KERNEL_TEST_CASE("add_bias/int SSE2") { return kernel_add_bias_test<CPUType::SSE2, int>(); }
KERNEL_TEST_CASE("add_bias/float SSE2") { return kernel_add_bias_test<CPUType::SSE2, float>(); }
KERNEL_TEST_CASE("add_bias/double SSE2") { return kernel_add_bias_test<CPUType::SSE2, double>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int8_t>();
template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int16_t>();
template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, int>();
template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, float>();
template INTGEMM_AVX2 void kernel_add_bias_test<CPUType::AVX2, double>();
KERNEL_TEST_CASE("add_bias/int8 AVX2") { return kernel_add_bias_test<CPUType::AVX2, int8_t>(); }
KERNEL_TEST_CASE("add_bias/int16 AVX2") { return kernel_add_bias_test<CPUType::AVX2, int16_t>(); }
KERNEL_TEST_CASE("add_bias/int AVX2") { return kernel_add_bias_test<CPUType::AVX2, int>(); }
KERNEL_TEST_CASE("add_bias/float AVX2") { return kernel_add_bias_test<CPUType::AVX2, float>(); }
KERNEL_TEST_CASE("add_bias/double AVX2") { return kernel_add_bias_test<CPUType::AVX2, double>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int8_t>();
template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int16_t>();
template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, int>();
template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, float>();
template INTGEMM_AVX512BW void kernel_add_bias_test<CPUType::AVX512BW, double>();
KERNEL_TEST_CASE("add_bias/int8 AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int8_t>(); }
KERNEL_TEST_CASE("add_bias/int16 AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int16_t>(); }
KERNEL_TEST_CASE("add_bias/int AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, int>(); }
KERNEL_TEST_CASE("add_bias/float AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, float>(); }
KERNEL_TEST_CASE("add_bias/double AVX512BW") { return kernel_add_bias_test<CPUType::AVX512BW, double>(); }
#endif
}

Просмотреть файл

@ -1,41 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstdlib>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_bitwise_not_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, int>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(int);
AlignedVector<int> input(VECTOR_LENGTH);
AlignedVector<int> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), 0);
*output.template as<vec_t>() = kernels::bitwise_not(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == ~input[i]);
}
template INTGEMM_SSE2 void kernel_bitwise_not_test<CPUType::SSE2>();
KERNEL_TEST_CASE("bitwise_not SSE2") { return kernel_bitwise_not_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_bitwise_not_test<CPUType::AVX2>();
KERNEL_TEST_CASE("bitwise_not AVX2") { return kernel_bitwise_not_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_bitwise_not_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("bitwise_not AVX512BW") { return kernel_bitwise_not_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,107 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_downcast32to8_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
AlignedVector<int32_t> input(LENGTH);
AlignedVector<int8_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
*output.template as<vi>() = kernels::downcast32to8(
input.template as<vi>()[0], input.template as<vi>()[1],
input.template as<vi>()[2], input.template as<vi>()[3]);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int8_t(input[i]));
}
template INTGEMM_SSE2 void kernel_downcast32to8_test<CPUType::SSE2>();
KERNEL_TEST_CASE("downcast32to8 SSE2") { return kernel_downcast32to8_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_downcast32to8_test<CPUType::AVX2>();
KERNEL_TEST_CASE("downcast32to8 AVX2") { return kernel_downcast32to8_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_downcast32to8_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("downcast32to8 AVX512BW") { return kernel_downcast32to8_test<CPUType::AVX512BW>(); }
#endif
template <CPUType CPUType_>
void kernel_downcast32to16_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int16_t);
AlignedVector<int32_t> input(LENGTH);
AlignedVector<int16_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
*output.template as<vi>() = kernels::downcast32to16(
input.template as<vi>()[0], input.template as<vi>()[1]);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int16_t(input[i]));
}
template INTGEMM_SSE2 void kernel_downcast32to16_test<CPUType::SSE2>();
KERNEL_TEST_CASE("downcast32to16 SSE2") { return kernel_downcast32to16_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_downcast32to16_test<CPUType::AVX2>();
KERNEL_TEST_CASE("downcast32to16 AVX2") { return kernel_downcast32to16_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_downcast32to16_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("downcast32to16 AVX512BW") { return kernel_downcast32to16_test<CPUType::AVX512BW>(); }
#endif
template <CPUType CPUType_>
void kernel_downcast16to8_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
AlignedVector<int16_t> input(LENGTH);
AlignedVector<int8_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int16_t>(-LENGTH / 2));
*output.template as<vi>() = kernels::downcast16to8(
input.template as<vi>()[0], input.template as<vi>()[1]);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int8_t(input[i]));
}
template INTGEMM_SSE2 void kernel_downcast16to8_test<CPUType::SSE2>();
KERNEL_TEST_CASE("downcast16to8 SSE2") { return kernel_downcast16to8_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_downcast16to8_test<CPUType::AVX2>();
KERNEL_TEST_CASE("downcast16to8 AVX2") { return kernel_downcast16to8_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_downcast16to8_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("downcast16to8 AVX512BW") { return kernel_downcast16to8_test<CPUType::AVX512BW>(); }
#endif
}

38
third_party/intgemm/test/kernels/exp_test.cc поставляемый
Просмотреть файл

@ -1,38 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_exp_approx_taylor_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, float>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
AlignedVector<float> input(VECTOR_LENGTH);
AlignedVector<float> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
*output.template as<vec_t>() = kernels::exp_approx_taylor(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK_EPS(output[i], exp(input[i]), 0.001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_exp_approx_taylor_test<CPUType::AVX2>();
KERNEL_TEST_CASE("exp_approx_taylor AVX2") { return kernel_exp_approx_taylor_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_exp_approx_taylor_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("exp_approx_taylor AVX512BW") { return kernel_exp_approx_taylor_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,41 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_floor_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, float>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
AlignedVector<float> input(VECTOR_LENGTH);
AlignedVector<float> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
*output.template as<vec_t>() = kernels::floor(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == std::floor(input[i]));
}
template INTGEMM_SSE2 void kernel_floor_test<CPUType::SSE2>();
KERNEL_TEST_CASE("floor SSE2") { return kernel_floor_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_floor_test<CPUType::AVX2>();
KERNEL_TEST_CASE("floor AVX2") { return kernel_floor_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_floor_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("floor AVX512BW") { return kernel_floor_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,67 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstdint>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_, typename Type_>
void kernel_multiply_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, Type_>;
constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(Type_);
AlignedVector<Type_> input1(VECTOR_LENGTH);
AlignedVector<Type_> input2(VECTOR_LENGTH);
AlignedVector<Type_> output(VECTOR_LENGTH);
std::iota(input1.begin(), input1.end(), static_cast<Type_>(-VECTOR_LENGTH / 2));
std::iota(input2.begin(), input2.end(), static_cast<Type_>(-VECTOR_LENGTH / 3));
*output.template as<vec_t>() = kernels::multiply<Type_>(*input1.template as<vec_t>(), *input2.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == Type_(input1[i] * input2[i]));
}
template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int8_t>();
template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int16_t>();
template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, int>();
template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, float>();
template INTGEMM_SSE2 void kernel_multiply_test<CPUType::SSE2, double>();
KERNEL_TEST_CASE("multiply/int8 SSE2") { return kernel_multiply_test<CPUType::SSE2, int8_t>(); }
KERNEL_TEST_CASE("multiply/int16 SSE2") { return kernel_multiply_test<CPUType::SSE2, int16_t>(); }
KERNEL_TEST_CASE("multiply/int SSE2") { return kernel_multiply_test<CPUType::SSE2, int>(); }
KERNEL_TEST_CASE("multiply/float SSE2") { return kernel_multiply_test<CPUType::SSE2, float>(); }
KERNEL_TEST_CASE("multiply/double SSE2") { return kernel_multiply_test<CPUType::SSE2, double>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int8_t>();
template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int16_t>();
template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, int>();
template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, float>();
template INTGEMM_AVX2 void kernel_multiply_test<CPUType::AVX2, double>();
KERNEL_TEST_CASE("multiply/int8 AVX2") { return kernel_multiply_test<CPUType::AVX2, int8_t>(); }
KERNEL_TEST_CASE("multiply/int16 AVX2") { return kernel_multiply_test<CPUType::AVX2, int16_t>(); }
KERNEL_TEST_CASE("multiply/int AVX2") { return kernel_multiply_test<CPUType::AVX2, int>(); }
KERNEL_TEST_CASE("multiply/float AVX2") { return kernel_multiply_test<CPUType::AVX2, float>(); }
KERNEL_TEST_CASE("multiply/double AVX2") { return kernel_multiply_test<CPUType::AVX2, double>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int8_t>();
template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int16_t>();
template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, int>();
template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, float>();
template INTGEMM_AVX512BW void kernel_multiply_test<CPUType::AVX512BW, double>();
KERNEL_TEST_CASE("multiply/int8 AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int8_t>(); }
KERNEL_TEST_CASE("multiply/int16 AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int16_t>(); }
KERNEL_TEST_CASE("multiply/int AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, int>(); }
KERNEL_TEST_CASE("multiply/float AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, float>(); }
KERNEL_TEST_CASE("multiply/double AVX512BW") { return kernel_multiply_test<CPUType::AVX512BW, double>(); }
#endif
}

Просмотреть файл

@ -1,41 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_quantize_test() {
if (kCPU < CPUType_)
return;
using input_vec_t = vector_t<CPUType_, float>;
using output_vec_t = vector_t<CPUType_, int>;
AlignedVector<float> input(sizeof(input_vec_t) / sizeof(float));
AlignedVector<int> output(sizeof(output_vec_t) / sizeof(int));
std::iota(input.begin(), input.end(), 0.0f);
auto quant_mult = set1_ps<input_vec_t>(2.f);
*output.template as<output_vec_t>() = kernels::quantize(*input.template as<input_vec_t>(), quant_mult);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int(i*2.f));
}
template INTGEMM_SSE2 void kernel_quantize_test<CPUType::SSE2>();
KERNEL_TEST_CASE("quantize SSE2") { return kernel_quantize_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_quantize_test<CPUType::AVX2>();
KERNEL_TEST_CASE("quantize AVX2") { return kernel_quantize_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_quantize_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("quantize AVX512BW") { return kernel_quantize_test<CPUType::AVX512BW>(); }
#endif
}

65
third_party/intgemm/test/kernels/relu_test.cc поставляемый
Просмотреть файл

@ -1,65 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstdint>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_, typename ElemType_>
void kernel_relu_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, ElemType_>;
constexpr int VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
AlignedVector<ElemType_> input(VECTOR_LENGTH);
AlignedVector<ElemType_> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), static_cast<ElemType_>(-VECTOR_LENGTH / 2));
*output.template as<vec_t>() = kernels::relu<ElemType_>(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == (input[i] < 0 ? 0 : input[i]));
}
template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int8_t>();
template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int16_t>();
template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, int>();
template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, float>();
template INTGEMM_SSE2 void kernel_relu_test<CPUType::SSE2, double>();
KERNEL_TEST_CASE("relu/int8 SSE2") { return kernel_relu_test<CPUType::SSE2, int8_t>(); }
KERNEL_TEST_CASE("relu/int16 SSE2") { return kernel_relu_test<CPUType::SSE2, int16_t>(); }
KERNEL_TEST_CASE("relu/int SSE2") { return kernel_relu_test<CPUType::SSE2, int>(); }
KERNEL_TEST_CASE("relu/float SSE2") { return kernel_relu_test<CPUType::SSE2, float>(); }
KERNEL_TEST_CASE("relu/double SSE2") { return kernel_relu_test<CPUType::SSE2, double>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int8_t>();
template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int16_t>();
template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, int>();
template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, float>();
template INTGEMM_AVX2 void kernel_relu_test<CPUType::AVX2, double>();
KERNEL_TEST_CASE("relu/int8 AVX2") { return kernel_relu_test<CPUType::AVX2, int8_t>(); }
KERNEL_TEST_CASE("relu/int16 AVX2") { return kernel_relu_test<CPUType::AVX2, int16_t>(); }
KERNEL_TEST_CASE("relu/int AVX2") { return kernel_relu_test<CPUType::AVX2, int>(); }
KERNEL_TEST_CASE("relu/float AVX2") { return kernel_relu_test<CPUType::AVX2, float>(); }
KERNEL_TEST_CASE("relu/double AVX2") { return kernel_relu_test<CPUType::AVX2, double>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int8_t>();
template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int16_t>();
template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, int>();
template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, float>();
template INTGEMM_AVX512BW void kernel_relu_test<CPUType::AVX512BW, double>();
KERNEL_TEST_CASE("relu/int8 AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int8_t>(); }
KERNEL_TEST_CASE("relu/int16 AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int16_t>(); }
KERNEL_TEST_CASE("relu/int AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, int>(); }
KERNEL_TEST_CASE("relu/float AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, float>(); }
KERNEL_TEST_CASE("relu/double AVX512BW") { return kernel_relu_test<CPUType::AVX512BW, double>(); }
#endif
}

Просмотреть файл

@ -1,43 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstdint>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_rescale_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
using vf = vector_t<CPUType_, float>;
constexpr int LENGTH = sizeof(vi) / sizeof(int);
AlignedVector<int32_t> input(LENGTH);
AlignedVector<int32_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int32_t>(-LENGTH / 2));
float scale = 2;
*output.template as<vi>() = kernels::rescale(*input.template as<vi>(), intgemm::set1_ps<vf>(scale));
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == std::round(input[i] * scale));
}
template INTGEMM_SSE2 void kernel_rescale_test<CPUType::SSE2>();
KERNEL_TEST_CASE("rescale SSE2") { return kernel_rescale_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_rescale_test<CPUType::AVX2>();
KERNEL_TEST_CASE("rescale AVX2") { return kernel_rescale_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_rescale_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("rescale AVX512BW") { return kernel_rescale_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,45 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
float sigmoid_ref(float x) {
if (x < 0)
return exp(x) / (1 + exp(x));
else
return 1 / (1 + exp(-x));
}
template <CPUType CPUType_>
void kernel_sigmoid_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, float>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
AlignedVector<float> input(VECTOR_LENGTH);
AlignedVector<float> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), -static_cast<float>(VECTOR_LENGTH / 2));
*output.template as<vec_t>() = kernels::sigmoid(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK_EPS(output[i], sigmoid_ref(input[i]), 0.001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_sigmoid_test<CPUType::AVX2>();
KERNEL_TEST_CASE("sigmoid AVX2") { return kernel_sigmoid_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_sigmoid_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("sigmoid AVX512BW") { return kernel_sigmoid_test<CPUType::AVX512BW>(); }
#endif
}

38
third_party/intgemm/test/kernels/tanh_test.cc поставляемый
Просмотреть файл

@ -1,38 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_tanh_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, float>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(float);
AlignedVector<float> input(VECTOR_LENGTH);
AlignedVector<float> output(VECTOR_LENGTH);
std::generate(input.begin(), input.end(), [] () { static int n = -int(VECTOR_LENGTH / 2); return n++ / float(VECTOR_LENGTH / 2); });
*output.template as<vec_t>() = kernels::tanh(*input.template as<vec_t>());
for (std::size_t i = 0; i < output.size(); ++i)
CHECK_EPS(output[i], tanh(input[i]), 0.001f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_tanh_test<CPUType::AVX2>();
KERNEL_TEST_CASE("tanh AVX2") { return kernel_tanh_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_tanh_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("tanh AVX512BW") { return kernel_tanh_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,41 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_unquantize_test() {
if (kCPU < CPUType_)
return;
using input_vec_t = vector_t<CPUType_, int>;
using output_vec_t = vector_t<CPUType_, float>;
AlignedVector<int> input(sizeof(input_vec_t) / sizeof(int));
AlignedVector<float> output(sizeof(output_vec_t) / sizeof(float));
std::iota(input.begin(), input.end(), 0);
auto unquant_mult = set1_ps<output_vec_t>(0.5f);
*output.template as<output_vec_t>() = kernels::unquantize(*input.template as<input_vec_t>(), unquant_mult);
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == i * 0.5f);
}
template INTGEMM_SSE2 void kernel_unquantize_test<CPUType::SSE2>();
KERNEL_TEST_CASE("unquantize SSE2") { return kernel_unquantize_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_unquantize_test<CPUType::AVX2>();
KERNEL_TEST_CASE("unquantize AVX2") { return kernel_unquantize_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_unquantize_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("unquantize AVX512BW") { return kernel_unquantize_test<CPUType::AVX512BW>(); }
#endif
}

Просмотреть файл

@ -1,118 +0,0 @@
// This test triggers an internal compiler error in gcc 5.
#if defined(__OPTIMIZE__) || defined(__clang__) || defined(__INTEL_COMPILER) || !defined(__GNUC__) || (__GNUC__ != 5)
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstdint>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_>
void kernel_upcast8to16_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
AlignedVector<int8_t> input(LENGTH);
AlignedVector<int16_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int8_t>(-LENGTH / 2));
auto result = kernels::upcast8to16(*input.template as<vi>());
output.template as<vi>()[0] = result.first;
output.template as<vi>()[1] = result.second;
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int16_t(input[i]));
}
template INTGEMM_SSE2 void kernel_upcast8to16_test<CPUType::SSE2>();
KERNEL_TEST_CASE("upcast8to16 SSE2") { return kernel_upcast8to16_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_upcast8to16_test<CPUType::AVX2>();
KERNEL_TEST_CASE("upcast8to16 AVX2") { return kernel_upcast8to16_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_upcast8to16_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("upcast8to16 AVX512BW") { return kernel_upcast8to16_test<CPUType::AVX512BW>(); }
#endif
template <CPUType CPUType_>
void kernel_upcast16to32_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int16_t);
AlignedVector<int16_t> input(LENGTH);
AlignedVector<int32_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int16_t>(-LENGTH / 2));
auto result = kernels::upcast16to32(*input.template as<vi>());
output.template as<vi>()[0] = result.first;
output.template as<vi>()[1] = result.second;
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int32_t(input[i]));
}
template INTGEMM_SSE2 void kernel_upcast16to32_test<CPUType::SSE2>();
KERNEL_TEST_CASE("upcast16to32 SSE2") { return kernel_upcast16to32_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_upcast16to32_test<CPUType::AVX2>();
KERNEL_TEST_CASE("upcast16to32 AVX2") { return kernel_upcast16to32_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_upcast16to32_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("upcast16to32 AVX512BW") { return kernel_upcast16to32_test<CPUType::AVX512BW>(); }
#endif
template <CPUType CPUType_>
void kernel_upcast8to32_test() {
if (kCPU < CPUType_)
return;
using vi = vector_t<CPUType_, int>;
constexpr int LENGTH = sizeof(vi) / sizeof(int8_t);
AlignedVector<int8_t> input(LENGTH);
AlignedVector<int32_t> output(LENGTH);
std::iota(input.begin(), input.end(), static_cast<int8_t>(-LENGTH / 2));
auto result = kernels::upcast8to32(*input.template as<vi>());
output.template as<vi>()[0] = result.first;
output.template as<vi>()[1] = result.second;
output.template as<vi>()[2] = result.third;
output.template as<vi>()[3] = result.fourth;
for (std::size_t i = 0; i < output.size(); ++i)
CHECK(output[i] == int32_t(input[i]));
}
template INTGEMM_SSE2 void kernel_upcast8to32_test<CPUType::SSE2>();
KERNEL_TEST_CASE("upcast8to32 SSE2") { return kernel_upcast8to32_test<CPUType::SSE2>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_upcast8to32_test<CPUType::AVX2>();
KERNEL_TEST_CASE("upcast8to32 AVX2") { return kernel_upcast8to32_test<CPUType::AVX2>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_upcast8to32_test<CPUType::AVX512BW>();
KERNEL_TEST_CASE("upcast8to32 AVX512BW") { return kernel_upcast8to32_test<CPUType::AVX512BW>(); }
#endif
}
#endif

Просмотреть файл

@ -1,65 +0,0 @@
#include "../test.h"
#include "../../intgemm/aligned.h"
#include "../../intgemm/kernels.h"
#include <cstddef>
#include <numeric>
namespace intgemm {
template <CPUType CPUType_, typename ElemType_>
void kernel_write_test() {
if (kCPU < CPUType_)
return;
using vec_t = vector_t<CPUType_, ElemType_>;
constexpr static std::size_t VECTOR_LENGTH = sizeof(vec_t) / sizeof(ElemType_);
AlignedVector<ElemType_> input(VECTOR_LENGTH);
AlignedVector<ElemType_> output(VECTOR_LENGTH);
std::iota(input.begin(), input.end(), static_cast<ElemType_>(0));
kernels::write(*input.template as<vec_t>(), output.begin(), 0);
for (std::size_t i = 0; i < VECTOR_LENGTH; ++i)
CHECK(output[i] == ElemType_(i));
}
template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int8_t>();
template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int16_t>();
template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, int>();
template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, float>();
template INTGEMM_SSE2 void kernel_write_test<CPUType::SSE2, double>();
KERNEL_TEST_CASE("write/int8 SSE2") { return kernel_write_test<CPUType::SSE2, int8_t>(); }
KERNEL_TEST_CASE("write/int16 SSE2") { return kernel_write_test<CPUType::SSE2, int16_t>(); }
KERNEL_TEST_CASE("write/int SSE2") { return kernel_write_test<CPUType::SSE2, int>(); }
KERNEL_TEST_CASE("write/float SSE2") { return kernel_write_test<CPUType::SSE2, float>(); }
KERNEL_TEST_CASE("write/double SSE2") { return kernel_write_test<CPUType::SSE2, double>(); }
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int8_t>();
template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int16_t>();
template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, int>();
template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, float>();
template INTGEMM_AVX2 void kernel_write_test<CPUType::AVX2, double>();
KERNEL_TEST_CASE("write/int8 AVX2") { return kernel_write_test<CPUType::AVX2, int8_t>(); }
KERNEL_TEST_CASE("write/int16 AVX2") { return kernel_write_test<CPUType::AVX2, int16_t>(); }
KERNEL_TEST_CASE("write/int AVX2") { return kernel_write_test<CPUType::AVX2, int>(); }
KERNEL_TEST_CASE("write/float AVX2") { return kernel_write_test<CPUType::AVX2, float>(); }
KERNEL_TEST_CASE("write/double AVX2") { return kernel_write_test<CPUType::AVX2, double>(); }
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int8_t>();
template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int16_t>();
template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, int>();
template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, float>();
template INTGEMM_AVX512BW void kernel_write_test<CPUType::AVX512BW, double>();
KERNEL_TEST_CASE("write/int8 AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int8_t>(); }
KERNEL_TEST_CASE("write/int16 AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int16_t>(); }
KERNEL_TEST_CASE("write/int AVX512BW") { return kernel_write_test<CPUType::AVX512BW, int>(); }
KERNEL_TEST_CASE("write/float AVX512BW") { return kernel_write_test<CPUType::AVX512BW, float>(); }
KERNEL_TEST_CASE("write/double AVX512BW") { return kernel_write_test<CPUType::AVX512BW, double>(); }
#endif
}

761
third_party/intgemm/test/multiply_test.cc поставляемый
Просмотреть файл

@ -1,761 +0,0 @@
#include "test.h"
#include "../intgemm/aligned.h"
#include "../intgemm/callbacks.h"
#include "../intgemm/interleave.h"
#include "../intgemm/intgemm.h"
#include "../intgemm/multiply.h"
#include "../intgemm/stats.h"
#include <algorithm>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iomanip>
#include <iostream>
#include <memory>
#include <numeric>
#include <random>
namespace intgemm {
#ifndef __INTEL_COMPILER
INTGEMM_SSE2
#endif
TEST_CASE("Transpose 16", "[transpose]") {
if (kCPU < CPUType::SSE2) return;
const unsigned N = 8;
AlignedVector<int16_t> input(N * N);
std::iota(input.begin(), input.end(), static_cast<int16_t>(0));
AlignedVector<int16_t> ref(N * N);
references::Transpose(input.begin(), ref.begin(), N, N);
// Overwrite input.
__m128i *t = input.as<__m128i>();
Transpose16InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7]);
for (std::size_t i = 0; i < input.size(); ++i) {
CHECK_MESSAGE(ref[i] == input[i], "16-bit transpose failure at: " << i << ": " << ref[i] << " != " << input[i]);
}
}
#ifndef __INTEL_COMPILER
INTGEMM_SSSE3
#endif
TEST_CASE("Transpose 8", "[transpose]") {
if (kCPU < CPUType::SSSE3) return;
const unsigned N = 16;
AlignedVector<int8_t> input(N * N);
std::iota(input.begin(), input.end(), static_cast<int8_t>(0));
AlignedVector<int8_t> ref(input.size());
references::Transpose(input.begin(), ref.begin(), N, N);
// Overwrite input.
__m128i *t = input.as<__m128i>();
Transpose8InLane(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8], t[9], t[10], t[11], t[12], t[13], t[14], t[15]);
for (std::size_t i = 0; i < input.size(); ++i) {
CHECK_MESSAGE(ref[i] == input[i], "8-bit transpose failure at " << i << ": " << (int16_t)ref[i] << " != " << (int16_t)input[i]);
}
}
template <class Routine> void TestPrepare(Index rows = 32, Index cols = 16) {
std::mt19937 gen;
// Go somewhat out of range too.
std::uniform_real_distribution<float> dist(-129.0, 129.0);
// Create array.
AlignedVector<float> input(rows * cols);
for (auto& it : input) {
it = dist(gen);
}
using Integer = typename Routine::Integer;
// Call Prepare
AlignedVector<Integer> test(input.size());
Routine::PrepareB(input.begin(), test.begin(), 1, rows, cols);
// Compute reference output.
AlignedVector<Integer> quantized(input.size());
Routine::Quantize(input.begin(), quantized.begin(), 1, static_cast<Index>(input.size()));
AlignedVector<Integer> reference(input.size());
// Note this won't work for Int8/Int16 generic routines because tile sizes vary.
references::Rearragement(quantized.begin(), reference.begin(), Routine::kBTileRow, Routine::kBTileCol, rows, cols);
CHECK_MESSAGE(memcmp(reference.begin(), test.begin(), test.size() * sizeof(Integer)) == 0, Routine::kName << " Mismatch:\n" <<
"Quantized Input" << '\n' << PrintMatrix(quantized.begin(), rows, cols) << "Reference" << '\n' <<
PrintMatrix(reference.begin(), rows, cols) << "Routine" << '\n' << PrintMatrix(test.begin(), rows, cols));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("Prepare AVX512", "[prepare]") {
if (kCPU < CPUType::AVX512BW) return;
TestPrepare<AVX512BW::Kernels8>(64, 8);
TestPrepare<AVX512BW::Kernels8>(256, 32);
TestPrepare<AVX512BW::Kernels16>(64, 8);
TestPrepare<AVX512BW::Kernels16>(256, 32);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("Prepare AVX2", "[prepare]") {
if (kCPU < CPUType::AVX2) return;
TestPrepare<AVX2::Kernels8>(64, 32);
TestPrepare<AVX2::Kernels16>(64, 32);
}
#endif
TEST_CASE("Prepare SSSE3", "[prepare]") {
if (kCPU < CPUType::SSSE3) return;
TestPrepare<SSSE3::Kernels8>(16, 8);
TestPrepare<SSSE3::Kernels8>(32, 16);
TestPrepare<SSSE3::Kernels8>(32, 32);
}
TEST_CASE("Prepare SSE2", "[prepare]") {
if (kCPU < CPUType::SSE2) return;
TestPrepare<SSE2::Kernels16>(8, 8);
TestPrepare<SSE2::Kernels16>(32, 32);
}
template <class Routine> void TestSelectColumnsB(Index rows = 64, Index cols = 16) {
std::mt19937 gen;
// Go somewhat out of range too.
std::uniform_real_distribution<float> dist(-129.0, 129.0);
AlignedVector<float> input(rows * cols);
for (auto& it : input) {
it = dist(gen);
}
using Integer = typename Routine::Integer;
AlignedVector<Integer> prepared(input.size());
Routine::PrepareB(input.begin(), prepared.begin(), 1, rows, cols);
const int kSelectCols = 24;
Index select_cols[kSelectCols];
std::uniform_int_distribution<Index> col_dist(0, cols - 1);
for (auto& it : select_cols) {
it = col_dist(gen);
}
AlignedVector<Integer> test(rows * kSelectCols);
Routine::SelectColumnsB(prepared.begin(), test.begin(), rows, select_cols, select_cols + kSelectCols);
// Select columns manually in float space.
AlignedVector<float> selected(rows * kSelectCols);
for (Index r = 0; r < rows; ++r) {
for (int c = 0; c < kSelectCols; ++c) {
assert(c + r * kSelectCols < rows * kSelectCols);
selected[c + r * kSelectCols] = input[select_cols[c] + r * cols];
}
}
AlignedVector<Integer> ref(rows * kSelectCols);
Routine::PrepareB(selected.begin(), ref.begin(), 1, rows, kSelectCols);
CHECK_MESSAGE(memcmp(ref.begin(), test.begin(), sizeof(Integer) * rows * kSelectCols) == 0, "Reference:\n" <<
PrintMatrix(ref.begin(), rows, kSelectCols) << PrintMatrix(test.begin(), rows, kSelectCols));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("SelectColumnsB AVX512", "[select]") {
if (kCPU < CPUType::AVX512BW) return;
TestSelectColumnsB<AVX512BW::Kernels8>();
TestSelectColumnsB<AVX512BW::Kernels16>(256, 256);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("SelectColumnsB AVX2", "[select]") {
if (kCPU < CPUType::AVX2) return;
TestSelectColumnsB<AVX2::Kernels8>(256, 256);
TestSelectColumnsB<AVX2::Kernels16>(256, 256);
}
#endif
TEST_CASE("SelectColumnsB SSSE3", "[select]") {
if (kCPU < CPUType::SSSE3) return;
TestSelectColumnsB<SSSE3::Kernels8>();
TestSelectColumnsB<SSSE3::Kernels8>(256, 256);
}
TEST_CASE("SelectColumnsB SSE2", "[select]") {
if (kCPU < CPUType::SSE2) return;
TestSelectColumnsB<SSE2::Kernels16>();
TestSelectColumnsB<SSE2::Kernels16>(256, 256);
}
template <class Register> void TestMax() {
Register r = set1_ps<Register>(-2.0);
for (std::size_t i = 0; i < sizeof(Register) / sizeof(float); ++i) {
Register c = r;
reinterpret_cast<float*>(&c)[i] = -1.0;
CHECK_MESSAGE((MaxFloat32(c) == -1.0), "MaxFloat32 produced " << MaxFloat32(c));
}
}
TEST_CASE("Max", "[max]") {
TestMax<__m128>();
}
void CompareMaxAbs(const float *begin, const float *end, float test, std::size_t offset) {
float largest = std::fabs(*std::max_element(begin, end));
float smallest = std::fabs(*std::min_element(begin, end));
largest = std::max(largest, smallest);
CHECK_MESSAGE(largest == test, "Error: " << largest << " versus " << test << " in length " << (end - begin) << " offset " << offset);
}
template <float (*Backend) (const float *, const float *)> void TestMaxAbsolute() {
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-8.0, 8.0);
const std::size_t kLengthMax = 65;
AlignedVector<float> test(kLengthMax);
for (std::size_t len = 1; len < kLengthMax; ++len) {
for (std::size_t t = 0; t < len; ++t) {
// Fill with [-8, 8).
for (auto& it : test) {
it = dist(gen);
}
CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
test[t] = -32.0;
CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
test[t] = 32.0;
CompareMaxAbs(test.begin(), test.begin() + len, Backend(test.begin(), test.begin() + len), t);
}
}
}
TEST_CASE("MaxAbsolute SSE2", "[max]") {
if (kCPU < CPUType::SSE2) return;
TestMaxAbsolute<SSE2::MaxAbsolute>();
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("MaxAbsolute AVX2", "[max]") {
if (kCPU < CPUType::AVX2) return;
TestMaxAbsolute<AVX2::MaxAbsolute>();
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("MaxAbsolute AVX512BW", "[max]") {
if (kCPU < CPUType::AVX512BW) return;
TestMaxAbsolute<AVX512BW::MaxAbsolute>();
}
#endif
// Based on https://arxiv.org/abs/1705.01991
// Copyright (c) 2017 Microsoft Corporation
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// Compute A*B slowly in floats.
template <class Routine> void TestMultiply(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<Integer> A_prep(A.size());
AlignedVector<Integer> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
OMPParallelWrap<callbacks::UnquantizeAndWrite, Routine>(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWrite(unquant_mult, test_C.begin()));
// Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::Sequence(
// callbacks::Unquantize(unquant_mult),
// callbacks::Write<float>(test_C.begin())
// ));
AlignedVector<Integer> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Assuming A is just quantization here.
references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo&) {
return sum * unquant_mult;
});
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo&) {
return static_cast<float>(sum);
});
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
template <class Routine> void TestMultiplyRelu(Index A_rows, Index width, Index B_cols,
float int_tolerance=.1, float float_tolerance=1, float MSE_float_tolerance=0, float MSE_int_tolerance=0) {
using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<Integer> A_prep(A.size());
AlignedVector<Integer> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
OMPParallelWrap<callbacks::UnquantizeAndWriteRelu, Routine>(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndWriteRelu(unquant_mult, test_C.begin()));
// Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::Sequence(
// callbacks::Unquantize(unquant_mult),
// callbacks::Write<float>(test_C.begin())
// ));
AlignedVector<Integer> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Assuming A is just quantization here.
references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo&) {
float ret = std::max(0.0f, sum * unquant_mult);
return ret;
});
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo&) {
return static_cast<float>(std::max(0.0,sum));
});
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
//Code duplication may be avoided through some use of variadic templates, as the different WriteC symbols
//Require different number of arguments. I don't think the refactoring is worth it.
template <class Routine> void TestMultiplyBias(Index A_rows, Index width, Index B_cols,
float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = dist(gen);
}
float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<Integer> A_prep(A.size());
AlignedVector<Integer> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias.begin(), test_C.begin()));
AlignedVector<Integer> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Assuming A is just quantization here.
references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return sum * unquant_mult + bias[info.col_idx];
});
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
return static_cast<float>(sum) + bias[info.col_idx];
});
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
template <class Routine> void TestMultiplyBiasRelu(Index A_rows, Index width, Index B_cols,
float int_tolerance = 0.1f, float float_tolerance = 1.0f, float MSE_float_tolerance = 0.0f, float MSE_int_tolerance = 0.0f) {
using Integer = typename Routine::Integer;
std::ostringstream info;
info << Routine::kName << "\t" << A_rows << '\t' << width << '\t' << B_cols << '\n';
// Initialize A and B.
AlignedVector<float> A(A_rows * width);
AlignedVector<float> B(width * B_cols);
AlignedVector<float> bias(B_cols);
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
for (auto& it : A) {
it = dist(gen);
}
for (auto& it : B) {
it = dist(gen);
}
for (auto& it : bias) {
it = dist(gen);
}
float quant_mult = (sizeof(Integer) == 2) ? 1024 : 64;
float unquant_mult = 1.0f / (quant_mult*quant_mult);
AlignedVector<Integer> A_prep(A.size());
AlignedVector<Integer> B_prep(B.size());
Routine::PrepareA(A.begin(), A_prep.begin(), quant_mult, A_rows, width);
Routine::PrepareB(B.begin(), B_prep.begin(), quant_mult, width, B_cols);
AlignedVector<float> test_C(A_rows * B_cols);
Routine::Multiply(A_prep.begin(), B_prep.begin(), A_rows, width, B_cols, callbacks::UnquantizeAndAddBiasAndWriteRelu(unquant_mult, bias.begin(), test_C.begin()));
AlignedVector<Integer> B_quant(B.size());
Routine::Quantize(B.begin(), B_quant.begin(), quant_mult, static_cast<Index>(B.size()));
AlignedVector<float> slowint_C(test_C.size());
// Assuming A is just quantization here.
references::Multiply(A_prep.begin(), B_quant.begin(), slowint_C.begin(), A_rows, width, B_cols, [&](int32_t sum, const callbacks::OutputBufferInfo& info) {
return std::max(0.0f, sum * unquant_mult + bias[info.col_idx]);
});
AlignedVector<float> float_C(test_C.size());
references::Multiply(A.begin(), B.begin(), float_C.begin(), A_rows, width, B_cols, [&](double sum, const callbacks::OutputBufferInfo& info) {
return std::max(0.0f, static_cast<float>(sum) + bias[info.col_idx]);
});
CompareMSE(float_C.begin(), slowint_C.begin(), test_C.begin(), test_C.size(), info.str(),
int_tolerance, float_tolerance, MSE_float_tolerance, MSE_int_tolerance);
}
TEST_CASE ("Multiply SSE2 16bit", "[multiply]") {
if (kCPU < CPUType::SSE2) return;
TestMultiply<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiply<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiply<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiply<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiply<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiply<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSE2 16bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::SSE2) return;
TestMultiplyRelu<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyRelu<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSE2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSE2) return;
TestMultiplyBias<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyBias<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSE2 16bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::SSE2) return;
TestMultiplyBiasRelu<SSE2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<SSE2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyBiasRelu<SSE2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<SSE2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<SSE2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<SSE2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply SSSE3 8bit", "[multiply]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiply<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
TestMultiply<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
TestMultiply<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
TestMultiply<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
TestMultiply<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
TestMultiply<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply SSSE3 8bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyRelu<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
TestMultiplyRelu<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
TestMultiplyRelu<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
TestMultiplyRelu<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
TestMultiplyRelu<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
TestMultiplyRelu<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply SSSE3 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyBias<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
TestMultiplyBias<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
TestMultiplyBias<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
TestMultiplyBias<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
TestMultiplyBias<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
TestMultiplyBias<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
TEST_CASE ("Multiply SSSE3 8bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::SSSE3) return;
TestMultiplyBiasRelu<SSSE3::Kernels8>(8, 256, 256, 1.2f, 1.2f, 0.064f, 0.026f);
TestMultiplyBiasRelu<SSSE3::Kernels8>(8, 2048, 256, 33, 33, 4.4f, 4.4f);
TestMultiplyBiasRelu<SSSE3::Kernels8>(320, 256, 256, 1.9f, 1.9f, 0.1f, 0.01f);
TestMultiplyBiasRelu<SSSE3::Kernels8>(472, 256, 256, 2.1f, 2.1f, 0.1f, 0.011f);
TestMultiplyBiasRelu<SSSE3::Kernels8>(248, 256, 256, 1.7f, 1.7f, 0.1f, 0.012f);
TestMultiplyBiasRelu<SSSE3::Kernels8>(200, 256, 256, 1.8f, 1.9f, 0.1f, 0.011f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Multiply AVX2 8bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
TestMultiply<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
TestMultiply<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
TestMultiply<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
TestMultiply<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
TestMultiply<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
TestMultiply<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 8bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyRelu<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
TestMultiplyRelu<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
TestMultiplyRelu<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
TestMultiplyRelu<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
TestMultiplyRelu<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
TestMultiplyRelu<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyBias<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
TestMultiplyBias<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
TestMultiplyBias<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
TestMultiplyBias<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
TestMultiplyBias<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
TestMultiplyBias<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 8bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyBiasRelu<AVX2::Kernels8>(8, 256, 256, .1f, 1, 0.1f);
TestMultiplyBiasRelu<AVX2::Kernels8>(8, 2048, 256, 19, 19, 1.8f, 1.8f);
TestMultiplyBiasRelu<AVX2::Kernels8>(320, 256, 256, .1f, 1, 0.1f);
TestMultiplyBiasRelu<AVX2::Kernels8>(472, 256, 256, .1f, 1, 0.1f);
TestMultiplyBiasRelu<AVX2::Kernels8>(248, 256, 256, .1f, 1, 0.1f);
TestMultiplyBiasRelu<AVX2::Kernels8>(200, 256, 256, .1f, 1, 0.1f);
}
TEST_CASE ("Multiply AVX2 16bit", "[multiply]") {
if (kCPU < CPUType::AVX2) return;
TestMultiply<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiply<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX2 16bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyRelu<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyRelu<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX2 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyBias<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyBias<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX2 16bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::AVX2) return;
TestMultiplyBiasRelu<AVX2::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX2::Kernels16>(8, 2048, 256, .1f, 1, 0.02f);
TestMultiplyBiasRelu<AVX2::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX2::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX2::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX2::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Multiply AVX512 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiply<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiply<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
TestMultiply<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiply<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiply<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiply<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512 8bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyRelu<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyRelu<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
TestMultiplyRelu<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyRelu<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyRelu<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyRelu<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyBias<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyBias<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
TestMultiplyBias<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyBias<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBias<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBias<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512 8bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyBiasRelu<AVX512BW::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyBiasRelu<AVX512BW::Kernels8>(8, 2048, 256, 3.7f, 4, 0.37f, 0.33f);
TestMultiplyBiasRelu<AVX512BW::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyBiasRelu<AVX512BW::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBiasRelu<AVX512BW::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBiasRelu<AVX512BW::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512VNNI
TEST_CASE ("Multiply AVX512VNNI 8bit", "[multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiply<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiply<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
TestMultiply<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiply<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiply<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiply<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512VNNI 8bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyRelu<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyRelu<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
TestMultiplyRelu<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyRelu<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyRelu<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyRelu<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512VNNI 8bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyBias<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyBias<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
TestMultiplyBias<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyBias<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBias<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBias<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
TEST_CASE ("Multiply AVX512VNNI 8bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::AVX512VNNI) return;
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(8, 256, 256, 0, 0.25f, 0.062f);
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(8, 2048, 256, 0, 0.55f, 0.25f);
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(320, 256, 256, 0, 0.26f, 0.059f);
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(472, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(248, 256, 256, 0, 0.29f, 0.059f);
TestMultiplyBiasRelu<AVX512VNNI::Kernels8>(200, 256, 256, 0, 0.28f, 0.06f);
}
#endif
TEST_CASE ("Multiply AVX512 16bit", "[multiply]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiply<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
TestMultiply<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiply<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX512 16bit with relu", "[multiply_relu]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyRelu<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
TestMultiplyRelu<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyRelu<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX512 16bit with bias", "[biased_multiply]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyBias<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
TestMultiplyBias<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBias<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
TEST_CASE ("Multiply AVX512 16bit with bias and relu", "[biased_multiply_relu]") {
if (kCPU < CPUType::AVX512BW) return;
TestMultiplyBiasRelu<AVX512BW::Kernels16>(8, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX512BW::Kernels16>(8, 2048, 256, .1f, 1, 0.011f);
TestMultiplyBiasRelu<AVX512BW::Kernels16>(320, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX512BW::Kernels16>(472, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX512BW::Kernels16>(248, 256, 256, .1f, 1, 0.01f);
TestMultiplyBiasRelu<AVX512BW::Kernels16>(200, 256, 256, .1f, 1, 0.01f);
}
#endif
} // namespace intgemm

Просмотреть файл

@ -1,96 +0,0 @@
#include "test.h"
#include "../intgemm/aligned.h"
#include "../intgemm/avx2_gemm.h"
#include "../intgemm/avx512_gemm.h"
#include "../intgemm/sse2_gemm.h"
#include "../intgemm/ssse3_gemm.h"
#include <cmath>
#include <cstring>
#include <iostream>
namespace intgemm {
namespace {
template <typename Backend>
void PrepareBQuantizedTransposedRef(const typename Backend::Integer* input, typename Backend::Integer* output, Index B_transposed_cols, Index B_transposed_rows) {
using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
auto output_it = output;
for (Index r = 0; r < B_transposed_rows; r += 8)
for (Index c = 0; c < B_transposed_cols; c += vec_len)
for (Index ri = 0; ri < 8; ++ri)
for (Index ci = 0; ci < vec_len; ++ci)
*output_it++ = input[(r + ri) * B_transposed_cols + c + ci];
}
template <typename Backend>
bool Test(const AlignedVector<typename Backend::Integer>& input, Index B_rows, Index B_cols) {
bool success = true;
AlignedVector<typename Backend::Integer> output(input.size());
Backend::PrepareBQuantizedTransposed(input.begin(), output.begin(), B_rows, B_cols);
AlignedVector<typename Backend::Integer> reference(input.size());
PrepareBQuantizedTransposedRef<Backend>(input.begin(), reference.begin(), B_rows, B_cols);
for (std::size_t i = 0; i < output.size(); ++i) {
if (output[i] != reference[i]) {
UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i]));
success = false;
break;
}
}
return success;
}
template <typename Backend>
bool TestMany(Index B_rows, Index B_cols) {
AlignedVector<typename Backend::Integer> input(B_rows * B_cols);
std::generate(input.begin(), input.end(), []() {
static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
static int value = 0;
return static_cast<typename Backend::Integer>((value++) % divider);
});
return Test<Backend>(input, B_rows, B_cols);
}
TEST_CASE("PrepareBQuantizedTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
CHECK(TestMany<SSE2::Kernels16>(32, 128));
}
TEST_CASE("PrepareBQuantizedTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
CHECK(TestMany<SSSE3::Kernels8>(32, 128));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareBQuantizedTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
CHECK(TestMany<AVX2::Kernels8>(32, 128));
CHECK(TestMany<AVX2::Kernels16>(32, 128));
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareBQuantizedTransposed AVX512", "") {
if (kCPU < CPUType::AVX512BW)
return;
CHECK(TestMany<AVX512BW::Kernels8>(64, 128));
CHECK(TestMany<AVX512BW::Kernels16>(64, 128));
}
#endif
}
}

Просмотреть файл

@ -1,97 +0,0 @@
#include "test.h"
#include "../intgemm/aligned.h"
#include "../intgemm/avx2_gemm.h"
#include "../intgemm/avx512_gemm.h"
#include "../intgemm/sse2_gemm.h"
#include "../intgemm/ssse3_gemm.h"
#include <cmath>
#include <cstring>
#include <iostream>
namespace intgemm {
namespace {
template <typename Backend>
void PrepareBTransposedRef(const float* input, typename Backend::Integer* output, float quant_mult, Index B_transposed_cols, Index B_transposed_rows) {
using vec_t = intgemm::vector_t<Backend::kUses, typename Backend::Integer>;
constexpr Index vec_len = sizeof(vec_t) / sizeof(typename Backend::Integer);
for (Index i = 0; i < B_transposed_rows * B_transposed_cols / 8; i += vec_len)
for (Index j = 0; j < 8; ++j)
for (Index k = 0; k < vec_len; ++k) {
Index col = (i + k) % B_transposed_cols;
Index row = 8 * ((i + k) / B_transposed_cols) + j;
*output++ = static_cast<typename Backend::Integer>(input[row * B_transposed_cols + col] * quant_mult);
}
}
template <typename Backend>
bool Test(const AlignedVector<float>& input, Index B_rows, Index B_cols, float quant_mult) {
bool success = true;
AlignedVector<typename Backend::Integer> output(input.size());
Backend::PrepareBTransposed(input.begin(), output.begin(), quant_mult, B_rows, B_cols);
AlignedVector<typename Backend::Integer> reference(input.size());
PrepareBTransposedRef<Backend>(input.begin(), reference.begin(), quant_mult, B_rows, B_cols);
for (std::size_t i = 0; i < output.size(); ++i) {
if (output[i] != reference[i]) {
UNSCOPED_INFO("Error at " << i << ", output = " << int(output[i]) << ", reference = " << int(reference[i]));
success = false;
break;
}
}
return success;
}
template <typename Backend>
bool TestMany(Index B_rows, Index B_cols, float quant_mult) {
AlignedVector<float> input(B_rows * B_cols);
std::generate(input.begin(), input.end(), []() {
static constexpr int divider = sizeof(intgemm::vector_t<Backend::kUses, typename Backend::Integer>) / sizeof(typename Backend::Integer);
static int value = 0;
return static_cast<float>((value++) % divider);
});
return Test<Backend>(input, B_rows, B_cols, quant_mult);
}
TEST_CASE("PrepareBTransposed SSE2", "") {
if (kCPU < CPUType::SSE2)
return;
CHECK(TestMany<SSE2::Kernels16>(4, 128, 2.0f));
}
TEST_CASE("PrepareBTransposed SSSE3", "") {
if (kCPU < CPUType::SSSE3)
return;
CHECK(TestMany<SSSE3::Kernels8>(4, 128, 2.0f));
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("PrepareBTransposed AVX2", "") {
if (kCPU < CPUType::AVX2)
return;
CHECK(TestMany<AVX2::Kernels8>(8, 128, 2.0f));
CHECK(TestMany<AVX2::Kernels16>(8, 128, 2.0f));
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("PrepareBTransposed AVX512", "") {
if (kCPU < CPUType::AVX512BW)
return;
CHECK(TestMany<AVX512BW::Kernels8>(16, 128, 2.0f));
CHECK(TestMany<AVX512BW::Kernels16>(16, 128, 2.0f));
}
#endif
}
}

199
third_party/intgemm/test/quantize_test.cc поставляемый
Просмотреть файл

@ -1,199 +0,0 @@
#include "test.h"
#include "../intgemm/aligned.h"
#include "../intgemm/avx2_gemm.h"
#include "../intgemm/avx512_gemm.h"
#include "../intgemm/sse2_gemm.h"
#include "../intgemm/ssse3_gemm.h"
#include "../intgemm/stats.h"
#include <cmath>
#include <cstring>
#include <iostream>
namespace intgemm {
namespace {
void QuantizeRef(const float *input, int16_t *output, float quant_mult, std::size_t size) {
for (std::size_t i = 0; i < size; ++i) {
float value = roundf(input[i] * quant_mult);
value = std::max(-32768.0f, value);
value = std::min(32767.0f, value);
// float should be exact in this range.
output[i] = static_cast<int16_t>(value);
}
}
void QuantizeRef(const float *input, int8_t *output, float quant_mult, std::size_t size) {
for (std::size_t i = 0; i < size; ++i) {
float value = roundf(input[i] * quant_mult);
value = std::max(-127.0f, value);
value = std::min(127.0f, value);
output[i] = static_cast<int8_t>(value);
}
}
MeanStd VectorMeanStd(AlignedVector<float>& vals, int num_items, bool absolute) {
float normal_sums = 0;
float squares_sum = 0;
if (absolute) {
std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=std::abs(n);});
} else {
std::for_each(vals.begin(), vals.end(), [&] (float n) {normal_sums+=n;});
}
std::for_each(vals.begin(), vals.end(), [&] (float n) {squares_sum+=n*n;});
MeanStd ret;
ret.mean = normal_sums/num_items;
ret.stddev = std::sqrt((squares_sum/num_items) - (ret.mean*ret.mean));
return ret;
}
template <MeanStd (*Backend) (const float *, const float *, bool)>
void testVectorMeanStd(int num_items, bool absolute=false) {
std::mt19937 gen;
std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
AlignedVector<float> inputVec(num_items);
for (auto&& it : inputVec) {
it = dist(gen);
}
MeanStd reference = VectorMeanStd(inputVec, num_items, absolute);
MeanStd fast = Backend(inputVec.begin(), inputVec.end(), absolute);
float meanDifference = std::fabs(reference.mean - fast.mean);
float stdDifference = std::fabs(reference.stddev - fast.stddev);
float eps = 0.00002f; //Accumulating horizontal sums can lead to errors.
CHECK_MESSAGE(meanDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.mean << " actual: " << fast.mean);
CHECK_MESSAGE(stdDifference <= eps, "Items: " << num_items << " Absolute: " << absolute << " Reference mean: " << reference.stddev << " actual: " << fast.stddev);
}
template <class I> bool IsOff(float from, I ref, I test) {
if (ref == test) return false;
if (ref - test > 1 && test - ref > 1) return true;
float off_test = std::fabs(static_cast<float>(test) - from);
float off_ref = std::fabs(static_cast<float>(ref) - from);
// Allow 0.5 to round either way.
if (off_test > 0.49 && off_test < 0.51 && off_ref > 0.49 && off_ref < 0.51) return false;
return true;
}
template <class Backend> bool Test(const float *input_unaligned, float quant_mult, std::size_t size) {
using Integer = typename Backend::Integer;
bool success = true;
AlignedVector<float> input(size);
std::memcpy(input.begin(), input_unaligned, sizeof(float) * size);
AlignedVector<Integer> ref(size);
AlignedVector<Integer> test(size);
QuantizeRef(input.begin(), ref.begin(), quant_mult, static_cast<Index>(size));
Backend::Quantize(input.begin(), test.begin(), quant_mult, static_cast<Index>(size));
for (std::size_t i = 0; i < size; ++i) {
if (IsOff(input[i] * quant_mult, ref[i], test[i])) {
UNSCOPED_INFO("Error at " << i << " from " << input[i] << '*' << quant_mult << '=' << (input[i]*quant_mult) << " ref = " << static_cast<int>(ref[i]) << " test = " << static_cast<int>(test[i]));
success = false;
}
}
return success;
}
template <class Backend> void TestMany(std::size_t grow) {
float input[33] = {
0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f,
14.f, 15.f, 16.f, 17.f, 18.f, 19.f, 20.f, 21.f, 22.f, 23.f, 24.f, 25.f,
26.f, 27.f, 28.f, 29.f, 30.f, 31.f, 32.f};
float corners[33] = {
-32769.f, -32768.f, -32767.f, -129.f, -128.f, -127.f, -1.f, 0.f, 1.f,
126.f, 127.f, 128.f, 129.f, 32766.f, 32768.f, 32769.f, -1.9f, -1.5f, -1.1f,
-1.f, -0.9f, -0.5f, -0.1f, 0.0f, 0.1f, 0.5f, 0.9f, 1.0f, 1.1f, 1.5f, 1.9f,
16056.8f, 2.5f};
for (std::size_t len = 0; len <= 33; len += grow) {
CHECK(Test<Backend>(input, 1.0f, len));
CHECK(Test<Backend>(input, 32.0f, len));
CHECK(Test<Backend>(corners, 1.0f, len));
CHECK(Test<Backend>(corners, -1.0f, len));
CHECK(Test<Backend>(corners, -0.49f, len));
}
}
TEST_CASE ("Quantize SSE2", "[quantize]") {
if (kCPU < CPUType::SSE2) return;
TestMany<SSE2::Kernels16>(8);
}
TEST_CASE ("Quantize SSSE3", "[quantize]") {
if (kCPU < CPUType::SSSE3) return;
TestMany<SSSE3::Kernels8>(1);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE ("Quantize AVX2", "[quantize]") {
if (kCPU < CPUType::AVX2) return;
TestMany<AVX2::Kernels8>(1);
TestMany<AVX2::Kernels16>(16);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE ("Quantize AVX512", "[quantize]") {
if (kCPU < CPUType::AVX512BW) return;
TestMany<AVX512BW::Kernels8>(1);
TestMany<AVX512BW::Kernels16>(16);
}
#endif
TEST_CASE("QuantizeStd SSSE3", "[VectorMeanStd]") {
if (kCPU < CPUType::SSSE3) return;
testVectorMeanStd<SSE2::VectorMeanStd>(64);
testVectorMeanStd<SSE2::VectorMeanStd>(64, true);
testVectorMeanStd<SSE2::VectorMeanStd>(256);
testVectorMeanStd<SSE2::VectorMeanStd>(256, true);
testVectorMeanStd<SSE2::VectorMeanStd>(2048);
testVectorMeanStd<SSE2::VectorMeanStd>(2048, true);
testVectorMeanStd<SSE2::VectorMeanStd>(65536);
testVectorMeanStd<SSE2::VectorMeanStd>(65536, true);
testVectorMeanStd<SSE2::VectorMeanStd>(81920);
testVectorMeanStd<SSE2::VectorMeanStd>(81920, true);
testVectorMeanStd<SSE2::VectorMeanStd>(120832);
testVectorMeanStd<SSE2::VectorMeanStd>(120832, true);
}
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX2
TEST_CASE("QuantizeStd AVX2", "[VectorMeanStd]") {
if (kCPU < CPUType::AVX2) return;
testVectorMeanStd<AVX2::VectorMeanStd>(64);
testVectorMeanStd<AVX2::VectorMeanStd>(64, true);
testVectorMeanStd<AVX2::VectorMeanStd>(256);
testVectorMeanStd<AVX2::VectorMeanStd>(256, true);
testVectorMeanStd<AVX2::VectorMeanStd>(2048);
testVectorMeanStd<AVX2::VectorMeanStd>(2048, true);
testVectorMeanStd<AVX2::VectorMeanStd>(65536);
testVectorMeanStd<AVX2::VectorMeanStd>(65536, true);
testVectorMeanStd<AVX2::VectorMeanStd>(81920);
testVectorMeanStd<AVX2::VectorMeanStd>(81920, true);
testVectorMeanStd<AVX2::VectorMeanStd>(120832);
testVectorMeanStd<AVX2::VectorMeanStd>(120832, true);
}
#endif
#ifdef INTGEMM_COMPILER_SUPPORTS_AVX512BW
TEST_CASE("QuantizeStd AVX512BW", "[VectorMeanStd]") {
if (kCPU < CPUType::AVX512BW) return;
testVectorMeanStd<AVX512BW::VectorMeanStd>(64);
testVectorMeanStd<AVX512BW::VectorMeanStd>(64, true);
testVectorMeanStd<AVX512BW::VectorMeanStd>(256);
testVectorMeanStd<AVX512BW::VectorMeanStd>(256, true);
testVectorMeanStd<AVX512BW::VectorMeanStd>(2048);
testVectorMeanStd<AVX512BW::VectorMeanStd>(2048, true);
testVectorMeanStd<AVX512BW::VectorMeanStd>(65536);
testVectorMeanStd<AVX512BW::VectorMeanStd>(65536, true);
testVectorMeanStd<AVX512BW::VectorMeanStd>(81920);
testVectorMeanStd<AVX512BW::VectorMeanStd>(81920, true);
testVectorMeanStd<AVX512BW::VectorMeanStd>(120832);
testVectorMeanStd<AVX512BW::VectorMeanStd>(120832, true);
}
#endif
} // namespace
} // namespace intgemm

27
third_party/intgemm/test/test.cc поставляемый
Просмотреть файл

@ -1,27 +0,0 @@
#define CATCH_CONFIG_RUNNER
#include "test.h"
#include <cmath>
int main(int argc, char ** argv) {
return Catch::Session().run(argc, argv);
}
namespace intgemm {
void CompareMSE(const float *float_ref, const float *int_ref, const float *int_test, std::size_t size, std::string test_info,
float int_tolerance, float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance) {
float int_sum = 0.0, float_sum = 0.0;
for (std::size_t i = 0; i < size; ++i) {
float int_diff = int_ref[i] - int_test[i];
float float_diff = float_ref[i] - int_test[i];
CHECK_MESSAGE(std::fabs(int_diff) <= int_tolerance, test_info << "Inaccurate compared to int reference at " << i << ' ' << int_ref[i] << ' ' << int_test[i]);
CHECK_MESSAGE(std::fabs(float_diff) <= float_tolerance, test_info << "Inaccurate compared to float reference at " << i << ' ' << float_ref[i] << ' ' << int_test[i]);
int_sum += int_diff * int_diff;
float_sum += float_diff * float_diff;
}
CHECK_MESSAGE(std::fabs(sqrt(float_sum / size)) <= MSE_float_tolerance, test_info << "Float MSE = " << sqrt(float_sum / size));
CHECK_MESSAGE(std::fabs(sqrt(int_sum / size)) <= MSE_int_tolerance, test_info << "Int MSE = " << sqrt(int_sum / size));
}
} // namespace intgemm

132
third_party/intgemm/test/test.h поставляемый
Просмотреть файл

@ -1,132 +0,0 @@
#pragma once
#include "intgemm/intgemm_config.h"
#include "3rd_party/catch.hpp"
#include "../intgemm/intgemm.h"
#include "../intgemm/aligned.h"
#include <cmath>
#include <sstream>
#include <iostream>
#include <iomanip>
#define CHECK_MESSAGE(cond, msg) do { INFO(msg); CHECK(cond); } while(0)
#define CHECK_FALSE_MESSAGE(cond, msg) do { INFO(msg); CHECK_FALSE(cond); } while(0)
#define REQUIRE_MESSAGE(cond, msg) do { INFO(msg); REQUIRE(cond); } while(0)
#define REQUIRE_FALSE_MESSAGE(cond, msg) do { INFO(msg); REQUIRE_FALSE(cond); } while(0)
#define CHECK_EPS(actual, expected, epsilon) \
do { \
if (std::fabs((actual) - (expected)) < epsilon) { SUCCEED(); } \
else { CHECK((actual) == (expected)); } \
} while(0)
#define KERNEL_TEST_CASE(name) TEST_CASE("Kernel: " name, "[kernel_test]")
namespace intgemm {
template <typename Type>
void Compare(const Type* reference, const Type* actual, Index size) {
for (Index i = 0; i < size; ++i) {
INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
CHECK(reference[i] == actual[i]);
}
}
template <typename Type>
void CompareEps(const Type* reference, const Type* actual, Index size, Type epsilon) {
for (Index i = 0; i < size; ++i) {
INFO("Inaccurate at " << i << ' ' << reference[i] << ' ' << actual[i]);
// Ratio to maximum value.
float threshold = epsilon * std::max<float>(0.01f, std::fabs(reference[i]));
CHECK(std::fabs(reference[i] - actual[i]) < threshold);
}
}
void CompareMSE(const float *float_ref, const float *int_ref, const float *int_test,
std::size_t size, std::string test_info, float int_tolerance,
float float_tolerance, float MSE_float_tolerance, float MSE_int_tolerance);
template <typename Type>
std::string PrintMatrix(const Type *mem, Index rows, Index cols) {
std::ostringstream out;
for (Index r = 0; r < rows; ++r) {
for (Index c = 0; c < cols; ++c) {
out << std::setw(4) << (int64_t) mem[r * cols + c] << ' ';
}
out << '\n';
}
return out.str();
}
/*
* References
*/
namespace references {
// Quantize
template <typename Type>
void Quantize(const float* input, Type* output, float quant_mult, Index size) {
for (Index i = 0; i < size; ++i) {
float value = roundf(input[i] * quant_mult);
value = std::max<float>(std::numeric_limits<Type>::min(), value);
value = std::min<float>(std::numeric_limits<Type>::max(), value);
output[i] = value;
}
}
/*
* Multiply C = A x B
*
* Notes: A and B has to be both integers or both floating points.
*
* Callback takes two arguments:
* - Intermediate value of multiplication 1 row times 1 column - it's int32_t or double based on types A and B.
* - Object containing information about position in output matrix - callbacks::OutputBufferInfo.
*/
template <typename TypeA, typename TypeB, typename TypeC, typename LambdaCallback,
typename std::enable_if<
(std::is_integral<TypeA>::value && std::is_integral<TypeB>::value) ||
(std::is_floating_point<TypeA>::value && std::is_floating_point<TypeB>::value)
>::type* = nullptr>
void Multiply(const TypeA* A, const TypeB* B, TypeC* C, Index A_rows, Index width, Index B_cols, LambdaCallback callback) {
using IntermediateType = typename std::conditional<std::is_integral<TypeA>::value, int32_t, double>::type;
for (Index r = 0; r < A_rows; ++r) {
for (Index c = 0; c < B_cols; ++c) {
IntermediateType sum = 0;
for (Index k = 0; k < width; ++k) {
sum += IntermediateType(A[r * width + k]) * IntermediateType(B[k * B_cols + c]);
}
C[r * B_cols + c] = callback(sum, {r, c, A_rows, B_cols});
}
}
}
// Matrix rearragement
template <typename Type>
void Rearragement(const Type* input, Type* output, Index simd, Index unroll, Index rows, Index cols) {
for (Index c = 0; c < cols; c += unroll) {
for (Index r = 0; r < rows; r += simd) {
for (Index i = 0; i < unroll; ++i)
for (Index j = 0; j < simd; ++j)
output[simd * i + j] = input[cols * r + c + cols * j + i];
output += unroll * simd;
}
}
}
// Transpose
template <typename Type>
void Transpose(const Type* input, Type* output, Index rows, Index cols) {
for (Index r = 0; r < rows; ++r) {
for (Index c = 0; c < cols; ++c) {
output[rows * c + r] = input[cols * r + c];
}
}
}
} // namespace references
} // namespace intgemm

45
third_party/intgemm/test/utils_test.cc поставляемый
Просмотреть файл

@ -1,45 +0,0 @@
#include "test.h"
#include "../intgemm/utils.h"
namespace intgemm {
namespace {
TEST_CASE("Factorial",) {
CHECK(factorial(0) == 1);
CHECK(factorial(1) == 1);
CHECK(factorial(2) == 2);
CHECK(factorial(3) == 6);
CHECK(factorial(4) == 24);
// Maximum result that fits in unsinged long long
CHECK(factorial(20) == 2432902008176640000);
}
TEST_CASE("Expi (negative)",) {
const double eps = 0.0000001;
CHECK_EPS(expi(-1), 0.3678794411714423, eps);
CHECK_EPS(expi(-2), 0.1353352832366127, eps);
CHECK_EPS(expi(-10), 0.0000453999297625, eps);
}
TEST_CASE("Expi (zero)",) {
const double eps = 0.0000001;
CHECK_EPS(expi(0), 1.0, eps);
}
TEST_CASE("Expi (positive)",) {
const double eps = 0.0000001;
CHECK_EPS(expi(1), 2.7182818284590452, eps);
CHECK_EPS(expi(2), 7.3890560989306502, eps);
CHECK_EPS(expi(10), 22026.4657948067165170, eps);
}
TEST_CASE("Round up",) {
CHECK(round_up(0, 5) == 0);
CHECK(round_up(1, 5) == 5);
CHECK(round_up(4, 5) == 5);
CHECK(round_up(6, 5) == 10);
}
}
}

151
third_party/xsimd/Changelog.rst поставляемый Normal file
Просмотреть файл

@ -0,0 +1,151 @@
.. Copyright (c) Serge Guelton and Johan Mabille
Copyright (c) QuantStack
Distributed under the terms of the BSD 3-Clause License.
The full license is in the file LICENSE, distributed with this software.
Changelog
=========
9.0.1
-----
* Fix potential ABI issue in SVE support, making ``xsimd::sve`` a type alias to
size-dependent type.
9.0.0
-----
* Support fixed size SVE
* Fix a bug in SSSE3 ``xsimd::swizzle`` implementation for ``int8`` and ``int16``
* Rename ``xsimd::hadd`` into ``xsimd::reduce_add``, provide ``xsimd::reduce_min`` and ``xsimd::reduce_max``
* Properly report unsupported double for neon on arm32
* Fill holes in xsimd scalar api
* Fix ``find_package(xsimd)`` for xtl enabled xsimd
* Replace ``xsimd::bool_cast`` by ``xsimd::batch_bool_cast``
* Native ``xsimd::hadd`` for float on arm64
* Properly static_assert when trying to instantiate an ``xsimd::batch`` of xtl complex
* Introduce ``xsimd::batch_bool::mask()`` and ``batch_bool::from_mask(...)``
* Flag some function with ``[[nodiscard]]``
* Accept both relative and absolute libdir and include dir in xsimd.pc
* Implement ``xsimd::nearbyint_as_int`` for NEON
* Add ``xsimd::polar``
* Speedup double -> F32/I32 gathers
* Add ``xsimd::slide_left`` and ``xsimd::slide_right``
* Support integral ``xsimd::swizzles`` on AVX
8.1.0
-----
* Add ``xsimd::gather`` and ``xsimd::scatter``
* Add ``xsimd::nearbyint_as_int``
* Add ``xsimd::none``
* Add ``xsimd::reciprocal``
* Remove batch constructor from memory adress, use ``xsimd::batch<...>::load_(un)aligned`` instead
* Leave to msvc users the opportunity to manually disable FMA3 on AVX
* Provide ``xsimd::insert`` to modify a single value from a vector
* Make ``xsimd::pow`` implementation resilient to ``FE_INVALID``
* Reciprocal square root support through ``xsimd::rsqrt``
* NEON: Improve ``xsimd::any`` and ``xsimd::all``
* Provide type utility to explicitly require a batch of given size and type
* Implement ``xsimd::swizzle`` on x86, neon and neon64
* Avx support for ``xsimd::zip_lo`` and ``xsimd::zip_hi``
* Only use ``_mm256_unpacklo_epi<N>`` on AVX2
* Provide neon/neon64 conversion function from ``uint(32|64)_t`` to ``(float|double)``
* Provide SSE/AVX/AVX2 conversion function from ``uint32_t`` to ``float``
* Provide AVX2 conversion function from ``(u)int64_t`` to ``double``
* Provide better SSE conversion function from ``uint64_t`` to ``double``
* Provide better SSE conversion function to ``double``
* Support logical xor for ``xsimd::batch_bool``
* Clarify fma support:
- FMA3 + SSE -> ``xsimd::fma3<sse4_2>``
- FMA3 + AVX -> ``xsimd::fma3<avx>``
- FMA3 + AVX2 -> ``xsimd::fma3<avx2>``
- FMA4 -> ``xsimd::fma4``
* Allow ``xsimd::transform`` to work with complex types
* Add missing scalar version of ``xsimd::norm`` and ``xsimd::conj``
8.0.5
-----
* Fix neon ``xsimd::hadd`` implementation
* Detect unsupported architectures and set ``XSIMD_NO_SUPPORTED_ARCHITECTURE``
if needs be
8.0.4
-----
* Provide some conversion operators for ``float`` -> ``uint32``
* Improve code generated for AVX2 signed integer comparisons
* Enable detection of avx512cd and avx512dq, and fix avx512bw detection
* Enable detection of AVX2+FMA
* Pick the best compatible architecture in ``xsimd::dispatch``
* Enables support for FMA when AVX2 is detected on Windows
* Add missing includes / forward declaration
* Mark all functions inline and noexcept
* Assert when using incomplete ``std::initializer_list``
8.0.3
-----
* Improve CI & testing, no functional change
8.0.2
-----
* Do not use ``_mm256_srai_epi32`` under AVX, it's an AVX2 instruction
8.0.1
-----
* Fix invalid constexpr ``std::make_tuple`` usage in neon64

Просмотреть файл

@ -34,8 +34,8 @@ namespace xsimd
inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
template <class T, class A>
inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
template <class B, class T, class A>
inline B bitwise_cast(batch<T, A> const& self) noexcept;
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> cos(batch<T, A> const& self) noexcept;
template <class T, class A>

Просмотреть файл

@ -909,7 +909,7 @@ namespace xsimd
e = fms(x, e, hxs);
using i_type = as_integer_t<batch_type>;
i_type ik = to_int(k);
batch_type two2mk = ::xsimd::bitwise_cast<batch_type>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
batch_type two2mk = ::xsimd::bitwise_cast<float>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
batch_type y = batch_type(1.) - two2mk - (e - x);
return ldexp(y, ik);
}
@ -936,7 +936,7 @@ namespace xsimd
e = (x * (e - c) - c) - hxs;
using i_type = as_integer_t<batch_type>;
i_type ik = to_int(k);
batch_type two2mk = ::xsimd::bitwise_cast<batch_type>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
batch_type two2mk = ::xsimd::bitwise_cast<double>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
batch_type ct1 = batch_type(1.) - two2mk - (e - x);
batch_type ct2 = ++(x - (e + two2mk));
batch_type y = select(k < batch_type(20.), ct1, ct2);
@ -1004,13 +1004,14 @@ namespace xsimd
inline batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
using i_type = batch<as_integer_t<T>, A>;
using int_type = as_integer_t<T>;
using i_type = batch<int_type, A>;
i_type m1f = constants::mask1frexp<batch_type>();
i_type r1 = m1f & ::xsimd::bitwise_cast<i_type>(self);
batch_type x = self & ::xsimd::bitwise_cast<batch_type>(~m1f);
i_type r1 = m1f & ::xsimd::bitwise_cast<int_type>(self);
batch_type x = self & ::xsimd::bitwise_cast<T>(~m1f);
exp = (r1 >> constants::nmb<batch_type>()) - constants::maxexponentm1<batch_type>();
exp = select(batch_bool_cast<typename i_type::value_type>(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0)));
return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<batch_type>(constants::mask2frexp<batch_type>()), batch_type(0.));
return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<T>(constants::mask2frexp<batch_type>()), batch_type(0.));
}
// from bool
@ -1058,7 +1059,7 @@ namespace xsimd
using itype = as_integer_t<batch_type>;
itype ik = other + constants::maxexponent<T>();
ik = ik << constants::nmb<T>();
return self * ::xsimd::bitwise_cast<batch_type>(ik);
return self * ::xsimd::bitwise_cast<T>(ik);
}
// lgamma
@ -1383,7 +1384,8 @@ namespace xsimd
inline batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<float, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<float>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type k(0);
auto isnez = (self != batch_type(0.));
@ -1391,15 +1393,15 @@ namespace xsimd
auto test = (self < constants::smallestposval<batch_type>()) && isnez;
if (any(test))
{
k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(23), k);
k = select(batch_bool_cast<int_type>(test), k - i_type(23), k);
x = select(test, x * batch_type(8388608ul), x);
}
#endif
i_type ix = ::xsimd::bitwise_cast<i_type>(x);
i_type ix = ::xsimd::bitwise_cast<int_type>(x);
ix += 0x3f800000 - 0x3f3504f3;
k += (ix >> 23) - 0x7f;
ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
x = ::xsimd::bitwise_cast<batch_type>(ix);
x = ::xsimd::bitwise_cast<float>(ix);
batch_type f = --x;
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
@ -1422,17 +1424,18 @@ namespace xsimd
inline batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<double, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<double>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
i_type k(0);
auto isnez = (self != batch_type(0.));
#ifndef XSIMD_NO_DENORMALS
auto test = (self < constants::smallestposval<batch_type>()) && isnez;
if (any(test))
{
k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
x = select(test, x * batch_type(18014398509481984ull), x);
}
#endif
@ -1440,7 +1443,7 @@ namespace xsimd
k += (hx >> 20) - 0x3ff;
batch_type dk = to_float(k);
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
batch_type f = --x;
batch_type hfsq = batch_type(0.5) * f * f;
@ -1471,7 +1474,8 @@ namespace xsimd
inline batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<float, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<float>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type k(0);
auto isnez = (self != batch_type(0.));
@ -1479,15 +1483,15 @@ namespace xsimd
auto test = (self < constants::smallestposval<batch_type>()) && isnez;
if (any(test))
{
k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(25), k);
k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
x = select(test, x * batch_type(33554432ul), x);
}
#endif
i_type ix = ::xsimd::bitwise_cast<i_type>(x);
i_type ix = ::xsimd::bitwise_cast<int_type>(x);
ix += 0x3f800000 - 0x3f3504f3;
k += (ix >> 23) - 0x7f;
ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
x = ::xsimd::bitwise_cast<batch_type>(ix);
x = ::xsimd::bitwise_cast<float>(ix);
batch_type f = --x;
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
@ -1510,9 +1514,10 @@ namespace xsimd
inline batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<double, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<double>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
i_type k(0);
auto isnez = (self != batch_type(0.));
#ifndef XSIMD_NO_DENORMALS
@ -1526,7 +1531,7 @@ namespace xsimd
hx += 0x3ff00000 - 0x3fe6a09e;
k += (hx >> 20) - 0x3ff;
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
batch_type f = --x;
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
@ -1536,7 +1541,7 @@ namespace xsimd
batch_type R = t2 + t1;
batch_type hfsq = batch_type(0.5) * f * f;
batch_type hi = f - hfsq;
hi = hi & ::xsimd::bitwise_cast<batch_type>((constants::allbits<i_type>() << 32));
hi = hi & ::xsimd::bitwise_cast<double>((constants::allbits<i_type>() << 32));
batch_type lo = fma(s, hfsq + R, f - hi - hfsq);
batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
@ -1591,7 +1596,8 @@ namespace xsimd
ivln10lo(-3.1689971365e-05f),
log10_2hi(3.0102920532e-01f),
log10_2lo(7.9034151668e-07f);
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<float>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type k(0);
auto isnez = (self != batch_type(0.));
@ -1599,15 +1605,15 @@ namespace xsimd
auto test = (self < constants::smallestposval<batch_type>()) && isnez;
if (any(test))
{
k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(25), k);
k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
x = select(test, x * batch_type(33554432ul), x);
}
#endif
i_type ix = ::xsimd::bitwise_cast<i_type>(x);
i_type ix = ::xsimd::bitwise_cast<int_type>(x);
ix += 0x3f800000 - 0x3f3504f3;
k += (ix >> 23) - 0x7f;
ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
x = ::xsimd::bitwise_cast<batch_type>(ix);
x = ::xsimd::bitwise_cast<float>(ix);
batch_type f = --x;
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
@ -1618,7 +1624,7 @@ namespace xsimd
batch_type dk = to_float(k);
batch_type hfsq = batch_type(0.5) * f * f;
batch_type hibits = f - hfsq;
hibits &= ::xsimd::bitwise_cast<batch_type>(i_type(0xfffff000));
hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq);
batch_type r = fma(dk, log10_2hi,
fma(hibits, ivln10hi,
@ -1641,23 +1647,24 @@ namespace xsimd
ivln10lo(2.50829467116452752298e-11),
log10_2hi(3.01029995663611771306e-01),
log10_2lo(3.69423907715893078616e-13);
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<double>;
using i_type = batch<int_type, A>;
batch_type x = self;
i_type hx = ::xsimd::bitwise_cast<i_type>(x) >> 32;
i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
i_type k(0);
auto isnez = (self != batch_type(0.));
#ifndef XSIMD_NO_DENORMALS
auto test = (self < constants::smallestposval<batch_type>()) && isnez;
if (any(test))
{
k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
x = select(test, x * batch_type(18014398509481984ull), x);
}
#endif
hx += 0x3ff00000 - 0x3fe6a09e;
k += (hx >> 20) - 0x3ff;
hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
x = ::xsimd::bitwise_cast<batch_type>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(x)));
x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
batch_type f = --x;
batch_type dk = to_float(k);
batch_type s = f / (batch_type(2.) + f);
@ -1668,7 +1675,7 @@ namespace xsimd
batch_type R = t2 + t1;
batch_type hfsq = batch_type(0.5) * f * f;
batch_type hi = f - hfsq;
hi = hi & ::xsimd::bitwise_cast<batch_type>(constants::allbits<i_type>() << 32);
hi = hi & ::xsimd::bitwise_cast<double>(constants::allbits<i_type>() << 32);
batch_type lo = f - hi - hfsq + s * (hfsq + R);
batch_type val_hi = hi * ivln10hi;
batch_type y = dk * log10_2hi;
@ -1705,14 +1712,15 @@ namespace xsimd
inline batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<float, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<float>;
using i_type = batch<int_type, A>;
const batch_type uf = self + batch_type(1.);
auto isnez = (uf != batch_type(0.));
i_type iu = ::xsimd::bitwise_cast<i_type>(uf);
i_type iu = ::xsimd::bitwise_cast<int_type>(uf);
iu += 0x3f800000 - 0x3f3504f3;
i_type k = (iu >> 23) - 0x7f;
iu = (iu & i_type(0x007fffff)) + 0x3f3504f3;
batch_type f = --(::xsimd::bitwise_cast<batch_type>(iu));
batch_type f = --(::xsimd::bitwise_cast<float>(iu));
batch_type s = f / (batch_type(2.) + f);
batch_type z = s * s;
batch_type w = z * z;
@ -1736,16 +1744,17 @@ namespace xsimd
inline batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<double, A>;
using i_type = as_integer_t<batch_type>;
using int_type = as_integer_t<double>;
using i_type = batch<int_type, A>;
const batch_type uf = self + batch_type(1.);
auto isnez = (uf != batch_type(0.));
i_type hu = ::xsimd::bitwise_cast<i_type>(uf) >> 32;
i_type hu = ::xsimd::bitwise_cast<int_type>(uf) >> 32;
hu += 0x3ff00000 - 0x3fe6a09e;
i_type k = (hu >> 20) - 0x3ff;
/* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
batch_type c = select(batch_bool_cast<double>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e;
batch_type f = ::xsimd::bitwise_cast<batch_type>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<i_type>(uf)));
batch_type f = ::xsimd::bitwise_cast<double>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(uf)));
f = --f;
batch_type hfsq = batch_type(0.5) * f * f;
batch_type s = f / (batch_type(2.) + f);
@ -1897,13 +1906,13 @@ namespace xsimd
static inline batch_type next(const batch_type& b) noexcept
{
batch_type n = ::xsimd::bitwise_cast<batch_type>(::xsimd::bitwise_cast<int_batch>(b) + int_type(1));
batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
return select(b == constants::infinity<batch_type>(), b, n);
}
static inline batch_type prev(const batch_type& b) noexcept
{
batch_type p = ::xsimd::bitwise_cast<batch_type>(::xsimd::bitwise_cast<int_batch>(b) - int_type(1));
batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
return select(b == constants::minusinfinity<batch_type>(), b, p);
}
};

Просмотреть файл

@ -159,7 +159,7 @@ namespace xsimd
template <class A, class T_out, class T_in>
inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
{
return { bitwise_cast<batch<T_out, A>>(batch<T_in, A>(self.data)).data };
return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
}
// bitwise_and
@ -1493,8 +1493,8 @@ namespace xsimd
V7> const& mask,
requires_arch<avx>) noexcept
{
return bitwise_cast<batch<T, A>>(
swizzle(bitwise_cast<batch<float, A>>(self), mask));
return bitwise_cast<T>(
swizzle(bitwise_cast<float>(self), mask));
}
template <class A,
@ -1509,8 +1509,8 @@ namespace xsimd
batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
requires_arch<avx>) noexcept
{
return bitwise_cast<batch<T, A>>(
swizzle(bitwise_cast<batch<double, A>>(self), mask));
return bitwise_cast<T>(
swizzle(bitwise_cast<double>(self), mask));
}
// trunc

Просмотреть файл

@ -574,7 +574,17 @@ namespace xsimd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
__m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
__m256i res_lo = _mm256_mullo_epi16(self, other);
__m256i other_hi = _mm256_srli_epi16(other, 8);
__m256i self_hi = _mm256_and_si256(self, mask_hi);
__m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
__m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
return res;
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_mullo_epi16(self, other);
}
@ -852,7 +862,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
@ -862,7 +872,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
}
// zip_hi

Просмотреть файл

@ -551,7 +551,7 @@ namespace xsimd
template <class A, uint16_t... Vs>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
}
template <class A, uint8_t... Vs>
@ -563,7 +563,7 @@ namespace xsimd
template <class A, uint8_t... Vs>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
}
// zip_hi

Просмотреть файл

@ -768,6 +768,45 @@ namespace xsimd
return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
}
// fnma
template <class A>
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fnmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fnmadd_pd(x, y, z);
}
// fma
template <class A>
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmadd_pd(x, y, z);
}
// fms
template <class A>
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
{
return _mm512_fmsub_pd(x, y, z);
}
// from bool
template <class A, class T>
inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
@ -1763,7 +1802,7 @@ namespace xsimd
template <class A, uint64_t... Vs>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
{
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx512f {}));
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
}
template <class A, uint32_t... Vs>
@ -1775,7 +1814,7 @@ namespace xsimd
template <class A, uint32_t... Vs>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
{
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx512f {}));
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
}
namespace detail
@ -1833,7 +1872,7 @@ namespace xsimd
inline batch<int16_t, A>
swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
{
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512f {}));
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
}
// trunc

Просмотреть файл

@ -452,59 +452,114 @@ namespace xsimd
* load *
********/
// It is not possible to use a call to A::alignment() here, so use an
// immediate instead.
#if defined(__clang__) || defined(__GNUC__)
#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
#elif defined(_MSC_VER)
#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
#else
#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
#endif
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u8((uint8_t*)src);
return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s8((int8_t*)src);
return xsimd_aligned_load(vld1q_s8, int8_t*, src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u16((uint16_t*)src);
return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s16((int16_t*)src);
return xsimd_aligned_load(vld1q_s16, int16_t*, src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u32((uint32_t*)src);
return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s32((int32_t*)src);
return xsimd_aligned_load(vld1q_s32, int32_t*, src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u64((uint64_t*)src);
return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s64((int64_t*)src);
return xsimd_aligned_load(vld1q_s64, int64_t*, src);
}
template <class A>
inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
{
return vld1q_f32(src);
return xsimd_aligned_load(vld1q_f32, float*, src);
}
template <class A, class T>
#undef xsimd_aligned_load
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return load_aligned<A>(src, convert<T>(), A {});
return vld1q_u8((uint8_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s8((int8_t*)src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u16((uint16_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s16((int16_t*)src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u32((uint32_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s32((int32_t*)src);
}
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_u64((uint64_t*)src);
}
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
{
return vld1q_s64((int64_t*)src);
}
template <class A>
inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
{
return vld1q_f32(src);
}
/*********
@ -2526,9 +2581,9 @@ namespace xsimd
inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
{
const auto left = vdupq_n_u8(0);
const auto right = bitwise_cast<batch<uint8_t, A>>(x).data;
const auto right = bitwise_cast<uint8_t>(x).data;
const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
return bitwise_cast<batch<T, A>>(res);
return bitwise_cast<T>(res);
}
};
@ -2558,10 +2613,10 @@ namespace xsimd
template <class A, class T>
inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
{
const auto left = bitwise_cast<batch<uint8_t, A>>(x).data;
const auto left = bitwise_cast<uint8_t>(x).data;
const auto right = vdupq_n_u8(0);
const batch<uint8_t, A> res(vextq_u8(left, right, N));
return bitwise_cast<batch<T, A>>(res);
return bitwise_cast<T>(res);
}
};

Просмотреть файл

@ -133,18 +133,26 @@ namespace xsimd
/********
* load *
********/
#if defined(__clang__) || defined(__GNUC__)
#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
#elif defined(_MSC_VER)
#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
#else
#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
#endif
template <class A>
inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
{
return vld1q_f64(src);
return xsimd_aligned_load(vld1q_f64, double*, src);
}
template <class A>
inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
{
return load_aligned<A>(src, convert<double>(), A {});
return vld1q_f64(src);
}
#undef xsimd_aligned_load
/*********
* store *

Просмотреть файл

@ -441,7 +441,16 @@ namespace xsimd
return !(x0 == x1);
}
#if defined(_GNU_SOURCE) && !defined(__APPLE__) && !defined(__MINGW32__) && !defined(__ANDROID__)
#if defined(__APPLE__)
inline float exp10(const float& x) noexcept
{
return __exp10f(x);
}
inline double exp10(const double& x) noexcept
{
return __exp10(x);
}
#elif defined(__GLIBC__)
inline float exp10(const float& x) noexcept
{
return ::exp10f(x);
@ -450,14 +459,24 @@ namespace xsimd
{
return ::exp10(x);
}
#endif
#elif defined(_WIN32)
template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
inline T exp10(const T& x) noexcept
{
// FIXME: very inefficient
// Very inefficient but other implementations give incorrect results
// on Windows
return std::pow(T(10), x);
}
#else
inline float exp10(const float& x) noexcept
{
return std::exp(0x1.26bb1cp+1f * x);
}
inline double exp10(const double& x) noexcept
{
return std::exp(0x1.26bb1bbb55516p+1 * x);
}
#endif
template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
inline auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))

Просмотреть файл

@ -23,8 +23,8 @@ namespace xsimd
template <class batch_type, bool... Values>
struct batch_bool_constant;
template <class B, class T, class A>
inline B bitwise_cast(batch<T, A> const& x) noexcept;
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
template <class batch_type, typename batch_type::value_type... Values>
struct batch_constant;
@ -140,7 +140,7 @@ namespace xsimd
template <class A, class T_out, class T_in>
inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
{
return { bitwise_cast<batch<T_out, A>>(batch<T_in, A>(self.data)).data };
return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
}
// bitwise_and
@ -1185,7 +1185,7 @@ namespace xsimd
batch<T, A> acc2 = max(acc1, step2);
if (sizeof(T) == 2)
return acc2.get(0);
batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
batch<T, A> acc3 = max(acc2, step3);
return acc3.get(0);
}
@ -1207,7 +1207,7 @@ namespace xsimd
batch<T, A> acc2 = min(acc1, step2);
if (sizeof(T) == 2)
return acc2.get(0);
batch<T, A> step3 = bitwise_cast<batch<T, A>>(bitwise_cast<batch<uint16_t, A>>(acc2) >> 8);
batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
batch<T, A> acc3 = min(acc2, step3);
return acc3.get(0);
}
@ -1600,7 +1600,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
{
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, sse2 {}));
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@ -1613,7 +1613,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
{
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, sse2 {}));
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
}
// zip_hi

Просмотреть файл

@ -118,7 +118,7 @@ namespace xsimd
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
{
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
@ -132,7 +132,7 @@ namespace xsimd
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
{
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));
}
}

Просмотреть файл

@ -289,45 +289,54 @@
#ifdef _MSC_VER
#if XSIMD_WITH_AVX512
#undef XSIMD_WITH_AVX2
#define XSIMD_WITH_AVX2 1
#endif
#if XSIMD_WITH_AVX2
#undef XSIMD_WITH_AVX
#define XSIMD_WITH_AVX 1
#undef XSIMD_WITH_FMA3_AVX
#define XSIMD_WITH_FMA3_AVX 1
#undef XSIMD_WITH_FMA3_AVX2
#define XSIMD_WITH_FMA3_AVX2 1
#endif
#if XSIMD_WITH_AVX
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if XSIMD_WITH_SSE4_2
#undef XSIMD_WITH_SSE4_1
#define XSIMD_WITH_SSE4_1 1
#endif
#if XSIMD_WITH_SSE4_1
#undef XSIMD_WITH_SSSE3
#define XSIMD_WITH_SSSE3 1
#endif
#if XSIMD_WITH_SSSE3
#undef XSIMD_WITH_SSE3
#define XSIMD_WITH_SSE3 1
#endif
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#if XSIMD_WITH_SSE3 || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#undef XSIMD_WITH_SSE2
#define XSIMD_WITH_SSE2 1
#endif

Просмотреть файл

@ -71,6 +71,21 @@ namespace xsimd
template <class C>
using container_alignment_t = typename container_alignment<C>::type;
/*********************
* alignment checker *
*********************/
/**
* Checks whether pointer \c ptr is aligned according the alignment
* requirements of \c Arch.
* @return true if the alignment requirements are met
*/
template <class Arch = default_arch>
inline bool is_aligned(void const* ptr)
{
return (reinterpret_cast<uintptr_t>(ptr) % static_cast<uintptr_t>(Arch::alignment())) == 0;
}
}
#endif

Просмотреть файл

@ -314,11 +314,12 @@ namespace xsimd
* @param x batch of \c T_in
* @return \c x reinterpreted as \c T_out
*/
template <class B, class T, class A>
inline B bitwise_cast(batch<T, A> const& x) noexcept
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
{
detail::static_check_supported_config<T, A>();
return kernel::bitwise_cast<A>(x, B {}, A {});
detail::static_check_supported_config<T_in, A>();
detail::static_check_supported_config<T_out, A>();
return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
}
/**
@ -886,10 +887,10 @@ namespace xsimd
* @return the result of the reduction, as a scalar.
*/
template <class T, class A, class F>
inline T reduce(F&& r, batch<T, A> const& x) noexcept
inline T reduce(F&& f, batch<T, A> const& x) noexcept
{
detail::static_check_supported_config<T, A>();
return kernel::detail::reduce(std::forward<F>(r), x, std::integral_constant<unsigned, batch<T, A>::size>());
return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
}
/**

Просмотреть файл

@ -16,6 +16,13 @@
#include "xsimd_batch.hpp"
/**
* high level type traits
*
* @defgroup batch_traits Type traits
*
**/
namespace xsimd
{
@ -205,11 +212,18 @@ namespace xsimd
template <class T1, class T2, class A = default_arch>
using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
/************
* is_batch *
************/
/**
* @ingroup batch_traits
*
* type traits that inherits from @c std::true_type for @c batch<...> types and from
* @c std::false_type otherwise.
*
* @tparam T type to analyze.
*/
template <class T>
struct is_batch;
template <class V>
template <class T>
struct is_batch : std::false_type
{
};
@ -219,11 +233,16 @@ namespace xsimd
{
};
/*****************
* is_batch_bool *
*****************/
/**
* @ingroup batch_traits
*
* type traits that inherits from @c std::true_type for @c batch_bool<...> types and from
* @c std::false_type otherwise.
*
* @tparam T type to analyze.
*/
template <class V>
template <class T>
struct is_batch_bool : std::false_type
{
};
@ -233,11 +252,16 @@ namespace xsimd
{
};
/********************
* is_batch_complex *
********************/
/**
* @ingroup batch_traits
*
* type traits that inherits from @c std::true_type for @c batch<std::complex<...>>
* types and from @c std::false_type otherwise.
*
* @tparam T type to analyze.
*/
template <class V>
template <class T>
struct is_batch_complex : std::false_type
{
};
@ -246,6 +270,50 @@ namespace xsimd
struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
{
};
/**
* @ingroup batch_traits
*
* type traits whose @c type field is set to @c T::value_type if @c
* is_batch<T>::value and to @c T otherwise.
*
* @tparam T type to analyze.
*/
template <class T>
struct scalar_type
{
using type = T;
};
template <class T, class A>
struct scalar_type<batch<T, A>>
{
using type = T;
};
template <class T>
using scalar_type_t = typename scalar_type<T>::type;
/**
* @ingroup batch_traits
*
* type traits whose @c type field is set to @c T::value_type if @c
* is_batch_bool<T>::value and to @c bool otherwise.
*
* @tparam T type to analyze.
*/
template <class T>
struct mask_type
{
using type = bool;
};
template <class T, class A>
struct mask_type<batch<T, A>>
{
using type = typename batch<T, A>::batch_bool_type;
};
template <class T>
using mask_type_t = typename mask_type<T>::type;
}
#endif

4
third_party/xsimd/moz.yaml поставляемый
Просмотреть файл

@ -10,8 +10,8 @@ origin:
url: https://github.com/QuantStack/xsimd
release: 75b043b8e031f1ada8053fe80d5ba635e2a75588 (2023-01-05T06:45:23Z).
revision: 75b043b8e031f1ada8053fe80d5ba635e2a75588
release: e8f209c3397c8a866be2312682689a04e4abfd66 (2023-02-27T06:32:46Z).
revision: e8f209c3397c8a866be2312682689a04e4abfd66
license: BSD-3-Clause