зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1786069 - Update libjxl and highway r=tnikkel
Differential Revision: https://phabricator.services.mozilla.com/D155272
This commit is contained in:
Родитель
ddc22eb2ef
Коммит
ca5565dea6
|
@ -11,6 +11,7 @@ LOCAL_INCLUDES += [
|
|||
SOURCES += [
|
||||
"/third_party/highway/hwy/aligned_allocator.cc",
|
||||
"/third_party/highway/hwy/contrib/image/image.cc",
|
||||
"/third_party/highway/hwy/per_target.cc",
|
||||
"/third_party/highway/hwy/targets.cc",
|
||||
]
|
||||
|
||||
|
@ -29,6 +30,7 @@ EXPORTS.hwy += [
|
|||
EXPORTS.hwy.ops += [
|
||||
"/third_party/highway/hwy/ops/arm_neon-inl.h",
|
||||
"/third_party/highway/hwy/ops/arm_sve-inl.h",
|
||||
"/third_party/highway/hwy/ops/emu128-inl.h",
|
||||
"/third_party/highway/hwy/ops/generic_ops-inl.h",
|
||||
"/third_party/highway/hwy/ops/rvv-inl.h",
|
||||
"/third_party/highway/hwy/ops/scalar-inl.h",
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z).
|
||||
release: 7f2e26854086fba4255220fd6c77e9141f1f87cc
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: f13e3b956eb226561ac79427893ec0afd66f91a8
|
||||
revision: 7f2e26854086fba4255220fd6c77e9141f1f87cc
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -49,7 +49,6 @@ SOURCES += [
|
|||
"/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/entropy_coder.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/epf.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/exif.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/fast_dct.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/fields.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/frame_header.cc",
|
||||
|
|
|
@ -10,9 +10,9 @@ origin:
|
|||
|
||||
url: https://github.com/libjxl/libjxl
|
||||
|
||||
release: 59f7d19e454bc5e1edd692187e1178f5926fdfd9 (2022-07-28T11:13:01Z).
|
||||
release: bb8eac5d6acec223e44cf8cc72ae02f0816de311
|
||||
|
||||
revision: 59f7d19e454bc5e1edd692187e1178f5926fdfd9
|
||||
revision: bb8eac5d6acec223e44cf8cc72ae02f0816de311
|
||||
|
||||
license: Apache-2.0
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
load("@bazel_skylib//lib:selects.bzl", "selects")
|
||||
|
||||
load("@rules_cc//cc:defs.bzl", "cc_test")
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
package(
|
||||
default_visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
|
@ -14,10 +16,33 @@ config_setting(
|
|||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_msvc",
|
||||
name = "compiler_clangcl",
|
||||
flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_msvc_actual",
|
||||
flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
|
||||
)
|
||||
|
||||
# The above is insufficient for Bazel on Windows, which does not seem to
|
||||
# detect/set a compiler flag. This workaround prevents compile errors due to
|
||||
# passing clang-only warning flags to MSVC.
|
||||
config_setting(
|
||||
name = "compiler_msvc_cpu",
|
||||
values = {
|
||||
"cpu": "x64_windows",
|
||||
},
|
||||
)
|
||||
|
||||
selects.config_setting_group(
|
||||
name = "compiler_msvc",
|
||||
match_any = [
|
||||
":compiler_msvc_actual",
|
||||
":compiler_msvc_cpu",
|
||||
],
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_emscripten",
|
||||
values = {"cpu": "wasm32"},
|
||||
|
@ -54,8 +79,8 @@ CLANG_GCC_COPTS = [
|
|||
"-Wunreachable-code",
|
||||
]
|
||||
|
||||
# Additional warnings only supported by Clang
|
||||
CLANG_ONLY_COPTS = [
|
||||
# Warnings supported by Clang and Clang-cl
|
||||
CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
|
||||
"-Wfloat-overflow-conversion",
|
||||
"-Wfloat-zero-conversion",
|
||||
"-Wfor-loop-analysis",
|
||||
|
@ -73,11 +98,19 @@ CLANG_ONLY_COPTS = [
|
|||
"-Wunused-comparison",
|
||||
]
|
||||
|
||||
# Warnings only supported by Clang, but not Clang-cl
|
||||
CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
|
||||
# Do not treat the third_party headers as system headers when building
|
||||
# highway - the errors are pertinent.
|
||||
"--no-system-header-prefix=third_party/highway",
|
||||
]
|
||||
|
||||
COPTS = select({
|
||||
":compiler_msvc": [],
|
||||
":compiler_gcc": CLANG_GCC_COPTS,
|
||||
":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
|
||||
# Default to clang because compiler detection only works in Bazel
|
||||
"//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
|
||||
"//conditions:default": CLANG_ONLY_COPTS,
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": [
|
||||
"-march=rv64gcv1p0",
|
||||
|
@ -87,6 +120,12 @@ COPTS = select({
|
|||
],
|
||||
})
|
||||
|
||||
DEFINES = select({
|
||||
":compiler_msvc": ["HWY_SHARED_DEFINE"],
|
||||
":compiler_clangcl": ["HWY_SHARED_DEFINE"],
|
||||
"//conditions:default": [],
|
||||
})
|
||||
|
||||
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
|
||||
# usages with an empty list.
|
||||
COMPAT = [
|
||||
|
@ -102,6 +141,8 @@ cc_library(
|
|||
name = "hwy",
|
||||
srcs = [
|
||||
"hwy/aligned_allocator.cc",
|
||||
"hwy/per_target.cc",
|
||||
"hwy/print.cc",
|
||||
"hwy/targets.cc",
|
||||
],
|
||||
# Normal headers with include guards
|
||||
|
@ -110,10 +151,12 @@ cc_library(
|
|||
"hwy/base.h",
|
||||
"hwy/cache_control.h",
|
||||
"hwy/detect_compiler_arch.h", # private
|
||||
"hwy/highway_export.h",
|
||||
"hwy/print.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
defines = DEFINES,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
# These are textual because config macros influence them:
|
||||
"hwy/detect_targets.h", # private
|
||||
|
@ -121,8 +164,12 @@ cc_library(
|
|||
# End of list
|
||||
"hwy/highway.h", # public
|
||||
"hwy/foreach_target.h", # public
|
||||
"hwy/per_target.h", # public
|
||||
"hwy/print-inl.h", # public
|
||||
"hwy/highway_export.h", # public
|
||||
"hwy/ops/arm_neon-inl.h",
|
||||
"hwy/ops/arm_sve-inl.h",
|
||||
"hwy/ops/emu128-inl.h",
|
||||
"hwy/ops/generic_ops-inl.h",
|
||||
"hwy/ops/scalar-inl.h",
|
||||
"hwy/ops/set_macros-inl.h",
|
||||
|
@ -140,9 +187,24 @@ cc_library(
|
|||
}),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "algo",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/algo/copy-inl.h",
|
||||
"hwy/contrib/algo/find-inl.h",
|
||||
"hwy/contrib/algo/transform-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "dot",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/dot/dot-inl.h",
|
||||
],
|
||||
|
@ -160,6 +222,8 @@ cc_library(
|
|||
"hwy/contrib/image/image.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_contrib_EXPORTS"],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
|
@ -168,6 +232,7 @@ cc_library(
|
|||
cc_library(
|
||||
name = "math",
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/math/math-inl.h",
|
||||
],
|
||||
|
@ -181,6 +246,9 @@ cc_library(
|
|||
name = "hwy_test_util",
|
||||
srcs = ["hwy/tests/test_util.cc"],
|
||||
hdrs = ["hwy/tests/test_util.h"],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_test_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
"hwy/tests/test_util-inl.h",
|
||||
"hwy/tests/hwy_gtest.h",
|
||||
|
@ -196,12 +264,16 @@ cc_library(
|
|||
name = "nanobenchmark",
|
||||
srcs = ["hwy/nanobenchmark.cc"],
|
||||
hdrs = ["hwy/nanobenchmark.h"],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "benchmark",
|
||||
srcs = ["hwy/examples/benchmark.cc"],
|
||||
copts = COPTS,
|
||||
deps = [
|
||||
":hwy",
|
||||
":nanobenchmark",
|
||||
|
@ -212,6 +284,8 @@ cc_library(
|
|||
name = "skeleton",
|
||||
srcs = ["hwy/examples/skeleton.cc"],
|
||||
hdrs = ["hwy/examples/skeleton.h"],
|
||||
copts = COPTS,
|
||||
local_defines = ["hwy_EXPORTS"],
|
||||
textual_hdrs = ["hwy/examples/skeleton-inl.h"],
|
||||
deps = [
|
||||
":hwy",
|
||||
|
@ -226,6 +300,9 @@ cc_binary(
|
|||
|
||||
# path, name
|
||||
HWY_TESTS = [
|
||||
("hwy/contrib/algo/", "copy_test"),
|
||||
("hwy/contrib/algo/", "find_test"),
|
||||
("hwy/contrib/algo/", "transform_test"),
|
||||
("hwy/contrib/dot/", "dot_test"),
|
||||
("hwy/contrib/image/", "image_test"),
|
||||
("hwy/contrib/math/", "math_test"),
|
||||
|
@ -238,20 +315,40 @@ HWY_TESTS = [
|
|||
("hwy/", "targets_test"),
|
||||
("hwy/tests/", "arithmetic_test"),
|
||||
("hwy/tests/", "blockwise_test"),
|
||||
("hwy/tests/", "blockwise_shift_test"),
|
||||
("hwy/tests/", "combine_test"),
|
||||
("hwy/tests/", "compare_test"),
|
||||
("hwy/tests/", "compress_test"),
|
||||
("hwy/tests/", "convert_test"),
|
||||
("hwy/tests/", "crypto_test"),
|
||||
("hwy/tests/", "demote_test"),
|
||||
("hwy/tests/", "float_test"),
|
||||
("hwy/tests/", "if_test"),
|
||||
("hwy/tests/", "interleaved_test"),
|
||||
("hwy/tests/", "logical_test"),
|
||||
("hwy/tests/", "mask_test"),
|
||||
("hwy/tests/", "mask_mem_test"),
|
||||
("hwy/tests/", "memory_test"),
|
||||
("hwy/tests/", "mul_test"),
|
||||
("hwy/tests/", "reduction_test"),
|
||||
("hwy/tests/", "reverse_test"),
|
||||
("hwy/tests/", "shift_test"),
|
||||
("hwy/tests/", "swizzle_test"),
|
||||
("hwy/tests/", "test_util_test"),
|
||||
]
|
||||
|
||||
HWY_TEST_COPTS = select({
|
||||
":compiler_msvc": [],
|
||||
"//conditions:default": [
|
||||
# gTest triggers this warning (which is enabled by the
|
||||
# extra-semi in COPTS), so we need to disable it here,
|
||||
# but it's still enabled for :hwy.
|
||||
"-Wno-c++98-compat-extra-semi",
|
||||
],
|
||||
})
|
||||
|
||||
HWY_TEST_DEPS = [
|
||||
":algo",
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
|
@ -272,12 +369,7 @@ HWY_TEST_DEPS = [
|
|||
srcs = [
|
||||
subdir + test + ".cc",
|
||||
],
|
||||
copts = COPTS + [
|
||||
# gTest triggers this warning (which is enabled by the
|
||||
# extra-semi in COPTS), so we need to disable it here,
|
||||
# but it's still enabled for :hwy.
|
||||
"-Wno-c++98-compat-extra-semi",
|
||||
],
|
||||
copts = COPTS + HWY_TEST_COPTS,
|
||||
features = select({
|
||||
"@platforms//cpu:riscv64": ["fully_static_link"],
|
||||
"//conditions:default": [],
|
||||
|
|
|
@ -19,7 +19,7 @@ if(POLICY CMP0083)
|
|||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 0.16.0) # Keep in sync with highway.h version
|
||||
project(hwy VERSION 1.0.0) # Keep in sync with highway.h version
|
||||
|
||||
# Directly define the ABI version from the cmake project() version values:
|
||||
set(LIBRARY_VERSION "${hwy_VERSION}")
|
||||
|
@ -48,6 +48,7 @@ set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4
|
|||
# arise due to compiler/platform changes. Enable this in CI/tests.
|
||||
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
|
||||
|
||||
set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
|
||||
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
|
||||
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
|
||||
|
||||
|
@ -62,12 +63,25 @@ check_cxx_source_compiles(
|
|||
HWY_EMSCRIPTEN
|
||||
)
|
||||
|
||||
set(HWY_CONTRIB_SOURCES
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
#if !defined(__riscv)
|
||||
static_assert(false, \"__riscv is not defined\");
|
||||
#endif
|
||||
return 0;
|
||||
}"
|
||||
HWY_RISCV
|
||||
)
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
# Glob all the traits so we don't need to modify this file when adding
|
||||
# additional special cases.
|
||||
file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
|
||||
list(APPEND HWY_CONTRIB_SOURCES
|
||||
hwy/contrib/dot/dot-inl.h
|
||||
hwy/contrib/image/image.cc
|
||||
hwy/contrib/image/image.h
|
||||
hwy/contrib/math/math-inl.h
|
||||
hwy/contrib/sort/disabled_targets.h
|
||||
hwy/contrib/sort/shared-inl.h
|
||||
hwy/contrib/sort/sorting_networks-inl.h
|
||||
hwy/contrib/sort/traits-inl.h
|
||||
|
@ -75,25 +89,8 @@ set(HWY_CONTRIB_SOURCES
|
|||
hwy/contrib/sort/vqsort-inl.h
|
||||
hwy/contrib/sort/vqsort.cc
|
||||
hwy/contrib/sort/vqsort.h
|
||||
hwy/contrib/sort/vqsort_128a.cc
|
||||
hwy/contrib/sort/vqsort_128d.cc
|
||||
hwy/contrib/sort/vqsort_f32a.cc
|
||||
hwy/contrib/sort/vqsort_f32d.cc
|
||||
hwy/contrib/sort/vqsort_f64a.cc
|
||||
hwy/contrib/sort/vqsort_f64d.cc
|
||||
hwy/contrib/sort/vqsort_i16a.cc
|
||||
hwy/contrib/sort/vqsort_i16d.cc
|
||||
hwy/contrib/sort/vqsort_i32a.cc
|
||||
hwy/contrib/sort/vqsort_i32d.cc
|
||||
hwy/contrib/sort/vqsort_i64a.cc
|
||||
hwy/contrib/sort/vqsort_i64d.cc
|
||||
hwy/contrib/sort/vqsort_u16a.cc
|
||||
hwy/contrib/sort/vqsort_u16d.cc
|
||||
hwy/contrib/sort/vqsort_u32a.cc
|
||||
hwy/contrib/sort/vqsort_u32d.cc
|
||||
hwy/contrib/sort/vqsort_u64a.cc
|
||||
hwy/contrib/sort/vqsort_u64d.cc
|
||||
)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
set(HWY_SOURCES
|
||||
hwy/aligned_allocator.cc
|
||||
|
@ -109,6 +106,7 @@ set(HWY_SOURCES
|
|||
hwy/nanobenchmark.h
|
||||
hwy/ops/arm_neon-inl.h
|
||||
hwy/ops/arm_sve-inl.h
|
||||
hwy/ops/emu128-inl.h
|
||||
hwy/ops/generic_ops-inl.h
|
||||
hwy/ops/scalar-inl.h
|
||||
hwy/ops/set_macros-inl.h
|
||||
|
@ -117,6 +115,11 @@ set(HWY_SOURCES
|
|||
hwy/ops/x86_128-inl.h
|
||||
hwy/ops/x86_256-inl.h
|
||||
hwy/ops/x86_512-inl.h
|
||||
hwy/per_target.cc
|
||||
hwy/per_target.h
|
||||
hwy/print-inl.h
|
||||
hwy/print.cc
|
||||
hwy/print.h
|
||||
hwy/targets.cc
|
||||
hwy/targets.h
|
||||
)
|
||||
|
@ -129,7 +132,10 @@ set(HWY_TEST_SOURCES
|
|||
)
|
||||
|
||||
if (MSVC)
|
||||
# TODO(janwas): add flags
|
||||
set(HWY_FLAGS
|
||||
# fix build error C1128 in blockwise*_test & arithmetic_test
|
||||
/bigobj
|
||||
)
|
||||
else()
|
||||
set(HWY_FLAGS
|
||||
# Avoid changing binaries based on the current time and date.
|
||||
|
@ -215,10 +221,23 @@ else()
|
|||
)
|
||||
endif() # HWY_CMAKE_ARM7
|
||||
|
||||
if(HWY_RISCV)
|
||||
list(APPEND HWY_FLAGS -march=rv64gcv1p0)
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS -menable-experimental-extensions)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (HWY_WARNINGS_ARE_ERRORS)
|
||||
list(APPEND HWY_FLAGS -Werror)
|
||||
endif()
|
||||
|
||||
# Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
|
||||
# because it was not compiled with 'atomics' or 'bulk-memory' features."
|
||||
if (HWY_EMSCRIPTEN)
|
||||
list(APPEND HWY_FLAGS -matomics)
|
||||
endif()
|
||||
|
||||
endif() # !MSVC
|
||||
|
||||
# By default prefer STATIC build (legacy behavior)
|
||||
|
@ -249,31 +268,57 @@ target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
|
|||
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_include_directories(hwy PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
if(NOT HWY_EMSCRIPTEN)
|
||||
# For GCC __atomic_store_8, see #887
|
||||
target_link_libraries(hwy atomic)
|
||||
endif()
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
set_property(TARGET hwy APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
|
||||
target_link_libraries(hwy_contrib hwy)
|
||||
target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_include_directories(hwy_contrib PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy_contrib PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy_contrib PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
|
||||
target_link_libraries(hwy_test hwy)
|
||||
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_include_directories(hwy_test PUBLIC
|
||||
$<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
|
||||
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
|
||||
target_compile_features(hwy_test PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy_test PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
set_property(TARGET hwy_test APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
|
@ -313,6 +358,7 @@ foreach (source ${HWY_SOURCES})
|
|||
endif()
|
||||
endforeach()
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
install(TARGETS hwy_contrib
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
|
@ -326,6 +372,7 @@ foreach (source ${HWY_CONTRIB_SOURCES})
|
|||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
install(TARGETS hwy_test
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
|
@ -343,13 +390,17 @@ endforeach()
|
|||
|
||||
# Add a pkg-config file for libhwy and the contrib/test libraries.
|
||||
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
|
||||
foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
|
||||
set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
list(APPEND HWY_PC_FILES libhwy-contrib.pc)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
foreach (pc ${HWY_PC_FILES})
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endforeach()
|
||||
|
||||
endif() # HWY_ENABLE_INSTALL
|
||||
endif() # HWY_ENABLE_INSTALL
|
||||
# -------------------------------------------------------- Examples
|
||||
if (HWY_ENABLE_EXAMPLES)
|
||||
|
||||
|
@ -360,14 +411,14 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
|||
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
|
||||
target_sources(hwy_benchmark PRIVATE
|
||||
hwy/nanobenchmark.h)
|
||||
# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
|
||||
# observe the difference in targets printed.
|
||||
# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
|
||||
# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
|
||||
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
|
||||
target_link_libraries(hwy_benchmark hwy)
|
||||
set_target_properties(hwy_benchmark
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
||||
|
||||
endif() # HWY_ENABLE_EXAMPLES
|
||||
endif() # HWY_ENABLE_EXAMPLES
|
||||
# -------------------------------------------------------- Tests
|
||||
|
||||
include(CTest)
|
||||
|
@ -411,34 +462,62 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
|
|||
if (CMAKE_VERSION VERSION_LESS 2.8.11)
|
||||
include_directories("${gtest_SOURCE_DIR}/include")
|
||||
endif()
|
||||
endif() # HWY_SYSTEM_GTEST
|
||||
endif() # HWY_SYSTEM_GTEST
|
||||
|
||||
set(HWY_TEST_FILES
|
||||
hwy/contrib/algo/copy_test.cc
|
||||
hwy/contrib/algo/find_test.cc
|
||||
hwy/contrib/algo/transform_test.cc
|
||||
hwy/aligned_allocator_test.cc
|
||||
hwy/base_test.cc
|
||||
hwy/highway_test.cc
|
||||
hwy/nanobenchmark_test.cc
|
||||
hwy/targets_test.cc
|
||||
hwy/examples/skeleton_test.cc
|
||||
hwy/tests/arithmetic_test.cc
|
||||
hwy/tests/blockwise_test.cc
|
||||
hwy/tests/blockwise_shift_test.cc
|
||||
hwy/tests/combine_test.cc
|
||||
hwy/tests/compare_test.cc
|
||||
hwy/tests/compress_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/crypto_test.cc
|
||||
hwy/tests/demote_test.cc
|
||||
hwy/tests/float_test.cc
|
||||
hwy/tests/if_test.cc
|
||||
hwy/tests/interleaved_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/mask_test.cc
|
||||
hwy/tests/mask_mem_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/mul_test.cc
|
||||
hwy/tests/reduction_test.cc
|
||||
hwy/tests/reverse_test.cc
|
||||
hwy/tests/shift_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
)
|
||||
|
||||
set(HWY_TEST_LIBS hwy hwy_test)
|
||||
|
||||
if (HWY_ENABLE_CONTRIB)
|
||||
list(APPEND HWY_TEST_LIBS hwy_contrib)
|
||||
|
||||
list(APPEND HWY_TEST_FILES
|
||||
hwy/contrib/dot/dot_test.cc
|
||||
hwy/contrib/image/image_test.cc
|
||||
# Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
|
||||
# not reproducible locally. Still tested via bazel build.
|
||||
# hwy/contrib/math/math_test.cc
|
||||
hwy/contrib/sort/sort_test.cc
|
||||
hwy/aligned_allocator_test.cc
|
||||
hwy/base_test.cc
|
||||
hwy/highway_test.cc
|
||||
hwy/targets_test.cc
|
||||
hwy/examples/skeleton_test.cc
|
||||
hwy/tests/arithmetic_test.cc
|
||||
hwy/tests/blockwise_test.cc
|
||||
hwy/tests/combine_test.cc
|
||||
hwy/tests/compare_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/crypto_test.cc
|
||||
hwy/tests/demote_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/mask_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/shift_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
)
|
||||
endif() # HWY_ENABLE_CONTRIB
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
|
||||
else()
|
||||
set(HWY_GTEST_LIBS gtest gtest_main)
|
||||
endif()
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
||||
foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
||||
|
@ -452,11 +531,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
|||
# that include us may set them.
|
||||
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test GTest::GTest GTest::Main)
|
||||
else()
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
|
||||
endif()
|
||||
target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
|
||||
|
||||
|
@ -474,4 +549,4 @@ endforeach ()
|
|||
# The skeleton test uses the skeleton library code.
|
||||
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
|
||||
|
||||
endif() # BUILD_TESTING
|
||||
endif() # BUILD_TESTING
|
||||
|
|
|
@ -64,7 +64,7 @@ via the below email)
|
|||
* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
|
||||
* [JPEG XL image codec](https://github.com/libjxl/libjxl)
|
||||
* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
|
||||
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort)
|
||||
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))
|
||||
|
||||
## Current status
|
||||
|
||||
|
@ -72,11 +72,9 @@ via the below email)
|
|||
|
||||
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
|
||||
WASM SIMD.
|
||||
WASM SIMD, RISC-V V.
|
||||
|
||||
SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV
|
||||
is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops
|
||||
which were not yet supported by GCC.
|
||||
SVE was initially tested using farm_sve (see acknowledgments).
|
||||
|
||||
### Versioning
|
||||
|
||||
|
@ -85,18 +83,19 @@ incrementing MINOR after backward-compatible additions and PATCH after
|
|||
backward-compatible fixes. We recommend using releases (rather than the Git tip)
|
||||
because they are tested more extensively, see below.
|
||||
|
||||
Version 0.11 is considered stable enough to use in other projects.
|
||||
Version 1.0 will signal an increased focus on backwards compatibility and will
|
||||
be reached after the RVV target is finished (planned for 2022H1).
|
||||
The current version 1.0 signals an increased focus on backwards compatibility.
|
||||
Applications using documented functionality will remain compatible with future
|
||||
updates that have the same major version number.
|
||||
|
||||
### Testing
|
||||
|
||||
Continuous integration tests build with a recent version of Clang (running on
|
||||
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
|
||||
native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
|
||||
x86).
|
||||
|
||||
Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
|
||||
GCC cross-compile and QEMU. See the
|
||||
[testing process](g3doc/release_testing_process.md) for details.
|
||||
Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
|
||||
cross-compile. See the [testing process](g3doc/release_testing_process.md) for
|
||||
details.
|
||||
|
||||
### Related modules
|
||||
|
||||
|
@ -148,21 +147,29 @@ portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
|
|||
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
|
||||
alternatives for use-cases requiring an upper bound on the lanes:
|
||||
|
||||
- For up to a power of two `N`, specify `CappedTag<T, N>` (or
|
||||
equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as
|
||||
a narrow matrix. A loop is still required because vectors may actually have
|
||||
fewer than `N` lanes.
|
||||
- For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
|
||||
`HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
|
||||
the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
|
||||
useful for data structures such as a narrow matrix. A loop is still required
|
||||
because vectors may actually have fewer than `N` lanes.
|
||||
|
||||
- For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
|
||||
supported `N` depends on the target, but is guaranteed to be at least
|
||||
`16/sizeof(T)`.
|
||||
|
||||
Functions using Highway must either be inside `namespace HWY_NAMESPACE {`
|
||||
(possibly nested in one or more other namespaces defined by the project), OR
|
||||
each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE;
|
||||
hn::LoadDup128()`. Additionally, each function using Highway must either be
|
||||
prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
`HWY_AFTER_NAMESPACE()`.
|
||||
Due to ADL restrictions, user code calling Highway ops must either:
|
||||
* Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
|
||||
* prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
|
||||
hn::Add()`; or
|
||||
* add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
|
||||
|
||||
Additionally, each function that calls Highway ops must either be prefixed with
|
||||
`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
|
||||
their opening brace.
|
||||
|
||||
The entry points into code using Highway differ slightly depending on whether
|
||||
they use static or dynamic dispatch.
|
||||
|
||||
* For static dispatch, `HWY_TARGET` will be the best available target among
|
||||
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
|
||||
|
@ -222,12 +229,26 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
this avoids the (potentially large) cost of predication or partial
|
||||
loads/stores on older targets, and does not duplicate code.
|
||||
|
||||
* Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
|
||||
takes care of the loop and remainder handling and you simply define a
|
||||
generic lambda function (C++14) or functor which receives the current vector
|
||||
from the input/output array, plus optionally vectors from up to two extra
|
||||
input arrays, and returns the value to write to the input/output array.
|
||||
|
||||
Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
|
||||
|
||||
```
|
||||
Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
|
||||
return MulAdd(Set(d, alpha), v, v1);
|
||||
});
|
||||
```
|
||||
|
||||
* Process whole vectors as above, followed by a scalar loop:
|
||||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
|
||||
```
|
||||
The template parameter and second function arguments are again not needed.
|
||||
|
||||
|
@ -247,10 +268,14 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
}
|
||||
```
|
||||
Now the template parameter and third function argument can be used inside
|
||||
`LoopBody` to 'blend' the new partial vector with previous memory contents:
|
||||
`Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.
|
||||
`LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
|
||||
with the previous contents of memory at subsequent locations:
|
||||
`BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
|
||||
`MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
|
||||
`num_remaining` elements and returns zero in other lanes.
|
||||
|
||||
This is a good default when it is infeasible to ensure vectors are padded.
|
||||
This is a good default when it is infeasible to ensure vectors are padded,
|
||||
but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
|
||||
In contrast to the scalar loop, only a single final iteration is needed.
|
||||
The increased code size from two loop bodies is expected to be worthwhile
|
||||
because it avoids the cost of masking in all but the final iteration.
|
||||
|
@ -260,6 +285,7 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
* [Highway introduction (slides)](g3doc/highway_intro.pdf)
|
||||
* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
|
||||
* [Design philosophy and comparison](g3doc/design_philosophy.md)
|
||||
* [Implementation details](g3doc/impl_details.md)
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
|
|
|
@ -1,3 +1,37 @@
|
|||
highway (1.0.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* ABI change: 64-bit target values, more room for expansion
|
||||
* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
|
||||
* Add HWY_SVE2_128 target
|
||||
* Sort speedups especially for 128-bit
|
||||
* Documentation clarifications
|
||||
* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
|
||||
* Improved SVE codegen
|
||||
* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
|
||||
* MSVC 2017 workarounds
|
||||
* Support for runtime dispatch on Arm/GCC/Linux
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Wed, 27 Jul 2022 10:00:00 +0200
|
||||
|
||||
highway (0.17.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
|
||||
* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
|
||||
* Add MulFixedPoint15, Or3
|
||||
* Add Copy[If], Find[If], Generate, Replace[If] algos
|
||||
* Add HWY_EMU128 target (replaces HWY_SCALAR)
|
||||
* HWY_RVV is feature-complete
|
||||
* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
|
||||
* Extend ConcatOdd/Even and StoreInterleaved* to all types
|
||||
* Allow CappedTag<T, nonPowerOfTwo>
|
||||
* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
|
||||
* Expand documentation
|
||||
* Fix RDTSCP crash in nanobenchmark
|
||||
* Fix XCR0 check (was ignoring AVX3 on ICL)
|
||||
* Support Arm/RISC-V timers
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Fri, 20 May 2022 10:00:00 +0200
|
||||
|
||||
highway (0.16.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add contrib/sort (vectorized quicksort)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -62,8 +63,8 @@ size_t NextAlignedOffset() {
|
|||
|
||||
} // namespace
|
||||
|
||||
void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
|
||||
void* opaque_ptr) {
|
||||
HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
|
||||
AllocPtr alloc_ptr, void* opaque_ptr) {
|
||||
HWY_ASSERT(payload_size != 0); // likely a bug in caller
|
||||
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
|
||||
HWY_DASSERT(false && "payload_size too large");
|
||||
|
@ -109,8 +110,8 @@ void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
|
|||
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
|
||||
}
|
||||
|
||||
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr) {
|
||||
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
|
||||
FreePtr free_ptr, void* opaque_ptr) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
|
@ -126,9 +127,10 @@ void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
|||
}
|
||||
|
||||
// static
|
||||
void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter) {
|
||||
HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
|
||||
FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -22,7 +23,6 @@
|
|||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -69,8 +69,8 @@ class FakeAllocator {
|
|||
void Free(void* memory) {
|
||||
if (!memory) return;
|
||||
EXPECT_NE(allocs_.end(), allocs_.find(memory));
|
||||
free(memory);
|
||||
allocs_.erase(memory);
|
||||
free(memory);
|
||||
}
|
||||
|
||||
std::set<void*> allocs_;
|
||||
|
@ -276,9 +276,3 @@ TEST(AlignedAllocatorTest, DefaultInit) {
|
|||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -20,12 +21,13 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cfloat>
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler-specific definitions
|
||||
|
||||
|
@ -57,7 +59,13 @@
|
|||
#else
|
||||
|
||||
#define HWY_RESTRICT __restrict__
|
||||
// force inlining without optimization enabled creates very inefficient code
|
||||
// that can cause compiler timeout
|
||||
#ifdef __OPTIMIZE__
|
||||
#define HWY_INLINE inline __attribute__((always_inline))
|
||||
#else
|
||||
#define HWY_INLINE inline
|
||||
#endif
|
||||
#define HWY_NOINLINE __attribute__((noinline))
|
||||
#define HWY_FLATTEN __attribute__((flatten))
|
||||
#define HWY_NORETURN __attribute__((noreturn))
|
||||
|
@ -165,6 +173,14 @@
|
|||
#define HWY_IS_TSAN 0
|
||||
#endif
|
||||
|
||||
// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
|
||||
// You can disable MSAN by adding this attribute to the function that fails.
|
||||
#if HWY_IS_MSAN
|
||||
#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
|
||||
#else
|
||||
#define HWY_ATTR_NO_MSAN
|
||||
#endif
|
||||
|
||||
// For enabling HWY_DASSERT and shortening tests in slower debug builds
|
||||
#if !defined(HWY_IS_DEBUG_BUILD)
|
||||
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
|
||||
|
@ -219,19 +235,17 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|||
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
||||
// by concatenating base type and bits.
|
||||
|
||||
#if HWY_ARCH_ARM && (__ARM_FP & 2)
|
||||
#define HWY_NATIVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_NATIVE_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
|
||||
// always supported on aarch64, for v7 only if -mfp16-format is given.
|
||||
#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
|
||||
using float16_t = __fp16;
|
||||
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
|
||||
// arguments, so use a wrapper.
|
||||
// TODO(janwas): replace with _Float16 when that is supported?
|
||||
// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
|
||||
// Required for Clang RVV if the float16 extension is used.
|
||||
#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
|
||||
using float16_t = _Float16;
|
||||
// Otherwise emulate
|
||||
#else
|
||||
struct float16_t {
|
||||
uint16_t bits;
|
||||
|
@ -247,6 +261,44 @@ struct bfloat16_t {
|
|||
using float32_t = float;
|
||||
using float64_t = double;
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
|
||||
// https://reviews.llvm.org/D86310
|
||||
struct alignas(16) uint128_t {
|
||||
uint64_t lo; // little-endian layout
|
||||
uint64_t hi;
|
||||
};
|
||||
|
||||
// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
|
||||
// field is to be compared (Lt128Upper instead of Lt128).
|
||||
struct alignas(16) K64V64 {
|
||||
uint64_t value; // little-endian layout
|
||||
uint64_t key;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
|
||||
const uint128_t& b) {
|
||||
return b < a;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return a.key < b.key;
|
||||
}
|
||||
// Required for std::greater.
|
||||
static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
|
||||
const K64V64& b) {
|
||||
return b < a;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Controlling overload resolution (SFINAE)
|
||||
|
||||
|
@ -299,6 +351,11 @@ HWY_API constexpr bool IsSame() {
|
|||
hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
|
||||
#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
|
||||
#define HWY_IF_LANE_SIZE_LT(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
|
||||
|
||||
#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
|
||||
hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
|
||||
|
||||
// Empty struct used as a size tag type.
|
||||
template <size_t N>
|
||||
|
@ -328,12 +385,16 @@ struct Relations<uint8_t> {
|
|||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = uint16_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = int16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint16_t> {
|
||||
|
@ -341,6 +402,8 @@ struct Relations<uint16_t> {
|
|||
using Signed = int16_t;
|
||||
using Wide = uint32_t;
|
||||
using Narrow = uint8_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int16_t> {
|
||||
|
@ -348,6 +411,8 @@ struct Relations<int16_t> {
|
|||
using Signed = int16_t;
|
||||
using Wide = int32_t;
|
||||
using Narrow = int8_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint32_t> {
|
||||
|
@ -356,6 +421,8 @@ struct Relations<uint32_t> {
|
|||
using Float = float;
|
||||
using Wide = uint64_t;
|
||||
using Narrow = uint16_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int32_t> {
|
||||
|
@ -364,13 +431,18 @@ struct Relations<int32_t> {
|
|||
using Float = float;
|
||||
using Wide = int64_t;
|
||||
using Narrow = int16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint64_t> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Wide = uint128_t;
|
||||
using Narrow = uint32_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<int64_t> {
|
||||
|
@ -378,6 +450,15 @@ struct Relations<int64_t> {
|
|||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = int32_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint128_t> {
|
||||
using Unsigned = uint128_t;
|
||||
using Narrow = uint64_t;
|
||||
enum { is_signed = 0 };
|
||||
enum { is_float = 0 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float16_t> {
|
||||
|
@ -385,12 +466,16 @@ struct Relations<float16_t> {
|
|||
using Signed = int16_t;
|
||||
using Float = float16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<bfloat16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<float> {
|
||||
|
@ -399,6 +484,8 @@ struct Relations<float> {
|
|||
using Float = float;
|
||||
using Wide = double;
|
||||
using Narrow = float16_t;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
};
|
||||
template <>
|
||||
struct Relations<double> {
|
||||
|
@ -406,6 +493,8 @@ struct Relations<double> {
|
|||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = float;
|
||||
enum { is_signed = 1 };
|
||||
enum { is_float = 1 };
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
|
@ -432,6 +521,10 @@ struct TypeFromSize<8> {
|
|||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<16> {
|
||||
using Unsigned = uint128_t;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
|
@ -457,6 +550,24 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
|
|||
template <size_t N>
|
||||
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
|
||||
|
||||
// Avoid confusion with SizeTag where the parameter is a lane size.
|
||||
using UnsignedTag = SizeTag<0>;
|
||||
using SignedTag = SizeTag<0x100>; // integer
|
||||
using FloatTag = SizeTag<0x200>;
|
||||
|
||||
template <typename T, class R = detail::Relations<T>>
|
||||
constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
|
||||
return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
|
||||
}
|
||||
|
||||
// For when we only want to distinguish FloatTag from everything else.
|
||||
using NonFloatTag = SizeTag<0x400>;
|
||||
|
||||
template <typename T, class R = detail::Relations<T>>
|
||||
constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
|
||||
return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
|
@ -502,11 +613,11 @@ HWY_API constexpr T LowestValue() {
|
|||
}
|
||||
template <>
|
||||
constexpr float LowestValue<float>() {
|
||||
return -FLT_MAX;
|
||||
return -3.402823466e+38F;
|
||||
}
|
||||
template <>
|
||||
constexpr double LowestValue<double>() {
|
||||
return -DBL_MAX;
|
||||
return -1.7976931348623158e+308;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -515,41 +626,51 @@ HWY_API constexpr T HighestValue() {
|
|||
}
|
||||
template <>
|
||||
constexpr float HighestValue<float>() {
|
||||
return FLT_MAX;
|
||||
return 3.402823466e+38F;
|
||||
}
|
||||
template <>
|
||||
constexpr double HighestValue<double>() {
|
||||
return DBL_MAX;
|
||||
return 1.7976931348623158e+308;
|
||||
}
|
||||
|
||||
// Returns width in bits of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr int MantissaBits() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr int MantissaBits<float>() {
|
||||
return 23;
|
||||
}
|
||||
template <>
|
||||
constexpr int MantissaBits<double>() {
|
||||
return 52;
|
||||
}
|
||||
|
||||
// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
|
||||
// the largest possible (biased) exponent field. Used by IsInf.
|
||||
template <typename T>
|
||||
constexpr MakeSigned<T> MaxExponentTimes2() {
|
||||
return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
|
||||
}
|
||||
|
||||
// Returns bitmask of the sign bit in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr MakeUnsigned<T> SignMask() {
|
||||
return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
|
||||
}
|
||||
|
||||
// Returns bitmask of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T ExponentMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t ExponentMask<uint32_t>() {
|
||||
return 0x7F800000;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t ExponentMask<uint64_t>() {
|
||||
return 0x7FF0000000000000ULL;
|
||||
constexpr MakeUnsigned<T> ExponentMask() {
|
||||
return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
|
||||
}
|
||||
|
||||
// Returns bitmask of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T MantissaMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t MantissaMask<uint32_t>() {
|
||||
return 0x007FFFFF;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t MantissaMask<uint64_t>() {
|
||||
return 0x000FFFFFFFFFFFFFULL;
|
||||
constexpr MakeUnsigned<T> MantissaMask() {
|
||||
return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
|
||||
}
|
||||
|
||||
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
||||
|
@ -569,6 +690,21 @@ constexpr double MantissaEnd<double>() {
|
|||
return 4503599627370496.0; // 1 << 52
|
||||
}
|
||||
|
||||
// Returns width in bits of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr int ExponentBits() {
|
||||
// Exponent := remaining bits after deducting sign and mantissa.
|
||||
return 8 * sizeof(T) - 1 - MantissaBits<T>();
|
||||
}
|
||||
|
||||
// Returns largest value of the biased exponent field in IEEE binary32/64,
|
||||
// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
|
||||
// This is expressed as a signed integer for more efficient comparison.
|
||||
template <typename T>
|
||||
constexpr MakeSigned<T> MaxExponentField() {
|
||||
return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
|
||||
|
@ -602,7 +738,7 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
|||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanForward64 not available
|
||||
uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
unsigned long index;
|
||||
unsigned long index; // NOLINT
|
||||
if (lsb == 0) {
|
||||
uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
_BitScanForward(&index, msb);
|
||||
|
@ -637,7 +773,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanReverse64 not available
|
||||
const uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
unsigned long index;
|
||||
unsigned long index; // NOLINT
|
||||
if (msb == 0) {
|
||||
const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
_BitScanReverse(&index, lsb);
|
||||
|
@ -653,7 +789,7 @@ HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
|||
}
|
||||
|
||||
HWY_API size_t PopCount(uint64_t x) {
|
||||
#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
|
||||
#if HWY_COMPILER_GCC // includes clang
|
||||
return static_cast<size_t>(__builtin_popcountll(x));
|
||||
// This instruction has a separate feature flag, but is often called from
|
||||
// non-SIMD code, so we don't want to require dynamic dispatch. It was first
|
||||
|
@ -662,7 +798,8 @@ HWY_API size_t PopCount(uint64_t x) {
|
|||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
|
||||
return _mm_popcnt_u64(x);
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
||||
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
|
||||
return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
|
||||
_mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
|
||||
#else
|
||||
x -= ((x >> 1) & 0x5555555555555555ULL);
|
||||
x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
|
||||
|
@ -715,22 +852,30 @@ HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
|
|||
#endif
|
||||
}
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#pragma intrinsic(memcpy)
|
||||
#pragma intrinsic(memset)
|
||||
#endif
|
||||
|
||||
// The source/destination must not overlap/alias.
|
||||
template <size_t kBytes, typename From, typename To>
|
||||
HWY_API void CopyBytes(const From* from, To* to) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
const uint8_t* HWY_RESTRICT from_bytes =
|
||||
reinterpret_cast<const uint8_t*>(from);
|
||||
uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
|
||||
for (size_t i = 0; i < kBytes; ++i) {
|
||||
to_bytes[i] = from_bytes[i];
|
||||
}
|
||||
memcpy(to, from, kBytes);
|
||||
#else
|
||||
// Avoids horrible codegen on Clang (series of PINSRB)
|
||||
__builtin_memcpy(to, from, kBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <size_t kBytes, typename To>
|
||||
HWY_API void ZeroBytes(To* to) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
memset(to, 0, kBytes);
|
||||
#else
|
||||
__builtin_memset(to, 0, kBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API float F32FromBF16(bfloat16_t bf) {
|
||||
uint32_t bits = bf.bits;
|
||||
bits <<= 16;
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -21,7 +22,7 @@
|
|||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "base_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
|
@ -30,25 +31,26 @@ namespace hwy {
|
|||
namespace HWY_NAMESPACE {
|
||||
|
||||
HWY_NOINLINE void TestAllLimits() {
|
||||
HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
|
||||
HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
|
||||
HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
|
||||
HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
|
||||
HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
|
||||
LimitsMin<int64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
|
||||
HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
|
||||
HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
|
||||
}
|
||||
|
||||
struct TestLowestHighest {
|
||||
|
@ -88,6 +90,10 @@ HWY_NOINLINE void TestAllType() {
|
|||
ForUnsignedTypes(TestIsUnsigned());
|
||||
ForSignedTypes(TestIsSigned());
|
||||
ForFloatTypes(TestIsFloat());
|
||||
|
||||
static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
|
||||
static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
|
||||
static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
|
||||
}
|
||||
|
||||
struct TestIsSame {
|
||||
|
@ -102,54 +108,54 @@ struct TestIsSame {
|
|||
HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
|
||||
|
||||
HWY_NOINLINE void TestAllBitScan() {
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
|
||||
HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
|
||||
HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0),
|
||||
HWY_ASSERT_EQ(size_t{0},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(0),
|
||||
HWY_ASSERT_EQ(size_t{0},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(1),
|
||||
HWY_ASSERT_EQ(size_t{1},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(1),
|
||||
HWY_ASSERT_EQ(size_t{1},
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
|
||||
HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
|
||||
HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
|
||||
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
|
||||
HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
|
||||
HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
|
||||
HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t(62),
|
||||
HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
|
||||
HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t{62},
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(63),
|
||||
HWY_ASSERT_EQ(size_t{63},
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPopCount() {
|
||||
HWY_ASSERT_EQ(size_t(0), PopCount(0u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(1u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(2u));
|
||||
HWY_ASSERT_EQ(size_t(2), PopCount(3u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
|
||||
HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
|
||||
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
|
||||
HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -169,10 +175,4 @@ HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
|
|||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -50,10 +51,10 @@ namespace hwy {
|
|||
#define HWY_ATTR_CACHE
|
||||
#endif
|
||||
|
||||
// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
|
||||
// serves as a full fence (waits for all prior instructions to complete).
|
||||
// No effect on non-x86.
|
||||
// DEPRECATED due to differing behavior across architectures AND vendors.
|
||||
// Delays subsequent loads until prior loads are visible. Beware of potentially
|
||||
// differing behavior across architectures and vendors: on Intel but not
|
||||
// AMD CPUs, also serves as a full fence (waits for all prior instructions to
|
||||
// complete).
|
||||
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_lfence();
|
||||
|
@ -76,7 +77,7 @@ template <typename T>
|
|||
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
|
||||
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
||||
#elif HWY_COMPILER_GCC // includes clang
|
||||
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
|
||||
// desirable, so use the default 3 (keep in caches).
|
||||
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
|
||||
|
|
|
@ -0,0 +1,138 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
||||
#endif
|
||||
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These functions avoid having to write a loop plus remainder handling in the
|
||||
// (unfortunately still common) case where arrays are not aligned/padded. If the
|
||||
// inputs are known to be aligned/padded, it is more efficient to write a single
|
||||
// loop using Load(). We do not provide a CopyAlignedPadded because it
|
||||
// would be more verbose than such a loop.
|
||||
|
||||
// Fills `to`[0, `count`) with `value`.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> v = Set(d, value);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
StoreU(v, d, to + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
SafeFillN(remaining, value, d, to + idx);
|
||||
}
|
||||
|
||||
// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, from + idx);
|
||||
StoreU(v, d, to + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
SafeCopyN(remaining, d, from + idx, to + idx);
|
||||
}
|
||||
|
||||
// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
|
||||
// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
|
||||
// of the newly written elements in `to`.
|
||||
//
|
||||
// `func` is either a functor with a templated operator()(d, v) returning a
|
||||
// mask, or a generic lambda if using C++14. Due to apparent limitations of
|
||||
// Clang on Windows, it is currently necessary to add HWY_ATTR before the
|
||||
// opening { of the lambda to avoid errors about "function .. requires target".
|
||||
//
|
||||
// NOTE: this is only supported for 16-, 32- or 64-bit types.
|
||||
// NOTE: Func may be called a second time for elements it has already seen, but
|
||||
// these elements will not be written to `to` again.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, from + idx);
|
||||
to += CompressBlendedStore(v, func(d, v), d, to);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return to;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
// Workaround for -Waggressive-loop-optimizations on GCC 8
|
||||
// (iteration 2305843009213693951 invokes undefined behavior for T=i64)
|
||||
const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
|
||||
const T* HWY_RESTRICT from_idx =
|
||||
reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
|
||||
const V1 v = LoadU(d1, from_idx);
|
||||
// Avoid storing to `to` unless we know it should be kept - otherwise, we
|
||||
// might overrun the end if it was allocated for the exact count.
|
||||
if (CountTrue(d1, func(d1, v)) == 0) continue;
|
||||
StoreU(v, d1, to);
|
||||
to += 1;
|
||||
}
|
||||
#else
|
||||
// Start index of the last unaligned whole vector, ending at the array end.
|
||||
const size_t last = count - N;
|
||||
// Number of elements before `from` or already written.
|
||||
const size_t invalid = idx - last;
|
||||
HWY_DASSERT(0 != invalid && invalid < N);
|
||||
const Mask<D> mask = Not(FirstN(d, invalid));
|
||||
const Vec<D> v = MaskedLoad(mask, d, from + last);
|
||||
to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
|
||||
#endif
|
||||
return to;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
|
|
@ -0,0 +1,199 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/copy-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to Transform, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns random integer in [0, 128), which fits in any lane type.
|
||||
template <typename T>
|
||||
T Random7Bit(RandomState& rng) {
|
||||
return static_cast<T>(Random32(&rng) & 127);
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
struct IsOdd {
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return TestBit(v, Set(d, TFromD<D>{1}));
|
||||
}
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
|
||||
// ForFloatTypes.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
for (size_t count = 0; count < 2 * N; ++count) {
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test()(d, count, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestFill {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// HWY_MAX prevents error when misalign == count == 0.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* expected = pa.get() + misalign_a;
|
||||
const T value = Random7Bit<T>(rng);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = value;
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
|
||||
T* actual = pb.get() + misalign_b;
|
||||
|
||||
actual[count] = T{0}; // sentinel
|
||||
Fill(d, value, count, actual);
|
||||
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFill() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
|
||||
}
|
||||
|
||||
struct TestCopy {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random7Bit<T>(rng);
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
T* b = pb.get() + misalign_b;
|
||||
|
||||
Copy(d, a, count, b);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
|
||||
__LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllCopy() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
|
||||
}
|
||||
|
||||
struct TestCopyIf {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random7Bit<T>(rng);
|
||||
}
|
||||
const size_t padding = Lanes(ScalableTag<T>());
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
|
||||
T* b = pb.get() + misalign_b;
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
size_t num_odd = 0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
if (a[i] & 1) {
|
||||
expected[num_odd++] = a[i];
|
||||
}
|
||||
}
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
|
||||
return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
|
||||
};
|
||||
#else
|
||||
const IsOdd is_odd;
|
||||
#endif
|
||||
T* end = CopyIf(d, a, count, b, is_odd);
|
||||
const size_t num_written = static_cast<size_t>(end - b);
|
||||
HWY_ASSERT_EQ(num_odd, num_written);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllCopyIf() {
|
||||
ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(CopyTest);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
|
||||
HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
|
@ -0,0 +1,109 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns index of the first element equal to `value` in `in[0, count)`, or
|
||||
// `count` if not found.
|
||||
template <class D, typename T = TFromD<D>>
|
||||
size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> broadcasted = Set(d, value);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
}
|
||||
|
||||
if (i != count) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Scan single elements.
|
||||
const CappedTag<T, 1> d1;
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
|
||||
for (; i < count; ++i) {
|
||||
if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - i;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, in + i);
|
||||
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
|
||||
const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
#endif // HWY_MEM_OPS_MIGHT_FAULT
|
||||
}
|
||||
|
||||
return count; // not found
|
||||
}
|
||||
|
||||
// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
|
||||
// returns true, otherwise `count`.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
}
|
||||
|
||||
if (i != count) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Scan single elements.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; i < count; ++i) {
|
||||
if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - i;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, in + i);
|
||||
// Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
|
||||
const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
|
||||
if (pos >= 0) return i + static_cast<size_t>(pos);
|
||||
#endif // HWY_MEM_OPS_MIGHT_FAULT
|
||||
}
|
||||
|
||||
return count; // not found
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
|
|
@ -0,0 +1,219 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/find-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to FindIf, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Returns random number in [-8, 8) - we use knowledge of the range to Find()
|
||||
// values we know are not present.
|
||||
template <typename T>
|
||||
T Random(RandomState& rng) {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
const double val = (bits - 512) / 64.0;
|
||||
// Clamp negative to zero for unsigned types.
|
||||
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
class GreaterThan {
|
||||
public:
|
||||
GreaterThan(int val) : val_(val) {}
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
|
||||
}
|
||||
|
||||
private:
|
||||
int val_;
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestFind) with all arg combinations.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
// Find() checks 8 vectors at a time, so we want to cover a fairly large
|
||||
// range without oversampling (checking every possible count).
|
||||
std::vector<size_t> counts(AdjustedReps(512));
|
||||
for (size_t& count : counts) {
|
||||
count = static_cast<size_t>(rng()) % (16 * N + 1);
|
||||
}
|
||||
counts[0] = 0; // ensure we test count=0.
|
||||
|
||||
for (size_t count : counts) {
|
||||
for (size_t m : misalignments) {
|
||||
Test()(d, count, m, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestFind {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Must allocate at least one even if count is zero.
|
||||
AlignedFreeUniquePtr<T[]> storage =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign + count));
|
||||
T* in = storage.get() + misalign;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
in[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
// For each position, search for that element (which we know is there)
|
||||
for (size_t pos = 0; pos < count; ++pos) {
|
||||
const size_t actual = Find(d, in[pos], in, count);
|
||||
|
||||
// We may have found an earlier occurrence of the same value; ensure the
|
||||
// value is the same, and that it is the first.
|
||||
if (!IsEqual(in[pos], in[actual])) {
|
||||
fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
static_cast<double>(in[actual]), static_cast<int>(actual),
|
||||
static_cast<double>(in[pos]));
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
for (size_t i = 0; i < actual; ++i) {
|
||||
if (IsEqual(in[i], in[pos])) {
|
||||
fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
static_cast<double>(in[i]), static_cast<int>(i),
|
||||
static_cast<int>(actual));
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also search for values we know not to be present (out of range)
|
||||
HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
|
||||
HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFind() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
|
||||
}
|
||||
|
||||
struct TestFindIf {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
using TI = MakeSigned<T>;
|
||||
// Must allocate at least one even if count is zero.
|
||||
AlignedFreeUniquePtr<T[]> storage =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign + count));
|
||||
T* in = storage.get() + misalign;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
in[i] = Random<T>(rng);
|
||||
HWY_ASSERT(in[i] < 8);
|
||||
HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
|
||||
}
|
||||
|
||||
bool found_any = false;
|
||||
bool not_found_any = false;
|
||||
|
||||
// unsigned T would be promoted to signed and compare greater than any
|
||||
// negative val, whereas Set() would just cast to an unsigned value and the
|
||||
// comparison remains unsigned, so avoid negative numbers there.
|
||||
const int min_val = IsSigned<T>() ? -9 : 0;
|
||||
// Includes out-of-range value 9 to test the not-found path.
|
||||
for (int val = min_val; val <= 9; ++val) {
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto greater = [val](const auto d, const auto v) HWY_ATTR {
|
||||
return Gt(v, Set(d, static_cast<T>(val)));
|
||||
};
|
||||
#else
|
||||
const GreaterThan greater(val);
|
||||
#endif
|
||||
const size_t actual = FindIf(d, in, count, greater);
|
||||
found_any |= actual < count;
|
||||
not_found_any |= actual == count;
|
||||
|
||||
const auto pos = std::find_if(
|
||||
in, in + count, [val](T x) { return x > static_cast<T>(val); });
|
||||
// Convert returned iterator to index.
|
||||
const size_t expected = static_cast<size_t>(pos - in);
|
||||
if (expected != actual) {
|
||||
fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
|
||||
val, static_cast<int>(expected), static_cast<int>(actual));
|
||||
hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
|
||||
0, count);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// We will always not-find something due to val=9.
|
||||
HWY_ASSERT(not_found_any);
|
||||
// We'll find something unless the input is empty or {0} - because 0 > i
|
||||
// is false for all i=[0,9].
|
||||
if (count != 0 && in[0] != 0) {
|
||||
HWY_ASSERT(found_any);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllFindIf() {
|
||||
ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(FindTest);
|
||||
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
|
||||
HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
|
@ -0,0 +1,262 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These functions avoid having to write a loop plus remainder handling in the
|
||||
// (unfortunately still common) case where arrays are not aligned/padded. If the
|
||||
// inputs are known to be aligned/padded, it is more efficient to write a single
|
||||
// loop using Load(). We do not provide a TransformAlignedPadded because it
|
||||
// would be more verbose than such a loop.
|
||||
//
|
||||
// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
|
||||
// generic lambda if using C++14. Due to apparent limitations of Clang on
|
||||
// Windows, it is currently necessary to add HWY_ATTR before the opening { of
|
||||
// the lambda to avoid errors about "always_inline function .. requires target".
|
||||
//
|
||||
// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
|
||||
// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
|
||||
// vector.
|
||||
|
||||
// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
|
||||
// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
|
||||
// the value of its lane i is i, and increases by `Lanes(d)` after every call.
|
||||
// Note that some of these indices may be `>= count`, but the elements that
|
||||
// `func` returns in those lanes will not be written to `out`.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
|
||||
const RebindToUnsigned<D> du;
|
||||
using TU = TFromD<decltype(du)>;
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
Vec<decltype(du)> vidx = Iota(du, 0);
|
||||
for (; idx + N <= count; idx += N) {
|
||||
StoreU(func(d, vidx), d, out + idx);
|
||||
vidx = Add(vidx, Set(du, static_cast<TU>(N)));
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const RebindToUnsigned<decltype(d1)> du1;
|
||||
for (; idx < count; ++idx) {
|
||||
StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
BlendedStore(func(d, vidx), mask, d, out + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
|
||||
// array elements by a constant.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(func(d, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
StoreU(func(d1, v), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(func(d, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
|
||||
// multiplying array elements by those of another array.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
|
||||
const T* HWY_RESTRICT in1, const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
const Vec<D> v1 = LoadU(d, in1 + idx);
|
||||
StoreU(func(d, v, v1), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
const V1 v1 = LoadU(d1, in1 + idx);
|
||||
StoreU(func(d1, v, v1), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
|
||||
BlendedStore(func(d, v, v1), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
|
||||
// usage: FMA of elements from three arrays, stored into the first array.
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
|
||||
const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
const Vec<D> v = LoadU(d, inout + idx);
|
||||
const Vec<D> v1 = LoadU(d, in1 + idx);
|
||||
const Vec<D> v2 = LoadU(d, in2 + idx);
|
||||
StoreU(func(d, v, v1, v2), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
const V1 v1 = LoadU(d1, in1 + idx);
|
||||
const V1 v2 = LoadU(d1, in2 + idx);
|
||||
StoreU(func(d1, v, v1, v2), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
|
||||
const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
|
||||
BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D, typename T = TFromD<D>>
|
||||
void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> old_v = Set(d, old_t);
|
||||
const Vec<D> new_v = Set(d, new_t);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
|
||||
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v1 = LoadU(d1, inout + idx);
|
||||
StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D, class Func, typename T = TFromD<D>>
|
||||
void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
|
||||
const Func& func) {
|
||||
const size_t N = Lanes(d);
|
||||
const Vec<D> new_v = Set(d, new_t);
|
||||
|
||||
size_t idx = 0;
|
||||
for (; idx + N <= count; idx += N) {
|
||||
Vec<D> v = LoadU(d, inout + idx);
|
||||
StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
|
||||
}
|
||||
|
||||
// `count` was a multiple of the vector length `N`: already done.
|
||||
if (HWY_UNLIKELY(idx == count)) return;
|
||||
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
// Proceed one by one.
|
||||
const CappedTag<T, 1> d1;
|
||||
const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
|
||||
for (; idx < count; ++idx) {
|
||||
using V1 = Vec<decltype(d1)>;
|
||||
const V1 v = LoadU(d1, inout + idx);
|
||||
StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
|
||||
}
|
||||
#else
|
||||
const size_t remaining = count - idx;
|
||||
HWY_DASSERT(0 != remaining && remaining < N);
|
||||
const Mask<D> mask = FirstN(d, remaining);
|
||||
const Vec<D> v = MaskedLoad(mask, d, inout + idx);
|
||||
BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
|
|
@ -0,0 +1,372 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc" //NOLINT
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/algo/transform-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// If your project requires C++14 or later, you can ignore this and pass lambdas
|
||||
// directly to Transform, without requiring an lvalue as we do here for C++11.
|
||||
#if __cplusplus < 201402L
|
||||
#define HWY_GENERIC_LAMBDA 0
|
||||
#else
|
||||
#define HWY_GENERIC_LAMBDA 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
T Alpha() {
|
||||
return static_cast<T>(1.5); // arbitrary scalar
|
||||
}
|
||||
|
||||
// Returns random floating-point number in [-8, 8) to ensure computations do
|
||||
// not exceed float32 precision.
|
||||
template <typename T>
|
||||
T Random(RandomState& rng) {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
const double val = (bits - 512) / 64.0;
|
||||
// Clamp negative to zero for unsigned types.
|
||||
return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
|
||||
}
|
||||
|
||||
// SCAL, AXPY names are from BLAS.
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = Alpha<T>() * x[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = Alpha<T>() * x[i] + y[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
|
||||
size_t count) {
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
out[i] = x[i] * y[i] + z[i];
|
||||
}
|
||||
}
|
||||
|
||||
// In C++14, we can instead define these as generic lambdas next to where they
|
||||
// are invoked.
|
||||
#if !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Generator that returns even numbers by doubling the output indices.
|
||||
struct Gen2 {
|
||||
template <class D, class VU>
|
||||
Vec<D> operator()(D d, VU vidx) const {
|
||||
return BitCast(d, Add(vidx, vidx));
|
||||
}
|
||||
};
|
||||
|
||||
struct SCAL {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D d, V v) const {
|
||||
using T = TFromD<D>;
|
||||
return Mul(Set(d, Alpha<T>()), v);
|
||||
}
|
||||
};
|
||||
|
||||
struct AXPY {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D d, V v, V v1) const {
|
||||
using T = TFromD<D>;
|
||||
return MulAdd(Set(d, Alpha<T>()), v, v1);
|
||||
}
|
||||
};
|
||||
|
||||
struct FMA4 {
|
||||
template <class D, class V>
|
||||
Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
|
||||
return MulAdd(v, v1, v2);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // !HWY_GENERIC_LAMBDA
|
||||
|
||||
// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
|
||||
// ForFloatTypes.
|
||||
template <class Test>
|
||||
struct ForeachCountAndMisalign {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) const {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
for (size_t count = 0; count < 2 * N; ++count) {
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test()(d, count, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Output-only, no loads
|
||||
struct TestGenerate {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
|
||||
RandomState& /*rng*/) {
|
||||
using T = TFromD<D>;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
|
||||
T* actual = pa.get() + misalign_a;
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = static_cast<T>(2 * i);
|
||||
}
|
||||
|
||||
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
|
||||
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto gen2 = [](const auto d, const auto vidx)
|
||||
HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
|
||||
#else
|
||||
const Gen2 gen2;
|
||||
#endif
|
||||
actual[count] = T{0}; // sentinel
|
||||
Generate(d, actual, count, gen2);
|
||||
HWY_ASSERT_EQ(T{0}, actual[count]); // did not write past end
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
|
||||
target_name, __FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// Zero extra input arrays
|
||||
struct TestTransform {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
if (misalign_b != 0) return;
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleSCAL(a, expected.get(), count);
|
||||
|
||||
// TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
|
||||
// the attribute also applies to lambdas? If so, remove HWY_ATTR.
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto scal = [](const auto d, const auto v)
|
||||
HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
|
||||
#else
|
||||
const SCAL scal;
|
||||
#endif
|
||||
Transform(d, a, count, scal);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// One extra input array
|
||||
struct TestTransform1 {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
b[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleAXPY(a, b, expected.get(), count);
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
|
||||
return MulAdd(Set(d, Alpha<T>()), v, v1);
|
||||
};
|
||||
#else
|
||||
const AXPY axpy;
|
||||
#endif
|
||||
Transform1(d, a, count, b, axpy);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
// Two extra input arrays
|
||||
struct TestTransform2 {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
// Prevents error if size to allocate is zero.
|
||||
AlignedFreeUniquePtr<T[]> pa =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
AlignedFreeUniquePtr<T[]> pb =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
|
||||
AlignedFreeUniquePtr<T[]> pc =
|
||||
AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
T* c = pc.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
b[i] = Random<T>(rng);
|
||||
c[i] = Random<T>(rng);
|
||||
}
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
|
||||
SimpleFMA4(a, b, c, expected.get(), count);
|
||||
|
||||
#if HWY_GENERIC_LAMBDA
|
||||
const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
|
||||
HWY_ATTR { return MulAdd(v, v1, v2); };
|
||||
#else
|
||||
const FMA4 fma4;
|
||||
#endif
|
||||
Transform2(d, a, count, b, c, fma4);
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
|
||||
__FILE__, __LINE__);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class IfEq {
|
||||
public:
|
||||
IfEq(T val) : val_(val) {}
|
||||
|
||||
template <class D, class V>
|
||||
Mask<D> operator()(D d, V v) const {
|
||||
return Eq(v, Set(d, val_));
|
||||
}
|
||||
|
||||
private:
|
||||
T val_;
|
||||
};
|
||||
|
||||
struct TestReplace {
|
||||
template <class D>
|
||||
void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
if (misalign_b != 0) return;
|
||||
if (count == 0) return;
|
||||
using T = TFromD<D>;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
|
||||
T* a = pa.get() + misalign_a;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
a[i] = Random<T>(rng);
|
||||
}
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
|
||||
|
||||
AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
|
||||
|
||||
std::vector<size_t> positions(AdjustedReps(count));
|
||||
for (size_t& pos : positions) {
|
||||
pos = static_cast<size_t>(rng()) % count;
|
||||
}
|
||||
|
||||
for (size_t pos = 0; pos < count; ++pos) {
|
||||
const T old_t = a[pos];
|
||||
const T new_t = Random<T>(rng);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
|
||||
}
|
||||
|
||||
// Copy so ReplaceIf gets the same input (and thus also outputs expected)
|
||||
memcpy(pb.get(), a, count * sizeof(T));
|
||||
|
||||
Replace(d, a, count, new_t, old_t);
|
||||
HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
|
||||
|
||||
ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
|
||||
HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllGenerate() {
|
||||
// The test BitCast-s the indices, which does not work for floats.
|
||||
ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
|
||||
}
|
||||
|
||||
void TestAllTransform() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
|
||||
}
|
||||
|
||||
void TestAllTransform1() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
|
||||
}
|
||||
|
||||
void TestAllTransform2() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
|
||||
}
|
||||
|
||||
void TestAllReplace() {
|
||||
ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(TransformTest);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
|
||||
HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
|
||||
} // namespace hwy
|
||||
|
||||
#endif
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -34,23 +35,19 @@ struct Dot {
|
|||
// argument to Compute. Each one may improve performance or reduce code size,
|
||||
// at the cost of additional requirements on the arguments.
|
||||
enum Assumptions {
|
||||
// num_elements is at least N, which may be up to HWY_MAX_LANES(T).
|
||||
// num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
|
||||
kAtLeastOneVector = 1,
|
||||
// num_elements is divisible by N (a power of two, so this can be used if
|
||||
// the problem size is known to be a power of two >= HWY_MAX_LANES(T)).
|
||||
// the problem size is known to be a power of two >= HWY_MAX_BYTES /
|
||||
// sizeof(T)).
|
||||
kMultipleOfVector = 2,
|
||||
// RoundUpTo(num_elements, N) elements are accessible; their value does not
|
||||
// matter (will be treated as if they were zero).
|
||||
kPaddedToVector = 4,
|
||||
// Pointers pa and pb, respectively, are multiples of N * sizeof(T).
|
||||
// For example, aligned_allocator.h ensures this. Note that it is still
|
||||
// beneficial to ensure such alignment even if these flags are not set.
|
||||
// If not set, the pointers need only be aligned to alignof(T).
|
||||
kVectorAlignedA = 8,
|
||||
kVectorAlignedB = 16,
|
||||
};
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for float or double inputs.
|
||||
// Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
|
||||
// pointers to a multiple of N elements is helpful but not required.
|
||||
template <int kAssumptions, class D, typename T = TFromD<D>,
|
||||
HWY_IF_NOT_LANE_SIZE_D(D, 2)>
|
||||
static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
|
||||
|
@ -67,8 +64,6 @@ struct Dot {
|
|||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
|
||||
constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
|
@ -98,28 +93,28 @@ struct Dot {
|
|||
|
||||
// Main loop: unrolled
|
||||
for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = MulAdd(a0, b0, sum0);
|
||||
const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a1 = LoadU(d, pa + i);
|
||||
const auto b1 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum1 = MulAdd(a1, b1, sum1);
|
||||
const auto a2 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b2 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a2 = LoadU(d, pa + i);
|
||||
const auto b2 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = MulAdd(a2, b2, sum2);
|
||||
const auto a3 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b3 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a3 = LoadU(d, pa + i);
|
||||
const auto b3 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum3 = MulAdd(a3, b3, sum3);
|
||||
}
|
||||
|
||||
// Up to 3 iterations of whole vectors
|
||||
for (; i + N <= num_elements; i += N) {
|
||||
const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a = LoadU(d, pa + i);
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum0 = MulAdd(a, b, sum0);
|
||||
}
|
||||
|
||||
|
@ -128,8 +123,8 @@ struct Dot {
|
|||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(d, remaining);
|
||||
const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a = LoadU(d, pa + i);
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
|
||||
} else {
|
||||
// Unaligned load such that the last element is in the highest lane -
|
||||
|
@ -152,7 +147,8 @@ struct Dot {
|
|||
return GetLane(SumOfLanes(d, sum0));
|
||||
}
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for bfloat16 inputs.
|
||||
// Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
|
||||
// multiple of N elements is helpful but not required.
|
||||
template <int kAssumptions, class D>
|
||||
static HWY_INLINE float Compute(const D d,
|
||||
const bfloat16_t* const HWY_RESTRICT pa,
|
||||
|
@ -170,8 +166,6 @@ struct Dot {
|
|||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
|
||||
constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
|
@ -197,20 +191,20 @@ struct Dot {
|
|||
|
||||
// Main loop: unrolled
|
||||
for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a1 = LoadU(d, pa + i);
|
||||
const auto b1 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
|
||||
}
|
||||
|
||||
// Possibly one more iteration of whole vectors
|
||||
if (i + N <= num_elements) {
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a0 = LoadU(d, pa + i);
|
||||
const auto b0 = LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
}
|
||||
|
@ -220,8 +214,8 @@ struct Dot {
|
|||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(du16, remaining);
|
||||
const auto va = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto vb = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto va = LoadU(d, pa + i);
|
||||
const auto vb = LoadU(d, pb + i);
|
||||
const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
|
||||
const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -21,7 +22,7 @@
|
|||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/dot/dot-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
@ -69,11 +70,6 @@ class TestDot {
|
|||
return static_cast<float>(bits - 512) * (1.0f / 64);
|
||||
};
|
||||
|
||||
const bool kIsAlignedA = (kAssumptions & Dot::kVectorAlignedA) != 0;
|
||||
const bool kIsAlignedB = (kAssumptions & Dot::kVectorAlignedB) != 0;
|
||||
|
||||
HWY_ASSERT(!kIsAlignedA || misalign_a == 0);
|
||||
HWY_ASSERT(!kIsAlignedB || misalign_b == 0);
|
||||
const size_t padded =
|
||||
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
|
||||
|
@ -99,27 +95,11 @@ class TestDot {
|
|||
HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
|
||||
}
|
||||
|
||||
// Runs tests with various alignments compatible with the given assumptions.
|
||||
// Runs tests with various alignments.
|
||||
template <int kAssumptions, class D>
|
||||
void ForeachMisalign(D d, size_t num, RandomState& rng) {
|
||||
static_assert(
|
||||
(kAssumptions & (Dot::kVectorAlignedA | Dot::kVectorAlignedB)) == 0,
|
||||
"Alignment must not be specified by caller");
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
// Both flags, both aligned
|
||||
Test<kAssumptions | Dot::kVectorAlignedA | Dot::kVectorAlignedB>(d, num, 0,
|
||||
0, rng);
|
||||
|
||||
// One flag and aligned, other aligned/misaligned
|
||||
for (size_t m : misalignments) {
|
||||
Test<kAssumptions | Dot::kVectorAlignedA>(d, num, 0, m, rng);
|
||||
Test<kAssumptions | Dot::kVectorAlignedB>(d, num, m, 0, rng);
|
||||
}
|
||||
|
||||
// Neither flag, all combinations of aligned/misaligned
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test<kAssumptions>(d, num, ma, mb, rng);
|
||||
|
@ -184,10 +164,4 @@ HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
|
|||
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -19,7 +20,7 @@
|
|||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
|
@ -104,7 +105,7 @@ ImageBase::ImageBase(const size_t xsize, const size_t ysize,
|
|||
}
|
||||
|
||||
void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
|
||||
#if defined(MEMORY_SANITIZER) || HWY_IDE
|
||||
#if HWY_IS_MSAN || HWY_IDE
|
||||
if (xsize_ == 0 || ysize_ == 0) return;
|
||||
|
||||
const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t!
|
||||
|
@ -130,7 +131,7 @@ void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
|
|||
#else
|
||||
(void)sizeof_t;
|
||||
(void)padding;
|
||||
#endif // MEMORY_SANITIZER
|
||||
#endif // HWY_IS_MSAN
|
||||
}
|
||||
|
||||
void ImageBase::Swap(ImageBase& other) {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright (c) the JPEG XL Project
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -14,14 +15,7 @@
|
|||
|
||||
#include "hwy/contrib/image/image.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -29,6 +23,11 @@
|
|||
#include <random>
|
||||
#include <utility>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target:
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
|
@ -82,7 +81,7 @@ struct TestUnalignedT {
|
|||
|
||||
// This test reads padding, which only works if it was initialized,
|
||||
// which only happens in MSAN builds.
|
||||
#if defined(MEMORY_SANITIZER) || HWY_IDE
|
||||
#if HWY_IS_MSAN || HWY_IDE
|
||||
// Initialize only the valid samples
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
|
@ -150,10 +149,4 @@ HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
|
|||
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -20,7 +21,7 @@
|
|||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#include "hwy/contrib/math/math-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
@ -219,10 +220,4 @@ HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
|
|||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -8,35 +8,91 @@ COMPAT = [
|
|||
"//buildenv/target:non_prod", # includes mobile/vendor.
|
||||
]
|
||||
|
||||
# cc_library(
|
||||
# name = "vxsort",
|
||||
# srcs = [
|
||||
# "vxsort/isa_detection.cpp",
|
||||
# "vxsort/isa_detection_msvc.cpp",
|
||||
# "vxsort/isa_detection_sane.cpp",
|
||||
# "vxsort/machine_traits.avx2.cpp",
|
||||
# "vxsort/smallsort/avx2_load_mask_tables.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
|
||||
# "vxsort/vxsort_stats.cpp",
|
||||
# ],
|
||||
# hdrs = [
|
||||
# "vxsort/alignment.h",
|
||||
# "vxsort/defs.h",
|
||||
# "vxsort/isa_detection.h",
|
||||
# "vxsort/machine_traits.avx2.h",
|
||||
# "vxsort/machine_traits.avx512.h",
|
||||
# "vxsort/machine_traits.h",
|
||||
# "vxsort/packer.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
|
||||
# "vxsort/smallsort/bitonic_sort.h",
|
||||
# "vxsort/vxsort.h",
|
||||
# "vxsort/vxsort_stats.h",
|
||||
# ],
|
||||
# compatible_with = [],
|
||||
# textual_hdrs = [
|
||||
# "vxsort/vxsort_targets_disable.h",
|
||||
# "vxsort/vxsort_targets_enable_avx2.h",
|
||||
# "vxsort/vxsort_targets_enable_avx512.h",
|
||||
# ],
|
||||
# )
|
||||
|
||||
cc_library(
|
||||
name = "vqsort",
|
||||
srcs = [
|
||||
# Split into separate files to reduce MSVC build time.
|
||||
"vqsort.cc",
|
||||
"vqsort_i16a.cc",
|
||||
"vqsort_i16d.cc",
|
||||
"vqsort_u16a.cc",
|
||||
"vqsort_u16d.cc",
|
||||
"vqsort_f32a.cc",
|
||||
"vqsort_f32d.cc",
|
||||
"vqsort_i32a.cc",
|
||||
"vqsort_i32d.cc",
|
||||
"vqsort_u32a.cc",
|
||||
"vqsort_u32d.cc",
|
||||
"vqsort_f64a.cc",
|
||||
"vqsort_f64d.cc",
|
||||
"vqsort_i64a.cc",
|
||||
"vqsort_i64d.cc",
|
||||
"vqsort_u64a.cc",
|
||||
"vqsort_u64d.cc",
|
||||
"vqsort_128a.cc",
|
||||
"vqsort_128d.cc",
|
||||
"vqsort_f32a.cc",
|
||||
"vqsort_f32d.cc",
|
||||
"vqsort_f64a.cc",
|
||||
"vqsort_f64d.cc",
|
||||
"vqsort_i16a.cc",
|
||||
"vqsort_i16d.cc",
|
||||
"vqsort_i32a.cc",
|
||||
"vqsort_i32d.cc",
|
||||
"vqsort_i64a.cc",
|
||||
"vqsort_i64d.cc",
|
||||
"vqsort_kv128a.cc",
|
||||
"vqsort_kv128d.cc",
|
||||
"vqsort_u16a.cc",
|
||||
"vqsort_u16d.cc",
|
||||
"vqsort_u32a.cc",
|
||||
"vqsort_u32d.cc",
|
||||
"vqsort_u64a.cc",
|
||||
"vqsort_u64d.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"disabled_targets.h",
|
||||
"vqsort.h", # public interface
|
||||
],
|
||||
compatible_with = [],
|
||||
local_defines = ["hwy_contrib_EXPORTS"],
|
||||
textual_hdrs = [
|
||||
"shared-inl.h",
|
||||
"sorting_networks-inl.h",
|
||||
|
@ -48,6 +104,7 @@ cc_library(
|
|||
# Only if VQSORT_SECURE_RNG is set.
|
||||
# "//third_party/absl/random",
|
||||
"//:hwy",
|
||||
# ":vxsort", # required if HAVE_VXSORT
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -86,8 +143,7 @@ cc_test(
|
|||
name = "sort_test",
|
||||
size = "medium",
|
||||
srcs = ["sort_test.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
|
@ -104,8 +160,7 @@ cc_binary(
|
|||
name = "bench_sort",
|
||||
testonly = 1,
|
||||
srcs = ["bench_sort.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
|
@ -120,8 +175,7 @@ cc_binary(
|
|||
name = "bench_parallel",
|
||||
testonly = 1,
|
||||
srcs = ["bench_parallel.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
# Do not enable fully_static_link (pthread crash on bazel)
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
# Vectorized and performance-portable Quicksort
|
||||
|
||||
## Introduction
|
||||
|
||||
As of 2022-06-07 this sorts large arrays of built-in types about ten times as
|
||||
fast as `std::sort`. See also our
|
||||
[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
|
||||
and [paper](https://arxiv.org/abs/2205.05982).
|
||||
|
||||
## Instructions
|
||||
|
||||
Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
|
||||
and Arm V1 (NEON, SVE).
|
||||
|
||||
### x86 (Linux)
|
||||
|
||||
Please first ensure golang, and Clang (tested with 13.0.1) are installed via
|
||||
your system's package manager.
|
||||
|
||||
```
|
||||
go install github.com/bazelbuild/bazelisk@latest
|
||||
git clone https://github.com/google/highway
|
||||
cd highway
|
||||
CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
|
||||
bazel-bin/hwy/contrib/sort/sort_test
|
||||
bazel-bin/hwy/contrib/sort/bench_sort
|
||||
```
|
||||
|
||||
### AWS Graviton3
|
||||
|
||||
Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
|
||||
32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
|
||||
config is verified, then re-launch. See IPv4 hostname in list of instances.
|
||||
|
||||
`ssh -i /path/key.pem ec2-user@hostname`
|
||||
|
||||
Note that the AWS CMake package is too old for llvm, so we build it first:
|
||||
```
|
||||
wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
|
||||
tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
|
||||
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
|
||||
make -j8 && sudo make install
|
||||
cd ..
|
||||
```
|
||||
|
||||
AWS clang is at version 11.1, which generates unnecessary AND instructions which
|
||||
slow down the sort by 1.15x. We tested with clang trunk as of June 13
|
||||
(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
|
||||
```
|
||||
git clone --depth 1 https://github.com/llvm/llvm-project.git
|
||||
cd llvm-project
|
||||
mkdir -p build && cd build
|
||||
/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
|
||||
make -j32 && sudo make install
|
||||
```
|
||||
|
||||
```
|
||||
sudo yum install go
|
||||
go install github.com/bazelbuild/bazelisk@latest
|
||||
git clone https://github.com/google/highway
|
||||
cd highway
|
||||
CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
|
||||
bazel-bin/hwy/contrib/sort/sort_test
|
||||
bazel-bin/hwy/contrib/sort/bench_sort
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
|
||||
algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
|
||||
sorted (f32 is float), the distribution of keys (uniform32 for uniform random
|
||||
with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
|
||||
number of key bytes output per second).
|
||||
|
||||
Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
|
||||
|
||||
```
|
||||
[ RUN ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
|
||||
AVX3: std: f32: uniform32: 1.00E+06 54 MB/s ( 1 threads)
|
||||
AVX3: vq: f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
|
||||
```
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -29,16 +30,18 @@
|
|||
// Third-party algorithms
|
||||
#define HAVE_AVX2SORT 0
|
||||
#define HAVE_IPS4O 0
|
||||
// When enabling, consider changing max_threads (required for Table 1a)
|
||||
#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
|
||||
#define HAVE_PDQSORT 0
|
||||
#define HAVE_SORT512 0
|
||||
#define HAVE_VXSORT 0
|
||||
|
||||
#if HAVE_AVX2SORT
|
||||
HWY_PUSH_ATTRIBUTES("avx2,avx")
|
||||
#include "avx2sort.h"
|
||||
#include "avx2sort.h" //NOLINT
|
||||
HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
|
||||
#include "third_party/ips4o/include/ips4o.hpp"
|
||||
#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
|
||||
#endif
|
||||
|
@ -46,18 +49,59 @@ HWY_POP_ATTRIBUTES
|
|||
#include "third_party/boost/allowed/sort/sort.hpp"
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
#include "sort512.h"
|
||||
#include "sort512.h" //NOLINT
|
||||
#endif
|
||||
|
||||
// vxsort is difficult to compile for multiple targets because it also uses
|
||||
// .cpp files, and we'd also have to #undef its include guards. Instead, compile
|
||||
// only for AVX2 or AVX3 depending on this macro.
|
||||
#define VXSORT_AVX3 1
|
||||
#if HAVE_VXSORT
|
||||
// inlined from vxsort_targets_enable_avx512 (must close before end of header)
|
||||
#ifdef __GNUC__
|
||||
#ifdef __clang__
|
||||
#if VXSORT_AVX3
|
||||
#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
|
||||
apply_to = any(function))
|
||||
#else
|
||||
#pragma clang attribute push(__attribute__((target("avx2"))), \
|
||||
apply_to = any(function))
|
||||
#endif // VXSORT_AVX3
|
||||
|
||||
#else
|
||||
#pragma GCC push_options
|
||||
#if VXSORT_AVX3
|
||||
#pragma GCC target("avx512f,avx512dq")
|
||||
#else
|
||||
#pragma GCC target("avx2")
|
||||
#endif // VXSORT_AVX3
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if VXSORT_AVX3
|
||||
#include "vxsort/machine_traits.avx512.h"
|
||||
#else
|
||||
#include "vxsort/machine_traits.avx2.h"
|
||||
#endif // VXSORT_AVX3
|
||||
#include "vxsort/vxsort.h"
|
||||
#ifdef __GNUC__
|
||||
#ifdef __clang__
|
||||
#pragma clang attribute pop
|
||||
#else
|
||||
#pragma GCC pop_options
|
||||
#endif
|
||||
#endif
|
||||
#endif // HAVE_VXSORT
|
||||
|
||||
namespace hwy {
|
||||
|
||||
enum class Dist { kUniform8, kUniform16, kUniform32 };
|
||||
|
||||
std::vector<Dist> AllDist() {
|
||||
return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32};
|
||||
static inline std::vector<Dist> AllDist() {
|
||||
return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
|
||||
}
|
||||
|
||||
const char* DistName(Dist dist) {
|
||||
static inline const char* DistName(Dist dist) {
|
||||
switch (dist) {
|
||||
case Dist::kUniform8:
|
||||
return "uniform8";
|
||||
|
@ -75,7 +119,13 @@ class InputStats {
|
|||
void Notify(T value) {
|
||||
min_ = std::min(min_, value);
|
||||
max_ = std::max(max_, value);
|
||||
sumf_ += static_cast<double>(value);
|
||||
// Converting to integer would truncate floats, multiplying to save digits
|
||||
// risks overflow especially when casting, so instead take the sum of the
|
||||
// bit representations as the checksum.
|
||||
uint64_t bits = 0;
|
||||
static_assert(sizeof(T) <= 8, "Expected a built-in type");
|
||||
CopyBytes<sizeof(T)>(&value, &bits);
|
||||
sum_ += bits;
|
||||
count_ += 1;
|
||||
}
|
||||
|
||||
|
@ -86,20 +136,16 @@ class InputStats {
|
|||
}
|
||||
|
||||
if (min_ != other.min_ || max_ != other.max_) {
|
||||
HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
|
||||
double(other.min_), double(other.max_));
|
||||
HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
|
||||
static_cast<double>(max_), static_cast<double>(other.min_),
|
||||
static_cast<double>(other.max_));
|
||||
}
|
||||
|
||||
// Sum helps detect duplicated/lost values
|
||||
if (sumf_ != other.sumf_) {
|
||||
// Allow some tolerance because kUniform32 * num can exceed double
|
||||
// precision.
|
||||
const double mul = 1E-9; // prevent destructive cancellation
|
||||
const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
|
||||
if (err > 1E-3) {
|
||||
HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
|
||||
other.sumf_, err, double(min_), double(max_));
|
||||
}
|
||||
if (sum_ != other.sum_) {
|
||||
HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
|
||||
static_cast<double>(sum_), static_cast<double>(other.sum_),
|
||||
static_cast<double>(min_), static_cast<double>(max_));
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -108,7 +154,7 @@ class InputStats {
|
|||
private:
|
||||
T min_ = hwy::HighestValue<T>();
|
||||
T max_ = hwy::LowestValue<T>();
|
||||
double sumf_ = 0.0;
|
||||
uint64_t sum_ = 0;
|
||||
size_t count_ = 0;
|
||||
};
|
||||
|
||||
|
@ -127,13 +173,16 @@ enum class Algo {
|
|||
#endif
|
||||
#if HAVE_SORT512
|
||||
kSort512,
|
||||
#endif
|
||||
#if HAVE_VXSORT
|
||||
kVXSort,
|
||||
#endif
|
||||
kStd,
|
||||
kVQSort,
|
||||
kHeap,
|
||||
};
|
||||
|
||||
const char* AlgoName(Algo algo) {
|
||||
static inline const char* AlgoName(Algo algo) {
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
case Algo::kSEA:
|
||||
|
@ -154,6 +203,10 @@ const char* AlgoName(Algo algo) {
|
|||
#if HAVE_SORT512
|
||||
case Algo::kSort512:
|
||||
return "sort512";
|
||||
#endif
|
||||
#if HAVE_VXSORT
|
||||
case Algo::kVXSort:
|
||||
return "vxsort";
|
||||
#endif
|
||||
case Algo::kStd:
|
||||
return "std";
|
||||
|
@ -205,12 +258,11 @@ class Xorshift128Plus {
|
|||
}
|
||||
|
||||
// Need to pass in the state because vector cannot be class members.
|
||||
template <class DU64>
|
||||
static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
|
||||
Vec<DU64>& state1) {
|
||||
Vec<DU64> s1 = state0;
|
||||
Vec<DU64> s0 = state1;
|
||||
const Vec<DU64> bits = Add(s1, s0);
|
||||
template <class VU64>
|
||||
static VU64 RandomBits(VU64& state0, VU64& state1) {
|
||||
VU64 s1 = state0;
|
||||
VU64 s0 = state1;
|
||||
const VU64 bits = Add(s1, s0);
|
||||
state0 = s0;
|
||||
s1 = Xor(s1, ShiftLeft<23>(s1));
|
||||
state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
|
||||
|
@ -218,30 +270,34 @@ class Xorshift128Plus {
|
|||
}
|
||||
};
|
||||
|
||||
template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
|
||||
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
|
||||
const Vec<DU64> mask) {
|
||||
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
|
||||
return And(bits, mask);
|
||||
template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
|
||||
Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
|
||||
const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
|
||||
return BitCast(d, And(bits, mask));
|
||||
}
|
||||
|
||||
// Important to avoid denormals, which are flushed to zero by SIMD but not
|
||||
// It is important to avoid denormals, which are flushed to zero by SIMD but not
|
||||
// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
|
||||
template <typename T, class DU64, HWY_IF_FLOAT(T)>
|
||||
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
|
||||
const Vec<DU64> mask) {
|
||||
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
|
||||
const Vec<DU64> values = And(bits, mask);
|
||||
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to i32
|
||||
const RebindToSigned<DU64> di;
|
||||
template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
|
||||
Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
|
||||
using TF = TFromD<DF>;
|
||||
const RebindToUnsigned<decltype(df)> du;
|
||||
using VU = Vec<decltype(du)>;
|
||||
|
||||
const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types
|
||||
using TU = MakeUnsigned<TF>;
|
||||
const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
|
||||
#else
|
||||
const Repartition<MakeSigned<T>, DU64> di;
|
||||
const VU bits = BitCast(du, bits64);
|
||||
#endif
|
||||
const RebindToFloat<decltype(di)> df;
|
||||
// Avoid NaN/denormal by converting from (range-limited) integer.
|
||||
const Vec<DU64> no_nan =
|
||||
And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
|
||||
return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
|
||||
// Avoid NaN/denormal by only generating values in [1, 2), i.e. random
|
||||
// mantissas with the exponent taken from the representation of 1.0.
|
||||
const VU k1 = BitCast(du, Set(df, TF{1.0}));
|
||||
const VU mantissa_mask = Set(du, MantissaMask<TF>());
|
||||
const VU representation = OrAnd(k1, bits, mantissa_mask);
|
||||
return BitCast(df, representation);
|
||||
}
|
||||
|
||||
template <class DU64>
|
||||
|
@ -269,29 +325,29 @@ InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
|
|||
SortTag<uint64_t> du64;
|
||||
using VU64 = Vec<decltype(du64)>;
|
||||
const size_t N64 = Lanes(du64);
|
||||
auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
|
||||
Xorshift128Plus::GenerateSeeds(du64, buf.get());
|
||||
auto s0 = Load(du64, buf.get());
|
||||
auto s1 = Load(du64, buf.get() + N64);
|
||||
|
||||
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
|
||||
auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
|
||||
Xorshift128Plus::GenerateSeeds(du64, seeds.get());
|
||||
VU64 s0 = Load(du64, seeds.get());
|
||||
VU64 s1 = Load(du64, seeds.get() + N64);
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
const Sisd<T> d;
|
||||
#else
|
||||
const Repartition<T, decltype(du64)> d;
|
||||
#endif
|
||||
using V = Vec<decltype(d)>;
|
||||
const size_t N = Lanes(d);
|
||||
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
|
||||
auto buf = hwy::AllocateAligned<T>(N);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
|
||||
#if HWY_ARCH_RVV
|
||||
// v may not be 64-bit aligned
|
||||
StoreU(bits, du64, buf.get());
|
||||
memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
|
||||
#else
|
||||
StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
|
||||
#endif
|
||||
const V values = RandomValues(d, s0, s1, mask);
|
||||
StoreU(values, d, v + i);
|
||||
}
|
||||
if (i < num) {
|
||||
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
|
||||
StoreU(bits, du64, buf.get());
|
||||
const V values = RandomValues(d, s0, s1, mask);
|
||||
StoreU(values, d, buf.get());
|
||||
memcpy(v + i, buf.get(), (num - i) * sizeof(T));
|
||||
}
|
||||
|
||||
|
@ -308,18 +364,65 @@ struct ThreadLocal {
|
|||
|
||||
struct SharedState {
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
ips4o::StdThreadPool pool{
|
||||
HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))};
|
||||
const unsigned max_threads = hwy::LimitsMax<unsigned>(); // 16 for Table 1a
|
||||
ips4o::StdThreadPool pool{static_cast<int>(
|
||||
HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
|
||||
#endif
|
||||
std::vector<ThreadLocal> tls{1};
|
||||
};
|
||||
|
||||
template <class Order, typename T>
|
||||
void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
|
||||
size_t thread) {
|
||||
using detail::HeapSort;
|
||||
using detail::LaneTraits;
|
||||
// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
|
||||
// non-128-bit keys they are the same:
|
||||
template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
|
||||
void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::TraitsLane;
|
||||
using detail::SharedTraits;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
|
||||
return detail::HeapSort(st, keys, num_keys);
|
||||
} else {
|
||||
const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
|
||||
return detail::HeapSort(st, keys, num_keys);
|
||||
}
|
||||
}
|
||||
|
||||
#if VQSORT_ENABLED
|
||||
template <class Order>
|
||||
void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::SharedTraits;
|
||||
using detail::Traits128;
|
||||
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
|
||||
const size_t num_lanes = num_keys * 2;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<Traits128<detail::OrderAscending128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
} else {
|
||||
const SharedTraits<Traits128<detail::OrderDescending128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Order>
|
||||
void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
|
||||
using detail::SharedTraits;
|
||||
using detail::Traits128;
|
||||
uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
|
||||
const size_t num_lanes = num_keys * 2;
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
} else {
|
||||
const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
|
||||
return detail::HeapSort(st, lanes, num_lanes);
|
||||
}
|
||||
}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
template <class Order, typename KeyType>
|
||||
void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
|
||||
SharedState& shared, size_t thread) {
|
||||
const std::less<KeyType> less;
|
||||
const std::greater<KeyType> greater;
|
||||
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
|
@ -330,18 +433,18 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
|
|||
#if HAVE_IPS4O
|
||||
case Algo::kIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::sort(inout, inout + num, std::less<T>());
|
||||
return ips4o::sort(inout, inout + num, less);
|
||||
} else {
|
||||
return ips4o::sort(inout, inout + num, std::greater<T>());
|
||||
return ips4o::sort(inout, inout + num, greater);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
case Algo::kParallelIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::parallel::sort(inout, inout + num, std::less<T>());
|
||||
return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
|
||||
} else {
|
||||
return ips4o::parallel::sort(inout, inout + num, std::greater<T>());
|
||||
return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -354,33 +457,47 @@ void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
|
|||
#if HAVE_PDQSORT
|
||||
case Algo::kPDQ:
|
||||
if (Order().IsAscending()) {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num,
|
||||
std::less<T>());
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num, less);
|
||||
} else {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num,
|
||||
std::greater<T>());
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num, greater);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_VXSORT
|
||||
case Algo::kVXSort: {
|
||||
#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
|
||||
(!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
|
||||
fprintf(stderr, "Do not call for target %s\n",
|
||||
hwy::TargetName(HWY_TARGET));
|
||||
return;
|
||||
#else
|
||||
#if VXSORT_AVX3
|
||||
vxsort::vxsort<KeyType, vxsort::AVX512> vx;
|
||||
#else
|
||||
vxsort::vxsort<KeyType, vxsort::AVX2> vx;
|
||||
#endif
|
||||
if (Order().IsAscending()) {
|
||||
return vx.sort(inout, inout + num - 1);
|
||||
} else {
|
||||
fprintf(stderr, "Skipping VX - does not support descending order\n");
|
||||
return;
|
||||
}
|
||||
#endif // enabled for this target
|
||||
}
|
||||
#endif // HAVE_VXSORT
|
||||
|
||||
case Algo::kStd:
|
||||
if (Order().IsAscending()) {
|
||||
return std::sort(inout, inout + num, std::less<T>());
|
||||
return std::sort(inout, inout + num, less);
|
||||
} else {
|
||||
return std::sort(inout, inout + num, std::greater<T>());
|
||||
return std::sort(inout, inout + num, greater);
|
||||
}
|
||||
|
||||
case Algo::kVQSort:
|
||||
return shared.tls[thread].sorter(inout, num, Order());
|
||||
|
||||
case Algo::kHeap:
|
||||
HWY_ASSERT(sizeof(T) < 16);
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<LaneTraits<detail::OrderAscending>> st;
|
||||
return HeapSort(st, inout, num);
|
||||
} else {
|
||||
const SharedTraits<LaneTraits<detail::OrderDescending>> st;
|
||||
return HeapSort(st, inout, num);
|
||||
}
|
||||
return CallHeapSort<Order>(inout, num);
|
||||
|
||||
default:
|
||||
HWY_ABORT("Not implemented");
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -15,20 +16,6 @@
|
|||
// Concurrent, independent sorts for generating more memory traffic and testing
|
||||
// scalability.
|
||||
|
||||
// clang-format off
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
// Last
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
@ -40,18 +27,29 @@
|
|||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" //NOLINT
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
// Last
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
class ThreadPool {
|
||||
public:
|
||||
// Starts the given number of worker threads and blocks until they are ready.
|
||||
explicit ThreadPool(
|
||||
const size_t num_threads = std::thread::hardware_concurrency() / 2)
|
||||
const size_t num_threads = std::thread::hardware_concurrency())
|
||||
: num_threads_(num_threads) {
|
||||
HWY_ASSERT(num_threads_ > 0);
|
||||
threads_.reserve(num_threads_);
|
||||
|
@ -168,36 +166,42 @@ class ThreadPool {
|
|||
const void* data_; // points to caller's Func
|
||||
};
|
||||
|
||||
template <class Order, typename T>
|
||||
void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
|
||||
SharedState& shared, size_t thread) {
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
template <class Traits>
|
||||
void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
|
||||
const Algo algo, SharedState& shared, size_t thread) {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
using Order = typename Traits::Order;
|
||||
const size_t num_lanes = num_keys * st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
|
||||
(void)GenerateInput(dist, aligned.get(), num);
|
||||
(void)GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<Order>(algo, aligned.get(), num, shared, thread);
|
||||
HWY_ASSERT(aligned[0] < aligned[num - 1]);
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
|
||||
thread);
|
||||
HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
|
||||
}
|
||||
|
||||
void BenchParallel() {
|
||||
// Not interested in benchmark results for other targets
|
||||
if (HWY_TARGET != HWY_AVX3) return;
|
||||
// Not interested in benchmark results for other targets on x86
|
||||
if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThreadPool pool;
|
||||
const size_t NT = pool.NumThreads();
|
||||
|
||||
using T = int64_t;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
|
||||
size_t num = 100 * 1000 * 1000;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
|
||||
using KeyType = typename decltype(st)::KeyType;
|
||||
const size_t num_keys = size_t{100} * 1000 * 1000;
|
||||
|
||||
#if HAVE_IPS4O
|
||||
const Algo algo = Algo::kIPS4O;
|
||||
#else
|
||||
const Algo algo = Algo::kVQSort;
|
||||
#endif
|
||||
const Dist dist = Dist::kUniform16;
|
||||
const Dist dist = Dist::kUniform32;
|
||||
|
||||
SharedState shared;
|
||||
shared.tls.resize(NT);
|
||||
|
@ -207,18 +211,15 @@ void BenchParallel() {
|
|||
Timestamp t0;
|
||||
// Default capture because MSVC wants algo/dist but clang does not.
|
||||
pool.RunOnThreads(nt, [=, &shared](size_t thread) {
|
||||
RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
|
||||
RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
|
||||
});
|
||||
const double sec = SecondsSince(t0);
|
||||
results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
|
||||
results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
|
||||
st.KeyString());
|
||||
results.back().Print();
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchParallel() {}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
@ -234,10 +235,4 @@ HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
|
|||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,71 +13,87 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <vector>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
// Mode for larger sorts because M1 is able to access more than the per-core
|
||||
// share of L2, so 1M elements might still be in cache.
|
||||
#define SORT_100M 0
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
// Defined within HWY_ONCE, used by BenchAllSort.
|
||||
extern int64_t first_sort_target;
|
||||
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
using detail::LaneTraits;
|
||||
using detail::TraitsLane;
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderDescending;
|
||||
using detail::SharedTraits;
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderDescending128;
|
||||
using detail::Traits128;
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchPartition() {
|
||||
const SortTag<T> d;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const SortTag<LaneType> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform8;
|
||||
double sum = 0.0;
|
||||
|
||||
detail::Generator rng(&sum, 123); // for ChoosePivot
|
||||
|
||||
const size_t max_log2 = AdjustedLog2Reps(20);
|
||||
for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
|
||||
const size_t num = 1ull << log2;
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
auto buf =
|
||||
hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
|
||||
const size_t num_lanes = 1ull << log2;
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(
|
||||
HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
|
||||
hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
|
||||
|
||||
std::vector<double> seconds;
|
||||
const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
|
||||
const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
|
||||
for (size_t rep = 0; rep < num_reps; ++rep) {
|
||||
(void)GenerateInput(dist, aligned.get(), num);
|
||||
(void)GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
// The pivot value can influence performance. Do exactly what vqsort will
|
||||
// do so that the performance (influenced by prefetching and branch
|
||||
// prediction) is likely to predict the actual performance inside vqsort.
|
||||
const auto pivot = detail::ChoosePivot(d, st, aligned.get(), 0, num_lanes,
|
||||
buf.get(), rng);
|
||||
|
||||
const Timestamp t0;
|
||||
|
||||
detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
|
||||
detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
|
||||
buf.get());
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// 'Use' the result to prevent optimizing out the partition.
|
||||
sum += static_cast<double>(aligned.get()[num / 2]);
|
||||
sum += static_cast<double>(aligned.get()[num_lanes / 2]);
|
||||
}
|
||||
|
||||
MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
|
||||
SummarizeMeasurements(seconds))
|
||||
Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
|
||||
sizeof(KeyType), st.KeyString())
|
||||
.Print();
|
||||
}
|
||||
HWY_ASSERT(sum != 999999); // Prevent optimizing out
|
||||
|
@ -84,52 +101,60 @@ HWY_NOINLINE void BenchPartition() {
|
|||
|
||||
HWY_NOINLINE void BenchAllPartition() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
|
||||
HWY_TARGET == HWY_AVX2) {
|
||||
if (HWY_TARGET == HWY_SSSE3) {
|
||||
return;
|
||||
}
|
||||
|
||||
BenchPartition<LaneTraits<OrderDescending>, float>();
|
||||
BenchPartition<LaneTraits<OrderAscending>, int64_t>();
|
||||
BenchPartition<Traits128<OrderDescending128>, uint64_t>();
|
||||
BenchPartition<TraitsLane<OrderDescending<float>>>();
|
||||
BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
|
||||
BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
|
||||
BenchPartition<Traits128<OrderAscending128>>();
|
||||
// BenchPartition<Traits128<OrderDescending128>>();
|
||||
// BenchPartition<Traits128<OrderAscendingKV128>>();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
return;
|
||||
}
|
||||
|
||||
const SortTag<T> d;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const SortTag<LaneType> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform32;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t num = SortConstants::BaseCaseNum(N);
|
||||
auto keys = hwy::AllocateAligned<T>(num);
|
||||
auto buf = hwy::AllocateAligned<T>(num + N);
|
||||
const size_t num_lanes = SortConstants::BaseCaseNum(N);
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
|
||||
|
||||
std::vector<double> seconds;
|
||||
double sum = 0; // prevents elision
|
||||
constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
|
||||
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
|
||||
for (size_t rep = 0; rep < 30; ++rep) {
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, keys.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
for (size_t i = 0; i < kMul; ++i) {
|
||||
detail::BaseCase(d, st, keys.get(), num, buf.get());
|
||||
detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
|
||||
buf.get());
|
||||
sum += static_cast<double>(keys[0]);
|
||||
}
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
|
||||
}
|
||||
HWY_ASSERT(sum < 1E99);
|
||||
results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
|
||||
SummarizeMeasurements(seconds)));
|
||||
results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
|
||||
SummarizeMeasurements(seconds), sizeof(KeyType),
|
||||
st.KeyString());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllBase() {
|
||||
|
@ -139,14 +164,19 @@ HWY_NOINLINE void BenchAllBase() {
|
|||
}
|
||||
|
||||
std::vector<Result> results;
|
||||
BenchBase<LaneTraits<OrderAscending>, float>(results);
|
||||
BenchBase<LaneTraits<OrderDescending>, int64_t>(results);
|
||||
BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
|
||||
BenchBase<TraitsLane<OrderAscending<float>>>(results);
|
||||
BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
|
||||
BenchBase<Traits128<OrderAscending128>>(results);
|
||||
for (const Result& r : results) {
|
||||
r.Print();
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchAllPartition() {}
|
||||
void BenchAllBase() {}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
std::vector<Algo> AlgoForBench() {
|
||||
return {
|
||||
#if HAVE_AVX2SORT
|
||||
|
@ -154,8 +184,7 @@ std::vector<Algo> AlgoForBench() {
|
|||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
Algo::kParallelIPS4O,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
#elif HAVE_IPS4O
|
||||
Algo::kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
|
@ -164,33 +193,64 @@ std::vector<Algo> AlgoForBench() {
|
|||
#if HAVE_SORT512
|
||||
Algo::kSort512,
|
||||
#endif
|
||||
// Algo::kStd, // too slow to always benchmark
|
||||
// Algo::kHeap, // too slow to always benchmark
|
||||
Algo::kVQSort,
|
||||
// Only include if we're compiling for the target it supports.
|
||||
#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
|
||||
(!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
|
||||
Algo::kVXSort,
|
||||
#endif
|
||||
|
||||
#if !HAVE_PARALLEL_IPS4O
|
||||
#if !SORT_100M
|
||||
// These are 10-20x slower, but that's OK for the default size when we
|
||||
// are not testing the parallel nor 100M modes.
|
||||
Algo::kStd, Algo::kHeap,
|
||||
#endif
|
||||
|
||||
Algo::kVQSort, // only ~4x slower, but not required for Table 1a
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_NOINLINE void BenchSort(size_t num) {
|
||||
template <class Traits>
|
||||
HWY_NOINLINE void BenchSort(size_t num_keys) {
|
||||
if (first_sort_target == 0) first_sort_target = HWY_TARGET;
|
||||
|
||||
SharedState shared;
|
||||
detail::SharedTraits<Traits> st;
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
using Order = typename Traits::Order;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
const size_t num_lanes = num_keys * st.LanesPerKey();
|
||||
auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
|
||||
|
||||
const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
|
||||
|
||||
for (Algo algo : AlgoForBench()) {
|
||||
// Other algorithms don't depend on the vector instructions, so only run
|
||||
// them for the first target.
|
||||
#if !HAVE_VXSORT
|
||||
if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (Dist dist : AllDist()) {
|
||||
std::vector<double> seconds;
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
|
||||
for (size_t rep = 0; rep < reps; ++rep) {
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, aligned.get(), num_lanes);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<typename Traits::Order>(algo, aligned.get(), num, shared,
|
||||
/*thread=*/0);
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
|
||||
shared, /*thread=*/0);
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(
|
||||
VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
|
||||
VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
|
||||
}
|
||||
MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
|
||||
Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
|
||||
sizeof(KeyType), st.KeyString())
|
||||
.Print();
|
||||
} // dist
|
||||
} // algo
|
||||
|
@ -198,41 +258,40 @@ HWY_NOINLINE void BenchSort(size_t num) {
|
|||
|
||||
HWY_NOINLINE void BenchAllSort() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
|
||||
HWY_TARGET == HWY_EMU128) {
|
||||
return;
|
||||
}
|
||||
// Only enable EMU128 on x86 - it's slow on emulators.
|
||||
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
|
||||
|
||||
constexpr size_t K = 1000;
|
||||
constexpr size_t M = K * K;
|
||||
(void)K;
|
||||
(void)M;
|
||||
for (size_t num : {
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
for (size_t num_keys : {
|
||||
#if HAVE_PARALLEL_IPS4O || SORT_100M
|
||||
100 * M,
|
||||
#else
|
||||
AdjustedReps(1 * M),
|
||||
1 * M,
|
||||
#endif
|
||||
}) {
|
||||
// BenchSort<LaneTraits<OrderAscending>, float>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, double>(num);
|
||||
// BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
|
||||
BenchSort<LaneTraits<OrderDescending>, int32_t>(num);
|
||||
BenchSort<LaneTraits<OrderAscending>, int64_t>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
|
||||
// BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
|
||||
BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
|
||||
BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
|
||||
BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
|
||||
// BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
|
||||
|
||||
BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
// BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
#if !HAVE_VXSORT && VQSORT_ENABLED
|
||||
BenchSort<Traits128<OrderAscending128>>(num_keys);
|
||||
// BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchAllPartition() {}
|
||||
void BenchAllBase() {}
|
||||
void BenchAllSort() {}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
@ -242,6 +301,7 @@ HWY_AFTER_NAMESPACE();
|
|||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
int64_t first_sort_target = 0; // none run yet
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(BenchSort);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
|
||||
|
@ -250,10 +310,4 @@ HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
|
|||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
|
|
|
@ -1,30 +0,0 @@
|
|||
// Copyright 2022 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Speed up MSVC builds by building fewer targets. This header must be included
|
||||
// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
|
||||
// However, users of vqsort.h are unaffected.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#undef HWY_DISABLED_TARGETS
|
||||
// HWY_SCALAR remains, so there will still be a valid target to call.
|
||||
#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -33,20 +34,19 @@ struct Timestamp {
|
|||
double t;
|
||||
};
|
||||
|
||||
double SecondsSince(const Timestamp& t0) {
|
||||
static inline double SecondsSince(const Timestamp& t0) {
|
||||
const Timestamp t1;
|
||||
return t1.t - t0.t;
|
||||
}
|
||||
|
||||
constexpr size_t kReps = 30;
|
||||
|
||||
// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
|
||||
// enough for the mode to be reliable).
|
||||
double SummarizeMeasurements(std::vector<double>& seconds) {
|
||||
static inline double SummarizeMeasurements(std::vector<double>& seconds) {
|
||||
std::sort(seconds.begin(), seconds.end());
|
||||
double sum = 0;
|
||||
int count = 0;
|
||||
for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
|
||||
const size_t num = seconds.size();
|
||||
for (size_t i = num / 4; i < num / 2; ++i) {
|
||||
sum += seconds[i];
|
||||
count += 1;
|
||||
}
|
||||
|
@ -71,72 +71,62 @@ namespace HWY_NAMESPACE {
|
|||
|
||||
struct Result {
|
||||
Result() {}
|
||||
Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
|
||||
size_t num, size_t num_threads, double sec, size_t sizeof_t,
|
||||
const char* type_name)
|
||||
: target(target),
|
||||
Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
|
||||
double sec, size_t sizeof_key, const std::string& key_name)
|
||||
: target(HWY_TARGET),
|
||||
algo(algo),
|
||||
dist(dist),
|
||||
is128(is128),
|
||||
num(num),
|
||||
num_keys(num_keys),
|
||||
num_threads(num_threads),
|
||||
sec(sec),
|
||||
sizeof_t(sizeof_t),
|
||||
type_name(type_name) {}
|
||||
sizeof_key(sizeof_key),
|
||||
key_name(key_name) {}
|
||||
|
||||
void Print() const {
|
||||
const double bytes = static_cast<double>(num) *
|
||||
const double bytes = static_cast<double>(num_keys) *
|
||||
static_cast<double>(num_threads) *
|
||||
static_cast<double>(sizeof_t);
|
||||
static_cast<double>(sizeof_key);
|
||||
printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
|
||||
hwy::TargetName(target), AlgoName(algo),
|
||||
is128 ? "u128" : type_name.c_str(), DistName(dist),
|
||||
static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
|
||||
hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
|
||||
DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
|
||||
num_threads);
|
||||
}
|
||||
|
||||
uint32_t target;
|
||||
int64_t target;
|
||||
Algo algo;
|
||||
Dist dist;
|
||||
bool is128;
|
||||
size_t num = 0;
|
||||
size_t num_keys = 0;
|
||||
size_t num_threads = 0;
|
||||
double sec = 0.0;
|
||||
size_t sizeof_t = 0;
|
||||
std::string type_name;
|
||||
size_t sizeof_key = 0;
|
||||
std::string key_name;
|
||||
};
|
||||
|
||||
template <typename T, class Traits>
|
||||
Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
|
||||
size_t num_threads, double sec) {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
|
||||
return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
|
||||
sizeof(T), string100);
|
||||
}
|
||||
template <class Traits, typename LaneType>
|
||||
bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
|
||||
const LaneType* out, size_t num_lanes, const char* caller) {
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
HWY_ASSERT(num_lanes >= N1);
|
||||
|
||||
template <class Traits, typename T>
|
||||
bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
|
||||
size_t num, const char* caller) {
|
||||
constexpr size_t N1 = st.Is128() ? 2 : 1;
|
||||
HWY_ASSERT(num >= N1);
|
||||
|
||||
InputStats<T> output_stats;
|
||||
InputStats<LaneType> output_stats;
|
||||
// Ensure it matches the sort order
|
||||
for (size_t i = 0; i < num - N1; i += N1) {
|
||||
for (size_t i = 0; i < num_lanes - N1; i += N1) {
|
||||
output_stats.Notify(out[i]);
|
||||
if (N1 == 2) output_stats.Notify(out[i + 1]);
|
||||
// Reverse order instead of checking !Compare1 so we accept equal keys.
|
||||
if (st.Compare1(out + i + N1, out + i)) {
|
||||
printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
|
||||
static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
|
||||
double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
|
||||
double(out[i + N1]));
|
||||
printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
|
||||
caller, static_cast<int>(i), static_cast<int>(num_lanes),
|
||||
static_cast<int>(N1), static_cast<double>(out[i + 1]),
|
||||
static_cast<double>(out[i + 0]),
|
||||
static_cast<double>(out[i + N1 + 1]),
|
||||
static_cast<double>(out[i + N1]));
|
||||
HWY_ABORT("%d-bit sort is incorrect\n",
|
||||
static_cast<int>(sizeof(T) * 8 * N1));
|
||||
static_cast<int>(sizeof(LaneType) * 8 * N1));
|
||||
}
|
||||
}
|
||||
output_stats.Notify(out[num - N1]);
|
||||
if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
|
||||
output_stats.Notify(out[num_lanes - N1]);
|
||||
if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
|
||||
|
||||
return input_stats == output_stats;
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -27,8 +28,8 @@ namespace hwy {
|
|||
struct SortConstants {
|
||||
// SortingNetwork reshapes its input into a matrix. This is the maximum number
|
||||
// of *keys* per vector.
|
||||
#if HWY_COMPILER_MSVC
|
||||
static constexpr size_t kMaxCols = 8; // avoids build timeout
|
||||
#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
static constexpr size_t kMaxCols = 8; // avoid build timeout/stack overflow
|
||||
#else
|
||||
static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
|
||||
#endif
|
||||
|
@ -41,7 +42,7 @@ struct SortConstants {
|
|||
static constexpr size_t kMaxRowsLog2 = 4;
|
||||
static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
|
||||
|
||||
static HWY_INLINE size_t BaseCaseNum(size_t N) {
|
||||
static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
|
||||
return kMaxRows * HWY_MIN(N, kMaxCols);
|
||||
}
|
||||
|
||||
|
@ -52,7 +53,7 @@ struct SortConstants {
|
|||
// To change, must also update left + 3 * N etc. in the loop.
|
||||
static constexpr size_t kPartitionUnroll = 4;
|
||||
|
||||
static HWY_INLINE size_t PartitionBufNum(size_t N) {
|
||||
static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
|
||||
// The main loop reads kPartitionUnroll vectors, and first loads from
|
||||
// both left and right beforehand, so it requires min = 2 *
|
||||
// kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
|
||||
|
@ -64,9 +65,26 @@ struct SortConstants {
|
|||
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
|
||||
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
|
||||
// are larger, use entire vectors to ensure we do not overrun the array.
|
||||
static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
|
||||
static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
|
||||
return HWY_MAX(64 / sizeof_t, N);
|
||||
}
|
||||
|
||||
static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
|
||||
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
|
||||
return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static constexpr HWY_INLINE size_t BufNum(size_t N) {
|
||||
// One extra for padding plus another for full-vector loads.
|
||||
return HWY_MAX(BaseCaseNum(N) + 2 * N,
|
||||
HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
|
||||
return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
@ -84,12 +102,23 @@ struct SortConstants {
|
|||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
|
||||
// Arm v7 debug.
|
||||
#undef VQSORT_ENABLED
|
||||
#if (HWY_TARGET == HWY_SCALAR) || \
|
||||
(HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
|
||||
(HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
|
||||
#define VQSORT_ENABLED 0
|
||||
#else
|
||||
#define VQSORT_ENABLED 1
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Default tag / vector width selector.
|
||||
// TODO(janwas): enable once LMUL < 1 is supported.
|
||||
#if HWY_TARGET == HWY_RVV && 0
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
|
||||
template <typename T>
|
||||
using SortTag = ScalableTag<T, -1>;
|
||||
#else
|
||||
|
|
|
@ -1,909 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
enum class SortOrder { kAscending, kDescending };
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
#define HWY_SORT_VERIFY 1
|
||||
|
||||
constexpr inline SortOrder Reverse(SortOrder order) {
|
||||
return (order == SortOrder::kAscending) ? SortOrder::kDescending
|
||||
: SortOrder::kAscending;
|
||||
}
|
||||
|
||||
namespace verify {
|
||||
|
||||
template <typename T>
|
||||
bool Compare(T a, T b, SortOrder kOrder) {
|
||||
if (kOrder == SortOrder::kAscending) return a <= b;
|
||||
return a >= b;
|
||||
}
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
|
||||
template <class D>
|
||||
class Runs {
|
||||
using T = TFromD<D>;
|
||||
|
||||
public:
|
||||
Runs(D d, size_t num_regs, size_t run_length = 0, bool alternating = false) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
buf_ = AllocateAligned<T>(N);
|
||||
consecutive_ = AllocateAligned<T>(num_regs * N);
|
||||
|
||||
num_regs_ = num_regs;
|
||||
if (run_length) {
|
||||
run_length_ = run_length;
|
||||
num_runs_ = num_regs * N / run_length;
|
||||
is_vector_ = true;
|
||||
alternating_ = alternating;
|
||||
} else {
|
||||
run_length_ = num_regs * 4;
|
||||
num_runs_ = N / 4;
|
||||
is_vector_ = false;
|
||||
alternating_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void ScatterQuartets(D d, const size_t idx_reg, Vec<D> v) {
|
||||
HWY_ASSERT(idx_reg < num_regs_);
|
||||
const size_t N = Lanes(d);
|
||||
for (size_t i = 0; i < N; i += 4) {
|
||||
Store(v, d, buf_.get());
|
||||
const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
|
||||
CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
|
||||
}
|
||||
}
|
||||
|
||||
void StoreVector(D d, const size_t idx_reg, Vec<D> v) {
|
||||
HWY_ASSERT(idx_reg < num_regs_);
|
||||
Store(v, d, &consecutive_[idx_reg * Lanes(d)]);
|
||||
}
|
||||
|
||||
bool IsBitonic() const {
|
||||
HWY_ASSERT(!alternating_);
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
bool is_asc = true;
|
||||
bool is_desc = true;
|
||||
bool is_zero = true;
|
||||
|
||||
for (size_t i = 0; i < run_length_ / 2 - 1; ++i) {
|
||||
is_asc &= (p[i] <= p[i + 1]);
|
||||
is_desc &= (p[i] >= p[i + 1]);
|
||||
}
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
is_zero &= (p[i] == 0);
|
||||
}
|
||||
|
||||
bool is_asc2 = true;
|
||||
bool is_desc2 = true;
|
||||
for (size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
|
||||
is_asc2 &= (p[i] <= p[i + 1]);
|
||||
is_desc2 &= (p[i] >= p[i + 1]);
|
||||
}
|
||||
|
||||
if (is_zero) continue;
|
||||
if (is_asc && is_desc2) continue;
|
||||
if (is_desc && is_asc2) continue;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CheckBitonic(int line, int caller) const {
|
||||
if (IsBitonic()) return;
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
printf("run %" PRIu64 " (len %" PRIu64 ")\n", static_cast<uint64_t>(ir),
|
||||
static_cast<uint64_t>(run_length_));
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
printf("%.0f\n", static_cast<float>(p[i]));
|
||||
}
|
||||
}
|
||||
printf("caller %d\n", caller);
|
||||
hwy::Abort("", line, "not bitonic");
|
||||
}
|
||||
|
||||
void CheckSorted(SortOrder kOrder, int line, int caller) const {
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const SortOrder order =
|
||||
(alternating_ && (ir & 1)) ? Reverse(kOrder) : kOrder;
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
|
||||
for (size_t i = 0; i < run_length_ - 1; ++i) {
|
||||
if (!Compare(p[i], p[i + 1], order)) {
|
||||
printf("ir%" PRIu64 " run_length=%" PRIu64
|
||||
" alt=%d original order=%d this order=%d\n",
|
||||
static_cast<uint64_t>(ir), static_cast<uint64_t>(run_length_),
|
||||
alternating_, static_cast<int>(kOrder),
|
||||
static_cast<int>(order));
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
printf(" %.0f\n", static_cast<float>(p[i]));
|
||||
}
|
||||
printf("caller %d\n", caller);
|
||||
hwy::Abort("", line, "not sorted");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
AlignedFreeUniquePtr<T[]> buf_;
|
||||
AlignedFreeUniquePtr<T[]> consecutive_;
|
||||
size_t num_regs_;
|
||||
size_t run_length_;
|
||||
size_t num_runs_;
|
||||
bool is_vector_;
|
||||
bool alternating_;
|
||||
};
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
|
||||
Runs<D> runs(d, 1);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
|
||||
Runs<D> runs(d, 2);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3) {
|
||||
Runs<D> runs(d, 4);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
runs.ScatterQuartets(d, 2, v2);
|
||||
runs.ScatterQuartets(d, 3, v3);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
|
||||
Vec<D> v7) {
|
||||
Runs<D> runs(d, 8);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
runs.ScatterQuartets(d, 2, v2);
|
||||
runs.ScatterQuartets(d, 3, v3);
|
||||
runs.ScatterQuartets(d, 4, v4);
|
||||
runs.ScatterQuartets(d, 5, v5);
|
||||
runs.ScatterQuartets(d, 6, v6);
|
||||
runs.ScatterQuartets(d, 7, v7);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
|
||||
Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
|
||||
Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
|
||||
Vec<D> vF) {
|
||||
Runs<D> runs(d, 16);
|
||||
runs.ScatterQuartets(d, 0x0, v0);
|
||||
runs.ScatterQuartets(d, 0x1, v1);
|
||||
runs.ScatterQuartets(d, 0x2, v2);
|
||||
runs.ScatterQuartets(d, 0x3, v3);
|
||||
runs.ScatterQuartets(d, 0x4, v4);
|
||||
runs.ScatterQuartets(d, 0x5, v5);
|
||||
runs.ScatterQuartets(d, 0x6, v6);
|
||||
runs.ScatterQuartets(d, 0x7, v7);
|
||||
runs.ScatterQuartets(d, 0x8, v8);
|
||||
runs.ScatterQuartets(d, 0x9, v9);
|
||||
runs.ScatterQuartets(d, 0xA, vA);
|
||||
runs.ScatterQuartets(d, 0xB, vB);
|
||||
runs.ScatterQuartets(d, 0xC, vC);
|
||||
runs.ScatterQuartets(d, 0xD, vD);
|
||||
runs.ScatterQuartets(d, 0xE, vE);
|
||||
runs.ScatterQuartets(d, 0xF, vF);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(
|
||||
D d, const Vec<D>& v00, const Vec<D>& v01, const Vec<D>& v02,
|
||||
const Vec<D>& v03, const Vec<D>& v04, const Vec<D>& v05, const Vec<D>& v06,
|
||||
const Vec<D>& v07, const Vec<D>& v08, const Vec<D>& v09, const Vec<D>& v0A,
|
||||
const Vec<D>& v0B, const Vec<D>& v0C, const Vec<D>& v0D, const Vec<D>& v0E,
|
||||
const Vec<D>& v0F, const Vec<D>& v10, const Vec<D>& v11, const Vec<D>& v12,
|
||||
const Vec<D>& v13, const Vec<D>& v14, const Vec<D>& v15, const Vec<D>& v16,
|
||||
const Vec<D>& v17, const Vec<D>& v18, const Vec<D>& v19, const Vec<D>& v1A,
|
||||
const Vec<D>& v1B, const Vec<D>& v1C, const Vec<D>& v1D, const Vec<D>& v1E,
|
||||
const Vec<D>& v1F) {
|
||||
Runs<D> runs(d, 32);
|
||||
runs.ScatterQuartets(d, 0x00, v00);
|
||||
runs.ScatterQuartets(d, 0x01, v01);
|
||||
runs.ScatterQuartets(d, 0x02, v02);
|
||||
runs.ScatterQuartets(d, 0x03, v03);
|
||||
runs.ScatterQuartets(d, 0x04, v04);
|
||||
runs.ScatterQuartets(d, 0x05, v05);
|
||||
runs.ScatterQuartets(d, 0x06, v06);
|
||||
runs.ScatterQuartets(d, 0x07, v07);
|
||||
runs.ScatterQuartets(d, 0x08, v08);
|
||||
runs.ScatterQuartets(d, 0x09, v09);
|
||||
runs.ScatterQuartets(d, 0x0A, v0A);
|
||||
runs.ScatterQuartets(d, 0x0B, v0B);
|
||||
runs.ScatterQuartets(d, 0x0C, v0C);
|
||||
runs.ScatterQuartets(d, 0x0D, v0D);
|
||||
runs.ScatterQuartets(d, 0x0E, v0E);
|
||||
runs.ScatterQuartets(d, 0x0F, v0F);
|
||||
runs.ScatterQuartets(d, 0x10, v10);
|
||||
runs.ScatterQuartets(d, 0x11, v11);
|
||||
runs.ScatterQuartets(d, 0x12, v12);
|
||||
runs.ScatterQuartets(d, 0x13, v13);
|
||||
runs.ScatterQuartets(d, 0x14, v14);
|
||||
runs.ScatterQuartets(d, 0x15, v15);
|
||||
runs.ScatterQuartets(d, 0x16, v16);
|
||||
runs.ScatterQuartets(d, 0x17, v17);
|
||||
runs.ScatterQuartets(d, 0x18, v18);
|
||||
runs.ScatterQuartets(d, 0x19, v19);
|
||||
runs.ScatterQuartets(d, 0x1A, v1A);
|
||||
runs.ScatterQuartets(d, 0x1B, v1B);
|
||||
runs.ScatterQuartets(d, 0x1C, v1C);
|
||||
runs.ScatterQuartets(d, 0x1D, v1D);
|
||||
runs.ScatterQuartets(d, 0x1E, v1E);
|
||||
runs.ScatterQuartets(d, 0x1F, v1F);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, size_t run_length, bool alternating) {
|
||||
Runs<D> runs(d, 1, run_length, alternating);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
|
||||
constexpr size_t kRegs = 2;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
|
||||
constexpr size_t kRegs = 4;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
runs.StoreVector(d, 2, v2);
|
||||
runs.StoreVector(d, 3, v3);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
|
||||
Vec<D> v5, Vec<D> v6, Vec<D> v7) {
|
||||
constexpr size_t kRegs = 8;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
runs.StoreVector(d, 2, v2);
|
||||
runs.StoreVector(d, 3, v3);
|
||||
runs.StoreVector(d, 4, v4);
|
||||
runs.StoreVector(d, 5, v5);
|
||||
runs.StoreVector(d, 6, v6);
|
||||
runs.StoreVector(d, 7, v7);
|
||||
return runs;
|
||||
}
|
||||
|
||||
#endif // HWY_SORT_VERIFY
|
||||
} // namespace verify
|
||||
|
||||
namespace detail {
|
||||
|
||||
// ------------------------------ Vector-length agnostic (quartets)
|
||||
|
||||
// For each lane i: replaces a[i] with the first and b[i] with the second
|
||||
// according to kOrder.
|
||||
// Corresponds to a conditional swap, which is one "node" of a sorting network.
|
||||
// Min/Max are cheaper than compare + blend at least for integers.
|
||||
template <SortOrder kOrder, class V>
|
||||
HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
|
||||
V temp = a;
|
||||
a = (kOrder == SortOrder::kAscending) ? Min(a, b) : Max(a, b);
|
||||
b = (kOrder == SortOrder::kAscending) ? Max(temp, b) : Min(temp, b);
|
||||
}
|
||||
|
||||
// For each lane: sorts the four values in the that lane of the four vectors.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void SortLanesIn4Vectors(D d, const TFromD<D>* in, V& v0, V& v1,
|
||||
V& v2, V& v3) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
// Bitonic and odd-even sorters both have 5 nodes. This one is from
|
||||
// http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
|
||||
|
||||
// layer 1
|
||||
v0 = Load(d, in + 0 * N);
|
||||
v2 = Load(d, in + 2 * N);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
v1 = Load(d, in + 1 * N);
|
||||
v3 = Load(d, in + 3 * N);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
|
||||
// layer 2
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
|
||||
// layer 3
|
||||
SortLanesIn2Vectors<kOrder>(v1, v2);
|
||||
}
|
||||
|
||||
// Inputs are vectors with columns in sorted order (from SortLanesIn4Vectors).
|
||||
// Transposes so that output vectors are sorted quartets (128-bit blocks),
|
||||
// and a quartet in v0 comes before its counterpart in v1, etc.
|
||||
template <class D, class V = Vec<D>>
|
||||
HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
|
||||
// Input: first number is reg, second is lane (0 is lowest)
|
||||
// 03 02 01 00 |
|
||||
// 13 12 11 10 | columns are sorted
|
||||
// 23 22 21 20 | (in this order)
|
||||
// 33 32 31 30 V
|
||||
const V t0 = InterleaveLower(d, v0, v1); // 11 01 10 00
|
||||
const V t1 = InterleaveLower(d, v2, v3); // 31 21 30 20
|
||||
const V t2 = InterleaveUpper(d, v0, v1); // 13 03 12 02
|
||||
const V t3 = InterleaveUpper(d, v2, v3); // 33 23 32 22
|
||||
|
||||
// 30 20 10 00
|
||||
v0 = BitCast(d, InterleaveLower(BitCast(dw, t0), BitCast(dw, t1)));
|
||||
// 31 21 11 01
|
||||
v1 = BitCast(d, InterleaveUpper(BitCast(dw, t0), BitCast(dw, t1)));
|
||||
// 32 22 12 02
|
||||
v2 = BitCast(d, InterleaveLower(BitCast(dw, t2), BitCast(dw, t3)));
|
||||
// 33 23 13 03 --> sorted in descending order (03=smallest in lane 0).
|
||||
v3 = BitCast(d, InterleaveUpper(BitCast(dw, t2), BitCast(dw, t3)));
|
||||
}
|
||||
|
||||
// 12 ops (including 4 swizzle)
|
||||
// Precondition: v0 and v1 are already sorted according to kOrder.
|
||||
// Postcondition: concatenate(v0, v1) is sorted and v0 is the lower half.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
|
||||
const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
|
||||
input0.CheckSorted(kOrder, __LINE__, caller);
|
||||
input1.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
|
||||
// See figure 5 from https://www.vldb.org/pvldb/vol8/p1274-inoue.pdf.
|
||||
// This requires 8 min/max vs 6 for bitonic merge (see Figure 2 in
|
||||
// http://www.vldb.org/pvldb/vol1/1454171.pdf), but is faster overall because
|
||||
// it needs less shuffling, and does not need a bitonic input.
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Bitonic merge (quartets)
|
||||
|
||||
// For the last layer of bitonic merge. Conditionally swaps even-numbered lanes
|
||||
// with their odd-numbered neighbor. Works for both quartets and vectors.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
|
||||
(void)d;
|
||||
// Optimization for 32-bit integers: swap via Shuffle and 64-bit Min/Max.
|
||||
// (not worthwhile on SSE4/AVX2 because they lack 64-bit Min/Max)
|
||||
#if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
|
||||
if (sizeof(TFromD<D>) == 4 && !IsFloat<TFromD<D>>()) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
const auto wide = BitCast(dw, q_or_v);
|
||||
const auto swap = BitCast(dw, Shuffle2301(q_or_v));
|
||||
if (kOrder == SortOrder::kAscending) {
|
||||
q_or_v = BitCast(d, Max(wide, swap));
|
||||
} else {
|
||||
q_or_v = BitCast(d, Min(wide, swap));
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec<D> swapped = Shuffle2301(q_or_v);
|
||||
SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
|
||||
q_or_v = OddEven(swapped, q_or_v);
|
||||
}
|
||||
}
|
||||
|
||||
// Lane 0 with 2, 1 with 3 etc. Works for both quartets and vectors.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
Vec<D> swapped = Shuffle1032(q_or_v);
|
||||
SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
|
||||
q_or_v = BitCast(d, OddEven(BitCast(dw, swapped), BitCast(dw, q_or_v)));
|
||||
}
|
||||
|
||||
// For all BitonicMerge*, and each block, the concatenation of those blocks from
|
||||
// the first half and second half of the input vectors must be sorted in
|
||||
// opposite orders.
|
||||
|
||||
// 14 ops (including 4 swizzle)
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 4 (2 ops)
|
||||
SortLanesIn2Vectors<kOrder>(q0, q1);
|
||||
|
||||
// Layer 2: lane stride 2 (6 ops)
|
||||
SortDistance2LanesQV<kOrder>(d, q0);
|
||||
SortDistance2LanesQV<kOrder>(d, q1);
|
||||
|
||||
// Layer 3: lane stride 1 (4 ops)
|
||||
SortAdjacentLanesQV<kOrder>(d, q0);
|
||||
SortAdjacentLanesQV<kOrder>(d, q1);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 32 ops, more efficient than three 4+4 merges (36 ops).
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
|
||||
int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 8
|
||||
SortLanesIn2Vectors<kOrder>(q0, q2);
|
||||
SortLanesIn2Vectors<kOrder>(q1, q3);
|
||||
|
||||
// Layers 2 to 4
|
||||
// Inputs are not fully sorted, so cannot use Merge2SortedQuartets.
|
||||
BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
|
||||
BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 72 ops.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(q0, q4);
|
||||
SortLanesIn2Vectors<kOrder>(q1, q5);
|
||||
SortLanesIn2Vectors<kOrder>(q2, q6);
|
||||
SortLanesIn2Vectors<kOrder>(q3, q7);
|
||||
|
||||
// Layers 2 to 5
|
||||
BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
|
||||
BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Bitonic merge (vectors)
|
||||
|
||||
// Lane 0 with 4, 1 with 5 etc. Only used for vectors with at least 8 lanes.
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
|
||||
// TODO(janwas): move to op
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
|
||||
return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, class T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d, Vec<decltype(d)>& v) {
|
||||
// In: FEDCBA98 76543210
|
||||
// Swap 128-bit halves of each 256 bits => BA98FEDC 32107654
|
||||
Vec512<T> swapped = Shuffle128_2301(v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = OddEven128(swapped, v);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d, Vec<decltype(d)>& v) {
|
||||
Vec<decltype(d)> swapped = ConcatLowerUpper(d, v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = ConcatUpperLower(swapped, v);
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 4> /* tag */, ...) {}
|
||||
|
||||
// Only used for vectors with at least 16 lanes.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
|
||||
Vec<D> swapped = ConcatLowerUpper(d, v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = ConcatUpperLower(swapped, v);
|
||||
}
|
||||
|
||||
// 120 ops. Only used if vectors are at least 8 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 32
|
||||
SortLanesIn2Vectors<kOrder>(v0, v4);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v3, v7);
|
||||
|
||||
// Layer 2: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
SortLanesIn2Vectors<kOrder>(v4, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v5, v7);
|
||||
|
||||
// Layer 3: lane stride 8
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
SortLanesIn2Vectors<kOrder>(v4, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v6, v7);
|
||||
|
||||
// Layer 4: lane stride 4
|
||||
SortDistance4LanesV<kOrder>(d, v0);
|
||||
SortDistance4LanesV<kOrder>(d, v1);
|
||||
SortDistance4LanesV<kOrder>(d, v2);
|
||||
SortDistance4LanesV<kOrder>(d, v3);
|
||||
SortDistance4LanesV<kOrder>(d, v4);
|
||||
SortDistance4LanesV<kOrder>(d, v5);
|
||||
SortDistance4LanesV<kOrder>(d, v6);
|
||||
SortDistance4LanesV<kOrder>(d, v7);
|
||||
|
||||
// Layer 5: lane stride 2
|
||||
SortDistance2LanesQV<kOrder>(d, v0);
|
||||
SortDistance2LanesQV<kOrder>(d, v1);
|
||||
SortDistance2LanesQV<kOrder>(d, v2);
|
||||
SortDistance2LanesQV<kOrder>(d, v3);
|
||||
SortDistance2LanesQV<kOrder>(d, v4);
|
||||
SortDistance2LanesQV<kOrder>(d, v5);
|
||||
SortDistance2LanesQV<kOrder>(d, v6);
|
||||
SortDistance2LanesQV<kOrder>(d, v7);
|
||||
|
||||
// Layer 6: lane stride 1
|
||||
SortAdjacentLanesQV<kOrder>(d, v0);
|
||||
SortAdjacentLanesQV<kOrder>(d, v1);
|
||||
SortAdjacentLanesQV<kOrder>(d, v2);
|
||||
SortAdjacentLanesQV<kOrder>(d, v3);
|
||||
SortAdjacentLanesQV<kOrder>(d, v4);
|
||||
SortAdjacentLanesQV<kOrder>(d, v5);
|
||||
SortAdjacentLanesQV<kOrder>(d, v6);
|
||||
SortAdjacentLanesQV<kOrder>(d, v7);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 60 ops. Only used if vectors are at least 16 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 32
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
|
||||
// Layer 2: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
|
||||
// Layer 3: lane stride 8
|
||||
SortDistance8LanesV<kOrder>(d, v0);
|
||||
SortDistance8LanesV<kOrder>(d, v1);
|
||||
SortDistance8LanesV<kOrder>(d, v2);
|
||||
SortDistance8LanesV<kOrder>(d, v3);
|
||||
|
||||
// Layer 4: lane stride 4
|
||||
SortDistance4LanesV<kOrder>(d, v0);
|
||||
SortDistance4LanesV<kOrder>(d, v1);
|
||||
SortDistance4LanesV<kOrder>(d, v2);
|
||||
SortDistance4LanesV<kOrder>(d, v3);
|
||||
|
||||
// Layer 5: lane stride 2
|
||||
SortDistance2LanesQV<kOrder>(d, v0);
|
||||
SortDistance2LanesQV<kOrder>(d, v1);
|
||||
SortDistance2LanesQV<kOrder>(d, v2);
|
||||
SortDistance2LanesQV<kOrder>(d, v3);
|
||||
|
||||
// Layer 6: lane stride 1
|
||||
SortAdjacentLanesQV<kOrder>(d, v0);
|
||||
SortAdjacentLanesQV<kOrder>(d, v1);
|
||||
SortAdjacentLanesQV<kOrder>(d, v2);
|
||||
SortAdjacentLanesQV<kOrder>(d, v3);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 128 ops. Only used if vectors are at least 16 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 64
|
||||
SortLanesIn2Vectors<kOrder>(v0, v4);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v3, v7);
|
||||
|
||||
BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
|
||||
BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Vector-length dependent
|
||||
|
||||
// Only called when N=4 (single block, so quartets can just be stored).
|
||||
template <SortOrder kOrder, class D, class V>
|
||||
HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, TFromD<D>* inout) {
|
||||
Store(q0, d, inout + 0 * 4);
|
||||
Store(q1, d, inout + 1 * 4);
|
||||
Store(q2, d, inout + 2 * 4);
|
||||
Store(q3, d, inout + 3 * 4);
|
||||
Store(q4, d, inout + 4 * 4);
|
||||
Store(q5, d, inout + 5 * 4);
|
||||
Store(q6, d, inout + 6 * 4);
|
||||
Store(q7, d, inout + 7 * 4);
|
||||
return 8 * 4;
|
||||
}
|
||||
|
||||
// Only called when N=8.
|
||||
template <SortOrder kOrder, class D, class V>
|
||||
HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, TFromD<D>* inout) {
|
||||
V v0 = ConcatLowerLower(d, q1, q0);
|
||||
V v1 = ConcatLowerLower(d, q3, q2);
|
||||
V v2 = ConcatLowerLower(d, q5, q4);
|
||||
V v3 = ConcatLowerLower(d, q7, q6);
|
||||
// TODO(janwas): merge into single table
|
||||
V v4 = Reverse(d, ConcatUpperUpper(d, q7, q6));
|
||||
V v5 = Reverse(d, ConcatUpperUpper(d, q5, q4));
|
||||
V v6 = Reverse(d, ConcatUpperUpper(d, q3, q2));
|
||||
V v7 = Reverse(d, ConcatUpperUpper(d, q1, q0));
|
||||
detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
|
||||
|
||||
Store(v0, d, inout + 0 * 8);
|
||||
Store(v1, d, inout + 1 * 8);
|
||||
Store(v2, d, inout + 2 * 8);
|
||||
Store(v3, d, inout + 3 * 8);
|
||||
Store(v4, d, inout + 4 * 8);
|
||||
Store(v5, d, inout + 5 * 8);
|
||||
Store(v6, d, inout + 6 * 8);
|
||||
Store(v7, d, inout + 7 * 8);
|
||||
return 8 * 8;
|
||||
}
|
||||
|
||||
// Only called when N=16.
|
||||
template <SortOrder kOrder, typename T, class V>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
|
||||
V& q4, V& q5, V& q6, V& q7, T* inout) {
|
||||
const V q11_01_10_00 = Shuffle128_2020(q0, q1);
|
||||
const V q13_03_12_02 = Shuffle128_2020(q2, q3);
|
||||
V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02); // 3..0
|
||||
|
||||
const V q15_05_14_04 = Shuffle128_2020(q4, q5);
|
||||
const V q17_07_16_06 = Shuffle128_2020(q6, q7);
|
||||
V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06); // 7..4
|
||||
|
||||
const V q19_09_18_08 = Shuffle128_3131(q0, q1);
|
||||
const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
|
||||
V v3 = Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a)); // b..8
|
||||
|
||||
const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
|
||||
const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
|
||||
V v2 = Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e)); // f..c
|
||||
|
||||
detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
|
||||
|
||||
// TODO(janwas): merge into single table
|
||||
V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02); // 13..10
|
||||
V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06); // 17..14
|
||||
V v7 = Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a)); // 1b..18
|
||||
V v6 = Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e)); // 1f..1c
|
||||
|
||||
detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
|
||||
|
||||
detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
|
||||
|
||||
Store(v0, d, inout + 0 * 16);
|
||||
Store(v1, d, inout + 1 * 16);
|
||||
Store(v2, d, inout + 2 * 16);
|
||||
Store(v3, d, inout + 3 * 16);
|
||||
Store(v4, d, inout + 4 * 16);
|
||||
Store(v5, d, inout + 5 * 16);
|
||||
Store(v6, d, inout + 6 * 16);
|
||||
Store(v7, d, inout + 7 * 16);
|
||||
return 8 * 16;
|
||||
}
|
||||
|
||||
// Avoid needing #if at the call sites.
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 8> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class D>
|
||||
HWY_API size_t SortBatchSize(D d) {
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 4) return 32;
|
||||
if (N == 8) return 64;
|
||||
if (N == 16) return 128;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
Vec<D> q0, q1, q2, q3;
|
||||
detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
|
||||
detail::Transpose4x4(d, q0, q1, q2, q3);
|
||||
detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
|
||||
detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
|
||||
|
||||
// Bitonic merges require one input to be in reverse order.
|
||||
constexpr SortOrder kReverse = Reverse(kOrder);
|
||||
|
||||
Vec<D> q4, q5, q6, q7;
|
||||
detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
|
||||
detail::Transpose4x4(d, q4, q5, q6, q7);
|
||||
detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
|
||||
detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
|
||||
|
||||
detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
|
||||
detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
|
||||
|
||||
detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
|
||||
__LINE__);
|
||||
|
||||
if (N == 4) {
|
||||
return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
if (N == 8) {
|
||||
return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Avoids unused attribute warning
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_API size_t SortBatch(D /* tag */, TFromD<D>* /* inout */) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,97 +13,76 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm> // std::max
|
||||
#include <vector>
|
||||
|
||||
#undef VQSORT_TEST_IMPL
|
||||
#if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
|
||||
// Scalar does not implement these, and MSVC non-debug builds time out.
|
||||
#define VQSORT_TEST_IMPL 0
|
||||
#else
|
||||
#define VQSORT_TEST_IMPL 1
|
||||
#endif
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
#undef VQSORT_TEST_SORT
|
||||
// MSVC non-debug builds time out.
|
||||
#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
|
||||
#define VQSORT_TEST_SORT 0
|
||||
#else
|
||||
#define VQSORT_TEST_SORT 1
|
||||
#endif
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
|
||||
using detail::LaneTraits;
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderDescending;
|
||||
using detail::OrderDescending128;
|
||||
using detail::SharedTraits;
|
||||
using detail::TraitsLane;
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderAscendingKV128;
|
||||
using detail::OrderDescending128;
|
||||
using detail::OrderDescendingKV128;
|
||||
using detail::Traits128;
|
||||
#endif
|
||||
|
||||
#if !VQSORT_TEST_IMPL
|
||||
static void TestAllMedian() {}
|
||||
static void TestAllBaseCase() {}
|
||||
static void TestAllPartition() {}
|
||||
static void TestAllGenerator() {}
|
||||
#else
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestMedian3() {
|
||||
using T = uint64_t;
|
||||
using D = CappedTag<T, 1>;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using D = CappedTag<LaneType, 1>;
|
||||
SharedTraits<Traits> st;
|
||||
const D d;
|
||||
using V = Vec<D>;
|
||||
for (uint32_t bits = 0; bits < 8; ++bits) {
|
||||
const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
|
||||
const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
|
||||
const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
|
||||
const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
|
||||
const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
|
||||
const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
|
||||
const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
|
||||
const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
|
||||
// If at least half(rounded up) of bits are 1, so is the median.
|
||||
const size_t count = PopCount(bits);
|
||||
HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
|
||||
HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllMedian() {
|
||||
TestMedian3<LaneTraits<OrderAscending> >();
|
||||
TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCaseAscDesc() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<T> d;
|
||||
const SortTag<LaneType> d;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
|
||||
auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
|
@ -123,43 +103,45 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
|
|||
for (bool asc : {false, true}) {
|
||||
for (size_t len : lengths) {
|
||||
for (size_t misalign : misalignments) {
|
||||
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
|
||||
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
|
||||
if (kDebug) {
|
||||
printf("============%s asc %d N1 %d len %d misalign %d\n",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
|
||||
st.KeyString().c_str(), asc, static_cast<int>(N1),
|
||||
static_cast<int>(len), static_cast<int>(misalign));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_keys[i] = hwy::LowestValue<T>();
|
||||
aligned_lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
InputStats<T> input_stats;
|
||||
InputStats<LaneType> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
keys[i] =
|
||||
asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
|
||||
input_stats.Notify(keys[i]);
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
|
||||
: static_cast<LaneType>(LaneType(len) - LaneType(i));
|
||||
input_stats.Notify(lanes[i]);
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, keys, len, buf.get());
|
||||
detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_keys[i] != hwy::LowestValue<T>())
|
||||
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
|
@ -167,17 +149,18 @@ static HWY_NOINLINE void TestBaseCaseAscDesc() {
|
|||
} // asc
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCase01() {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<T> d;
|
||||
const SortTag<LaneType> d;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto keys = hwy::AllocateAligned<T>(base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
|
||||
auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
|
@ -189,65 +172,69 @@ static HWY_NOINLINE void TestBaseCase01() {
|
|||
|
||||
for (size_t len : lengths) {
|
||||
if (kDebug) {
|
||||
printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
|
||||
printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
|
||||
static_cast<int>(N1), static_cast<int>(len));
|
||||
}
|
||||
const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
|
||||
for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
|
||||
InputStats<T> input_stats;
|
||||
InputStats<LaneType> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
|
||||
input_stats.Notify(keys[i]);
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
|
||||
input_stats.Notify(lanes[i]);
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, keys.get(), len, buf.get());
|
||||
detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // bits
|
||||
} // len
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestBaseCase() {
|
||||
TestBaseCaseAscDesc<Traits, T>();
|
||||
TestBaseCase01<Traits, T>();
|
||||
TestBaseCaseAscDesc<Traits>();
|
||||
TestBaseCase01<Traits>();
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllBaseCase() {
|
||||
// Workaround for stack overflow on MSVC debug.
|
||||
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
|
||||
#if defined(_MSC_VER)
|
||||
return;
|
||||
#endif
|
||||
// Only enable EMU128 on x86 - it's slow on emulators.
|
||||
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
|
||||
|
||||
TestBaseCase<LaneTraits<OrderAscending>, int32_t>();
|
||||
TestBaseCase<LaneTraits<OrderDescending>, int64_t>();
|
||||
TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
|
||||
TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
|
||||
TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
|
||||
TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
|
||||
TestBaseCase<Traits128<OrderAscending128> >();
|
||||
TestBaseCase<Traits128<OrderDescending128> >();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
|
||||
size_t left, size_t border,
|
||||
size_t right, const size_t N1,
|
||||
const T* pivot) {
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void VerifyPartition(
|
||||
Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
|
||||
size_t border, size_t right, const size_t N1,
|
||||
const typename Traits::LaneType* pivot) {
|
||||
/* for (size_t i = left; i < right; ++i) {
|
||||
if (i == border) printf("--\n");
|
||||
printf("%4zu: %3d\n", i, keys[i]);
|
||||
printf("%4zu: %3d\n", i, lanes[i]);
|
||||
}*/
|
||||
|
||||
HWY_ASSERT(left % N1 == 0);
|
||||
|
@ -255,30 +242,33 @@ static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
|
|||
HWY_ASSERT(right % N1 == 0);
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
for (size_t i = left; i < border; i += N1) {
|
||||
if (st.Compare1(pivot, keys + i)) {
|
||||
if (st.Compare1(pivot, lanes + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
|
||||
"border %d",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
|
||||
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
|
||||
double(keys[i + 0]), static_cast<int>(border));
|
||||
st.KeyString().c_str(), asc, static_cast<int>(i),
|
||||
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
|
||||
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
|
||||
static_cast<int>(border));
|
||||
}
|
||||
}
|
||||
for (size_t i = border; i < right; i += N1) {
|
||||
if (!st.Compare1(pivot, keys + i)) {
|
||||
if (!st.Compare1(pivot, lanes + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
|
||||
"border %d",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
|
||||
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
|
||||
double(keys[i]), static_cast<int>(border));
|
||||
st.KeyString().c_str(), asc, static_cast<int>(i),
|
||||
static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
|
||||
static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
|
||||
static_cast<int>(border));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestPartition() {
|
||||
const SortTag<T> d;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
const SortTag<LaneType> d;
|
||||
SharedTraits<Traits> st;
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
const size_t N = Lanes(d);
|
||||
|
@ -286,8 +276,8 @@ static HWY_NOINLINE void TestPartition() {
|
|||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
// left + len + align
|
||||
const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
|
||||
auto aligned_keys = hwy::AllocateAligned<T>(total);
|
||||
auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
|
||||
auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
|
||||
auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
|
||||
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
for (bool in_asc : {false, true}) {
|
||||
|
@ -296,61 +286,66 @@ static HWY_NOINLINE void TestPartition() {
|
|||
for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
|
||||
2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
|
||||
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
|
||||
for (T pivot1 :
|
||||
{T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
|
||||
const T pivot2[2] = {pivot1, 0};
|
||||
for (LaneType pivot1 :
|
||||
{LaneType(0), LaneType(len / 3), LaneType(len / 2),
|
||||
LaneType(2 * len / 3), LaneType(len)}) {
|
||||
const LaneType pivot2[2] = {pivot1, 0};
|
||||
const auto pivot = st.SetKey(d, pivot2);
|
||||
for (size_t misalign = 0; misalign < N;
|
||||
misalign += st.LanesPerKey()) {
|
||||
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
|
||||
LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
|
||||
const size_t right = left + len;
|
||||
if (kDebug) {
|
||||
printf(
|
||||
"=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
|
||||
st.KeyString().c_str(), asc, static_cast<int>(left),
|
||||
static_cast<int>(len), static_cast<int>(right),
|
||||
double(pivot2[1]), double(pivot2[0]));
|
||||
static_cast<double>(pivot2[1]),
|
||||
static_cast<double>(pivot2[0]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_keys[i] = hwy::LowestValue<T>();
|
||||
aligned_lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
|
||||
: static_cast<T>(right) - T(i));
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
lanes[i] = static_cast<LaneType>(
|
||||
in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
|
||||
: static_cast<LaneType>(right) - LaneType(i));
|
||||
if (kDebug >= 2) {
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
lanes[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
|
||||
size_t border =
|
||||
detail::Partition(d, st, keys, left, right, pivot, buf.get());
|
||||
detail::Partition(d, st, lanes, left, right, pivot, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
printf("%3zu: sentinel %f\n", i, double(keys[i]));
|
||||
printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
|
||||
}
|
||||
}
|
||||
|
||||
VerifyPartition(st, keys, left, border, right, N1, pivot2);
|
||||
VerifyPartition(st, lanes, left, border, right, N1, pivot2);
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_keys[i] != hwy::LowestValue<T>())
|
||||
if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
if (lanes[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
|
@ -361,15 +356,18 @@ static HWY_NOINLINE void TestPartition() {
|
|||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPartition() {
|
||||
TestPartition<LaneTraits<OrderAscending>, int16_t>();
|
||||
TestPartition<LaneTraits<OrderDescending>, int32_t>();
|
||||
TestPartition<LaneTraits<OrderAscending>, int64_t>();
|
||||
TestPartition<LaneTraits<OrderDescending>, float>();
|
||||
// Only enable EMU128 on x86 - it's slow on emulators.
|
||||
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
|
||||
|
||||
TestPartition<TraitsLane<OrderAscending<int16_t> > >();
|
||||
TestPartition<TraitsLane<OrderDescending<int32_t> > >();
|
||||
TestPartition<TraitsLane<OrderAscending<int64_t> > >();
|
||||
TestPartition<TraitsLane<OrderDescending<float> > >();
|
||||
#if HWY_HAVE_FLOAT64
|
||||
TestPartition<LaneTraits<OrderDescending>, double>();
|
||||
TestPartition<TraitsLane<OrderDescending<double> > >();
|
||||
#endif
|
||||
TestPartition<Traits128<OrderAscending128>, uint64_t>();
|
||||
TestPartition<Traits128<OrderDescending128>, uint64_t>();
|
||||
TestPartition<Traits128<OrderAscending128> >();
|
||||
TestPartition<Traits128<OrderDescending128> >();
|
||||
}
|
||||
|
||||
// (used for sample selection for choosing a pivot)
|
||||
|
@ -399,7 +397,7 @@ static HWY_NOINLINE void TestRandomGenerator() {
|
|||
|
||||
// Also ensure the mean is near the middle of the range
|
||||
const double expected = (num_blocks - 1) / 2.0;
|
||||
const double actual = double(sum) / kReps;
|
||||
const double actual = static_cast<double>(sum) / kReps;
|
||||
HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
|
||||
}
|
||||
}
|
||||
|
@ -409,22 +407,26 @@ HWY_NOINLINE void TestAllGenerator() {
|
|||
TestRandomGenerator<uint64_t>();
|
||||
}
|
||||
|
||||
#endif // VQSORT_TEST_IMPL
|
||||
|
||||
#if !VQSORT_TEST_SORT
|
||||
static void TestAllSort() {}
|
||||
#else
|
||||
static void TestAllMedian() {}
|
||||
static void TestAllBaseCase() {}
|
||||
static void TestAllPartition() {}
|
||||
static void TestAllGenerator() {}
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
// Remembers input, and compares results to that of a reference algorithm.
|
||||
template <class Traits, typename T>
|
||||
template <class Traits>
|
||||
class CompareResults {
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
|
||||
public:
|
||||
void SetInput(const T* in, size_t num) {
|
||||
copy_.resize(num);
|
||||
memcpy(copy_.data(), in, num * sizeof(T));
|
||||
CompareResults(const LaneType* in, size_t num_lanes) {
|
||||
copy_.resize(num_lanes);
|
||||
memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
|
||||
}
|
||||
|
||||
bool Verify(const T* output) {
|
||||
bool Verify(const LaneType* output) {
|
||||
#if HAVE_PDQSORT
|
||||
const Algo reference = Algo::kPDQ;
|
||||
#else
|
||||
|
@ -432,13 +434,28 @@ class CompareResults {
|
|||
#endif
|
||||
SharedState shared;
|
||||
using Order = typename Traits::Order;
|
||||
Run<Order>(reference, copy_.data(), copy_.size(), shared,
|
||||
/*thread=*/0);
|
||||
const Traits st;
|
||||
const size_t num_keys = copy_.size() / st.LanesPerKey();
|
||||
Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
|
||||
shared, /*thread=*/0);
|
||||
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
if (copy_[i] != output[i]) {
|
||||
fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
|
||||
static_cast<int>(i), double(copy_[i]), double(output[i]));
|
||||
if (sizeof(KeyType) == 16) {
|
||||
fprintf(stderr,
|
||||
"%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
|
||||
st.KeyString().c_str(), Order().IsAscending(),
|
||||
static_cast<int>(i), static_cast<int>(copy_.size()),
|
||||
static_cast<uint64_t>(copy_[i]),
|
||||
static_cast<uint64_t>(output[i]));
|
||||
} else {
|
||||
fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
|
||||
st.KeyString().c_str(), Order().IsAscending(),
|
||||
static_cast<int>(i), static_cast<int>(copy_.size()));
|
||||
PrintValue(copy_[i]);
|
||||
PrintValue(output[i]);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -446,7 +463,7 @@ class CompareResults {
|
|||
}
|
||||
|
||||
private:
|
||||
std::vector<T> copy_;
|
||||
std::vector<LaneType> copy_;
|
||||
};
|
||||
|
||||
std::vector<Algo> AlgoForTest() {
|
||||
|
@ -467,62 +484,65 @@ std::vector<Algo> AlgoForTest() {
|
|||
};
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
void TestSort(size_t num) {
|
||||
// TODO(janwas): fix
|
||||
if (HWY_TARGET == HWY_SSSE3) return;
|
||||
template <class Traits>
|
||||
void TestSort(size_t num_lanes) {
|
||||
// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
|
||||
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
|
||||
#if defined(_MSC_VER)
|
||||
return;
|
||||
#endif
|
||||
// Only enable EMU128 on x86 - it's slow on emulators.
|
||||
if (!HWY_ARCH_X86 && (HWY_TARGET == HWY_EMU128)) return;
|
||||
|
||||
using Order = typename Traits::Order;
|
||||
using LaneType = typename Traits::LaneType;
|
||||
using KeyType = typename Traits::KeyType;
|
||||
SharedState shared;
|
||||
SharedTraits<Traits> st;
|
||||
|
||||
// Round up to a whole number of keys.
|
||||
num_lanes += (st.Is128() && (num_lanes & 1));
|
||||
const size_t num_keys = num_lanes / st.LanesPerKey();
|
||||
|
||||
constexpr size_t kMaxMisalign = 16;
|
||||
auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
|
||||
auto aligned =
|
||||
hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
|
||||
for (Algo algo : AlgoForTest()) {
|
||||
#if HAVE_IPS4O
|
||||
if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
for (Dist dist : AllDist()) {
|
||||
for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
|
||||
size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
|
||||
T* keys = aligned.get() + misalign;
|
||||
LaneType* lanes = aligned.get() + misalign;
|
||||
|
||||
// Set up red zones before/after the keys to sort
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned[i] = hwy::LowestValue<T>();
|
||||
aligned[i] = hwy::LowestValue<LaneType>();
|
||||
}
|
||||
for (size_t i = 0; i < kMaxMisalign; ++i) {
|
||||
keys[num + i] = hwy::HighestValue<T>();
|
||||
lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
|
||||
}
|
||||
#if HWY_IS_MSAN
|
||||
__msan_poison(aligned.get(), misalign * sizeof(T));
|
||||
__msan_poison(keys + num, kMaxMisalign * sizeof(T));
|
||||
__msan_poison(aligned.get(), misalign * sizeof(LaneType));
|
||||
__msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
|
||||
#endif
|
||||
InputStats<T> input_stats = GenerateInput(dist, keys, num);
|
||||
InputStats<LaneType> input_stats =
|
||||
GenerateInput(dist, lanes, num_lanes);
|
||||
|
||||
CompareResults<Traits, T> compare;
|
||||
compare.SetInput(keys, num);
|
||||
|
||||
Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
|
||||
HWY_ASSERT(compare.Verify(keys));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
|
||||
CompareResults<Traits> compare(lanes, num_lanes);
|
||||
Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
|
||||
/*thread=*/0);
|
||||
HWY_ASSERT(compare.Verify(lanes));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
|
||||
|
||||
// Check red zones
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(aligned.get(), misalign * sizeof(T));
|
||||
__msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
|
||||
__msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
|
||||
__msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
|
||||
#endif
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned[i] != hwy::LowestValue<T>())
|
||||
if (aligned[i] != hwy::LowestValue<LaneType>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = num; i < num + kMaxMisalign; ++i) {
|
||||
if (keys[i] != hwy::HighestValue<T>())
|
||||
for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
|
||||
if (lanes[i] != hwy::HighestValue<LaneType>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
|
@ -531,32 +551,37 @@ void TestSort(size_t num) {
|
|||
}
|
||||
|
||||
void TestAllSort() {
|
||||
const size_t num = 15 * 1000;
|
||||
for (int num : {129, 504, 20 * 1000, 34567}) {
|
||||
const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
|
||||
TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
|
||||
|
||||
TestSort<LaneTraits<OrderAscending>, int16_t>(num);
|
||||
TestSort<LaneTraits<OrderDescending>, uint16_t>(num);
|
||||
TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
|
||||
|
||||
TestSort<LaneTraits<OrderDescending>, int32_t>(num);
|
||||
TestSort<LaneTraits<OrderDescending>, uint32_t>(num);
|
||||
TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
|
||||
TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
|
||||
|
||||
TestSort<LaneTraits<OrderAscending>, int64_t>(num);
|
||||
TestSort<LaneTraits<OrderAscending>, uint64_t>(num);
|
||||
|
||||
// WARNING: for float types, SIMD comparisons will flush denormals to zero,
|
||||
// causing mismatches with scalar sorts. In this test, we avoid generating
|
||||
// denormal inputs.
|
||||
TestSort<LaneTraits<OrderAscending>, float>(num);
|
||||
// WARNING: for float types, SIMD comparisons will flush denormals to
|
||||
// zero, causing mismatches with scalar sorts. In this test, we avoid
|
||||
// generating denormal inputs.
|
||||
TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
|
||||
#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
|
||||
if (Sorter::HaveFloat64()) {
|
||||
TestSort<LaneTraits<OrderDescending>, double>(num);
|
||||
}
|
||||
if (Sorter::HaveFloat64()) {
|
||||
TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
|
||||
}
|
||||
#endif
|
||||
|
||||
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
}
|
||||
// Our HeapSort does not support 128-bit keys.
|
||||
#if VQSORT_ENABLED
|
||||
TestSort<Traits128<OrderAscending128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescending128> >(num_lanes);
|
||||
|
||||
#endif // VQSORT_TEST_SORT
|
||||
TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
|
||||
TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -577,10 +602,4 @@ HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
|
|||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -21,7 +22,6 @@
|
|||
#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/highway.h"
|
||||
|
||||
|
@ -30,6 +30,8 @@ namespace hwy {
|
|||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if VQSORT_ENABLED
|
||||
|
||||
using Constants = hwy::SortConstants;
|
||||
|
||||
// ------------------------------ SharedTraits
|
||||
|
@ -589,17 +591,19 @@ HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
|
|||
// Reshapes `buf` into a matrix, sorts columns independently, and then merges
|
||||
// into a sorted 1D array without transposing.
|
||||
//
|
||||
// `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
|
||||
// bridges differences in sort order and single-lane vs 128-bit keys.
|
||||
// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
|
||||
// differences in sort order and single-lane vs 128-bit keys.
|
||||
// `buf` ensures full vectors are aligned, and enables loads/stores without
|
||||
// bounds checks.
|
||||
//
|
||||
// NOINLINE because this is large and called twice from vqsort-inl.h.
|
||||
//
|
||||
// References:
|
||||
// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
|
||||
// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
|
||||
// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
|
||||
template <class Traits, typename T>
|
||||
HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
||||
HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
||||
const CappedTag<T, Constants::kMaxCols> d;
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
|
@ -646,8 +650,8 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
|||
Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
||||
// Avoids build timeout
|
||||
#if !HWY_COMPILER_MSVC
|
||||
// Avoids build timeout. Must match #if condition in kMaxCols.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
|
||||
if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
|
||||
Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
@ -677,6 +681,11 @@ HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
|||
StoreU(vf, d, buf + 0xf * cols);
|
||||
}
|
||||
|
||||
#else
|
||||
template <class Base>
|
||||
struct SharedTraits : public Base {};
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -21,36 +22,64 @@
|
|||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include <string>
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
template <typename T>
|
||||
struct KeyLane {
|
||||
constexpr bool Is128() const { return false; }
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
// What type bench_sort should allocate for generating inputs.
|
||||
using LaneType = T;
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = T;
|
||||
|
||||
std::string KeyString() const {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
|
||||
return string100;
|
||||
}
|
||||
|
||||
// For HeapSort
|
||||
template <typename T>
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const T temp = *a;
|
||||
*a = *b;
|
||||
*b = temp;
|
||||
}
|
||||
|
||||
template <class V, class M>
|
||||
HWY_INLINE V CompressKeys(V keys, M mask) const {
|
||||
return CompressNot(keys, mask);
|
||||
}
|
||||
|
||||
// Broadcasts one key into a vector
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
|
||||
return Set(d, *key);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Eq(a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
return Reverse(d, v);
|
||||
|
@ -148,10 +177,10 @@ struct KeyLane {
|
|||
// We avoid overloaded functions because we want all functions to be callable
|
||||
// from a SortTraits without per-function wrappers. Specializing would work, but
|
||||
// we are anyway going to specialize at a higher level.
|
||||
struct OrderAscending : public KeyLane {
|
||||
template <typename T>
|
||||
struct OrderAscending : public KeyLane<T> {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return *a < *b;
|
||||
}
|
||||
|
@ -174,31 +203,31 @@ struct OrderAscending : public KeyLane {
|
|||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D>>());
|
||||
return Set(d, hwy::LowestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D>>());
|
||||
return Set(d, hwy::HighestValue<T>());
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending : public KeyLane {
|
||||
template <typename T>
|
||||
struct OrderDescending : public KeyLane<T> {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return *b < *a;
|
||||
}
|
||||
|
@ -220,32 +249,30 @@ struct OrderDescending : public KeyLane {
|
|||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
T* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D>>());
|
||||
return Set(d, hwy::HighestValue<T>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D>>());
|
||||
return Set(d, hwy::LowestValue<T>());
|
||||
}
|
||||
};
|
||||
|
||||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
struct LaneTraits : public Base {
|
||||
constexpr bool Is128() const { return false; }
|
||||
|
||||
struct TraitsLane : public Base {
|
||||
// For each lane i: replaces a[i] with the first and b[i] with the second
|
||||
// according to Base.
|
||||
// Corresponds to a conditional swap, which is one "node" of a sorting
|
||||
|
@ -315,6 +342,66 @@ struct LaneTraits : public Base {
|
|||
}
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
// Base class shared between OrderAscending, OrderDescending.
|
||||
template <typename T>
|
||||
struct KeyLane {
|
||||
constexpr bool Is128() const { return false; }
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
using LaneType = T;
|
||||
using KeyType = T;
|
||||
|
||||
std::string KeyString() const {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
|
||||
return string100;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct OrderAscending : public KeyLane<T> {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
|
||||
return Lt(a, b);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct OrderDescending : public KeyLane<T> {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
|
||||
return Lt(b, a);
|
||||
}
|
||||
};
|
||||
|
||||
template <class Order>
|
||||
struct TraitsLane : public Order {
|
||||
// For HeapSort
|
||||
template <typename T> // MSVC doesn't find typename Order::LaneType.
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const T temp = *a;
|
||||
*a = *b;
|
||||
*b = temp;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return Set(d, *key);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -21,6 +22,9 @@
|
|||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
|
||||
|
@ -29,48 +33,31 @@ namespace hwy {
|
|||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
|
||||
struct OrderAscending128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
|
||||
}
|
||||
};
|
||||
|
||||
template <class Order>
|
||||
struct Traits128 : public Order {
|
||||
constexpr bool Is128() const { return true; }
|
||||
constexpr size_t LanesPerKey() const { return 2; }
|
||||
};
|
||||
|
||||
#else
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
struct Key128 {
|
||||
struct KeyAny128 {
|
||||
constexpr bool Is128() const { return true; }
|
||||
constexpr size_t LanesPerKey() const { return 2; }
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const FixedTag<T, 2> d;
|
||||
// What type bench_sort should allocate for generating inputs.
|
||||
using LaneType = uint64_t;
|
||||
// KeyType and KeyString are defined by derived classes.
|
||||
|
||||
HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
|
||||
const FixedTag<LaneType, 2> d;
|
||||
const auto temp = LoadU(d, a);
|
||||
StoreU(LoadU(d, b), d, a);
|
||||
StoreU(temp, d, b);
|
||||
}
|
||||
|
||||
template <class V, class M>
|
||||
HWY_INLINE V CompressKeys(V keys, M mask) const {
|
||||
return CompressBlocksNot(keys, mask);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return LoadDup128(d, key);
|
||||
|
@ -135,6 +122,23 @@ struct Key128 {
|
|||
}
|
||||
};
|
||||
|
||||
// Base class shared between OrderAscending128, OrderDescending128.
|
||||
struct Key128 : public KeyAny128 {
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = hwy::uint128_t;
|
||||
|
||||
std::string KeyString() const { return "U128"; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Eq128(a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
return a[0] == b[0] && a[1] == b[1];
|
||||
}
|
||||
};
|
||||
|
||||
// Anything order-related depends on the key traits *and* the order (see
|
||||
// FirstOfLanes). We cannot implement just one Compare function because Lt128
|
||||
// only compiles if the lane type is u64. Thus we need either overloaded
|
||||
|
@ -145,8 +149,7 @@ struct Key128 {
|
|||
struct OrderAscending128 : public Key128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
|
||||
}
|
||||
|
||||
|
@ -171,30 +174,6 @@ struct OrderAscending128 : public Key128 {
|
|||
return Max128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = First(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = Last(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
|
@ -210,8 +189,7 @@ struct OrderAscending128 : public Key128 {
|
|||
struct OrderDescending128 : public Key128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
|
||||
}
|
||||
|
||||
|
@ -236,28 +214,101 @@ struct OrderDescending128 : public Key128 {
|
|||
return Min128(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = First(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = Last(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
};
|
||||
|
||||
// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
|
||||
struct KeyValue128 : public KeyAny128 {
|
||||
// What type to pass to Sorter::operator().
|
||||
using KeyType = K64V64;
|
||||
|
||||
std::string KeyString() const { return "KV128"; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
|
||||
return Eq128Upper(a, b);
|
||||
}
|
||||
|
||||
HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
|
||||
return a[1] == b[1];
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderAscendingKV128 : public KeyValue128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return a[1] < b[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128Upper(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescendingKV128 : public KeyValue128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
|
||||
return b[1] < a[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128Upper(d, b, a);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128Upper(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128Upper(d, a, b);
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
|
@ -275,16 +326,21 @@ struct OrderDescending128 : public Key128 {
|
|||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
class Traits128 : public Base {
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
// Special case for >= 256 bit vectors
|
||||
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
|
||||
// Returns vector with only the top u64 lane valid. Useful when the next step
|
||||
// is to replicate the mask anyway.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
|
||||
const Mask<D> eqHL = Eq(a, b);
|
||||
const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return IfThenElse(eqHL, DupEven(ltHL), ltHL);
|
||||
#else
|
||||
const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
|
||||
return OrAnd(ltHL, eqHL, ltLX);
|
||||
return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
|
||||
#endif
|
||||
}
|
||||
|
||||
// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
|
||||
|
@ -292,16 +348,42 @@ class Traits128 : public Base {
|
|||
// replicate it 4x. Only called for >= 256-bit vectors.
|
||||
template <class V>
|
||||
HWY_INLINE V ReplicateTop4x(V v) const {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
#if HWY_TARGET == HWY_SVE_256
|
||||
return svdup_lane_u64(v, 3);
|
||||
#elif HWY_TARGET <= HWY_AVX3
|
||||
return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#else // AVX2
|
||||
return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#endif // HWY_TARGET
|
||||
|
||||
public:
|
||||
constexpr bool Is128() const { return true; }
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = base->SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
|
||||
v = base->First(d, v, base->SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = base->SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
|
||||
v = base->Last(d, v, base->SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
|
||||
|
@ -319,7 +401,7 @@ class Traits128 : public Base {
|
|||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
|
||||
const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
|
||||
return IfVecThenElse(select, swapped, v);
|
||||
#else
|
||||
|
@ -357,7 +439,7 @@ class Traits128 : public Base {
|
|||
}
|
||||
};
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
#endif // VQSORT_ENABLED
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -16,6 +17,10 @@
|
|||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
|
||||
|
||||
#ifndef VQSORT_PRINT
|
||||
#define VQSORT_PRINT 0
|
||||
#endif
|
||||
|
||||
// Makes it harder for adversaries to predict our sampling locations, at the
|
||||
// cost of 1-2% increased runtime.
|
||||
#ifndef VQSORT_SECURE_RNG
|
||||
|
@ -26,10 +31,13 @@
|
|||
#include "third_party/absl/random/random.h"
|
||||
#endif
|
||||
|
||||
#if VQSORT_PRINT
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "hwy/cache_control.h" // Prefetch
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/cache_control.h" // Prefetch
|
||||
#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes
|
||||
|
||||
#if HWY_IS_MSAN
|
||||
|
@ -47,6 +55,10 @@
|
|||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
|
||||
#endif
|
||||
|
||||
#if VQSORT_PRINT
|
||||
#include "hwy/print-inl.h"
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h"
|
||||
#include "hwy/highway.h"
|
||||
|
@ -56,117 +68,66 @@ namespace hwy {
|
|||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
|
||||
template <typename T>
|
||||
void Swap(T* a, T* b) {
|
||||
T t = *a;
|
||||
*a = *b;
|
||||
*b = t;
|
||||
}
|
||||
|
||||
// Scalar version of HeapSort (see below)
|
||||
template <class Traits, typename T>
|
||||
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
|
||||
if (num < 2) return;
|
||||
|
||||
// Build heap.
|
||||
for (size_t i = 1; i < num; i += 1) {
|
||||
size_t j = i;
|
||||
while (j != 0) {
|
||||
const size_t idx_parent = ((j - 1) / 1 / 2);
|
||||
if (!st.Compare1(keys + idx_parent, keys + j)) {
|
||||
break;
|
||||
}
|
||||
Swap(keys + j, keys + idx_parent);
|
||||
j = idx_parent;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = num - 1; i != 0; i -= 1) {
|
||||
// Swap root with last
|
||||
Swap(keys + 0, keys + i);
|
||||
|
||||
// Sift down the new root.
|
||||
size_t j = 0;
|
||||
while (j < i) {
|
||||
const size_t left = 2 * j + 1;
|
||||
const size_t right = 2 * j + 2;
|
||||
if (left >= i) break;
|
||||
size_t idx_larger = j;
|
||||
if (st.Compare1(keys + j, keys + left)) {
|
||||
idx_larger = left;
|
||||
}
|
||||
if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
|
||||
idx_larger = right;
|
||||
}
|
||||
if (idx_larger == j) break;
|
||||
Swap(keys + j, keys + idx_larger);
|
||||
j = idx_larger;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using Constants = hwy::SortConstants;
|
||||
|
||||
// ------------------------------ HeapSort
|
||||
|
||||
// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
|
||||
// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
|
||||
template <class Traits, typename T>
|
||||
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
|
||||
void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
|
||||
size_t start) {
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
const FixedTag<T, N1> d;
|
||||
|
||||
if (num < 2 * N1) return;
|
||||
|
||||
// Build heap.
|
||||
for (size_t i = N1; i < num; i += N1) {
|
||||
size_t j = i;
|
||||
while (j != 0) {
|
||||
const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
|
||||
if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
|
||||
st.SetKey(d, keys + j)))) {
|
||||
break;
|
||||
}
|
||||
st.Swap(keys + j, keys + idx_parent);
|
||||
j = idx_parent;
|
||||
while (start < num_lanes) {
|
||||
const size_t left = 2 * start + N1;
|
||||
const size_t right = 2 * start + 2 * N1;
|
||||
if (left >= num_lanes) break;
|
||||
size_t idx_larger = start;
|
||||
const auto key_j = st.SetKey(d, lanes + start);
|
||||
if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
|
||||
idx_larger = left;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = num - N1; i != 0; i -= N1) {
|
||||
// Swap root with last
|
||||
st.Swap(keys + 0, keys + i);
|
||||
|
||||
// Sift down the new root.
|
||||
size_t j = 0;
|
||||
while (j < i) {
|
||||
const size_t left = 2 * j + N1;
|
||||
const size_t right = 2 * j + 2 * N1;
|
||||
if (left >= i) break;
|
||||
size_t idx_larger = j;
|
||||
const auto key_j = st.SetKey(d, keys + j);
|
||||
if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
|
||||
idx_larger = left;
|
||||
}
|
||||
if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
|
||||
st.SetKey(d, keys + right)))) {
|
||||
idx_larger = right;
|
||||
}
|
||||
if (idx_larger == j) break;
|
||||
st.Swap(keys + j, keys + idx_larger);
|
||||
j = idx_larger;
|
||||
if (right < num_lanes &&
|
||||
AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
|
||||
st.SetKey(d, lanes + right)))) {
|
||||
idx_larger = right;
|
||||
}
|
||||
if (idx_larger == start) break;
|
||||
st.Swap(lanes + start, lanes + idx_larger);
|
||||
start = idx_larger;
|
||||
}
|
||||
}
|
||||
|
||||
// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
|
||||
// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
|
||||
template <class Traits, typename T>
|
||||
void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
|
||||
if (num_lanes < 2 * N1) return;
|
||||
|
||||
// Build heap.
|
||||
for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
|
||||
SiftDown(st, lanes, num_lanes, i);
|
||||
}
|
||||
|
||||
for (size_t i = num_lanes - N1; i != 0; i -= N1) {
|
||||
// Swap root with last
|
||||
st.Swap(lanes + 0, lanes + i);
|
||||
|
||||
// Sift down the new root.
|
||||
SiftDown(st, lanes, i, 0);
|
||||
}
|
||||
}
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
|
||||
// ------------------------------ BaseCase
|
||||
|
||||
// Sorts `keys` within the range [0, num) via sorting network.
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
|
||||
T* HWY_RESTRICT keys_end, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
|
@ -184,14 +145,25 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
|
||||
HWY_DASSERT(cols <= N);
|
||||
|
||||
// We can avoid padding and load/store directly to `keys` after checking the
|
||||
// original input array has enough space. Except at the right border, it's OK
|
||||
// to sort more than the current sub-array. Even if we sort across a previous
|
||||
// partition point, we know that keys will not migrate across it. However, we
|
||||
// must use the maximum size of the sorting network, because the StoreU of its
|
||||
// last vector would otherwise write invalid data starting at kMaxRows * cols.
|
||||
const size_t N_sn = Lanes(CappedTag<T, Constants::kMaxCols>());
|
||||
if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) {
|
||||
SortingNetwork(st, keys, N_sn);
|
||||
return;
|
||||
}
|
||||
|
||||
// Copy `keys` to `buf`.
|
||||
size_t i;
|
||||
for (i = 0; i + N <= num; i += N) {
|
||||
Store(LoadU(d, keys + i), d, buf + i);
|
||||
}
|
||||
for (; i < num; ++i) {
|
||||
buf[i] = keys[i];
|
||||
}
|
||||
SafeCopyN(num - i, d, keys + i, buf + i);
|
||||
i = num;
|
||||
|
||||
// Fill with padding - last in sort order, not copied to keys.
|
||||
const V kPadding = st.LastValue(d);
|
||||
|
@ -206,9 +178,7 @@ HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
for (i = 0; i + N <= num; i += N) {
|
||||
StoreU(Load(d, buf + i), d, keys + i);
|
||||
}
|
||||
for (; i < num; ++i) {
|
||||
keys[i] = buf[i];
|
||||
}
|
||||
SafeCopyN(num - i, d, buf + i, keys + i);
|
||||
}
|
||||
|
||||
// ------------------------------ Partition
|
||||
|
@ -268,15 +238,35 @@ HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
|
|||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
|
||||
const Vec<D> pivot, T* HWY_RESTRICT keys,
|
||||
size_t& writeL, size_t& writeR) {
|
||||
size_t& writeL, size_t& remaining) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
const auto comp = st.Compare(d, pivot, v);
|
||||
const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL);
|
||||
writeL += num_left;
|
||||
|
||||
writeR -= (N - num_left);
|
||||
(void)CompressBlendedStore(v, comp, d, keys + writeR);
|
||||
remaining -= N;
|
||||
if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
|
||||
(HWY_MAX_BYTES == 16 && st.Is128())) {
|
||||
// Non-native Compress (e.g. AVX2): we are able to partition a vector using
|
||||
// a single Compress+two StoreU instead of two Compress[Blended]Store. The
|
||||
// latter are more expensive. Because we store entire vectors, the contents
|
||||
// between the updated writeL and writeR are ignored and will be overwritten
|
||||
// by subsequent calls. This works because writeL and writeR are at least
|
||||
// two vectors apart.
|
||||
const auto lr = st.CompressKeys(v, comp);
|
||||
const size_t num_left = N - CountTrue(d, comp);
|
||||
StoreU(lr, d, keys + writeL);
|
||||
// Now write the right-side elements (if any), such that the previous writeR
|
||||
// is one past the end of the newly written right elements, then advance.
|
||||
StoreU(lr, d, keys + remaining + writeL);
|
||||
writeL += num_left;
|
||||
} else {
|
||||
// Native Compress[Store] (e.g. AVX3), which only keep the left or right
|
||||
// side, not both, hence we require two calls.
|
||||
const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
|
||||
writeL += num_left;
|
||||
|
||||
(void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
|
||||
}
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
|
@ -284,11 +274,11 @@ HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
|
|||
const Vec<D> v1, const Vec<D> v2,
|
||||
const Vec<D> v3, const Vec<D> pivot,
|
||||
T* HWY_RESTRICT keys, size_t& writeL,
|
||||
size_t& writeR) {
|
||||
StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
|
||||
size_t& remaining) {
|
||||
StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
|
||||
StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
|
||||
StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
|
||||
StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
|
||||
}
|
||||
|
||||
// Moves "<= pivot" keys to the front, and others to the back. pivot is
|
||||
|
@ -313,9 +303,39 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
|
|||
PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
|
||||
constexpr size_t kUnroll = Constants::kPartitionUnroll;
|
||||
|
||||
// Invariant: [left, writeL) and [writeR, right) are already partitioned.
|
||||
// Partition splits the vector into 3 sections, left to right: Elements
|
||||
// smaller or equal to the pivot, unpartitioned elements and elements larger
|
||||
// than the pivot. To write elements unconditionally on the loop body without
|
||||
// overwriting existing data, we maintain two regions of the loop where all
|
||||
// elements have been copied elsewhere (e.g. vector registers.). I call these
|
||||
// bufferL and bufferR, for left and right respectively.
|
||||
//
|
||||
// These regions are tracked by the indices (writeL, writeR, left, right) as
|
||||
// presented in the diagram below.
|
||||
//
|
||||
// writeL writeR
|
||||
// \/ \/
|
||||
// | <= pivot | bufferL | unpartitioned | bufferR | > pivot |
|
||||
// \/ \/
|
||||
// left right
|
||||
//
|
||||
// In the main loop body below we choose a side, load some elements out of the
|
||||
// vector and move either `left` or `right`. Next we call into StoreLeftRight
|
||||
// to partition the data, and the partitioned elements will be written either
|
||||
// to writeR or writeL and the corresponding index will be moved accordingly.
|
||||
//
|
||||
// Note that writeR is not explicitly tracked as an optimization for platforms
|
||||
// with conditional operations. Instead we track writeL and the number of
|
||||
// elements left to process (`remaining`). From the diagram above we can see
|
||||
// that:
|
||||
// writeR - writeL = remaining => writeR = remaining + writeL
|
||||
//
|
||||
// Tracking `remaining` is advantageous because each iteration reduces the
|
||||
// number of unpartitioned elements by a fixed amount, so we can compute
|
||||
// `remaining` without data dependencies.
|
||||
//
|
||||
size_t writeL = left;
|
||||
size_t writeR = right;
|
||||
size_t remaining = right - left;
|
||||
|
||||
const size_t num = right - left;
|
||||
// Cannot load if there were fewer than 2 * kUnroll * N.
|
||||
|
@ -339,12 +359,33 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
|
|||
while (left != right) {
|
||||
V v0, v1, v2, v3;
|
||||
|
||||
// Free up capacity for writing by loading from the side that has less.
|
||||
// Data-dependent but branching is faster than forcing branch-free.
|
||||
const size_t capacityL = left - writeL;
|
||||
const size_t capacityR = writeR - right;
|
||||
HWY_DASSERT(capacityL <= num && capacityR <= num); // >= 0
|
||||
if (capacityR < capacityL) {
|
||||
HWY_DASSERT(capacityL <= num); // >= 0
|
||||
// Load data from the end of the vector with less data (front or back).
|
||||
// The next paragraphs explain how this works.
|
||||
//
|
||||
// let block_size = (kUnroll * N)
|
||||
// On the loop prelude we load block_size elements from the front of the
|
||||
// vector and an additional block_size elements from the back. On each
|
||||
// iteration k elements are written to the front of the vector and
|
||||
// (block_size - k) to the back.
|
||||
//
|
||||
// This creates a loop invariant where the capacity on the front
|
||||
// (capacityL) and on the back (capacityR) always add to 2 * block_size.
|
||||
// In other words:
|
||||
// capacityL + capacityR = 2 * block_size
|
||||
// capacityR = 2 * block_size - capacityL
|
||||
//
|
||||
// This means that:
|
||||
// capacityL < capacityR <=>
|
||||
// capacityL < 2 * block_size - capacityL <=>
|
||||
// 2 * capacityL < 2 * block_size <=>
|
||||
// capacityL < block_size
|
||||
//
|
||||
// Thus the check on the next line is equivalent to capacityL > capacityR.
|
||||
//
|
||||
if (kUnroll * N < capacityL) {
|
||||
right -= kUnroll * N;
|
||||
v0 = LoadU(d, keys + right + 0 * N);
|
||||
v1 = LoadU(d, keys + right + 1 * N);
|
||||
|
@ -360,16 +401,16 @@ HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
|
|||
hwy::Prefetch(keys + left + 3 * kUnroll * N);
|
||||
}
|
||||
|
||||
StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
|
||||
}
|
||||
|
||||
// Now finish writing the initial left/right to the middle.
|
||||
StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
|
||||
StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
|
||||
}
|
||||
|
||||
// We have partitioned [left, right) such that writeL is the boundary.
|
||||
HWY_DASSERT(writeL == writeR);
|
||||
HWY_DASSERT(remaining == 0);
|
||||
// Make space for inserting vlast: move up to N of the first right-side keys
|
||||
// into the unused space starting at last. If we have fewer, ensure they are
|
||||
// the last items in that vector by subtracting from the *load* address,
|
||||
|
@ -406,41 +447,6 @@ HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
|
|||
return v1;
|
||||
}
|
||||
|
||||
// Replaces triplets with their median and recurses until less than 3 keys
|
||||
// remain. Ignores leftover values (non-whole triplets)!
|
||||
template <class D, class Traits, typename T>
|
||||
Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
|
||||
if (num < 3 * N1) return st.SetKey(d, keys);
|
||||
|
||||
size_t read = 0;
|
||||
size_t written = 0;
|
||||
|
||||
// Triplets of vectors
|
||||
for (; read + 3 * N <= num; read += 3 * N) {
|
||||
const auto v0 = Load(d, keys + read + 0 * N);
|
||||
const auto v1 = Load(d, keys + read + 1 * N);
|
||||
const auto v2 = Load(d, keys + read + 2 * N);
|
||||
Store(MedianOf3(st, v0, v1, v2), d, buf + written);
|
||||
written += N;
|
||||
}
|
||||
|
||||
// Triplets of keys
|
||||
for (; read + 3 * N1 <= num; read += 3 * N1) {
|
||||
const auto v0 = st.SetKey(d, keys + read + 0 * N1);
|
||||
const auto v1 = st.SetKey(d, keys + read + 1 * N1);
|
||||
const auto v2 = st.SetKey(d, keys + read + 2 * N1);
|
||||
StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
|
||||
written += N1;
|
||||
}
|
||||
|
||||
// Tail recursion; swap buffers
|
||||
return RecursiveMedianOf3(d, st, buf, written, keys);
|
||||
}
|
||||
|
||||
#if VQSORT_SECURE_RNG
|
||||
using Generator = absl::BitGen;
|
||||
#else
|
||||
|
@ -453,6 +459,11 @@ class Generator {
|
|||
k_ = 1; // stream index: must be odd
|
||||
}
|
||||
|
||||
explicit Generator(uint64_t seed) {
|
||||
a_ = b_ = w_ = seed;
|
||||
k_ = 1;
|
||||
}
|
||||
|
||||
uint64_t operator()() {
|
||||
const uint64_t b = b_;
|
||||
w_ += k_;
|
||||
|
@ -481,19 +492,190 @@ HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
|
|||
return static_cast<size_t>(chunk_index);
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_INLINE void SortSamples(Traits st, T* HWY_RESTRICT buf) {
|
||||
// buf contains 192 bytes, so 16 128-bit vectors are necessary and sufficient.
|
||||
constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
|
||||
const CappedTag<T, 16 / sizeof(T)> d128;
|
||||
const size_t N128 = Lanes(d128);
|
||||
constexpr size_t kCols = HWY_MIN(16 / sizeof(T), Constants::kMaxCols);
|
||||
constexpr size_t kBytes = kCols * Constants::kMaxRows * sizeof(T);
|
||||
static_assert(192 <= kBytes, "");
|
||||
// Fill with padding - last in sort order.
|
||||
const auto kPadding = st.LastValue(d128);
|
||||
// Initialize an extra vector because SortingNetwork loads full vectors,
|
||||
// which may exceed cols*kMaxRows.
|
||||
for (size_t i = kSampleLanes; i <= kBytes / sizeof(T); i += N128) {
|
||||
StoreU(kPadding, d128, buf + i);
|
||||
}
|
||||
|
||||
SortingNetwork(st, buf, kCols);
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_INLINE size_t PivotRank(Traits st, T* HWY_RESTRICT buf) {
|
||||
constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr size_t kRankMid = kSampleLanes / 2;
|
||||
static_assert(kRankMid % N1 == 0, "Mid is not an aligned key");
|
||||
|
||||
// Find the previous value not equal to the median.
|
||||
size_t rank_prev = kRankMid - N1;
|
||||
for (; st.Equal1(buf + rank_prev, buf + kRankMid); rank_prev -= N1) {
|
||||
// All previous samples are equal to the median.
|
||||
if (rank_prev == 0) return 0;
|
||||
}
|
||||
|
||||
size_t rank_next = rank_prev + N1;
|
||||
for (; st.Equal1(buf + rank_next, buf + kRankMid); rank_next += N1) {
|
||||
// The median is also the largest sample. If it is also the largest key,
|
||||
// we'd end up with an empty right partition, so choose the previous key.
|
||||
if (rank_next == kSampleLanes - N1) return rank_prev;
|
||||
}
|
||||
|
||||
// If we choose the median as pivot, the ratio of keys ending in the left
|
||||
// partition will likely be rank_next/kSampleLanes (if the sample is
|
||||
// representative). This is because equal-to-pivot values also land in the
|
||||
// left - it's infeasible to do an in-place vectorized 3-way partition.
|
||||
// Check whether prev would lead to a more balanced partition.
|
||||
const size_t excess_if_median = rank_next - kRankMid;
|
||||
const size_t excess_if_prev = kRankMid - rank_prev;
|
||||
return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
|
||||
}
|
||||
|
||||
#if VQSORT_PRINT
|
||||
// Compute exact min/max.
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
|
||||
const size_t begin, const size_t end,
|
||||
T* HWY_RESTRICT buf, Generator& rng) {
|
||||
HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
|
||||
size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
|
||||
Vec<D>& last) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
first = st.LastValue(d);
|
||||
last = st.FirstValue(d);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const Vec<D> v = LoadU(d, keys + i);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
}
|
||||
if (HWY_LIKELY(i != num)) {
|
||||
HWY_DASSERT(num >= N); // See HandleSpecialCases
|
||||
const Vec<D> v = LoadU(d, keys + num - N);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
}
|
||||
|
||||
first = st.FirstOfLanes(d, first, buf);
|
||||
last = st.LastOfLanes(d, last, buf);
|
||||
}
|
||||
#endif // VQSORT_PRINT
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE bool ScanEqual(D d, Traits st, const T* HWY_RESTRICT keys,
|
||||
size_t num) {
|
||||
using V = Vec<decltype(d)>;
|
||||
const size_t N = Lanes(d);
|
||||
HWY_DASSERT(num >= N); // See HandleSpecialCases
|
||||
const V reference = st.SetKey(d, keys);
|
||||
const V zero = Zero(d);
|
||||
// Sticky bits registering any difference between `keys` and the first key.
|
||||
// We use vector XOR because it may be cheaper than comparisons, especially
|
||||
// for 128-bit. 2x unrolled for more ILP.
|
||||
V diff0 = zero;
|
||||
V diff1 = zero;
|
||||
|
||||
// We want to stop once a difference has been found, but without slowing down
|
||||
// the loop by comparing during each iteration. The compromise is to compare
|
||||
// after a 'group', which consists of kLoops times two vectors.
|
||||
constexpr size_t kLoops = 4;
|
||||
const size_t lanes_per_group = kLoops * 2 * N;
|
||||
size_t i = 0;
|
||||
for (; i + lanes_per_group <= num; i += lanes_per_group) {
|
||||
for (size_t loop = 0; loop < kLoops; ++loop) {
|
||||
const V v0 = LoadU(d, keys + i + loop * 2 * N);
|
||||
const V v1 = LoadU(d, keys + i + loop * 2 * N + N);
|
||||
// TODO(janwas): ternlog
|
||||
diff0 = Or(diff0, Xor(v0, reference));
|
||||
diff1 = Or(diff1, Xor(v1, reference));
|
||||
}
|
||||
diff0 = Or(diff0, diff1);
|
||||
if (!AllTrue(d, Eq(diff0, zero))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Whole vectors, no unrolling
|
||||
for (; i + N <= num; i += N) {
|
||||
const V v0 = LoadU(d, keys + i);
|
||||
// TODO(janwas): ternlog
|
||||
diff0 = Or(diff0, Xor(v0, reference));
|
||||
if (!AllTrue(d, Eq(diff0, zero))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// If there are remainders, re-check the last whole vector.
|
||||
if (HWY_LIKELY(i != num)) {
|
||||
const V v0 = LoadU(d, keys + num - N);
|
||||
// TODO(janwas): ternlog
|
||||
diff0 = Or(diff0, Xor(v0, reference));
|
||||
if (!AllTrue(d, Eq(diff0, zero))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns key prior to reference in sort order.
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE Vec<D> ScanForPrev(D d, Traits st, const T* HWY_RESTRICT keys,
|
||||
size_t num, Vec<D> reference,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
HWY_DASSERT(num >= N); // See HandleSpecialCases
|
||||
|
||||
Vec<D> prev = st.FirstValue(d);
|
||||
Mask<D> any_found = st.Compare(d, prev, prev); // false
|
||||
|
||||
size_t i = 0;
|
||||
// Whole vectors, no unrolling
|
||||
for (; i + N <= num; i += N) {
|
||||
const Vec<D> curr = LoadU(d, keys + i);
|
||||
const auto is_before = st.Compare(d, curr, reference);
|
||||
any_found = Or(any_found, is_before);
|
||||
prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
|
||||
}
|
||||
// If there are remainders, re-check the last whole vector.
|
||||
if (HWY_LIKELY(i != num)) {
|
||||
const Vec<D> curr = LoadU(d, keys + num - N);
|
||||
const auto is_before = st.Compare(d, curr, reference);
|
||||
any_found = Or(any_found, is_before);
|
||||
prev = IfThenElse(is_before, st.Last(d, prev, curr), prev);
|
||||
}
|
||||
|
||||
const Vec<D> candidate = st.LastOfLanes(d, prev, buf);
|
||||
// If we didn't find any key less than reference, we're still stuck with
|
||||
// FirstValue; replace that with reference. (We cannot compare directly to
|
||||
// FirstValue because that might be the desired value of prev.)
|
||||
return IfThenElse(any_found, candidate, reference);
|
||||
}
|
||||
|
||||
enum class PivotResult {
|
||||
kNormal, // use partition
|
||||
kAllEqual, // already done
|
||||
};
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf, Generator& rng) {
|
||||
using V = decltype(Zero(d));
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
// Power of two
|
||||
const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
|
||||
|
||||
keys += begin;
|
||||
size_t num = end - begin;
|
||||
|
||||
// Align start of keys to chunks. We always have at least 2 chunks because the
|
||||
// base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
|
||||
HWY_DASSERT(num >= 2 * lanes_per_chunk);
|
||||
|
@ -548,49 +730,92 @@ HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
|
|||
const V medians2 = MedianOf3(st, v6, v7, v8);
|
||||
Store(medians2, d, buf + i + lanes_per_chunk * 2);
|
||||
}
|
||||
|
||||
return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
|
||||
buf + 3 * lanes_per_chunk);
|
||||
}
|
||||
|
||||
// Compute exact min/max to detect all-equal partitions. Only called after a
|
||||
// degenerate Partition (none in the right partition).
|
||||
// Returns pivot, which is never the largest key (thus the right partition will
|
||||
// never be empty).
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
|
||||
size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
|
||||
Vec<D>& last) {
|
||||
HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
|
||||
const size_t begin, const size_t end,
|
||||
T* HWY_RESTRICT buf, Generator& rng,
|
||||
PivotResult& result) {
|
||||
using V = decltype(Zero(d));
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
first = st.LastValue(d);
|
||||
last = st.FirstValue(d);
|
||||
constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const Vec<D> v = LoadU(d, keys + i);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
const size_t num = end - begin;
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, "\nChoosePivot num %zu:\n", num);
|
||||
#endif
|
||||
DrawSamples(d, st, keys + begin, num, buf, rng);
|
||||
|
||||
SortSamples(st, buf);
|
||||
#if VQSORT_PRINT
|
||||
for (size_t i = 0; i < kSampleLanes; i += N) {
|
||||
Print(d, "", Load(d, buf + i), 0, N);
|
||||
}
|
||||
if (HWY_LIKELY(i != num)) {
|
||||
HWY_DASSERT(num >= N); // See HandleSpecialCases
|
||||
const Vec<D> v = LoadU(d, keys + num - N);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
#endif
|
||||
|
||||
// All samples are equal.
|
||||
if (st.Equal1(buf, buf + kSampleLanes - N1)) {
|
||||
const bool all_eq = ScanEqual(d, st, keys + begin, num);
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, "Pivot num=%zu all eq samples, keys also: %d\n", num,
|
||||
all_eq);
|
||||
#endif
|
||||
if (all_eq) {
|
||||
result = PivotResult::kAllEqual;
|
||||
return Zero(d);
|
||||
}
|
||||
|
||||
// If the sample is indeed the most common key and it is the largest, then
|
||||
// the right partition will be empty. Prevent this by replacing the pivot
|
||||
// with the previous key in sort order. By contrast, selecting the first key
|
||||
// in sort order would guarantee (minimal) progress. We instead do a full
|
||||
// scan to maximize load balance in case there are numerous keys that
|
||||
// precede the most common key.
|
||||
result = PivotResult::kNormal;
|
||||
const V reference = st.SetKey(d, buf);
|
||||
const V pivot = ScanForPrev(d, st, keys + begin, num, reference, buf);
|
||||
#if VQSORT_PRINT
|
||||
Print(d, "PREV pivot", pivot, 0, st.LanesPerKey());
|
||||
#endif
|
||||
return pivot;
|
||||
}
|
||||
|
||||
first = st.FirstOfLanes(d, first, buf);
|
||||
last = st.LastOfLanes(d, last, buf);
|
||||
const size_t pivot_rank = PivotRank(st, buf);
|
||||
const Vec<D> pivot = st.SetKey(d, buf + pivot_rank);
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, " Pivot rank %zu = %.0f\n", pivot_rank,
|
||||
static_cast<double>(GetLane(pivot)));
|
||||
#endif
|
||||
result = PivotResult::kNormal;
|
||||
return pivot;
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
|
||||
const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
|
||||
Generator& rng, size_t remaining_levels) {
|
||||
void Recurse(D d, Traits st, T* HWY_RESTRICT keys, T* HWY_RESTRICT keys_end,
|
||||
const size_t begin, const size_t end, const Vec<D> pivot,
|
||||
T* HWY_RESTRICT buf, Generator& rng, size_t remaining_levels) {
|
||||
HWY_DASSERT(begin + 1 < end);
|
||||
const size_t num = end - begin; // >= 2
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, "- Recurse remaining %zu [%zu %zu) len %zu\n",
|
||||
remaining_levels, begin, end, num);
|
||||
Vec<D> first, last;
|
||||
ScanMinMax(d, st, keys + begin, num, buf, first, last);
|
||||
Print(d, "first", first, 0, st.LanesPerKey());
|
||||
Print(d, "last", last, 0, st.LanesPerKey());
|
||||
#endif
|
||||
|
||||
// Too many degenerate partitions. This is extremely unlikely to happen
|
||||
// because we select pivots from large (though still O(1)) samples.
|
||||
// Too many recursions. This is unlikely to happen because we select pivots
|
||||
// from large (though still O(1)) samples.
|
||||
if (HWY_UNLIKELY(remaining_levels == 0)) {
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, "HeapSort reached, size=%zu\n", num);
|
||||
#endif
|
||||
HeapSort(st, keys + begin, num); // Slow but N*logN.
|
||||
return;
|
||||
}
|
||||
|
@ -604,35 +829,31 @@ void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
|
|||
const ptrdiff_t num_right =
|
||||
static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
|
||||
|
||||
// Check for degenerate partitions (i.e. Partition did not move any keys):
|
||||
if (HWY_UNLIKELY(num_right == 0)) {
|
||||
// Because the pivot is one of the keys, it must have been equal to the
|
||||
// first or last key in sort order. Scan for the actual min/max:
|
||||
// passing the current pivot as the new bound is insufficient because one of
|
||||
// the partitions might not actually include that key.
|
||||
Vec<D> first, last;
|
||||
ScanMinMax(d, st, keys + begin, num, buf, first, last);
|
||||
if (AllTrue(d, Eq(first, last))) return;
|
||||
|
||||
// Separate recursion to make sure that we don't pick `last` as the
|
||||
// pivot - that would again lead to a degenerate partition.
|
||||
Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
|
||||
return;
|
||||
}
|
||||
// ChoosePivot ensures pivot != largest key, so this should never happen.
|
||||
HWY_ASSERT(num_right != 0);
|
||||
|
||||
if (HWY_UNLIKELY(num_left <= base_case_num)) {
|
||||
BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
|
||||
BaseCase(d, st, keys + begin, keys_end, static_cast<size_t>(num_left), buf);
|
||||
} else {
|
||||
const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
|
||||
Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
PivotResult result;
|
||||
const Vec<D> next_pivot =
|
||||
ChoosePivot(d, st, keys, begin, bound, buf, rng, result);
|
||||
if (result != PivotResult::kAllEqual) {
|
||||
Recurse(d, st, keys, keys_end, begin, bound, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
}
|
||||
}
|
||||
if (HWY_UNLIKELY(num_right <= base_case_num)) {
|
||||
BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
|
||||
BaseCase(d, st, keys + bound, keys_end, static_cast<size_t>(num_right),
|
||||
buf);
|
||||
} else {
|
||||
const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
|
||||
Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
PivotResult result;
|
||||
const Vec<D> next_pivot =
|
||||
ChoosePivot(d, st, keys, bound, end, buf, rng, result);
|
||||
if (result != PivotResult::kAllEqual) {
|
||||
Recurse(d, st, keys, keys_end, bound, end, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -646,10 +867,11 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
// 128-bit keys require vectors with at least two u64 lanes, which is always
|
||||
// the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
|
||||
// hardware vector width is less than 128bit / fraction.
|
||||
const bool partial_128 = N < 2 && st.Is128();
|
||||
const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
|
||||
// Partition assumes its input is at least two vectors. If vectors are huge,
|
||||
// base_case_num may actually be smaller. If so, which is only possible on
|
||||
// RVV, pass a capped or partial d (LMUL < 1).
|
||||
// RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
|
||||
// HWY_LANES to account for the largest possible LMUL.
|
||||
constexpr bool kPotentiallyHuge =
|
||||
HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
|
||||
const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
|
||||
|
@ -661,7 +883,7 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
|
||||
// Small arrays: use sorting network, no need for other checks.
|
||||
if (HWY_UNLIKELY(num <= base_case_num)) {
|
||||
BaseCase(d, st, keys, num, buf);
|
||||
BaseCase(d, st, keys, keys + num, num, buf);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -671,7 +893,7 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
return false; // not finished sorting
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
#endif // VQSORT_ENABLED
|
||||
} // namespace detail
|
||||
|
||||
// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
|
||||
|
@ -683,17 +905,26 @@ bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
// sampling only from the first 256 GiB.
|
||||
//
|
||||
// `d` is typically SortTag<T> (chooses between full and partial vectors).
|
||||
// `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
|
||||
// bridges differences in sort order and single-lane vs 128-bit keys.
|
||||
// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
|
||||
// differences in sort order and single-lane vs 128-bit keys.
|
||||
template <class D, class Traits, typename T>
|
||||
void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
(void)d;
|
||||
(void)buf;
|
||||
// PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
|
||||
return detail::HeapSort(st, keys, num);
|
||||
#else
|
||||
#if VQSORT_PRINT
|
||||
fprintf(stderr, "=============== Sort num %zu\n", num);
|
||||
#endif
|
||||
|
||||
#if VQSORT_ENABLED || HWY_IDE
|
||||
#if !HWY_HAVE_SCALABLE
|
||||
// On targets with fixed-size vectors, avoid _using_ the allocated memory.
|
||||
// We avoid (potentially expensive for small input sizes) allocations on
|
||||
// platforms where no targets are scalable. For 512-bit vectors, this fits on
|
||||
// the stack (several KiB).
|
||||
HWY_ALIGN T storage[SortConstants::BufNum<T>(HWY_LANES(T))] = {};
|
||||
static_assert(sizeof(storage) <= 8192, "Unexpectedly large, check size");
|
||||
buf = storage;
|
||||
#endif // !HWY_HAVE_SCALABLE
|
||||
|
||||
if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
|
||||
|
||||
#if HWY_MAX_BYTES > 64
|
||||
|
@ -705,13 +936,22 @@ void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
|||
|
||||
// Pulled out of the recursion so we can special-case degenerate partitions.
|
||||
detail::Generator rng(keys, num);
|
||||
const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
|
||||
detail::PivotResult result;
|
||||
const Vec<D> pivot =
|
||||
detail::ChoosePivot(d, st, keys, 0, num, buf, rng, result);
|
||||
|
||||
// Introspection: switch to worst-case N*logN heapsort after this many.
|
||||
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
|
||||
|
||||
detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
if (result != detail::PivotResult::kAllEqual) {
|
||||
// Introspection: switch to worst-case N*logN heapsort after this many.
|
||||
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
|
||||
detail::Recurse(d, st, keys, keys + num, 0, num, pivot, buf, rng,
|
||||
max_levels);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
(void)buf;
|
||||
// PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
|
||||
return detail::HeapSort(st, keys, num);
|
||||
#endif // VQSORT_ENABLED
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -16,21 +17,64 @@
|
|||
|
||||
#include <string.h> // memset
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
|
||||
// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
|
||||
// optimization that replaces dynamic allocation with stack storage.
|
||||
#ifndef VQSORT_STACK
|
||||
#if HWY_ARCH_X86 || HWY_ARCH_WASM
|
||||
#define VQSORT_STACK 1
|
||||
#else
|
||||
#define VQSORT_STACK 0
|
||||
#endif
|
||||
#endif // VQSORT_STACK
|
||||
|
||||
#if !VQSORT_STACK
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#endif
|
||||
|
||||
// Check if we have sys/random.h. First skip some systems on which the check
|
||||
// itself (features.h) might be problematic.
|
||||
#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
|
||||
#include <features.h>
|
||||
|
||||
// ---- which libc
|
||||
#if defined(__UCLIBC__)
|
||||
#define VQSORT_GETRANDOM 1 // added Mar 2015, before uclibc-ng 1.0
|
||||
|
||||
#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
|
||||
#if __GLIBC_PREREQ(2, 25)
|
||||
#define VQSORT_GETRANDOM 1
|
||||
#else
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
|
||||
// https://www.openwall.com/lists/musl/2013/03/29/13.
|
||||
#define VQSORT_GETRANDOM 1
|
||||
|
||||
#endif // ---- which libc
|
||||
#endif // linux
|
||||
|
||||
#if !defined(VQSORT_GETRANDOM)
|
||||
#define VQSORT_GETRANDOM 0
|
||||
#endif
|
||||
|
||||
// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
|
||||
// (not all Android support the getrandom wrapper)
|
||||
#ifndef VQSORT_SECURE_SEED
|
||||
|
||||
#if (defined(linux) || defined(__linux__)) && \
|
||||
!(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV)
|
||||
#if VQSORT_GETRANDOM
|
||||
#define VQSORT_SECURE_SEED 1
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
#define VQSORT_SECURE_SEED 2
|
||||
|
@ -47,7 +91,7 @@
|
|||
#include <sys/random.h>
|
||||
#elif VQSORT_SECURE_SEED == 2
|
||||
#include <windows.h>
|
||||
#pragma comment(lib, "Advapi32.lib")
|
||||
#pragma comment(lib, "advapi32.lib")
|
||||
// Must come after windows.h.
|
||||
#include <wincrypt.h>
|
||||
#endif // VQSORT_SECURE_SEED
|
||||
|
@ -72,40 +116,32 @@ namespace {
|
|||
HWY_EXPORT(VectorSize);
|
||||
HWY_EXPORT(HaveFloat64);
|
||||
|
||||
HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
|
||||
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
|
||||
const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N);
|
||||
return (3 + 1) * lpc + 2 * N;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Sorter::Sorter() {
|
||||
#if VQSORT_STACK
|
||||
ptr_ = nullptr; // Sort will use stack storage instead
|
||||
#else
|
||||
// Determine the largest buffer size required for any type by trying them all.
|
||||
// (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
|
||||
// may require a larger buffer.)
|
||||
const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
|
||||
size_t max_bytes = 0;
|
||||
for (size_t sizeof_t :
|
||||
{sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) {
|
||||
const size_t N = vector_size / sizeof_t;
|
||||
// One extra for padding plus another for full-vector loads.
|
||||
const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N;
|
||||
const size_t partition_num = SortConstants::PartitionBufNum(N);
|
||||
const size_t buf_lanes =
|
||||
HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N)));
|
||||
max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t);
|
||||
}
|
||||
|
||||
const size_t max_bytes =
|
||||
HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
|
||||
SortConstants::BufBytes<uint32_t>(vector_size)),
|
||||
SortConstants::BufBytes<uint64_t>(vector_size));
|
||||
ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
|
||||
|
||||
// Prevent msan errors by initializing.
|
||||
memset(ptr_, 0, max_bytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
void Sorter::Delete() {
|
||||
#if !VQSORT_STACK
|
||||
FreeAlignedBytes(ptr_, nullptr, nullptr);
|
||||
ptr_ = nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !VQSORT_SECURE_RNG
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -13,6 +14,12 @@
|
|||
// limitations under the License.
|
||||
|
||||
// Interface to vectorized quicksort with dynamic dispatch.
|
||||
// Blog post: https://tinyurl.com/vqsort-blog
|
||||
// Paper with measurements: https://arxiv.org/abs/2205.05982
|
||||
//
|
||||
// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
|
||||
// worthwhile, we recommend using this code for sorting arrays whose size is at
|
||||
// least 512 KiB.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
|
@ -21,15 +28,6 @@
|
|||
|
||||
namespace hwy {
|
||||
|
||||
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
|
||||
// https://reviews.llvm.org/D86310
|
||||
#pragma pack(push, 1)
|
||||
struct alignas(16) uint128_t {
|
||||
uint64_t lo; // little-endian layout
|
||||
uint64_t hi;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
// Tag arguments that determine the sort order.
|
||||
struct SortAscending {
|
||||
constexpr bool IsAscending() const { return true; }
|
||||
|
@ -84,6 +82,9 @@ class HWY_CONTRIB_DLLEXPORT Sorter {
|
|||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
// For internal use only
|
||||
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
|
||||
static bool HaveFloat64();
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
|
@ -29,9 +29,16 @@ namespace HWY_NAMESPACE {
|
|||
|
||||
void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
|
@ -29,9 +29,16 @@ namespace HWY_NAMESPACE {
|
|||
|
||||
void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -29,7 +29,7 @@ namespace HWY_NAMESPACE {
|
|||
|
||||
void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
|
||||
float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -31,7 +31,7 @@ void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
|
|||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -31,7 +31,7 @@ void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
|
|||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,20 +13,16 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
@ -55,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
|||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,20 +13,16 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
@ -55,5 +52,3 @@ void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
|||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV128Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV128Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -0,0 +1,65 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// clang-format off
|
||||
// (avoid line break, which would prevent Copybara rules from matching)
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc" //NOLINT
|
||||
// clang-format on
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
#if VQSORT_ENABLED
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void) keys;
|
||||
(void) num;
|
||||
(void) buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortKV128Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortKV128Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,20 +13,16 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
@ -33,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
@ -55,5 +52,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
|||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,20 +13,16 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
@ -33,7 +30,8 @@ namespace HWY_NAMESPACE {
|
|||
void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
@ -55,5 +53,3 @@ void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
|||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,8 @@ namespace HWY_NAMESPACE {
|
|||
void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,7 @@ namespace HWY_NAMESPACE {
|
|||
void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,12 +13,11 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
|
@ -30,7 +30,8 @@ namespace HWY_NAMESPACE {
|
|||
void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
|
||||
st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -29,34 +30,45 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Compiler
|
||||
|
||||
// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
|
||||
// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
|
||||
// purpose.
|
||||
// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
|
||||
// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && defined(__clang__)
|
||||
#define HWY_COMPILER_CLANGCL _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_CLANGCL 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
|
||||
// compiler extensions (eg. Clang, Intel...)
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang can masquerade as MSVC/GCC, in which case both are set.
|
||||
// Clang or clang-cl, not GCC.
|
||||
#ifdef __clang__
|
||||
#ifdef __APPLE__
|
||||
// Apple LLVM version is unrelated to the actual Clang version, which we need
|
||||
// for enabling workarounds. Use the presence of warning flags to deduce it.
|
||||
// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
|
||||
// an invalid version number, deduce it from the presence of warnings.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if __has_warning("-Wformat-insufficient-args")
|
||||
#if defined(__APPLE__) || __clang_major__ >= 999
|
||||
#if __has_warning("-Wbitwise-instead-of-logical")
|
||||
#define HWY_COMPILER_CLANG 1400
|
||||
#elif __has_warning("-Wreserved-identifier")
|
||||
#define HWY_COMPILER_CLANG 1300
|
||||
#elif __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
|
@ -72,19 +84,32 @@
|
|||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#else // Non-Apple: normal version
|
||||
#else // use normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#endif
|
||||
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
|
||||
#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
|
||||
#else
|
||||
#define HWY_COMPILER_GCC_ACTUAL 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
|
||||
!HWY_COMPILER_CLANG
|
||||
#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
|
||||
HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
// We should only detect one of these (only clang/clangcl overlap)
|
||||
#if 1 < \
|
||||
(!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
|
||||
!!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
|
||||
#error "Detected multiple compilers"
|
||||
#endif
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
|
@ -140,7 +165,7 @@
|
|||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
|
@ -150,12 +175,20 @@
|
|||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
// Any *supported* version of Arm, i.e. 7 or later
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
|
||||
#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
|
||||
#define HWY_ARCH_ARM_OLD 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_OLD 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
|
@ -170,9 +203,21 @@
|
|||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
||||
HWY_ARCH_RVV) > 1
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
|
||||
HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
|
||||
#error "Must not detect more than one architecture"
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define HWY_OS_WIN 1
|
||||
#else
|
||||
#define HWY_OS_WIN 0
|
||||
#endif
|
||||
|
||||
#if defined(linux) || defined(__linux__)
|
||||
#define HWY_OS_LINUX 1
|
||||
#else
|
||||
#define HWY_OS_LINUX 0
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -37,6 +38,10 @@
|
|||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
|
||||
// #define HWY_WANT_SSSE3
|
||||
// #define HWY_WANT_SSE4
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
|
@ -46,59 +51,71 @@
|
|||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
|
||||
// can use 32-bit literals.
|
||||
|
||||
// 1,2: reserved
|
||||
|
||||
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
|
||||
// be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
|
||||
// yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
|
||||
#define HWY_AVX3_DL 4 // see HWY_WANT_AVX3_DL below
|
||||
#define HWY_AVX3 8
|
||||
#define HWY_AVX2 16
|
||||
// 32: reserved for AVX
|
||||
#define HWY_SSE4 64
|
||||
#define HWY_SSSE3 128
|
||||
// 0x100, 0x200: reserved for SSE3, SSE2
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types. This
|
||||
// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
|
||||
// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
|
||||
// avoid overflow when computing HWY_TARGETS (subtracting one instead of
|
||||
// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
|
||||
|
||||
// --------------------------- x86: 15 targets (+ one fallback)
|
||||
// Bits 0..6 reserved (7 targets)
|
||||
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
|
||||
// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
|
||||
// Tiger Lake? We do not yet have uses for GFNI.
|
||||
#define HWY_AVX3_DL (1LL << 7) // see HWY_WANT_AVX3_DL below
|
||||
#define HWY_AVX3 (1LL << 8)
|
||||
#define HWY_AVX2 (1LL << 9)
|
||||
// Bit 10: reserved for AVX
|
||||
#define HWY_SSE4 (1LL << 11)
|
||||
#define HWY_SSSE3 (1LL << 12)
|
||||
// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 9
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 14
|
||||
|
||||
#define HWY_SVE2 0x400
|
||||
#define HWY_SVE 0x800
|
||||
// 0x1000 reserved for Helium
|
||||
#define HWY_NEON 0x2000
|
||||
// --------------------------- Arm: 15 targets (+ one fallback)
|
||||
// Bits 15..23 reserved (9 targets)
|
||||
#define HWY_SVE2_128 (1LL << 24) // specialized target (e.g. Arm N2)
|
||||
#define HWY_SVE_256 (1LL << 25) // specialized target (e.g. Arm V1)
|
||||
#define HWY_SVE2 (1LL << 26)
|
||||
#define HWY_SVE (1LL << 27)
|
||||
#define HWY_NEON (1LL << 28) // On A64, includes/requires AES
|
||||
// Bit 29 reserved (Helium?)
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 29
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 13
|
||||
// --------------------------- RISC-V: 9 targets (+ one fallback)
|
||||
// Bits 30..36 reserved (7 targets)
|
||||
#define HWY_RVV (1LL << 37)
|
||||
// Bit 38 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 38
|
||||
|
||||
// 0x4000, 0x8000 reserved
|
||||
#define HWY_PPC8 0x10000 // v2.07 or 3
|
||||
// 0x20000, 0x40000 reserved for prior VSX/AltiVec
|
||||
// --------------------------- Future expansion: 4 targets
|
||||
// Bits 39..42 reserved
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 18
|
||||
|
||||
#define HWY_WASM2 0x80000 // Experimental
|
||||
#define HWY_WASM 0x100000
|
||||
// --------------------------- IBM Power: 9 targets (+ one fallback)
|
||||
// Bits 43..48 reserved (6 targets)
|
||||
#define HWY_PPC8 (1LL << 49) // v2.07 or 3
|
||||
// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 51
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 20
|
||||
// --------------------------- WebAssembly: 9 targets (+ one fallback)
|
||||
// Bits 52..57 reserved (6 targets)
|
||||
#define HWY_WASM_EMU256 (1LL << 58) // Experimental
|
||||
#define HWY_WASM (1LL << 59)
|
||||
// Bits 60 reserved
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 60
|
||||
|
||||
// 0x200000, 0x400000, 0x800000 reserved
|
||||
// --------------------------- Emulation: 2 targets
|
||||
|
||||
#define HWY_RVV 0x1000000
|
||||
#define HWY_EMU128 (1LL << 61)
|
||||
// We do not add/left-shift, so this will not overflow to a negative number.
|
||||
#define HWY_SCALAR (1LL << 62)
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 24
|
||||
|
||||
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
|
||||
|
||||
#define HWY_SCALAR 0x20000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
|
||||
|
||||
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
|
||||
// Do not use bit 63 - would be confusing to have negative numbers.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
@ -138,9 +155,9 @@
|
|||
#define HWY_BROKEN_TARGETS (HWY_NEON)
|
||||
|
||||
// SVE[2] require recent clang or gcc versions.
|
||||
#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
|
||||
(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
|
||||
#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
|
||||
(HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
|
@ -152,24 +169,41 @@
|
|||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
|
||||
// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
|
||||
// be enabled. If 1, we instead choose HWY_SCALAR even without
|
||||
// HWY_COMPILE_ONLY_SCALAR being set.
|
||||
#if !defined(HWY_BROKEN_EMU128) // allow overriding
|
||||
#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
|
||||
#define HWY_BROKEN_EMU128 1
|
||||
#else
|
||||
#define HWY_BROKEN_EMU128 0
|
||||
#endif
|
||||
#endif // HWY_BROKEN_EMU128
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. Do not use
|
||||
// this directly because it does not take the blocklist into account. Allow the
|
||||
// user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
// instructions, implying the target CPU would have to support them. This does
|
||||
// not take the blocklist into account.
|
||||
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
|
||||
#define HWY_BASELINE_SCALAR HWY_SCALAR
|
||||
#else
|
||||
#define HWY_BASELINE_SCALAR HWY_EMU128
|
||||
#endif
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_SCALAR.
|
||||
// HWY_TARGET == HWY_BASELINE_SCALAR.
|
||||
|
||||
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
||||
#if defined(HWY_WANT_WASM2)
|
||||
#define HWY_BASELINE_WASM HWY_WASM2
|
||||
#define HWY_BASELINE_WASM HWY_WASM_EMU256
|
||||
#else
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#endif // HWY_WANT_WASM2
|
||||
#endif // HWY_WANT_WASM2
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
@ -181,7 +215,6 @@
|
|||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
// SVE2 compiles, but is not yet tested.
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#else
|
||||
|
@ -189,6 +222,11 @@
|
|||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
|
||||
// Baseline targets can be used unconditionally, which does not apply to
|
||||
// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
|
||||
// in the baseline would also disable all 'worse' targets (including SVE and
|
||||
// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
|
||||
// HWY_ATTAINABLE_TARGETS below.
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#else
|
||||
#define HWY_BASELINE_SVE 0
|
||||
|
@ -201,11 +239,11 @@
|
|||
#define HWY_BASELINE_NEON 0
|
||||
#endif
|
||||
|
||||
// Special handling for MSVC because it has fewer predefined macros
|
||||
#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
|
||||
// Special handling for MSVC because it has fewer predefined macros:
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
// We can only be sure SSSE3/SSE4 are enabled if AVX is
|
||||
// (https://stackoverflow.com/questions/18563978/)
|
||||
// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
|
||||
// https://stackoverflow.com/questions/18563978/.
|
||||
#if defined(__AVX__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#define HWY_CHECK_SSE4 1
|
||||
|
@ -214,8 +252,8 @@
|
|||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
|
||||
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
|
||||
// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
|
||||
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#define HWY_CHECK_F16C 1
|
||||
|
@ -255,13 +293,13 @@
|
|||
|
||||
#endif // non-MSVC
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_CHECK_SSSE3
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
|
||||
#define HWY_BASELINE_SSSE3 HWY_SSSE3
|
||||
#else
|
||||
#define HWY_BASELINE_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
|
||||
#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
|
@ -284,7 +322,9 @@
|
|||
|
||||
// TODO(janwas): not yet known whether these will be set by MSVC
|
||||
#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
|
||||
defined(__VPCLMULQDQ__)
|
||||
defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) && \
|
||||
defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) && \
|
||||
defined(__AVX512BITALG__)
|
||||
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
|
@ -296,16 +336,13 @@
|
|||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
|
||||
HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 | \
|
||||
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
||||
HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
|
||||
|
||||
#else
|
||||
// User already defined HWY_BASELINE_TARGETS, but we still need to define
|
||||
// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
|
||||
#define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
|
||||
// Allow the user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
|
||||
HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON | \
|
||||
HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | \
|
||||
HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -329,33 +366,68 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
|
||||
defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
|
||||
#error "Invalid config: can only define a single policy for targets"
|
||||
#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
|
||||
defined(HWY_COMPILE_ONLY_STATIC))
|
||||
#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
|
||||
#endif
|
||||
// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
|
||||
|
||||
// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
|
||||
// does, and we require Linux to detect CPU capabilities.
|
||||
#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 1
|
||||
#else
|
||||
#define HWY_HAVE_RUNTIME_DISPATCH 0
|
||||
#endif
|
||||
|
||||
// Further to checking for disabled/broken targets, we only use AVX3_DL after
|
||||
// explicit opt-in (via this macro OR baseline compiler flags) to avoid
|
||||
// generating a codepath which is only helpful if the app uses AVX3_DL features.
|
||||
#if defined(HWY_WANT_AVX3_DL)
|
||||
#define HWY_CHECK_AVX3_DL HWY_AVX3_DL
|
||||
// AVX3_DL is not widely available yet. To reduce code size and compile time,
|
||||
// only include it in the set of attainable targets (for dynamic dispatch) if
|
||||
// the user opts in, OR it is in the baseline (we check whether enabled below).
|
||||
#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
|
||||
#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
|
||||
#define HWY_ATTAINABLE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && \
|
||||
((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
|
||||
#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && \
|
||||
((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
|
||||
#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_SVE2 0
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
|
||||
HWY_CHECK_AVX3_DL)
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
|
||||
HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
|
||||
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
|
||||
HWY_ATTAINABLE_SVE2)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
(HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
|
||||
#endif
|
||||
|
||||
// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
|
||||
// to ~HWY_SCALAR, but this is more explicit).
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
|
||||
#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_EMU128 // override baseline
|
||||
#define HWY_TARGETS HWY_EMU128
|
||||
|
||||
// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
|
||||
// we currently still support it for backwards compatibility.
|
||||
#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
|
||||
(defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
@ -369,9 +441,12 @@
|
|||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar.
|
||||
// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
|
||||
// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
|
||||
// sets all lower bits (better targets), then we also include the static target.
|
||||
#else
|
||||
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
|
||||
#define HWY_TARGETS \
|
||||
(HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -12,10 +13,6 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
@ -24,8 +21,12 @@
|
|||
#include <memory>
|
||||
#include <numeric> // iota
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
|
@ -81,7 +82,8 @@ void RunBenchmark(const char* caption) {
|
|||
benchmark.Verify(num_items);
|
||||
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
const double cycles_per_item = results[i].ticks / double(results[i].input);
|
||||
const double cycles_per_item =
|
||||
results[i].ticks / static_cast<double>(results[i].input);
|
||||
const double mad = results[i].variability * cycles_per_item;
|
||||
printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
|
||||
static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
|
||||
|
@ -233,7 +235,7 @@ namespace hwy {
|
|||
HWY_EXPORT(RunBenchmarks);
|
||||
|
||||
void Run() {
|
||||
for (uint32_t target : SupportedAndGeneratedTargets()) {
|
||||
for (int64_t target : SupportedAndGeneratedTargets()) {
|
||||
SetSupportedTargetsForTest(target);
|
||||
HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -22,7 +23,7 @@
|
|||
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
|
||||
// Generates code for each enabled target by re-including this source file.
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
|
@ -35,28 +36,21 @@ namespace skeleton {
|
|||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
|
||||
// For reasons unknown, optimized msan builds encounter long build times here;
|
||||
// work around it until a cause is found.
|
||||
#if HWY_COMPILER_CLANG && defined(MEMORY_SANITIZER) && defined(__OPTIMIZE__)
|
||||
#define ATTR_MSAN __attribute__((optnone))
|
||||
#else
|
||||
#define ATTR_MSAN
|
||||
#endif
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Computes log2 by converting to a vector of floats. Compiled once per target.
|
||||
template <class DF>
|
||||
ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
|
||||
const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const RebindToSigned<DF> d32;
|
||||
const Rebind<uint8_t, DF> d8;
|
||||
const hn::RebindToSigned<DF> d32;
|
||||
const hn::Rebind<uint8_t, DF> d8;
|
||||
|
||||
const auto u8 = Load(d8, values);
|
||||
const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
|
||||
const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
|
||||
Store(DemoteTo(d8, exponent), d8, log2);
|
||||
const auto u8 = hn::Load(d8, values);
|
||||
const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
|
||||
const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
|
||||
hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
|
||||
}
|
||||
|
||||
void CodepathDemo() {
|
||||
|
@ -74,14 +68,14 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
|||
uint8_t* HWY_RESTRICT log2) {
|
||||
CodepathDemo();
|
||||
|
||||
const ScalableTag<float> df;
|
||||
const size_t N = Lanes(df);
|
||||
const hn::ScalableTag<float> df;
|
||||
const size_t N = hn::Lanes(df);
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
OneFloorLog2(df, values + i, log2 + i);
|
||||
}
|
||||
for (; i < count; ++i) {
|
||||
CappedTag<float, 1> d1;
|
||||
hn::CappedTag<float, 1> d1;
|
||||
OneFloorLog2(d1, values + i, log2 + i);
|
||||
}
|
||||
}
|
||||
|
@ -105,8 +99,9 @@ HWY_EXPORT(FloorLog2);
|
|||
// This function is optional and only needed in the case of exposing it in the
|
||||
// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
|
||||
// is equivalent to inlining this function.
|
||||
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
||||
uint8_t* HWY_RESTRICT out) {
|
||||
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
|
||||
const size_t count,
|
||||
uint8_t* HWY_RESTRICT out) {
|
||||
// This must reside outside of HWY_NAMESPACE because it references (calls the
|
||||
// appropriate one from) the per-target implementations there.
|
||||
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -27,8 +28,8 @@
|
|||
namespace skeleton {
|
||||
|
||||
// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
|
||||
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
||||
uint8_t* HWY_RESTRICT out);
|
||||
HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
|
||||
const size_t count, uint8_t* HWY_RESTRICT out);
|
||||
|
||||
} // namespace skeleton
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -20,7 +21,7 @@
|
|||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
|
@ -34,13 +35,13 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
namespace hn = hwy::HWY_NAMESPACE;
|
||||
|
||||
// Calls function defined in skeleton.cc.
|
||||
struct TestFloorLog2 {
|
||||
template <class T, class DF>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, DF df) {
|
||||
const size_t count = 5 * Lanes(df);
|
||||
const size_t count = 5 * hn::Lanes(df);
|
||||
auto in = hwy::AllocateAligned<uint8_t>(count);
|
||||
auto expected = hwy::AllocateAligned<uint8_t>(count);
|
||||
|
||||
|
@ -70,7 +71,7 @@ struct TestSumMulAdd {
|
|||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
hwy::RandomState rng;
|
||||
const size_t count = 4096;
|
||||
EXPECT_TRUE(count % Lanes(d) == 0);
|
||||
EXPECT_EQ(0, count % hn::Lanes(d));
|
||||
auto mul = hwy::AllocateAligned<T>(count);
|
||||
auto x = hwy::AllocateAligned<T>(count);
|
||||
auto add = hwy::AllocateAligned<T>(count);
|
||||
|
@ -106,10 +107,4 @@ HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
|
|||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
|
||||
} // namespace skeleton
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -52,6 +53,17 @@
|
|||
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_EMU128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SCALAR
|
||||
|
@ -74,6 +86,17 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_RVV
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE
|
||||
|
@ -96,6 +119,28 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE_256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2_128
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSSE3
|
||||
|
@ -151,9 +196,9 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
|
||||
#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM2
|
||||
#define HWY_TARGET HWY_WASM_EMU256
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -26,8 +27,8 @@
|
|||
namespace hwy {
|
||||
|
||||
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
||||
#define HWY_MAJOR 0
|
||||
#define HWY_MINOR 16
|
||||
#define HWY_MAJOR 1
|
||||
#define HWY_MINOR 0
|
||||
#define HWY_PATCH 0
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -39,7 +40,7 @@ namespace hwy {
|
|||
// registers in the group, and is ignored on targets that do not support groups.
|
||||
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
|
||||
#define HWY_FULL2(T, LMUL) \
|
||||
hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
|
||||
hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
|
||||
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
|
||||
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
|
||||
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
|
||||
|
@ -49,8 +50,7 @@ namespace hwy {
|
|||
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
||||
|
||||
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
|
||||
#define HWY_CAPPED(T, MAX_N) \
|
||||
hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))>
|
||||
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Export user functions for static/dynamic dispatch
|
||||
|
@ -68,10 +68,12 @@ namespace hwy {
|
|||
// defined), and can be used to deduce the return type of Choose*.
|
||||
#if HWY_STATIC_TARGET == HWY_SCALAR
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_EMU128
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_RVV
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_NEON
|
||||
|
@ -80,6 +82,10 @@ namespace hwy {
|
|||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE_256
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SVE2_128
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC8
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSSE3
|
||||
|
@ -94,51 +100,22 @@ namespace hwy {
|
|||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
|
||||
#endif
|
||||
|
||||
// Dynamic dispatch declarations.
|
||||
|
||||
template <typename RetType, typename... Args>
|
||||
struct FunctionCache {
|
||||
public:
|
||||
typedef RetType(FunctionType)(Args...);
|
||||
|
||||
// A template function that when instantiated has the same signature as the
|
||||
// function being called. This function initializes the global cache of the
|
||||
// current supported targets mask used for dynamic dispatch and calls the
|
||||
// appropriate function. Since this mask used for dynamic dispatch is a
|
||||
// global cache, all the highway exported functions, even those exposed by
|
||||
// different modules, will be initialized after this function runs for any one
|
||||
// of those exported functions.
|
||||
template <FunctionType* const table[]>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
// If we are running here it means we need to update the chosen target.
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update();
|
||||
return (table[chosen_target.GetIndex()])(args...);
|
||||
}
|
||||
};
|
||||
|
||||
// Factory function only used to infer the template parameters RetType and Args
|
||||
// from a function passed to the factory.
|
||||
template <typename RetType, typename... Args>
|
||||
FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
||||
return FunctionCache<RetType, Args...>();
|
||||
}
|
||||
|
||||
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
|
||||
// nullptr is that target was not compiled.
|
||||
#if HWY_TARGETS & HWY_SCALAR
|
||||
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
|
||||
#if HWY_TARGETS & HWY_EMU128
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
|
||||
#elif HWY_TARGETS & HWY_SCALAR
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
|
||||
#else
|
||||
// When scalar is not present and we try to use scalar because other targets
|
||||
// were disabled at runtime we fall back to the baseline with
|
||||
// HWY_STATIC_DISPATCH()
|
||||
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
|
||||
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
|
||||
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM2
|
||||
#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
|
||||
#if HWY_TARGETS & HWY_WASM_EMU256
|
||||
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
|
||||
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM
|
||||
|
@ -171,6 +148,18 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE_256
|
||||
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SVE2_128
|
||||
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_PPC8
|
||||
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
|
||||
#else
|
||||
|
@ -207,6 +196,53 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
|
||||
// apparently cannot be an array. Use a function pointer instead, which has the
|
||||
// disadvantage that we call the static (not best) target on the first call to
|
||||
// any HWY_DYNAMIC_DISPATCH.
|
||||
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
|
||||
#define HWY_DISPATCH_WORKAROUND 1
|
||||
#else
|
||||
#define HWY_DISPATCH_WORKAROUND 0
|
||||
#endif
|
||||
|
||||
// Provides a static member function which is what is called during the first
|
||||
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
|
||||
// this function are the first entry in the tables created by HWY_EXPORT.
|
||||
template <typename RetType, typename... Args>
|
||||
struct FunctionCache {
|
||||
public:
|
||||
typedef RetType(FunctionType)(Args...);
|
||||
|
||||
#if HWY_DISPATCH_WORKAROUND
|
||||
template <FunctionType* const func>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update(SupportedTargets());
|
||||
return (*func)(args...);
|
||||
}
|
||||
#else
|
||||
// A template function that when instantiated has the same signature as the
|
||||
// function being called. This function initializes the bit array of targets
|
||||
// supported by the current CPU and then calls the appropriate entry within
|
||||
// the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
|
||||
// exported functions, even those defined by different translation units,
|
||||
// will dispatch directly to the best available target.
|
||||
template <FunctionType* const table[]>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update(SupportedTargets());
|
||||
return (table[chosen_target.GetIndex()])(args...);
|
||||
}
|
||||
#endif // HWY_DISPATCH_WORKAROUND
|
||||
};
|
||||
|
||||
// Used to deduce the template parameters RetType and Args from a function.
|
||||
template <typename RetType, typename... Args>
|
||||
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
|
||||
return FunctionCache<RetType, Args...>();
|
||||
}
|
||||
|
||||
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
|
||||
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
|
||||
|
||||
|
@ -215,7 +251,7 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
// static array must be defined at the same namespace level as the function
|
||||
// it is exporting.
|
||||
// After being exported, it can be called from other parts of the same source
|
||||
// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
|
||||
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
|
||||
// like in the following example:
|
||||
//
|
||||
// #include "hwy/highway.h"
|
||||
|
@ -245,26 +281,44 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
// This case still uses a table, although of a single element, to provide the
|
||||
// same compile error conditions as with the dynamic dispatch case when multiple
|
||||
// targets are being compiled.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
|
||||
const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = { \
|
||||
&HWY_STATIC_DISPATCH(FUNC_NAME)}
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
|
||||
HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
|
||||
#else
|
||||
|
||||
// Dynamic dispatch case with one entry per dynamic target plus the scalar
|
||||
// mode and the initialization wrapper.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
|
||||
const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the appropriate function. */ \
|
||||
&decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_SCALAR(FUNC_NAME), \
|
||||
// Simplified version for MSVC 2017: function pointer instead of table.
|
||||
#if HWY_DISPATCH_WORKAROUND
|
||||
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
||||
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the function from HWY_STATIC_TARGET. */ \
|
||||
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Dynamic dispatch case with one entry per dynamic target plus the fallback
|
||||
// target and the initialization wrapper.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
|
||||
FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the appropriate function. */ \
|
||||
&decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_FALLBACK(FUNC_NAME), \
|
||||
}
|
||||
|
||||
#endif // HWY_DISPATCH_WORKAROUND
|
||||
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
|
||||
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
|
||||
|
||||
|
@ -302,14 +356,17 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#error "PPC is not yet supported"
|
||||
#elif HWY_TARGET == HWY_NEON
|
||||
#include "hwy/ops/arm_neon-inl.h"
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
|
||||
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
||||
#include "hwy/ops/arm_sve-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM2
|
||||
#elif HWY_TARGET == HWY_WASM_EMU256
|
||||
#include "hwy/ops/wasm_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
#include "hwy/ops/wasm_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
#include "hwy/ops/rvv-inl.h"
|
||||
#elif HWY_TARGET == HWY_EMU128
|
||||
#include "hwy/ops/emu128-inl.h"
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
#include "hwy/ops/scalar-inl.h"
|
||||
#else
|
||||
|
|
|
@ -9,19 +9,11 @@
|
|||
#ifndef HWY_DLLEXPORT_H
|
||||
#define HWY_DLLEXPORT_H
|
||||
|
||||
// Bazel build are always static:
|
||||
#if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE)
|
||||
#define HWY_STATIC_DEFINE
|
||||
#endif
|
||||
|
||||
#ifdef HWY_STATIC_DEFINE
|
||||
#if !defined(HWY_SHARED_DEFINE)
|
||||
#define HWY_DLLEXPORT
|
||||
#define HWY_NO_EXPORT
|
||||
#define HWY_CONTRIB_DLLEXPORT
|
||||
#define HWY_CONTRIB_NO_EXPORT
|
||||
#define HWY_TEST_DLLEXPORT
|
||||
#define HWY_TEST_NO_EXPORT
|
||||
#else
|
||||
#else // !HWY_SHARED_DEFINE
|
||||
|
||||
#ifndef HWY_DLLEXPORT
|
||||
#if defined(hwy_EXPORTS)
|
||||
|
@ -31,23 +23,15 @@
|
|||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
#else // defined(hwy_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_NO_EXPORT
|
||||
#else
|
||||
#define HWY_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
#endif // defined(hwy_EXPORTS)
|
||||
#endif // HWY_DLLEXPORT
|
||||
|
||||
#ifndef HWY_CONTRIB_DLLEXPORT
|
||||
#if defined(hwy_contrib_EXPORTS)
|
||||
|
@ -57,23 +41,15 @@
|
|||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
#else // defined(hwy_contrib_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_CONTRIB_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_NO_EXPORT
|
||||
#else
|
||||
#define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
#endif // defined(hwy_contrib_EXPORTS)
|
||||
#endif // HWY_CONTRIB_DLLEXPORT
|
||||
|
||||
#ifndef HWY_TEST_DLLEXPORT
|
||||
#if defined(hwy_test_EXPORTS)
|
||||
|
@ -83,24 +59,16 @@
|
|||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
#else // defined(hwy_test_EXPORTS)
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif // defined(hwy_test_EXPORTS)
|
||||
#endif // HWY_TEST_DLLEXPORT
|
||||
|
||||
#ifndef HWY_TEST_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_NO_EXPORT
|
||||
#else
|
||||
#define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif // !HWY_SHARED_DEFINE
|
||||
|
||||
#endif /* HWY_DLLEXPORT_H */
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -14,12 +15,15 @@
|
|||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <bitset>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "highway_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h" // Unpredictable1
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
@ -28,6 +32,38 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <size_t kLimit, typename T>
|
||||
HWY_NOINLINE void TestCappedLimit(T /* tag */) {
|
||||
CappedTag<T, kLimit> d;
|
||||
// Ensure two ops compile
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0}));
|
||||
|
||||
// Ensure we do not write more than kLimit lanes
|
||||
const size_t N = Lanes(d);
|
||||
if (kLimit < N) {
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T{0});
|
||||
Store(Set(d, T{1}), d, lanes.get());
|
||||
for (size_t i = kLimit; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(lanes[i], T{0});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not
|
||||
// use ForPartialVectors etc.
|
||||
struct TestCapped {
|
||||
template <typename T>
|
||||
void operator()(T t) const {
|
||||
TestCappedLimit<1>(t);
|
||||
TestCappedLimit<3>(t);
|
||||
TestCappedLimit<5>(t);
|
||||
TestCappedLimit<1ull << 15>(t);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); }
|
||||
|
||||
// For testing that ForPartialVectors reaches every possible size:
|
||||
using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
|
||||
|
||||
|
@ -48,7 +84,7 @@ struct TestMaxLanes {
|
|||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t kMax = MaxLanes(d);
|
||||
const size_t kMax = MaxLanes(d); // for RVV, includes LMUL
|
||||
HWY_ASSERT(N <= kMax);
|
||||
HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
|
||||
|
||||
|
@ -175,26 +211,19 @@ HWY_NOINLINE void TestAllSignBit() {
|
|||
ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
|
||||
}
|
||||
|
||||
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
||||
template <typename TF>
|
||||
bool IsNaN(TF f) {
|
||||
MakeUnsigned<TF> bits;
|
||||
memcpy(&bits, &f, sizeof(TF));
|
||||
bits += bits;
|
||||
bits >>= 1; // clear sign bit
|
||||
// NaN if all exponent bits are set and the mantissa is not zero.
|
||||
return bits > ExponentMask<decltype(bits)>();
|
||||
}
|
||||
|
||||
// inline to work around incorrect SVE codegen (only first 128 bits used).
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
|
||||
HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
|
||||
using T = TFromD<D>;
|
||||
const T lane = GetLane(v);
|
||||
if (!IsNaN(lane)) {
|
||||
const std::string type_name = TypeName(T(), Lanes(d));
|
||||
const size_t N = Lanes(d);
|
||||
if (!AllTrue(d, IsNaN(v))) {
|
||||
Print(d, "not all NaN", v, 0, N);
|
||||
Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N);
|
||||
const std::string type_name = TypeName(T(), N);
|
||||
// RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
|
||||
// avoid truncating doubles.
|
||||
uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
|
||||
const T lane = GetLane(v);
|
||||
memcpy(bytes, &lane, sizeof(T));
|
||||
Abort(file, line,
|
||||
"Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
|
||||
|
@ -264,14 +293,14 @@ struct TestNaN {
|
|||
|
||||
// Reduction
|
||||
HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
|
||||
// TODO(janwas): re-enable after QEMU is fixed
|
||||
// TODO(janwas): re-enable after QEMU/Spike are fixed
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
|
||||
HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
|
||||
#endif
|
||||
|
||||
// Min
|
||||
#if HWY_ARCH_X86 && HWY_TARGET != HWY_SCALAR
|
||||
#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128)
|
||||
// x86 SIMD returns the second operand if any input is NaN.
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
|
||||
|
@ -317,6 +346,74 @@ HWY_NOINLINE void TestAllNaN() {
|
|||
ForPartialVectors<TestF32NaN>()(float());
|
||||
}
|
||||
|
||||
struct TestIsNaN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
const auto neg = Set(d, T{-1});
|
||||
HWY_ASSERT_NAN(d, nan);
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue<T>())));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue<T>())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIsNaN() {
|
||||
ForFloatTypes(ForPartialVectors<TestIsNaN>());
|
||||
}
|
||||
|
||||
struct TestIsInf {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
const auto neg = Set(d, T{-1});
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue<T>())));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue<T>())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIsInf() {
|
||||
ForFloatTypes(ForPartialVectors<TestIsInf>());
|
||||
}
|
||||
|
||||
struct TestIsFinite {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
const auto neg = Set(d, T{-1});
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d)));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue<T>())));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d),
|
||||
IsFinite(Set(d, hwy::HighestValue<T>())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIsFinite() {
|
||||
ForFloatTypes(ForPartialVectors<TestIsFinite>());
|
||||
}
|
||||
|
||||
struct TestCopyAndAssign {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -371,21 +468,19 @@ HWY_AFTER_NAMESPACE();
|
|||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HighwayTest);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -24,6 +25,7 @@
|
|||
#include <algorithm> // sort
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <chrono> //NOLINT
|
||||
#include <limits>
|
||||
#include <numeric> // iota
|
||||
#include <random>
|
||||
|
@ -122,6 +124,9 @@ inline Ticks Start() {
|
|||
Ticks t;
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
||||
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
||||
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
|
@ -160,10 +165,14 @@ inline Ticks Start() {
|
|||
return t;
|
||||
}
|
||||
|
||||
// WARNING: on x86, caller must check HasRDTSCP before using this!
|
||||
inline Ticks Stop() {
|
||||
uint64_t t;
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
|
||||
// pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
|
||||
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
unsigned aux;
|
||||
|
@ -302,7 +311,8 @@ T MedianAbsoluteDeviation(const T* values, const size_t num_values,
|
|||
std::vector<T> abs_deviations;
|
||||
abs_deviations.reserve(num_values);
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
|
||||
const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
|
||||
static_cast<int64_t>(median));
|
||||
abs_deviations.push_back(static_cast<T>(abs));
|
||||
}
|
||||
return Median(abs_deviations.data(), num_values);
|
||||
|
@ -331,6 +341,38 @@ inline void PreventElision(T&& output) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Measures the actual current frequency of Ticks. We cannot rely on the nominal
|
||||
// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
|
||||
// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
|
||||
// used on RISC-V and ARM64.
|
||||
HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
|
||||
double max_ticks_per_sec = 0.0;
|
||||
// Arbitrary, enough to ignore 2 outliers without excessive init time.
|
||||
for (int rep = 0; rep < 3; ++rep) {
|
||||
auto time0 = std::chrono::steady_clock::now();
|
||||
using Time = decltype(time0);
|
||||
const timer::Ticks ticks0 = timer::Start();
|
||||
const Time time_min = time0 + std::chrono::milliseconds(10);
|
||||
|
||||
Time time1;
|
||||
timer::Ticks ticks1;
|
||||
for (;;) {
|
||||
time1 = std::chrono::steady_clock::now();
|
||||
// Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
|
||||
// another codepath, just use Start instead. now() presumably has its own
|
||||
// fence-like behavior.
|
||||
ticks1 = timer::Start(); // Do not use Stop, see comment above
|
||||
if (time1 >= time_min) break;
|
||||
}
|
||||
|
||||
const double dticks = static_cast<double>(ticks1 - ticks0);
|
||||
std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
|
||||
const double ticks_per_sec = dticks / dtime.count();
|
||||
max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec);
|
||||
}
|
||||
return max_ticks_per_sec;
|
||||
}
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
|
||||
void Cpuid(const uint32_t level, const uint32_t count,
|
||||
|
@ -378,50 +420,27 @@ std::string BrandString() {
|
|||
return brand_string;
|
||||
}
|
||||
|
||||
// Returns the frequency quoted inside the brand string. This does not
|
||||
// account for throttling nor Turbo Boost.
|
||||
double NominalClockRate() {
|
||||
const std::string& brand_string = BrandString();
|
||||
// Brand strings include the maximum configured frequency. These prefixes are
|
||||
// defined by Intel CPUID documentation.
|
||||
const char* prefixes[3] = {"MHz", "GHz", "THz"};
|
||||
const double multipliers[3] = {1E6, 1E9, 1E12};
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
const size_t pos_prefix = brand_string.find(prefixes[i]);
|
||||
if (pos_prefix != std::string::npos) {
|
||||
const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
|
||||
if (pos_space != std::string::npos) {
|
||||
const std::string digits =
|
||||
brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
|
||||
return std::stod(digits) * multipliers[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT double InvariantTicksPerSecond() {
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
return double(__ppc_get_timebase_freq());
|
||||
#elif HWY_ARCH_X86
|
||||
// We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
|
||||
return NominalClockRate();
|
||||
return static_cast<double>(__ppc_get_timebase_freq());
|
||||
#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
|
||||
// We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
|
||||
static const double freq = MeasureNominalClockRate();
|
||||
return freq;
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
LARGE_INTEGER freq;
|
||||
(void)QueryPerformanceFrequency(&freq);
|
||||
return double(freq.QuadPart);
|
||||
return static_cast<double>(freq.QuadPart);
|
||||
#elif defined(__APPLE__)
|
||||
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
|
||||
mach_timebase_info_data_t timebase;
|
||||
(void)mach_timebase_info(&timebase);
|
||||
return double(timebase.denom) / timebase.numer * 1E9;
|
||||
return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
|
||||
#else
|
||||
// TODO(janwas): ARM? Unclear how to reliably query cntvct_el0 frequency.
|
||||
return 1E9; // Haiku and clock_gettime return nanoseconds.
|
||||
#endif
|
||||
}
|
||||
|
@ -432,14 +451,28 @@ HWY_DLLEXPORT double Now() {
|
|||
}
|
||||
|
||||
HWY_DLLEXPORT uint64_t TimerResolution() {
|
||||
#if HWY_ARCH_X86
|
||||
bool can_use_stop = platform::HasRDTSCP();
|
||||
#else
|
||||
constexpr bool can_use_stop = true;
|
||||
#endif
|
||||
|
||||
// Nested loop avoids exceeding stack/L1 capacity.
|
||||
timer::Ticks repetitions[Params::kTimerSamples];
|
||||
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
|
||||
timer::Ticks samples[Params::kTimerSamples];
|
||||
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
|
||||
const timer::Ticks t0 = timer::Start();
|
||||
const timer::Ticks t1 = timer::Stop();
|
||||
samples[i] = t1 - t0;
|
||||
if (can_use_stop) {
|
||||
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
|
||||
const timer::Ticks t0 = timer::Start();
|
||||
const timer::Ticks t1 = timer::Stop(); // we checked HasRDTSCP above
|
||||
samples[i] = t1 - t0;
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
|
||||
const timer::Ticks t0 = timer::Start();
|
||||
const timer::Ticks t1 = timer::Start(); // do not use Stop, see above
|
||||
samples[i] = t1 - t0;
|
||||
}
|
||||
}
|
||||
repetitions[rep] = robust_statistics::Mode(samples);
|
||||
}
|
||||
|
@ -459,7 +492,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|||
// Choose initial samples_per_eval based on a single estimated duration.
|
||||
timer::Ticks t0 = timer::Start();
|
||||
lambda();
|
||||
timer::Ticks t1 = timer::Stop();
|
||||
timer::Ticks t1 = timer::Stop(); // Caller checks HasRDTSCP
|
||||
timer::Ticks est = t1 - t0;
|
||||
static const double ticks_per_second = platform::InvariantTicksPerSecond();
|
||||
const size_t ticks_per_eval =
|
||||
|
@ -483,7 +516,7 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|||
for (size_t i = 0; i < samples_per_eval; ++i) {
|
||||
t0 = timer::Start();
|
||||
lambda();
|
||||
t1 = timer::Stop();
|
||||
t1 = timer::Stop(); // Caller checks HasRDTSCP
|
||||
samples.push_back(t1 - t0);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -57,7 +58,7 @@ std::mt19937 rng;
|
|||
// A function whose runtime depends on rng.
|
||||
FuncOutput Random(const void* /*arg*/, FuncInput in) {
|
||||
const size_t r = rng() & 0xF;
|
||||
uint32_t ret = in;
|
||||
FuncOutput ret = static_cast<FuncOutput>(in);
|
||||
for (size_t i = 0; i < r; ++i) {
|
||||
ret /= ((rng() & 1) + 2);
|
||||
}
|
||||
|
@ -88,9 +89,3 @@ TEST(NanobenchmarkTest, RunAll) {
|
|||
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -58,9 +59,8 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
|
|||
// Returns lanes with the most significant bit set and all other bits zero.
|
||||
template <class D>
|
||||
HWY_API Vec<D> SignBit(D d) {
|
||||
using Unsigned = MakeUnsigned<TFromD<D>>;
|
||||
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
|
||||
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
return BitCast(d, Set(du, SignMask<TFromD<D>>()));
|
||||
}
|
||||
|
||||
// Returns quiet NaN.
|
||||
|
@ -72,6 +72,906 @@ HWY_API Vec<D> NaN(D d) {
|
|||
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
|
||||
}
|
||||
|
||||
// Returns positive infinity.
|
||||
template <class D>
|
||||
HWY_API Vec<D> Inf(D d) {
|
||||
const RebindToUnsigned<D> du;
|
||||
using T = TFromD<D>;
|
||||
using TU = TFromD<decltype(du)>;
|
||||
const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
|
||||
return BitCast(d, Set(du, max_x2 >> 1));
|
||||
}
|
||||
|
||||
// ------------------------------ SafeFillN
|
||||
|
||||
template <class D, typename T = TFromD<D>>
|
||||
HWY_API void SafeFillN(const size_t num, const T value, D d,
|
||||
T* HWY_RESTRICT to) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
(void)d;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
to[i] = value;
|
||||
}
|
||||
#else
|
||||
BlendedStore(Set(d, value), FirstN(d, num), d, to);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ SafeCopyN
|
||||
|
||||
template <class D, typename T = TFromD<D>>
|
||||
HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
|
||||
T* HWY_RESTRICT to) {
|
||||
#if HWY_MEM_OPS_MIGHT_FAULT
|
||||
(void)d;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
to[i] = from[i];
|
||||
}
|
||||
#else
|
||||
const Mask<D> mask = FirstN(d, num);
|
||||
BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
|
||||
#endif
|
||||
}
|
||||
|
||||
// "Include guard": skip if native instructions are available. The generic
|
||||
// implementation is currently shared between x86_* and wasm_*, and is too large
|
||||
// to duplicate.
|
||||
|
||||
#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
|
||||
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#else
|
||||
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#endif
|
||||
|
||||
// ------------------------------ LoadInterleaved2
|
||||
|
||||
template <typename T, size_t N, class V>
|
||||
HWY_API void LoadInterleaved2(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1) {
|
||||
const V A = LoadU(d, unaligned + 0 * N); // v1[1] v0[1] v1[0] v0[0]
|
||||
const V B = LoadU(d, unaligned + 1 * N);
|
||||
v0 = ConcatEven(d, B, A);
|
||||
v1 = ConcatOdd(d, B, A);
|
||||
}
|
||||
|
||||
template <typename T, class V>
|
||||
HWY_API void LoadInterleaved2(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
}
|
||||
|
||||
// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
|
||||
template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
|
||||
HWY_API void LoadTransposedBlocks3(Simd<T, N, 0> d,
|
||||
const T* HWY_RESTRICT unaligned, V& A, V& B,
|
||||
V& C) {
|
||||
A = LoadU(d, unaligned + 0 * N);
|
||||
B = LoadU(d, unaligned + 1 * N);
|
||||
C = LoadU(d, unaligned + 2 * N);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
|
||||
HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
// Compact notation so these fit on one line: 12 := v1[2].
|
||||
V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
|
||||
V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
|
||||
V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
|
||||
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
||||
// Compress all lanes belonging to v0 into consecutive lanes.
|
||||
constexpr uint8_t Z = 0x80;
|
||||
alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
|
||||
Z, Z, Z, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, Z, Z, Z, 2, 5,
|
||||
8, 11, 14, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
|
||||
Z, Z, Z, 1, 4, 7, 10, 13};
|
||||
alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
|
||||
Z, Z, Z, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, Z, Z, 0, 3, 6,
|
||||
9, 12, 15, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
|
||||
Z, Z, Z, 2, 5, 8, 11, 14};
|
||||
alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
|
||||
Z, Z, Z, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, Z, Z, Z, 1, 4, 7,
|
||||
10, 13, Z, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z, Z,
|
||||
Z, Z, 0, 3, 6, 9, 12, 15};
|
||||
const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
|
||||
const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
|
||||
const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
|
||||
const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
|
||||
const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
|
||||
const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
|
||||
const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
|
||||
const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
|
||||
const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
|
||||
v0 = Or3(v0L, v0M, v0U);
|
||||
v1 = Or3(v1L, v1M, v1U);
|
||||
v2 = Or3(v2L, v2M, v2U);
|
||||
}
|
||||
|
||||
// 8-bit lanes x8
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
|
||||
HWY_IF_LANES_PER_BLOCK(T, N, 8)>
|
||||
HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
|
||||
V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
|
||||
V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
|
||||
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
||||
// Compress all lanes belonging to v0 into consecutive lanes.
|
||||
constexpr uint8_t Z = 0x80;
|
||||
alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
|
||||
alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
|
||||
alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
|
||||
alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
|
||||
const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
|
||||
const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
|
||||
const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
|
||||
const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
|
||||
const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
|
||||
const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
|
||||
const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
|
||||
const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
|
||||
const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
|
||||
v0 = Or3(v0L, v0M, v0U);
|
||||
v1 = Or3(v1L, v1M, v1U);
|
||||
v2 = Or3(v2L, v2M, v2U);
|
||||
}
|
||||
|
||||
// 16-bit lanes x8
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
|
||||
HWY_IF_LANES_PER_BLOCK(T, N, 8)>
|
||||
HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
|
||||
V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
|
||||
V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
|
||||
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
||||
// Compress all lanes belonging to v0 into consecutive lanes. Same as above,
|
||||
// but each element of the array contains two byte indices for a lane.
|
||||
constexpr uint16_t Z = 0x8080;
|
||||
alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
|
||||
Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z, Z, Z, 0x0302,
|
||||
0x0908, 0x0F0E, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z, Z,
|
||||
Z, Z, 0x0504, 0x0B0A};
|
||||
alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
|
||||
Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z, Z, Z, 0x0504,
|
||||
0x0B0A, Z, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z, Z, Z,
|
||||
Z, 0x0100, 0x0706, 0x0D0C};
|
||||
alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
|
||||
Z, Z, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z, Z, 0x0100, 0x0706,
|
||||
0x0D0C, Z, Z, Z};
|
||||
alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z, Z, Z,
|
||||
Z, 0x0302, 0x0908, 0x0F0E};
|
||||
const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
|
||||
const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
|
||||
const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
|
||||
const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
|
||||
const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
|
||||
const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
|
||||
const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
|
||||
const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
|
||||
const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
|
||||
v0 = Or3(v0L, v0M, v0U);
|
||||
v1 = Or3(v1L, v1M, v1U);
|
||||
v2 = Or3(v2L, v2M, v2U);
|
||||
}
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
|
||||
HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
V A; // v0[1] v2[0] v1[0] v0[0]
|
||||
V B; // v1[2] v0[2] v2[1] v1[1]
|
||||
V C; // v2[3] v1[3] v0[3] v2[2]
|
||||
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
||||
|
||||
const V vxx_02_03_xx = OddEven(C, B);
|
||||
v0 = detail::Shuffle1230(A, vxx_02_03_xx);
|
||||
|
||||
// Shuffle2301 takes the upper/lower halves of the output from one input, so
|
||||
// we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
|
||||
// OddEven because it may have higher throughput than Shuffle.
|
||||
const V vxx_xx_10_11 = OddEven(A, B);
|
||||
const V v12_13_xx_xx = OddEven(B, C);
|
||||
v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
|
||||
|
||||
const V vxx_20_21_xx = OddEven(B, A);
|
||||
v2 = detail::Shuffle3012(vxx_20_21_xx, C);
|
||||
}
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
|
||||
HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
V A; // v1[0] v0[0]
|
||||
V B; // v0[1] v2[0]
|
||||
V C; // v2[1] v1[1]
|
||||
detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
|
||||
v0 = OddEven(B, A);
|
||||
v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
|
||||
v2 = OddEven(C, B);
|
||||
}
|
||||
|
||||
template <typename T, class V>
|
||||
HWY_API void LoadInterleaved3(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
v2 = LoadU(d, unaligned + 2);
|
||||
}
|
||||
|
||||
// ------------------------------ LoadInterleaved4
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
|
||||
template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
|
||||
HWY_API void LoadTransposedBlocks4(Simd<T, N, 0> d,
|
||||
const T* HWY_RESTRICT unaligned, V& A, V& B,
|
||||
V& C, V& D) {
|
||||
A = LoadU(d, unaligned + 0 * N);
|
||||
B = LoadU(d, unaligned + 1 * N);
|
||||
C = LoadU(d, unaligned + 2 * N);
|
||||
D = LoadU(d, unaligned + 3 * N);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
|
||||
HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2, V& v3) {
|
||||
const Repartition<uint64_t, decltype(d)> d64;
|
||||
using V64 = VFromD<decltype(d64)>;
|
||||
// 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
|
||||
// Here int[i] means the four interleaved values of the i-th 4-tuple and
|
||||
// int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
|
||||
V A; // int[13..10] int[3..0]
|
||||
V B; // int[17..14] int[7..4]
|
||||
V C; // int[1b..18] int[b..8]
|
||||
V D; // int[1f..1c] int[f..c]
|
||||
detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
|
||||
|
||||
// For brevity, the comments only list the lower block (upper = lower + 0x10)
|
||||
const V v5140 = InterleaveLower(d, A, B); // int[5,1,4,0]
|
||||
const V vd9c8 = InterleaveLower(d, C, D); // int[d,9,c,8]
|
||||
const V v7362 = InterleaveUpper(d, A, B); // int[7,3,6,2]
|
||||
const V vfbea = InterleaveUpper(d, C, D); // int[f,b,e,a]
|
||||
|
||||
const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0]
|
||||
const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8]
|
||||
const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1]
|
||||
const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9]
|
||||
|
||||
const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0]
|
||||
const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8]
|
||||
const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0]
|
||||
const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8]
|
||||
|
||||
v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
|
||||
v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
|
||||
v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
|
||||
v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
|
||||
}
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
|
||||
HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2, V& v3) {
|
||||
// In the last step, we interleave by half of the block size, which is usually
|
||||
// 8 bytes but half that for 8-bit x8 vectors.
|
||||
using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
|
||||
const Repartition<TW, decltype(d)> dw;
|
||||
using VW = VFromD<decltype(dw)>;
|
||||
|
||||
// (Comments are for 256-bit vectors.)
|
||||
// 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
|
||||
V A; // v3210[9]v3210[8] v3210[1]v3210[0]
|
||||
V B; // v3210[b]v3210[a] v3210[3]v3210[2]
|
||||
V C; // v3210[d]v3210[c] v3210[5]v3210[4]
|
||||
V D; // v3210[f]v3210[e] v3210[7]v3210[6]
|
||||
detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
|
||||
|
||||
const V va820 = InterleaveLower(d, A, B); // v3210[a,8] v3210[2,0]
|
||||
const V vec64 = InterleaveLower(d, C, D); // v3210[e,c] v3210[6,4]
|
||||
const V vb931 = InterleaveUpper(d, A, B); // v3210[b,9] v3210[3,1]
|
||||
const V vfd75 = InterleaveUpper(d, C, D); // v3210[f,d] v3210[7,5]
|
||||
|
||||
const VW v10_b830 = // v10[b..8] v10[3..0]
|
||||
BitCast(dw, InterleaveLower(d, va820, vb931));
|
||||
const VW v10_fc74 = // v10[f..c] v10[7..4]
|
||||
BitCast(dw, InterleaveLower(d, vec64, vfd75));
|
||||
const VW v32_b830 = // v32[b..8] v32[3..0]
|
||||
BitCast(dw, InterleaveUpper(d, va820, vb931));
|
||||
const VW v32_fc74 = // v32[f..c] v32[7..4]
|
||||
BitCast(dw, InterleaveUpper(d, vec64, vfd75));
|
||||
|
||||
v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
|
||||
v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
|
||||
v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
|
||||
v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
|
||||
}
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
|
||||
HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2, V& v3) {
|
||||
V A; // v3210[4] v3210[0]
|
||||
V B; // v3210[5] v3210[1]
|
||||
V C; // v3210[6] v3210[2]
|
||||
V D; // v3210[7] v3210[3]
|
||||
detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
|
||||
const V v10_ev = InterleaveLower(d, A, C); // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
|
||||
const V v10_od = InterleaveLower(d, B, D); // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
|
||||
const V v32_ev = InterleaveUpper(d, A, C); // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
|
||||
const V v32_od = InterleaveUpper(d, B, D); // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
|
||||
|
||||
v0 = InterleaveLower(d, v10_ev, v10_od);
|
||||
v1 = InterleaveUpper(d, v10_ev, v10_od);
|
||||
v2 = InterleaveLower(d, v32_ev, v32_od);
|
||||
v3 = InterleaveUpper(d, v32_ev, v32_od);
|
||||
}
|
||||
|
||||
template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
|
||||
HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2, V& v3) {
|
||||
V A, B, C, D;
|
||||
detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
|
||||
v0 = InterleaveLower(d, A, C);
|
||||
v1 = InterleaveUpper(d, A, C);
|
||||
v2 = InterleaveLower(d, B, D);
|
||||
v3 = InterleaveUpper(d, B, D);
|
||||
}
|
||||
|
||||
// Any T x1
|
||||
template <typename T, class V>
|
||||
HWY_API void LoadInterleaved4(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
|
||||
V& v0, V& v1, V& v2, V& v3) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
v2 = LoadU(d, unaligned + 2);
|
||||
v3 = LoadU(d, unaligned + 3);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved2
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
|
||||
template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
|
||||
HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(A, d, unaligned + 0 * N);
|
||||
StoreU(B, d, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// >= 128 bit vector
|
||||
template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0]
|
||||
const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[N/2] v0[N/2]
|
||||
detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
|
||||
}
|
||||
|
||||
// 64 bits
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
|
||||
Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const auto v10 = InterleaveLower(d_full, v0, v1);
|
||||
StoreU(v10, d_full, unaligned);
|
||||
}
|
||||
|
||||
// <= 32 bits
|
||||
template <typename T, size_t N, HWY_IF_LE32(T, N)>
|
||||
HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
|
||||
const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const auto v10 = InterleaveLower(d_full, v0, v1);
|
||||
alignas(16) T buf[16 / sizeof(T)];
|
||||
StoreU(v10, d_full, buf);
|
||||
CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
||||
// TableLookupBytes)
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
|
||||
template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
|
||||
HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
|
||||
Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(A, d, unaligned + 0 * N);
|
||||
StoreU(B, d, unaligned + 1 * N);
|
||||
StoreU(C, d, unaligned + 2 * N);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// >= 128-bit vector, 8-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const auto k5 = Set(du, 5);
|
||||
const auto k6 = Set(du, 6);
|
||||
|
||||
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
||||
// v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
|
||||
// to their place, with 0x80 so lanes to be filled from other vectors are 0
|
||||
// to enable blending by ORing together.
|
||||
alignas(16) static constexpr uint8_t tbl_v0[16] = {
|
||||
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
||||
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
||||
alignas(16) static constexpr uint8_t tbl_v1[16] = {
|
||||
0x80, 0, 0x80, 0x80, 1, 0x80, //
|
||||
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
||||
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A0 = LoadDup128(du, tbl_v0);
|
||||
const auto shuf_A1 = LoadDup128(du, tbl_v1); // cannot reuse shuf_A0 (has 5)
|
||||
const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
|
||||
const V A = BitCast(d, A0 | A1 | A2);
|
||||
|
||||
// B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
|
||||
const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6..
|
||||
const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5
|
||||
const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5.
|
||||
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
|
||||
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
|
||||
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
|
||||
const V B = BitCast(d, B0 | B1 | B2);
|
||||
|
||||
// C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
|
||||
const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B.
|
||||
const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B..
|
||||
const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A
|
||||
const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
|
||||
const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
|
||||
const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
|
||||
const V C = BitCast(d, C0 | C1 | C2);
|
||||
|
||||
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
||||
}
|
||||
|
||||
// >= 128-bit vector, 16-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const Repartition<uint8_t, decltype(d)> du8;
|
||||
const auto k2 = Set(du8, 2 * sizeof(T));
|
||||
const auto k3 = Set(du8, 3 * sizeof(T));
|
||||
|
||||
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
||||
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
|
||||
// filled from other vectors are 0 for blending. Note that these are byte
|
||||
// indices for 16-bit lanes.
|
||||
alignas(16) static constexpr uint8_t tbl_v1[16] = {
|
||||
0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
|
||||
2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
|
||||
alignas(16) static constexpr uint8_t tbl_v2[16] = {
|
||||
0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
|
||||
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
|
||||
|
||||
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A1 = LoadDup128(du8, tbl_v1); // 2..1..0.
|
||||
// .2..1..0
|
||||
const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
|
||||
const auto shuf_A2 = LoadDup128(du8, tbl_v2); // ..1..0..
|
||||
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
|
||||
const V A = BitCast(d, A0 | A1 | A2);
|
||||
|
||||
// B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
|
||||
const auto shuf_B0 = shuf_A1 + k3; // 5..4..3.
|
||||
const auto shuf_B1 = shuf_A2 + k3; // ..4..3..
|
||||
const auto shuf_B2 = shuf_A0 + k2; // .4..3..2
|
||||
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
|
||||
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
|
||||
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
|
||||
const V B = BitCast(d, B0 | B1 | B2);
|
||||
|
||||
// C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
|
||||
const auto shuf_C0 = shuf_B1 + k3; // ..7..6..
|
||||
const auto shuf_C1 = shuf_B2 + k3; // .7..6..5
|
||||
const auto shuf_C2 = shuf_B0 + k2; // 7..6..5.
|
||||
const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
|
||||
const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
|
||||
const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
|
||||
const V C = BitCast(d, C0 | C1 | C2);
|
||||
|
||||
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
||||
}
|
||||
|
||||
// >= 128-bit vector, 32-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
|
||||
const V v10_v00 = InterleaveLower(d, v0, v1);
|
||||
const V v01_v20 = OddEven(v0, v2);
|
||||
// A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
|
||||
const V A = BitCast(
|
||||
d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
|
||||
|
||||
const V v1_321 = ShiftRightLanes<1>(d, v1);
|
||||
const V v0_32 = ShiftRightLanes<2>(d, v0);
|
||||
const V v21_v11 = OddEven(v2, v1_321);
|
||||
const V v12_v02 = OddEven(v1_321, v0_32);
|
||||
// B: v1[2],v0[2], v2[1],v1[1]
|
||||
const V B = BitCast(
|
||||
d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
|
||||
|
||||
// Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
|
||||
const V v23_v13 = OddEven(v2, v1_321);
|
||||
const V v03_v22 = OddEven(v0, v2);
|
||||
// C: v2[3],v1[3],v0[3], v2[2]
|
||||
const V C = BitCast(
|
||||
d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
|
||||
|
||||
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
||||
}
|
||||
|
||||
// >= 128-bit vector, 64-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const V A = InterleaveLower(d, v0, v1);
|
||||
const V B = OddEven(v0, v2);
|
||||
const V C = InterleaveUpper(d, v1, v2);
|
||||
detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
|
||||
}
|
||||
|
||||
// 64-bit vector, 8-bit lanes
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
||||
HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Vec64<T> part2, Full64<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
// Use full vectors for the shuffles and first result.
|
||||
const Full128<uint8_t> du;
|
||||
const Full128<T> d_full;
|
||||
const auto k5 = Set(du, 5);
|
||||
const auto k6 = Set(du, 6);
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
|
||||
// Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
|
||||
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
|
||||
// filled from other vectors are 0 for blending.
|
||||
alignas(16) static constexpr uint8_t tbl_v0[16] = {
|
||||
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
||||
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
||||
alignas(16) static constexpr uint8_t tbl_v1[16] = {
|
||||
0x80, 0, 0x80, 0x80, 1, 0x80, //
|
||||
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
||||
// The interleaved vectors will be named A, B, C; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A0 = Load(du, tbl_v0);
|
||||
const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB)
|
||||
const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0.
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0..
|
||||
const auto A = BitCast(d_full, A0 | A1 | A2);
|
||||
StoreU(A, d_full, unaligned + 0 * N);
|
||||
|
||||
// Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
|
||||
const auto shuf_B0 = shuf_A2 + k6; // ..7..6..
|
||||
const auto shuf_B1 = shuf_A0 + k5; // .7..6..5
|
||||
const auto shuf_B2 = shuf_A1 + k5; // 7..6..5.
|
||||
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
|
||||
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
|
||||
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
|
||||
const Vec64<T> B{(B0 | B1 | B2).raw};
|
||||
StoreU(B, d, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
// 64-bit vector, 16-bit lanes
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Vec64<T> part2, Full64<T> dh,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
const Full128<T> d;
|
||||
const Full128<uint8_t> du8;
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
const auto k2 = Set(du8, 2 * sizeof(T));
|
||||
const auto k3 = Set(du8, 3 * sizeof(T));
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
|
||||
// Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
|
||||
// v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
|
||||
// to their place, with 0x80 so lanes to be filled from other vectors are 0
|
||||
// to enable blending by ORing together.
|
||||
alignas(16) static constexpr uint8_t tbl_v1[16] = {
|
||||
0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80,
|
||||
2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5};
|
||||
alignas(16) static constexpr uint8_t tbl_v2[16] = {
|
||||
0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
|
||||
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
|
||||
|
||||
// The interleaved vectors will be named A, B; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0.
|
||||
// .2..1..0
|
||||
const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
|
||||
const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0..
|
||||
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
|
||||
const Vec128<T> A = BitCast(d, A0 | A1 | A2);
|
||||
StoreU(A, d, unaligned + 0 * N);
|
||||
|
||||
// Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
|
||||
const auto shuf_B0 = shuf_A1 + k3; // ..3.
|
||||
const auto shuf_B1 = shuf_A2 + k3; // .3..
|
||||
const auto shuf_B2 = shuf_A0 + k2; // 3..2
|
||||
const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
|
||||
const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
|
||||
const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
|
||||
const Vec128<T> B = BitCast(d, B0 | B1 | B2);
|
||||
StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
// 64-bit vector, 32-bit lanes
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
|
||||
const Vec64<T> v2, Full64<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
// (same code as 128-bit vector, 64-bit lanes)
|
||||
constexpr size_t N = 2;
|
||||
const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
|
||||
const Vec64<T> v01_v20 = OddEven(v0, v2);
|
||||
const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
|
||||
StoreU(v10_v00, d, unaligned + 0 * N);
|
||||
StoreU(v01_v20, d, unaligned + 1 * N);
|
||||
StoreU(v21_v11, d, unaligned + 2 * N);
|
||||
}
|
||||
|
||||
// 64-bit lanes are handled by the N=1 case below.
|
||||
|
||||
// <= 32-bit vector, 8-bit lanes
|
||||
template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
|
||||
HWY_API void StoreInterleaved3(const Vec128<T, N> part0,
|
||||
const Vec128<T, N> part1,
|
||||
const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors for the shuffles and result.
|
||||
const Full128<uint8_t> du;
|
||||
const Full128<T> d_full;
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
|
||||
// Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
|
||||
// so lanes to be filled from other vectors are 0 to enable blending by ORing
|
||||
// together.
|
||||
alignas(16) static constexpr uint8_t tbl_v0[16] = {
|
||||
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80,
|
||||
0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
|
||||
// The interleaved vector will be named A; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A0 = Load(du, tbl_v0);
|
||||
const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
|
||||
const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0.
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0..
|
||||
const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
|
||||
alignas(16) T buf[16 / sizeof(T)];
|
||||
StoreU(A, d_full, buf);
|
||||
CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
|
||||
}
|
||||
|
||||
// 32-bit vector, 16-bit lanes
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API void StoreInterleaved3(const Vec128<T, 2> part0,
|
||||
const Vec128<T, 2> part1,
|
||||
const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
constexpr size_t N = 4 / sizeof(T);
|
||||
// Use full vectors for the shuffles and result.
|
||||
const Full128<uint8_t> du8;
|
||||
const Full128<T> d_full;
|
||||
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
|
||||
// Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
|
||||
// so lanes to be filled from other vectors are 0 to enable blending by ORing
|
||||
// together.
|
||||
alignas(16) static constexpr uint8_t tbl_v2[16] = {
|
||||
0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80,
|
||||
0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80};
|
||||
// The interleaved vector will be named A; temporaries with suffix
|
||||
// 0..2 indicate which input vector's lanes they hold.
|
||||
const auto shuf_A2 = // ..1..0..
|
||||
Load(du8, tbl_v2);
|
||||
const auto shuf_A1 = // ...1..0.
|
||||
CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
|
||||
const auto shuf_A0 = // ....1..0
|
||||
CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
|
||||
const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0
|
||||
const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0.
|
||||
const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0..
|
||||
const auto A = BitCast(d_full, A0 | A1 | A2);
|
||||
alignas(16) T buf[16 / sizeof(T)];
|
||||
StoreU(A, d_full, buf);
|
||||
CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
|
||||
}
|
||||
|
||||
// Single-element vector, any lane size: just store directly
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved3(const Vec128<T, 1> v0, const Vec128<T, 1> v1,
|
||||
const Vec128<T, 1> v2, Simd<T, 1, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(v0, d, unaligned + 0);
|
||||
StoreU(v1, d, unaligned + 1);
|
||||
StoreU(v2, d, unaligned + 2);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved4
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
|
||||
template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
|
||||
HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
|
||||
Simd<T, N, 0> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(A, d, unaligned + 0 * N);
|
||||
StoreU(B, d, unaligned + 1 * N);
|
||||
StoreU(C, d, unaligned + 2 * N);
|
||||
StoreU(D, d, unaligned + 3 * N);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// >= 128-bit vector, 8..32-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
|
||||
const auto v32L = ZipLower(dw, v2, v3);
|
||||
const auto v10U = ZipUpper(dw, v0, v1);
|
||||
const auto v32U = ZipUpper(dw, v2, v3);
|
||||
// The interleaved vectors are A, B, C, D.
|
||||
const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210
|
||||
const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
|
||||
const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
|
||||
const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
|
||||
detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
|
||||
}
|
||||
|
||||
// >= 128-bit vector, 64-bit lanes
|
||||
template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
|
||||
HWY_IF_GE128(T, N)>
|
||||
HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
|
||||
Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
|
||||
// The interleaved vectors are A, B, C, D.
|
||||
const auto A = InterleaveLower(d, v0, v1); // v1[0] v0[0]
|
||||
const auto B = InterleaveLower(d, v2, v3);
|
||||
const auto C = InterleaveUpper(d, v0, v1);
|
||||
const auto D = InterleaveUpper(d, v2, v3);
|
||||
detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
|
||||
}
|
||||
|
||||
// 64-bit vector, 8..32-bit lanes
|
||||
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
|
||||
HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Vec64<T> part2, const Vec64<T> part3,
|
||||
Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const RepartitionToWide<decltype(d_full)> dw;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
const Vec128<T> v3{part3.raw};
|
||||
const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0]
|
||||
const auto v32 = ZipLower(dw, v2, v3);
|
||||
const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
|
||||
const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
|
||||
StoreU(A, d_full, unaligned + 0 * N);
|
||||
StoreU(B, d_full, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
// 64-bit vector, 64-bit lane
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
|
||||
const Vec64<T> part2, const Vec64<T> part3,
|
||||
Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
|
||||
constexpr size_t N = 16 / sizeof(T);
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
const Vec128<T> v3{part3.raw};
|
||||
const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0]
|
||||
const auto B = InterleaveLower(d_full, v2, v3);
|
||||
StoreU(A, d_full, unaligned + 0 * N);
|
||||
StoreU(B, d_full, unaligned + 1 * N);
|
||||
}
|
||||
|
||||
// <= 32-bit vectors
|
||||
template <typename T, size_t N, HWY_IF_LE32(T, N)>
|
||||
HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
|
||||
const Vec128<T, N> part1,
|
||||
const Vec128<T, N> part2,
|
||||
const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
// Use full vectors to reduce the number of stores.
|
||||
const Full128<T> d_full;
|
||||
const RepartitionToWide<decltype(d_full)> dw;
|
||||
const Vec128<T> v0{part0.raw};
|
||||
const Vec128<T> v1{part1.raw};
|
||||
const Vec128<T> v2{part2.raw};
|
||||
const Vec128<T> v3{part3.raw};
|
||||
const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0]
|
||||
const auto v32 = ZipLower(dw, v2, v3);
|
||||
const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
|
||||
alignas(16) T buf[16 / sizeof(T)];
|
||||
StoreU(v3210, d_full, buf);
|
||||
CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
|
||||
}
|
||||
|
||||
#endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
|
||||
// ------------------------------ AESRound
|
||||
|
||||
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
|
||||
|
@ -281,6 +1181,7 @@ HWY_API V CLMulUpper(V a, V b) {
|
|||
#define HWY_NATIVE_POPCNT
|
||||
#endif
|
||||
|
||||
#undef HWY_MIN_POW2_FOR_128
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#define HWY_MIN_POW2_FOR_128 1
|
||||
#else
|
||||
|
@ -291,10 +1192,11 @@ HWY_API V CLMulUpper(V a, V b) {
|
|||
|
||||
// This algorithm requires vectors to be at least 16 bytes, which is the case
|
||||
// for LMUL >= 2. If not, use the fallback below.
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
|
||||
HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
|
||||
HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
|
||||
const D d;
|
||||
HWY_ALIGN constexpr uint8_t kLookup[16] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
};
|
||||
|
@ -307,9 +1209,10 @@ HWY_API V PopulationCount(V v) {
|
|||
// RVV has a specialization that avoids the Set().
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
// Slower fallback for capped vectors.
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1), HWY_IF_LT128_D(D)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
|
||||
const D d;
|
||||
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
|
||||
v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
|
||||
v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
|
||||
|
@ -317,26 +1220,29 @@ HWY_API V PopulationCount(V v) {
|
|||
}
|
||||
#endif // HWY_TARGET != HWY_RVV
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
|
||||
const D d;
|
||||
const Repartition<uint8_t, decltype(d)> d8;
|
||||
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
||||
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
|
||||
const D d;
|
||||
Repartition<uint16_t, decltype(d)> d16;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
|
||||
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
|
||||
template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
|
||||
const D d;
|
||||
Repartition<uint32_t, decltype(d)> d32;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
|
||||
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -70,7 +71,7 @@ class Mask1 {
|
|||
public:
|
||||
static HWY_INLINE Mask1<T> FromBool(bool b) {
|
||||
Mask1<T> mask;
|
||||
mask.bits = b ? ~Raw(0) : 0;
|
||||
mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
|
||||
return mask;
|
||||
}
|
||||
|
||||
|
@ -127,6 +128,9 @@ HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
|
|||
return Vec1<T>(static_cast<T>(first));
|
||||
}
|
||||
|
||||
template <class D>
|
||||
using VFromD = decltype(Zero(D()));
|
||||
|
||||
// ================================================== LOGICAL
|
||||
|
||||
// ------------------------------ Not
|
||||
|
@ -187,6 +191,13 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
|
|||
return Xor(a, b);
|
||||
}
|
||||
|
||||
// ------------------------------ Or3
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) {
|
||||
return Or(o1, Or(o2, o3));
|
||||
}
|
||||
|
||||
// ------------------------------ OrAnd
|
||||
|
||||
template <typename T>
|
||||
|
@ -337,7 +348,8 @@ HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
|
|||
template <int kBits, typename T>
|
||||
HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
|
||||
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
||||
return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits);
|
||||
return Vec1<T>(
|
||||
static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
|
||||
}
|
||||
|
||||
template <int kBits, typename T>
|
||||
|
@ -346,7 +358,7 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
|
|||
#if __cplusplus >= 202002L
|
||||
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
||||
// negative infinity, i.e. shifting in the sign bit).
|
||||
return Vec1<T>(v.raw >> kBits);
|
||||
return Vec1<T>(static_cast<T>(v.raw >> kBits));
|
||||
#else
|
||||
if (IsSigned<T>()) {
|
||||
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
||||
|
@ -355,28 +367,51 @@ HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
|
|||
const Sisd<TU> du;
|
||||
const TU shifted = BitCast(du, v).raw >> kBits;
|
||||
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
||||
const TU upper = sign << (sizeof(TU) * 8 - 1 - kBits);
|
||||
const size_t sign_shift =
|
||||
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
|
||||
const TU upper = static_cast<TU>(sign << sign_shift);
|
||||
return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
|
||||
} else {
|
||||
return Vec1<T>(v.raw >> kBits); // unsigned, logical shift
|
||||
} else { // T is unsigned
|
||||
return Vec1<T>(static_cast<T>(v.raw >> kBits));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ RotateRight (ShiftRight)
|
||||
|
||||
namespace detail {
|
||||
|
||||
// For partial specialization: kBits == 0 results in an invalid shift count
|
||||
template <int kBits>
|
||||
struct RotateRight {
|
||||
template <typename T>
|
||||
HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
|
||||
return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RotateRight<0> {
|
||||
template <typename T>
|
||||
HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
|
||||
return v;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <int kBits, typename T>
|
||||
HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
|
||||
static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
|
||||
if (kBits == 0) return v;
|
||||
return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
|
||||
return detail::RotateRight<kBits>()(v);
|
||||
}
|
||||
|
||||
// ------------------------------ ShiftLeftSame (BroadcastSignBit)
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
|
||||
return Vec1<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits);
|
||||
return Vec1<T>(
|
||||
static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -384,7 +419,7 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
|
|||
#if __cplusplus >= 202002L
|
||||
// Signed right shift is now guaranteed to be arithmetic (rounding toward
|
||||
// negative infinity, i.e. shifting in the sign bit).
|
||||
return Vec1<T>(v.raw >> bits);
|
||||
return Vec1<T>(static_cast<T>(v.raw >> bits));
|
||||
#else
|
||||
if (IsSigned<T>()) {
|
||||
// Emulate arithmetic shift using only logical (unsigned) shifts, because
|
||||
|
@ -393,10 +428,12 @@ HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
|
|||
const Sisd<TU> du;
|
||||
const TU shifted = BitCast(du, v).raw >> bits;
|
||||
const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
|
||||
const TU upper = sign << (sizeof(TU) * 8 - 1 - bits);
|
||||
const size_t sign_shift =
|
||||
static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
|
||||
const TU upper = static_cast<TU>(sign << sign_shift);
|
||||
return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
|
||||
} else {
|
||||
return Vec1<T>(v.raw >> bits); // unsigned, logical shift
|
||||
} else { // T is unsigned
|
||||
return Vec1<T>(static_cast<T>(v.raw >> bits));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -601,6 +638,10 @@ HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
|
|||
(static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
|
||||
}
|
||||
|
||||
HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
|
||||
return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
|
||||
}
|
||||
|
||||
// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
|
||||
HWY_API Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
|
||||
const int64_t a64 = a.raw;
|
||||
|
@ -684,7 +725,7 @@ HWY_API Vec1<T> Round(const Vec1<T> v) {
|
|||
const TI rounded = static_cast<TI>(v.raw + bias);
|
||||
if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
|
||||
// Round to even
|
||||
if ((rounded & 1) && std::abs(rounded - v.raw) == T(0.5)) {
|
||||
if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
|
||||
return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
|
||||
}
|
||||
return Vec1<T>(static_cast<T>(rounded));
|
||||
|
@ -842,6 +883,45 @@ HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
|
|||
return Mask1<T>::FromBool(a.raw >= b.raw);
|
||||
}
|
||||
|
||||
// ------------------------------ Floating-point classification (==)
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
|
||||
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
||||
MakeUnsigned<T> bits;
|
||||
memcpy(&bits, &v, sizeof(v));
|
||||
bits += bits;
|
||||
bits >>= 1; // clear sign bit
|
||||
// NaN if all exponent bits are set and the mantissa is not zero.
|
||||
return Mask1<T>::FromBool(bits > ExponentMask<T>());
|
||||
}
|
||||
|
||||
HWY_API Mask1<float> IsInf(const Vec1<float> v) {
|
||||
const Sisd<float> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const Vec1<uint32_t> vu = BitCast(du, v);
|
||||
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
||||
return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
|
||||
}
|
||||
HWY_API Mask1<double> IsInf(const Vec1<double> v) {
|
||||
const Sisd<double> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const Vec1<uint64_t> vu = BitCast(du, v);
|
||||
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
||||
return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
|
||||
}
|
||||
|
||||
HWY_API Mask1<float> IsFinite(const Vec1<float> v) {
|
||||
const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
|
||||
// Shift left to clear the sign bit, check whether exponent != max value.
|
||||
return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
|
||||
}
|
||||
HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
|
||||
const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
|
||||
// Shift left to clear the sign bit, check whether exponent != max value.
|
||||
return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
|
||||
}
|
||||
|
||||
// ================================================== MEMORY
|
||||
|
||||
// ------------------------------ Load
|
||||
|
@ -883,20 +963,69 @@ HWY_API void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
|
|||
return Store(v, d, p);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved3
|
||||
template <typename T>
|
||||
HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, Sisd<T> d,
|
||||
T* HWY_RESTRICT p) {
|
||||
if (!m.bits) return;
|
||||
StoreU(v, d, p);
|
||||
}
|
||||
|
||||
HWY_API void StoreInterleaved3(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
|
||||
const Vec1<uint8_t> v2, Sisd<uint8_t> d,
|
||||
uint8_t* HWY_RESTRICT unaligned) {
|
||||
// ------------------------------ LoadInterleaved2/3/4
|
||||
|
||||
// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
|
||||
#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#else
|
||||
#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
HWY_API void LoadInterleaved2(Sisd<T> d, const T* HWY_RESTRICT unaligned,
|
||||
Vec1<T>& v0, Vec1<T>& v1) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API void LoadInterleaved3(Sisd<T> d, const T* HWY_RESTRICT unaligned,
|
||||
Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
v2 = LoadU(d, unaligned + 2);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API void LoadInterleaved4(Sisd<T> d, const T* HWY_RESTRICT unaligned,
|
||||
Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2,
|
||||
Vec1<T>& v3) {
|
||||
v0 = LoadU(d, unaligned + 0);
|
||||
v1 = LoadU(d, unaligned + 1);
|
||||
v2 = LoadU(d, unaligned + 2);
|
||||
v3 = LoadU(d, unaligned + 3);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved2/3/4
|
||||
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, Sisd<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(v0, d, unaligned + 0);
|
||||
StoreU(v1, d, unaligned + 1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
|
||||
const Vec1<T> v2, Sisd<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(v0, d, unaligned + 0);
|
||||
StoreU(v1, d, unaligned + 1);
|
||||
StoreU(v2, d, unaligned + 2);
|
||||
}
|
||||
|
||||
HWY_API void StoreInterleaved4(const Vec1<uint8_t> v0, const Vec1<uint8_t> v1,
|
||||
const Vec1<uint8_t> v2, const Vec1<uint8_t> v3,
|
||||
Sisd<uint8_t> d,
|
||||
uint8_t* HWY_RESTRICT unaligned) {
|
||||
template <typename T>
|
||||
HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
|
||||
const Vec1<T> v2, const Vec1<T> v3, Sisd<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
StoreU(v0, d, unaligned + 0);
|
||||
StoreU(v1, d, unaligned + 1);
|
||||
StoreU(v2, d, unaligned + 2);
|
||||
|
@ -933,7 +1062,8 @@ template <typename T, typename Offset>
|
|||
HWY_API Vec1<T> GatherOffset(Sisd<T> d, const T* base,
|
||||
const Vec1<Offset> offset) {
|
||||
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
||||
const uintptr_t addr = reinterpret_cast<uintptr_t>(base) + offset.raw;
|
||||
const intptr_t addr =
|
||||
reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
|
||||
return Load(d, reinterpret_cast<const T*>(addr));
|
||||
}
|
||||
|
||||
|
@ -988,12 +1118,8 @@ HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
|||
}
|
||||
|
||||
HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
uint16_t bits16;
|
||||
CopyBytes<2>(&v.raw, &bits16);
|
||||
#else
|
||||
const uint16_t bits16 = v.raw.bits;
|
||||
#endif
|
||||
const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
|
||||
const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
|
||||
const uint32_t mantissa = bits16 & 0x3FF;
|
||||
|
@ -1031,12 +1157,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
|
|||
// Tiny or zero => zero.
|
||||
Vec1<float16_t> out;
|
||||
if (exp < -24) {
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
const uint16_t zero = 0;
|
||||
CopyBytes<2>(&zero, &out.raw);
|
||||
#else
|
||||
out.raw.bits = 0;
|
||||
#endif
|
||||
return out;
|
||||
}
|
||||
|
||||
|
@ -1059,12 +1181,8 @@ HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
|
|||
HWY_DASSERT(mantissa16 < 1024);
|
||||
const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
|
||||
HWY_DASSERT(bits16 < 0x10000);
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
const uint16_t narrowed = static_cast<uint16_t>(bits16); // big-endian safe
|
||||
CopyBytes<2>(&narrowed, &out.raw);
|
||||
#else
|
||||
out.raw.bits = static_cast<uint16_t>(bits16);
|
||||
#endif
|
||||
return out;
|
||||
}
|
||||
|
||||
|
@ -1097,6 +1215,38 @@ HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
|
|||
return DemoteTo(Sisd<uint8_t>(), v);
|
||||
}
|
||||
|
||||
// ------------------------------ Truncations
|
||||
|
||||
HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
|
||||
const Vec1<uint64_t> v) {
|
||||
return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
|
||||
}
|
||||
|
||||
HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
|
||||
const Vec1<uint64_t> v) {
|
||||
return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
|
||||
}
|
||||
|
||||
HWY_API Vec1<uint32_t> TruncateTo(Sisd<uint32_t> /* tag */,
|
||||
const Vec1<uint64_t> v) {
|
||||
return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
|
||||
}
|
||||
|
||||
HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
|
||||
const Vec1<uint32_t> v) {
|
||||
return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
|
||||
}
|
||||
|
||||
HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
|
||||
const Vec1<uint32_t> v) {
|
||||
return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
|
||||
}
|
||||
|
||||
HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
|
||||
const Vec1<uint16_t> v) {
|
||||
return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
|
||||
}
|
||||
|
||||
// ================================================== COMBINE
|
||||
// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
|
||||
|
||||
|
@ -1117,6 +1267,21 @@ HWY_API T GetLane(const Vec1<T> v) {
|
|||
return v.raw;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
|
||||
HWY_DASSERT(i == 0);
|
||||
(void)i;
|
||||
return v.raw;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) {
|
||||
HWY_DASSERT(i == 0);
|
||||
(void)i;
|
||||
v.raw = t;
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> DupEven(Vec1<T> v) {
|
||||
return v;
|
||||
|
@ -1157,7 +1322,7 @@ HWY_API Indices1<T> IndicesFromVec(Sisd<T>, Vec1<TI> vec) {
|
|||
|
||||
template <typename T, typename TI>
|
||||
HWY_API Indices1<T> SetTableIndices(Sisd<T> d, const TI* idx) {
|
||||
return IndicesFromVec(d, LoadU(idx));
|
||||
return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
@ -1180,6 +1345,7 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
|
|||
return v;
|
||||
}
|
||||
|
||||
// Must not be called:
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
|
||||
return v;
|
||||
|
@ -1304,19 +1470,24 @@ HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
|
|||
|
||||
// ------------------------------ Compress, CompressBits
|
||||
|
||||
template <typename T>
|
||||
struct CompressIsPartition {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
|
||||
// Upper lanes are undefined, so result is the same independent of mask.
|
||||
// A single lane is already partitioned by definition.
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Compress(Vec1<T> v, const uint8_t* HWY_RESTRICT /* bits */) {
|
||||
HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
|
||||
// A single lane is already partitioned by definition.
|
||||
return v;
|
||||
}
|
||||
|
||||
// ------------------------------ CompressStore
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
|
@ -1325,7 +1496,6 @@ HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
|
|||
}
|
||||
|
||||
// ------------------------------ CompressBlendedStore
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
|
||||
T* HWY_RESTRICT unaligned) {
|
||||
|
@ -1334,8 +1504,13 @@ HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
|
|||
return 1;
|
||||
}
|
||||
|
||||
// ------------------------------ CompressBitsStore
|
||||
// ------------------------------ CompressBits
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
|
||||
return v;
|
||||
}
|
||||
|
||||
// ------------------------------ CompressBitsStore
|
||||
template <typename T>
|
||||
HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
|
||||
Sisd<T> d, T* HWY_RESTRICT unaligned) {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -36,6 +37,8 @@
|
|||
#undef HWY_HAVE_INTEGER64
|
||||
#undef HWY_HAVE_FLOAT16
|
||||
#undef HWY_HAVE_FLOAT64
|
||||
#undef HWY_MEM_OPS_MIGHT_FAULT
|
||||
#undef HWY_NATIVE_FMA
|
||||
#undef HWY_CAP_GE256
|
||||
#undef HWY_CAP_GE512
|
||||
|
||||
|
@ -71,6 +74,7 @@
|
|||
|
||||
// Before include guard so we redefine HWY_TARGET_STR on each include,
|
||||
// governed by the current HWY_TARGET.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSSE3
|
||||
#if HWY_TARGET == HWY_SSSE3
|
||||
|
@ -84,11 +88,13 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_AES 0
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE4
|
||||
#elif HWY_TARGET == HWY_SSE4
|
||||
|
@ -102,6 +108,8 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -120,6 +128,14 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
|
||||
#ifdef HWY_DISABLE_BMI2_FMA
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#else
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#endif
|
||||
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -137,6 +153,8 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 1
|
||||
|
||||
|
@ -148,9 +166,10 @@
|
|||
#elif HWY_TARGET == HWY_AVX3_DL
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3_DL
|
||||
#define HWY_TARGET_STR \
|
||||
HWY_TARGET_STR_AVX3 \
|
||||
",vpclmulqdq,avx512vbmi2,vaes,avxvnni,avx512bitalg,avx512vpopcntdq"
|
||||
#define HWY_TARGET_STR \
|
||||
HWY_TARGET_STR_AVX3 \
|
||||
",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \
|
||||
"avx512vpopcntdq"
|
||||
|
||||
#else
|
||||
#error "Logic error"
|
||||
|
@ -168,6 +187,8 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -186,8 +207,6 @@
|
|||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_ARCH_ARM_A64
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
|
@ -195,19 +214,38 @@
|
|||
#define HWY_HAVE_FLOAT64 0
|
||||
#endif
|
||||
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
|
||||
#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#else
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#endif
|
||||
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_NEON
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
// Can use pragmas instead of -march compiler flag
|
||||
#if HWY_HAVE_RUNTIME_DISPATCH
|
||||
#if HWY_ARCH_ARM_V7
|
||||
#define HWY_TARGET_STR "+neon-vfpv4"
|
||||
#else
|
||||
#define HWY_TARGET_STR "+crypto"
|
||||
#endif // HWY_ARCH_ARM_V7
|
||||
#else
|
||||
// HWY_TARGET_STR remains undefined
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SVE[2]
|
||||
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
|
||||
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
|
||||
HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
|
||||
|
||||
// SVE only requires lane alignment, not natural alignment of the entire vector.
|
||||
#define HWY_ALIGN alignas(8)
|
||||
|
||||
#define HWY_MAX_BYTES 256
|
||||
|
||||
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
|
||||
// overallocation.
|
||||
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
|
||||
|
@ -216,16 +254,35 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_TARGET == HWY_SVE2
|
||||
#define HWY_NAMESPACE N_SVE2
|
||||
#define HWY_MAX_BYTES 256
|
||||
#elif HWY_TARGET == HWY_SVE_256
|
||||
#define HWY_NAMESPACE N_SVE_256
|
||||
#define HWY_MAX_BYTES 32
|
||||
#elif HWY_TARGET == HWY_SVE2_128
|
||||
#define HWY_NAMESPACE N_SVE2_128
|
||||
#define HWY_MAX_BYTES 16
|
||||
#else
|
||||
#define HWY_NAMESPACE N_SVE
|
||||
#define HWY_MAX_BYTES 256
|
||||
#endif
|
||||
|
||||
// Can use pragmas instead of -march compiler flag
|
||||
#if HWY_HAVE_RUNTIME_DISPATCH
|
||||
#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
|
||||
#define HWY_TARGET_STR "+sve2-aes"
|
||||
#else
|
||||
#define HWY_TARGET_STR "+sve"
|
||||
#endif
|
||||
#else
|
||||
// HWY_TARGET_STR remains undefined
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM
|
||||
|
@ -239,6 +296,8 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -247,8 +306,8 @@
|
|||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM2
|
||||
#elif HWY_TARGET == HWY_WASM2
|
||||
// WASM_EMU256
|
||||
#elif HWY_TARGET == HWY_WASM_EMU256
|
||||
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_MAX_BYTES 32
|
||||
|
@ -258,10 +317,12 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM2
|
||||
#define HWY_NAMESPACE N_WASM_EMU256
|
||||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
|
@ -283,10 +344,12 @@
|
|||
#define HWY_HAVE_SCALABLE 1
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(__riscv_zfh)
|
||||
#if defined(__riscv_zvfh)
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
|
@ -297,6 +360,27 @@
|
|||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
// (rv64gcv is not a valid target)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// EMU128
|
||||
#elif HWY_TARGET == HWY_EMU128
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_EMU128
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SCALAR
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
|
@ -309,6 +393,8 @@
|
|||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 0
|
||||
#define HWY_NATIVE_FMA 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -320,6 +406,12 @@
|
|||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
// Override this to 1 in asan/msan builds, which will still fault.
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN
|
||||
#undef HWY_MEM_OPS_MIGHT_FAULT
|
||||
#define HWY_MEM_OPS_MIGHT_FAULT 1
|
||||
#endif
|
||||
|
||||
// Clang <9 requires this be invoked at file scope, before any namespace.
|
||||
#undef HWY_BEFORE_NAMESPACE
|
||||
#if defined(HWY_TARGET_STR)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -98,21 +99,21 @@ struct Simd {
|
|||
|
||||
namespace detail {
|
||||
|
||||
#if HWY_HAVE_SCALABLE
|
||||
|
||||
template <typename T, size_t N, int kPow2>
|
||||
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
|
||||
return N == HWY_LANES(T) && kPow2 == 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns the number of lanes (possibly zero) after applying a shift:
|
||||
// - 0: no change;
|
||||
// - [1,3]: a group of 2,4,8 [fractional] vectors;
|
||||
// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
|
||||
constexpr size_t ScaleByPower(size_t N, int pow2) {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
|
||||
#else
|
||||
return pow2 >= 0 ? N : (N >> (-pow2));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Struct wrappers enable validation of arguments via static_assert.
|
||||
|
@ -136,17 +137,16 @@ struct ScalableTagChecker {
|
|||
template <typename T, size_t kLimit>
|
||||
struct CappedTagChecker {
|
||||
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
|
||||
using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
|
||||
// Safely handle non-power-of-two inputs by rounding down, which is allowed by
|
||||
// CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
|
||||
static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
|
||||
using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
|
||||
};
|
||||
|
||||
template <typename T, size_t kNumLanes>
|
||||
struct FixedTagChecker {
|
||||
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
|
||||
static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// HWY_MAX_BYTES would still allow uint8x8, which is not supported.
|
||||
static_assert(kNumLanes == 1, "Scalar only supports one lane");
|
||||
#endif
|
||||
static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
|
||||
using type = Simd<T, kNumLanes, 0>;
|
||||
};
|
||||
|
||||
|
@ -172,8 +172,8 @@ template <typename T, size_t kLimit>
|
|||
using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
|
||||
|
||||
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
|
||||
// even on targets with scalable vectors. HWY_SCALAR only supports one lane.
|
||||
// All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
|
||||
// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
|
||||
// two not exceeding `HWY_LANES(T)`.
|
||||
//
|
||||
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
|
||||
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
|
||||
|
@ -218,6 +218,15 @@ using Half = typename D::Half;
|
|||
template <class D>
|
||||
using Twice = typename D::Twice;
|
||||
|
||||
template <typename T>
|
||||
using Full32 = Simd<T, 4 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full64 = Simd<T, 8 / sizeof(T), 0>;
|
||||
|
||||
template <typename T>
|
||||
using Full128 = Simd<T, 16 / sizeof(T), 0>;
|
||||
|
||||
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
|
||||
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
|
||||
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
|
||||
|
@ -232,15 +241,12 @@ using Twice = typename D::Twice;
|
|||
#define HWY_IF_GE128_D(D) \
|
||||
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
|
||||
|
||||
// Same, but with a vector argument.
|
||||
// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
|
||||
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
|
||||
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
|
||||
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
|
||||
#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
|
||||
|
||||
// For implementing functions for a specific type.
|
||||
// IsSame<...>() in template arguments is broken on MSVC2015.
|
||||
#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
|
||||
#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
|
||||
|
@ -291,8 +297,7 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
|
|||
// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
|
||||
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
|
||||
// and possibly also other functions that are not inlined.
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
|
||||
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
|
||||
#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
|
||||
template <class V>
|
||||
using VecArg = const V&;
|
||||
#else
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2021 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -590,6 +591,10 @@ HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
|
|||
return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// Multiplies even lanes (0, 2 ..) and returns the double-width result.
|
||||
HWY_API Vec256<int64_t> MulEven(const Vec256<int32_t> a,
|
||||
const Vec256<int32_t> b) {
|
||||
|
@ -717,6 +722,37 @@ HWY_API Vec256<float> Floor(const Vec256<float> v) {
|
|||
return Vec256<float>{wasm_f32x4_floor(v.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ Floating-point classification
|
||||
|
||||
template <typename T>
|
||||
HWY_API Mask256<T> IsNaN(const Vec256<T> v) {
|
||||
return v != v;
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Mask256<T> IsInf(const Vec256<T> v) {
|
||||
const Full256<T> d;
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
const VFromD<decltype(di)> vi = BitCast(di, v);
|
||||
// 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
|
||||
return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
|
||||
}
|
||||
|
||||
// Returns whether normal/subnormal/zero.
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
|
||||
const Full256<T> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
|
||||
const VFromD<decltype(du)> vu = BitCast(du, v);
|
||||
// 'Shift left' to clear the sign bit, then right so we can compare with the
|
||||
// max exponent (cannot compare with MaxExponentTimes2 directly because it is
|
||||
// negative and non-negative floats would be greater).
|
||||
const VFromD<decltype(di)> exp =
|
||||
BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
|
||||
return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
|
||||
}
|
||||
|
||||
// ================================================== COMPARE
|
||||
|
||||
// Comparisons fill a lane with 1-bits if the condition is true, else 0.
|
||||
|
@ -910,6 +946,13 @@ HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
|
|||
return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ Or3
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
|
||||
return Or(o1, Or(o2, o3));
|
||||
}
|
||||
|
||||
// ------------------------------ OrAnd
|
||||
|
||||
template <typename T>
|
||||
|
@ -1201,6 +1244,12 @@ HWY_API void StoreU(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT p) {
|
|||
Store(v, d, p);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
|
||||
T* HWY_RESTRICT p) {
|
||||
StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
|
||||
}
|
||||
|
||||
// ------------------------------ Non-temporal stores
|
||||
|
||||
// Same as aligned stores on non-x86.
|
||||
|
@ -1281,8 +1330,19 @@ HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,
|
|||
|
||||
// ================================================== SWIZZLE
|
||||
|
||||
// ------------------------------ Extract lane
|
||||
// ------------------------------ ExtractLane
|
||||
template <typename T, size_t N>
|
||||
HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// ------------------------------ InsertLane
|
||||
template <typename T, size_t N>
|
||||
HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// ------------------------------ GetLane
|
||||
// Gets the single value stored in a vector/part.
|
||||
HWY_API uint8_t GetLane(const Vec256<uint8_t> v) {
|
||||
return wasm_i8x16_extract_lane(v.raw, 0);
|
||||
|
@ -2244,6 +2304,50 @@ HWY_API Vec256<uint8_t> U8FromU32(const Vec256<uint32_t> v) {
|
|||
return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
|
||||
}
|
||||
|
||||
// ------------------------------ Truncations
|
||||
|
||||
HWY_API Vec256<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
|
||||
const Vec256<uint64_t> v) {
|
||||
return Vec256<uint8_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24,
|
||||
0, 8, 16, 24, 0, 8, 16, 24, 0, 8,
|
||||
16, 24)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
|
||||
const Vec256<uint64_t> v) {
|
||||
return Vec256<uint16_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9,
|
||||
16, 17, 24, 25, 0, 1, 8, 9, 16,
|
||||
17, 24, 25)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint32_t, 4> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
|
||||
const Vec256<uint64_t> v) {
|
||||
return Vec256<uint32_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3,
|
||||
8, 9, 10, 11, 16, 17, 18, 19,
|
||||
24, 25, 26, 27)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
|
||||
const Vec256<uint32_t> v) {
|
||||
return Vec256<uint8_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12,
|
||||
16, 20, 24, 28, 0, 4, 8, 12, 16,
|
||||
20, 24, 28)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
|
||||
const Vec256<uint32_t> v) {
|
||||
return Vec256<uint16_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5,
|
||||
8, 9, 12, 13, 16, 17, 20, 21,
|
||||
24, 25, 28, 29)};
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
|
||||
const Vec256<uint16_t> v) {
|
||||
return Vec256<uint8_t, 16>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6,
|
||||
8, 10, 12, 14, 16, 18, 20, 22,
|
||||
24, 26, 28, 30)};
|
||||
}
|
||||
|
||||
// ------------------------------ Convert i32 <=> f32 (Round)
|
||||
|
||||
HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
|
||||
|
@ -2687,12 +2791,29 @@ HWY_INLINE Vec256<uint64_t> Compress(hwy::SizeTag<8> /*tag*/,
|
|||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
struct CompressIsPartition {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
|
||||
const uint64_t mask_bits = detail::BitsFromMask(mask);
|
||||
return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
|
||||
}
|
||||
|
||||
// ------------------------------ CompressNot
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
|
||||
return Compress(v, Not(mask));
|
||||
}
|
||||
|
||||
// ------------------------------ CompressBlocksNot
|
||||
HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
|
||||
Mask256<uint64_t> mask) {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// ------------------------------ CompressBits
|
||||
|
||||
template <typename T>
|
||||
|
@ -2750,76 +2871,10 @@ HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
|
|||
return PopCount(mask_bits);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
|
||||
// TableLookupBytes)
|
||||
// ------------------------------ StoreInterleaved2/3/4
|
||||
|
||||
HWY_API void StoreInterleaved3(const Vec256<uint8_t> a, const Vec256<uint8_t> b,
|
||||
const Vec256<uint8_t> c, Full256<uint8_t> d,
|
||||
uint8_t* HWY_RESTRICT unaligned) {
|
||||
const auto k5 = Set(d, 5);
|
||||
const auto k6 = Set(d, 6);
|
||||
|
||||
// Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0].
|
||||
// 0x80 so lanes to be filled from other vectors are 0 for blending.
|
||||
alignas(32) static constexpr uint8_t tbl_r0[16] = {
|
||||
0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, //
|
||||
3, 0x80, 0x80, 4, 0x80, 0x80, 5};
|
||||
alignas(32) static constexpr uint8_t tbl_g0[16] = {
|
||||
0x80, 0, 0x80, 0x80, 1, 0x80, //
|
||||
0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
|
||||
const auto shuf_r0 = Load(d, tbl_r0);
|
||||
const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB
|
||||
const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0);
|
||||
const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0
|
||||
const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0.
|
||||
const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0..
|
||||
const auto int0 = r0 | g0 | b0;
|
||||
StoreU(int0, d, unaligned + 0 * 16);
|
||||
|
||||
// Second vector: g10,r10, bgr[9:6], b5,g5
|
||||
const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6..
|
||||
const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5
|
||||
const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5.
|
||||
const auto r1 = TableLookupBytes(a, shuf_r1);
|
||||
const auto g1 = TableLookupBytes(b, shuf_g1);
|
||||
const auto b1 = TableLookupBytes(c, shuf_b1);
|
||||
const auto int1 = r1 | g1 | b1;
|
||||
StoreU(int1, d, unaligned + 1 * 16);
|
||||
|
||||
// Third vector: bgr[15:11], b10
|
||||
const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B.
|
||||
const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B..
|
||||
const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A
|
||||
const auto r2 = TableLookupBytes(a, shuf_r2);
|
||||
const auto g2 = TableLookupBytes(b, shuf_g2);
|
||||
const auto b2 = TableLookupBytes(c, shuf_b2);
|
||||
const auto int2 = r2 | g2 | b2;
|
||||
StoreU(int2, d, unaligned + 2 * 16);
|
||||
}
|
||||
|
||||
// ------------------------------ StoreInterleaved4
|
||||
|
||||
HWY_API void StoreInterleaved4(const Vec256<uint8_t> v0,
|
||||
const Vec256<uint8_t> v1,
|
||||
const Vec256<uint8_t> v2,
|
||||
const Vec256<uint8_t> v3, Full256<uint8_t> d8,
|
||||
uint8_t* HWY_RESTRICT unaligned) {
|
||||
const RepartitionToWide<decltype(d8)> d16;
|
||||
const RepartitionToWide<decltype(d16)> d32;
|
||||
// let a,b,c,d denote v0..3.
|
||||
const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0
|
||||
const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0
|
||||
const auto ba8 = ZipUpper(d16, v0, v1);
|
||||
const auto dc8 = ZipUpper(d16, v2, v3);
|
||||
const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0
|
||||
const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4
|
||||
const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8
|
||||
const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC
|
||||
StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16);
|
||||
StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16);
|
||||
StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16);
|
||||
StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16);
|
||||
}
|
||||
// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
|
||||
// generic_ops-inl.h.
|
||||
|
||||
// ------------------------------ MulEven/Odd (Load)
|
||||
|
||||
|
@ -2952,12 +3007,27 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> /* tag */, const Vec256<T> v) {
|
|||
template <typename T>
|
||||
HWY_INLINE Mask256<T> Lt128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> Lt128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> Eq128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Mask256<T> Eq128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Vec256<T> Min128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Vec256<T> Max128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Vec256<T> Min128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE Vec256<T> Max128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,50 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/per_target.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/per_target.cc"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
// On SVE, Lanes rounds down to a power of two, but we want to know the actual
|
||||
// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32
|
||||
// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on
|
||||
// HWY_SVE_256 requires the actual vector to be a power of two.
|
||||
#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256
|
||||
size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); }
|
||||
#else
|
||||
size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
|
||||
#endif
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(GetVectorBytes); // Local function.
|
||||
} // namespace
|
||||
|
||||
size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); }
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -0,0 +1,37 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_PER_TARGET_H_
|
||||
#define HIGHWAY_HWY_PER_TARGET_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
// Per-target functions.
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
|
||||
//
|
||||
// Do not cache the result, which may change after calling DisableTargets, or
|
||||
// if software requests a different vector size (e.g. when entering/exiting SME
|
||||
// streaming mode). Instead call this right before the code that depends on the
|
||||
// result, without any DisableTargets or SME transition in-between. Note that
|
||||
// this involves an indirect call, so prefer not to call this frequently nor
|
||||
// unnecessarily.
|
||||
size_t VectorBytes();
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_PER_TARGET_H_
|
|
@ -0,0 +1,56 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Print() function
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/print.h"
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_PRINT_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_PRINT_INL_H_
|
||||
#undef HIGHWAY_HWY_PRINT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_PRINT_INL_H_
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Prints lanes around `lane`, in memory order.
|
||||
template <class D, class V = Vec<D>>
|
||||
void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
|
||||
size_t max_lanes = 7) {
|
||||
const size_t N = Lanes(d);
|
||||
using T = TFromD<D>;
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(v, d, lanes.get());
|
||||
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // per-target include guard
|
|
@ -0,0 +1,107 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/print.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace detail {
|
||||
|
||||
HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) {
|
||||
const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
|
||||
// Omit the xN suffix for scalars.
|
||||
if (N == 1) {
|
||||
// NOLINTNEXTLINE
|
||||
snprintf(string100, 64, "%c%d", prefix,
|
||||
static_cast<int>(info.sizeof_t * 8));
|
||||
} else {
|
||||
// NOLINTNEXTLINE
|
||||
snprintf(string100, 64, "%c%dx%d", prefix,
|
||||
static_cast<int>(info.sizeof_t * 8), static_cast<int>(N));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
|
||||
char* string100) {
|
||||
if (info.sizeof_t == 1) {
|
||||
uint8_t byte;
|
||||
CopyBytes<1>(ptr, &byte); // endian-safe: we ensured sizeof(T)=1.
|
||||
snprintf(string100, 100, "0x%02X", byte); // NOLINT
|
||||
} else if (info.sizeof_t == 2) {
|
||||
uint16_t bits;
|
||||
CopyBytes<2>(ptr, &bits);
|
||||
snprintf(string100, 100, "0x%04X", bits); // NOLINT
|
||||
} else if (info.sizeof_t == 4) {
|
||||
if (info.is_float) {
|
||||
float value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%g", static_cast<double>(value)); // NOLINT
|
||||
} else if (info.is_signed) {
|
||||
int32_t value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%d", value); // NOLINT
|
||||
} else {
|
||||
uint32_t value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%u", value); // NOLINT
|
||||
}
|
||||
} else {
|
||||
HWY_ASSERT(info.sizeof_t == 8);
|
||||
if (info.is_float) {
|
||||
double value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%g", value); // NOLINT
|
||||
} else if (info.is_signed) {
|
||||
int64_t value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%" PRIi64 "", value); // NOLINT
|
||||
} else {
|
||||
uint64_t value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%" PRIu64 "", value); // NOLINT
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N, size_t lane_u,
|
||||
size_t max_lanes) {
|
||||
const uint8_t* array_bytes = reinterpret_cast<const uint8_t*>(array_void);
|
||||
|
||||
char type_name[100];
|
||||
TypeName(info, N, type_name);
|
||||
|
||||
const intptr_t lane = intptr_t(lane_u);
|
||||
const size_t begin = static_cast<size_t>(HWY_MAX(0, lane - 2));
|
||||
const size_t end = HWY_MIN(begin + max_lanes, N);
|
||||
fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n ", type_name, caption,
|
||||
static_cast<uint64_t>(begin));
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
const void* ptr = array_bytes + i * info.sizeof_t;
|
||||
char str[100];
|
||||
ToString(info, ptr, str);
|
||||
fprintf(stderr, "%s,", str);
|
||||
}
|
||||
if (begin >= end) fprintf(stderr, "(out of bounds)");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hwy
|
|
@ -0,0 +1,73 @@
|
|||
// Copyright 2022 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HWY_PRINT_H_
|
||||
#define HWY_PRINT_H_
|
||||
|
||||
// Helpers for printing vector lanes.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
namespace detail {
|
||||
|
||||
// For implementing value comparisons etc. as type-erased functions to reduce
|
||||
// template bloat.
|
||||
struct TypeInfo {
|
||||
size_t sizeof_t;
|
||||
bool is_float;
|
||||
bool is_signed;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE TypeInfo MakeTypeInfo() {
|
||||
TypeInfo info;
|
||||
info.sizeof_t = sizeof(T);
|
||||
info.is_float = IsFloat<T>();
|
||||
info.is_signed = IsSigned<T>();
|
||||
return info;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
|
||||
HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
|
||||
char* string100);
|
||||
|
||||
HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N,
|
||||
size_t lane_u = 0, size_t max_lanes = 7);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void PrintValue(T value) {
|
||||
char str[100];
|
||||
detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
|
||||
fprintf(stderr, "%s,", str);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE void PrintArray(const T* value, size_t count) {
|
||||
detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
|
||||
count);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_PRINT_H_
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -14,6 +15,7 @@
|
|||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
#include <inttypes.h> // PRIx64
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
@ -21,7 +23,7 @@
|
|||
|
||||
#include <atomic>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/per_target.h"
|
||||
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
|
||||
|
@ -36,7 +38,11 @@
|
|||
#else // !HWY_COMPILER_MSVC
|
||||
#include <cpuid.h>
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
#elif HWY_ARCH_ARM && HWY_OS_LINUX
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif // HWY_ARCH_*
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
@ -57,7 +63,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
|
|||
for (int i = 0; i < 4; ++i) {
|
||||
abcd[i] = regs[i];
|
||||
}
|
||||
#else // HWY_COMPILER_MSVC
|
||||
#else // HWY_COMPILER_MSVC
|
||||
uint32_t a;
|
||||
uint32_t b;
|
||||
uint32_t c;
|
||||
|
@ -75,7 +81,7 @@ HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
|
|||
uint32_t ReadXCR0() {
|
||||
#if HWY_COMPILER_MSVC
|
||||
return static_cast<uint32_t>(_xgetbv(0));
|
||||
#else // HWY_COMPILER_MSVC
|
||||
#else // HWY_COMPILER_MSVC
|
||||
uint32_t xcr0, xcr0_high;
|
||||
const uint32_t index = 0;
|
||||
asm volatile(".byte 0x0F, 0x01, 0xD0"
|
||||
|
@ -87,15 +93,12 @@ uint32_t ReadXCR0() {
|
|||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
// Not function-local => no compiler-generated locking.
|
||||
std::atomic<uint32_t> supported_{0}; // Not yet initialized
|
||||
|
||||
// When running tests, this value can be set to the mocked supported targets
|
||||
// mask. Only written to from a single thread before the test starts.
|
||||
uint32_t supported_targets_for_test_ = 0;
|
||||
int64_t supported_targets_for_test_ = 0;
|
||||
|
||||
// Mask of targets disabled at runtime with DisableTargets.
|
||||
uint32_t supported_mask_{LimitsMax<uint32_t>()};
|
||||
int64_t supported_mask_ = LimitsMax<int64_t>();
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Arbritrary bit indices indicating which instruction set extensions are
|
||||
|
@ -126,6 +129,7 @@ enum class FeatureIndex : uint32_t {
|
|||
|
||||
kVNNI,
|
||||
kVPCLMULQDQ,
|
||||
kVBMI,
|
||||
kVBMI2,
|
||||
kVAES,
|
||||
kPOPCNTDQ,
|
||||
|
@ -176,77 +180,19 @@ constexpr uint64_t kGroupAVX3 =
|
|||
|
||||
constexpr uint64_t kGroupAVX3_DL =
|
||||
Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
|
||||
Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
|
||||
Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
|
||||
Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
|
||||
Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
|
||||
Bit(FeatureIndex::kBITALG) | kGroupAVX3;
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...) {
|
||||
char buf[2000];
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vsnprintf(buf, sizeof(buf), format, args);
|
||||
va_end(args);
|
||||
|
||||
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
|
||||
|
||||
// If compiled with any sanitizer, they can also print a stack trace.
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
__sanitizer_print_stack_trace();
|
||||
#endif // HWY_IS_*
|
||||
fflush(stderr);
|
||||
|
||||
// Now terminate the program:
|
||||
#if HWY_ARCH_RVV
|
||||
exit(1); // trap/abort just freeze Spike.
|
||||
#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
|
||||
// Facilitates breaking into a debugger, but don't use this in non-debug
|
||||
// builds because it looks like "illegal instruction", which is misleading.
|
||||
__builtin_trap();
|
||||
#else
|
||||
abort(); // Compile error without this due to HWY_NORETURN.
|
||||
#endif
|
||||
}
|
||||
|
||||
void DisableTargets(uint32_t disabled_targets) {
|
||||
supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
|
||||
// We can call Update() here to initialize the mask but that will trigger a
|
||||
// call to SupportedTargets() which we use in tests to tell whether any of the
|
||||
// highway dynamic dispatch functions were used.
|
||||
GetChosenTarget().DeInit();
|
||||
}
|
||||
|
||||
void SetSupportedTargetsForTest(uint32_t targets) {
|
||||
// Reset the cached supported_ value to 0 to force a re-evaluation in the
|
||||
// next call to SupportedTargets() which will use the mocked value set here
|
||||
// if not zero.
|
||||
supported_.store(0, std::memory_order_release);
|
||||
supported_targets_for_test_ = targets;
|
||||
GetChosenTarget().DeInit();
|
||||
}
|
||||
|
||||
bool SupportedTargetsCalledForTest() {
|
||||
return supported_.load(std::memory_order_acquire) != 0;
|
||||
}
|
||||
|
||||
uint32_t SupportedTargets() {
|
||||
uint32_t bits = supported_.load(std::memory_order_acquire);
|
||||
// Already initialized?
|
||||
if (HWY_LIKELY(bits != 0)) {
|
||||
return bits & supported_mask_;
|
||||
}
|
||||
|
||||
// When running tests, this allows to mock the current supported targets.
|
||||
if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
|
||||
// Store the value to signal that this was used.
|
||||
supported_.store(supported_targets_for_test_, std::memory_order_release);
|
||||
return supported_targets_for_test_ & supported_mask_;
|
||||
}
|
||||
|
||||
bits = HWY_SCALAR;
|
||||
// Returns targets supported by the CPU, independently of DisableTargets.
|
||||
// Factored out of SupportedTargets to make its structure more obvious. Note
|
||||
// that x86 CPUID may take several hundred cycles.
|
||||
int64_t DetectTargets() {
|
||||
// Apps will use only one of these (the default is EMU128), but compile flags
|
||||
// for this TU may differ from that of the app, so allow both.
|
||||
int64_t bits = HWY_SCALAR | HWY_EMU128;
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
bool has_osxsave = false;
|
||||
|
@ -288,6 +234,7 @@ uint32_t SupportedTargets() {
|
|||
flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
|
||||
flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
|
||||
|
||||
flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
|
||||
flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
|
||||
flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
|
||||
flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
|
||||
|
@ -318,33 +265,162 @@ uint32_t SupportedTargets() {
|
|||
// are not preserved across context switches.
|
||||
if (has_osxsave) {
|
||||
const uint32_t xcr0 = ReadXCR0();
|
||||
const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
|
||||
const int64_t min_avx2 = HWY_AVX2 | min_avx3;
|
||||
// XMM
|
||||
if (!IsBitSet(xcr0, 1)) {
|
||||
bits &=
|
||||
~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
|
||||
bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2);
|
||||
}
|
||||
// YMM
|
||||
if (!IsBitSet(xcr0, 2)) {
|
||||
bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
|
||||
bits &= ~min_avx2;
|
||||
}
|
||||
// ZMM + opmask
|
||||
if ((xcr0 & 0x70) != 0x70) {
|
||||
bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
|
||||
// opmask, ZMM lo/hi
|
||||
if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) {
|
||||
bits &= ~min_avx3;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
// TODO(janwas): detect for other platforms
|
||||
bits = HWY_ENABLED_BASELINE;
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
|
||||
fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
|
||||
size_t(bits), HWY_ENABLED_BASELINE);
|
||||
fprintf(stderr,
|
||||
"WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
|
||||
"\n",
|
||||
bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
|
||||
}
|
||||
|
||||
supported_.store(bits, std::memory_order_release);
|
||||
return bits & supported_mask_;
|
||||
#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
|
||||
using CapBits = unsigned long; // NOLINT
|
||||
const CapBits hw = getauxval(AT_HWCAP);
|
||||
(void)hw;
|
||||
|
||||
#if HWY_ARCH_ARM_A64
|
||||
|
||||
#if defined(HWCAP_AES)
|
||||
// aarch64 always has NEON and VFPv4, but not necessarily AES, which we
|
||||
// require and thus must still check for.
|
||||
if (hw & HWCAP_AES) {
|
||||
bits |= HWY_NEON;
|
||||
}
|
||||
#endif // HWCAP_AES
|
||||
|
||||
#if defined(HWCAP_SVE)
|
||||
if (hw & HWCAP_SVE) {
|
||||
bits |= HWY_SVE;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
|
||||
const CapBits hw2 = getauxval(AT_HWCAP2);
|
||||
if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
|
||||
bits |= HWY_SVE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#else // HWY_ARCH_ARM_A64
|
||||
|
||||
// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
|
||||
// Note that AES has a different HWCAP bit compared to aarch64.
|
||||
#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
|
||||
if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
|
||||
bits |= HWY_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HWY_ARCH_ARM_A64
|
||||
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
|
||||
fprintf(stderr,
|
||||
"WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
|
||||
"\n",
|
||||
bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
|
||||
}
|
||||
#else // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
|
||||
// TODO(janwas): detect for other platforms and check for baseline
|
||||
// This file is typically compiled without HWY_IS_TEST, but targets_test has
|
||||
// it set, and will expect all of its HWY_TARGETS (= all attainable) to be
|
||||
// supported.
|
||||
bits |= HWY_ENABLED_BASELINE;
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...) {
|
||||
char buf[2000];
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vsnprintf(buf, sizeof(buf), format, args);
|
||||
va_end(args);
|
||||
|
||||
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
|
||||
|
||||
// If compiled with any sanitizer, they can also print a stack trace.
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
__sanitizer_print_stack_trace();
|
||||
#endif // HWY_IS_*
|
||||
fflush(stderr);
|
||||
|
||||
// Now terminate the program:
|
||||
#if HWY_ARCH_RVV
|
||||
exit(1); // trap/abort just freeze Spike.
|
||||
#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
|
||||
// Facilitates breaking into a debugger, but don't use this in non-debug
|
||||
// builds because it looks like "illegal instruction", which is misleading.
|
||||
__builtin_trap();
|
||||
#else
|
||||
abort(); // Compile error without this due to HWY_NORETURN.
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
|
||||
supported_mask_ = static_cast<int64_t>(~disabled_targets);
|
||||
// This will take effect on the next call to SupportedTargets, which is
|
||||
// called right before GetChosenTarget::Update. However, calling Update here
|
||||
// would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
|
||||
// to check in tests. We instead de-initialize such that the next
|
||||
// HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
|
||||
GetChosenTarget().DeInit();
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
|
||||
supported_targets_for_test_ = targets;
|
||||
GetChosenTarget().DeInit(); // see comment above
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT int64_t SupportedTargets() {
|
||||
int64_t targets = supported_targets_for_test_;
|
||||
if (HWY_LIKELY(targets == 0)) {
|
||||
// Mock not active. Re-detect instead of caching just in case we're on a
|
||||
// heterogeneous ISA (also requires some app support to pin threads). This
|
||||
// is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
|
||||
// DisableTargets or SetSupportedTargetsForTest.
|
||||
targets = DetectTargets();
|
||||
|
||||
// VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
|
||||
// first set up ChosenTarget. No need to Update() again afterwards with the
|
||||
// final targets - that will be done by a caller of this function.
|
||||
GetChosenTarget().Update(targets);
|
||||
|
||||
// Now that we can call VectorBytes, check for targets with specific sizes.
|
||||
if (HWY_ARCH_ARM_A64) {
|
||||
const size_t vec_bytes = VectorBytes(); // uncached, see declaration
|
||||
if ((targets & HWY_SVE) && vec_bytes == 32) {
|
||||
targets = static_cast<int64_t>(targets | HWY_SVE_256);
|
||||
} else {
|
||||
targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
|
||||
}
|
||||
if ((targets & HWY_SVE2) && vec_bytes == 16) {
|
||||
targets = static_cast<int64_t>(targets | HWY_SVE2_128);
|
||||
} else {
|
||||
targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
|
||||
}
|
||||
} // HWY_ARCH_ARM_A64
|
||||
}
|
||||
|
||||
targets &= supported_mask_;
|
||||
return targets == 0 ? HWY_STATIC_TARGET : targets;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
|
||||
|
@ -352,14 +428,4 @@ HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
|
|||
return chosen_target;
|
||||
}
|
||||
|
||||
void ChosenTarget::Update() {
|
||||
// The supported variable contains the current CPU supported targets shifted
|
||||
// to the location expected by the ChosenTarget mask. We enabled SCALAR
|
||||
// regardless of whether it was compiled since it is also used as the
|
||||
// fallback mechanism to the baseline target.
|
||||
uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
|
||||
HWY_CHOSEN_TARGET_MASK_SCALAR;
|
||||
mask_.store(supported);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -24,13 +25,18 @@
|
|||
#include "hwy/detect_targets.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
#if !HWY_ARCH_RVV
|
||||
#include <atomic>
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns (cached) bitfield of enabled targets that are supported on this CPU.
|
||||
// Implemented in targets.cc; unconditionally compiled to support the use case
|
||||
// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
|
||||
// eliding calls to this function.
|
||||
HWY_DLLEXPORT uint32_t SupportedTargets();
|
||||
// Returns bitfield of enabled targets that are supported on this CPU; there is
|
||||
// always at least one such target, hence the return value is never 0. The
|
||||
// targets returned may change after calling DisableTargets. This function is
|
||||
// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
|
||||
// calls to it if there is only a single target enabled.
|
||||
HWY_DLLEXPORT int64_t SupportedTargets();
|
||||
|
||||
// Evaluates to a function call, or literal if there is a single target.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
|
@ -39,40 +45,36 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
|
|||
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
|
||||
#endif
|
||||
|
||||
// Disable from runtime dispatch the mask of compiled in targets. Targets that
|
||||
// were not enabled at compile time are ignored. This function is useful to
|
||||
// disable a target supported by the CPU that is known to have bugs or when a
|
||||
// lower target is desired. For this reason, attempts to disable targets which
|
||||
// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
|
||||
// returns at least the baseline target.
|
||||
HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
|
||||
// Subsequent SupportedTargets will not return targets whose bit(s) are set in
|
||||
// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
|
||||
// instead return HWY_STATIC_TARGET (there must always be one target to call).
|
||||
//
|
||||
// This function is useful for disabling targets known to be buggy, or if the
|
||||
// best available target is undesirable (perhaps due to throttling or memory
|
||||
// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
|
||||
// function for iteratively enabling specific targets for testing.
|
||||
HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
|
||||
|
||||
// Set the mock mask of CPU supported targets instead of the actual CPU
|
||||
// supported targets computed in SupportedTargets(). The return value of
|
||||
// SupportedTargets() will still be affected by the DisableTargets() mask
|
||||
// regardless of this mock, to prevent accidentally adding targets that are
|
||||
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
|
||||
// mock and use the actual CPU supported targets instead.
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
|
||||
|
||||
// Returns whether the SupportedTargets() function was called since the last
|
||||
// SetSupportedTargetsForTest() call.
|
||||
HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
|
||||
// Subsequent SupportedTargets will return the given set of targets, except
|
||||
// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
|
||||
// and return to the normal SupportedTargets behavior. Used to run tests for
|
||||
// all targets.
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
|
||||
|
||||
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
|
||||
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
|
||||
// is affected by the current SetSupportedTargetsForTest() mock if any.
|
||||
HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
|
||||
std::vector<uint32_t> ret;
|
||||
for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
|
||||
HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
|
||||
std::vector<int64_t> ret;
|
||||
for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
|
||||
targets = targets & (targets - 1)) {
|
||||
uint32_t current_target = targets & ~(targets - 1);
|
||||
int64_t current_target = targets & ~(targets - 1);
|
||||
ret.push_back(current_target);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
||||
static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
|
||||
switch (target) {
|
||||
#if HWY_ARCH_X86
|
||||
case HWY_SSSE3:
|
||||
|
@ -88,22 +90,28 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
case HWY_SVE2_128:
|
||||
return "SVE2_128";
|
||||
case HWY_SVE_256:
|
||||
return "SVE_256";
|
||||
case HWY_SVE2:
|
||||
return "SVE2";
|
||||
case HWY_SVE:
|
||||
return "SVE";
|
||||
case HWY_NEON:
|
||||
return "Neon";
|
||||
return "NEON";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_PPC
|
||||
case HWY_PPC8:
|
||||
return "Power8";
|
||||
return "PPC8";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_WASM
|
||||
case HWY_WASM:
|
||||
return "Wasm";
|
||||
return "WASM";
|
||||
case HWY_WASM_EMU256:
|
||||
return "WASM_EMU256";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV
|
||||
|
@ -111,8 +119,10 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
return "RVV";
|
||||
#endif
|
||||
|
||||
case HWY_EMU128:
|
||||
return "EMU128";
|
||||
case HWY_SCALAR:
|
||||
return "Scalar";
|
||||
return "SCALAR";
|
||||
|
||||
default:
|
||||
return "Unknown"; // must satisfy gtest IsValidParamName()
|
||||
|
@ -125,95 +135,125 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
// For the ChosenTarget mask and index we use a different bit arrangement than
|
||||
// in the HWY_TARGETS mask. Only the targets involved in the current
|
||||
// architecture are used in this mask, and therefore only the least significant
|
||||
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
|
||||
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
|
||||
// significant bit is set when the mask is not initialized, the next
|
||||
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
|
||||
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
|
||||
// that position and the next more significant bit is used for the scalar
|
||||
// target. Because of this we need to define equivalent values for HWY_TARGETS
|
||||
// in this representation.
|
||||
// that position and the next more significant bit is used for HWY_SCALAR (if
|
||||
// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
|
||||
// define equivalent values for HWY_TARGETS in this representation.
|
||||
// This mask representation allows to use ctz() on this mask and obtain a small
|
||||
// number that's used as an index of the table for dynamic dispatch. In this
|
||||
// way the first entry is used when the mask is uninitialized, the following
|
||||
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
|
||||
// scalar.
|
||||
|
||||
// The HWY_SCALAR bit in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
|
||||
// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
|
||||
|
||||
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
|
||||
// current architecture.
|
||||
#define HWY_CHOSEN_TARGET_SHIFT(X) \
|
||||
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
|
||||
((1u << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
|
||||
((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
|
||||
<< 1)
|
||||
|
||||
// The HWY_TARGETS mask in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
|
||||
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
|
||||
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Maximum number of dynamic targets, changing this value is an ABI incompatible
|
||||
// change
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 10
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 15
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
|
||||
// These must match the order in which the HWY_TARGETS are defined
|
||||
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
|
||||
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
|
||||
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
|
||||
// corresponds to the best target. Don't include a "," at the end of the list.
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
|
||||
nullptr, /* SSE3 */ \
|
||||
nullptr /* SSE2 */
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
|
||||
nullptr , /* reserved - SSE3? */ \
|
||||
nullptr /* reserved - SSE2? */
|
||||
|
||||
#elif HWY_ARCH_ARM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 15
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
|
||||
HWY_CHOOSE_SVE(func_name), /* SVE */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_NEON(func_name) /* NEON */
|
||||
|
||||
#elif HWY_ARCH_PPC
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 5
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
|
||||
nullptr, /* VSX */ \
|
||||
nullptr /* AltiVec */
|
||||
|
||||
#elif HWY_ARCH_WASM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM2(func_name), /* WASM2 */ \
|
||||
HWY_CHOOSE_WASM(func_name) /* WASM */
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
|
||||
HWY_CHOOSE_SVE_256(func_name), /* SVE 256-bit */ \
|
||||
HWY_CHOOSE_SVE2(func_name), /* SVE2 */ \
|
||||
HWY_CHOOSE_SVE(func_name), /* SVE */ \
|
||||
HWY_CHOOSE_NEON(func_name), /* NEON */ \
|
||||
nullptr /* reserved - Helium? */
|
||||
|
||||
#elif HWY_ARCH_RVV
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_RVV(func_name) /* RVV */
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_RVV(func_name), /* RVV */ \
|
||||
nullptr /* reserved */
|
||||
|
||||
#elif HWY_ARCH_PPC
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
|
||||
nullptr, /* reserved (VSX or AltiVec) */ \
|
||||
nullptr /* reserved (VSX or AltiVec) */
|
||||
|
||||
#elif HWY_ARCH_WASM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 9
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
|
||||
HWY_CHOOSE_WASM(func_name), /* WASM */ \
|
||||
nullptr /* reserved */
|
||||
|
||||
#else
|
||||
// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
|
||||
|
@ -222,32 +262,52 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
|
||||
#endif
|
||||
|
||||
// Bitfield of supported and enabled targets. The format differs from that of
|
||||
// HWY_TARGETS; the lowest bit governs the first function pointer (which is
|
||||
// special in that it calls FunctionCache, then Update, then dispatches to the
|
||||
// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
|
||||
// GetChosenTarget), thread-safe except on RVV.
|
||||
struct ChosenTarget {
|
||||
public:
|
||||
// Update the ChosenTarget mask based on the current CPU supported
|
||||
// targets.
|
||||
HWY_DLLEXPORT void Update();
|
||||
// Reset bits according to `targets` (typically the return value of
|
||||
// SupportedTargets()). Postcondition: IsInitialized() == true.
|
||||
void Update(int64_t targets) {
|
||||
// These are `targets` shifted downwards, see above. Also include SCALAR
|
||||
// (corresponds to the last entry in the function table) as fallback.
|
||||
StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
|
||||
}
|
||||
|
||||
// Reset the ChosenTarget to the uninitialized state.
|
||||
void DeInit() { mask_.store(1); }
|
||||
// Reset to the uninitialized state, so that FunctionCache will call Update
|
||||
// during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
|
||||
void DeInit() { StoreMask(1); }
|
||||
|
||||
// Whether the ChosenTarget was initialized. This is useful to know whether
|
||||
// any HWY_DYNAMIC_DISPATCH function was called.
|
||||
bool IsInitialized() const { return mask_.load() != 1; }
|
||||
// Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
|
||||
// function was called, which we check in tests.
|
||||
bool IsInitialized() const { return LoadMask() != 1; }
|
||||
|
||||
// Return the index in the dynamic dispatch table to be used by the current
|
||||
// CPU. Note that this method must be in the header file so it uses the value
|
||||
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
|
||||
// calls it, which may be different from others. This allows to only consider
|
||||
// calls it, which may be different from others. This means we only enable
|
||||
// those targets that were actually compiled in this module.
|
||||
size_t HWY_INLINE GetIndex() const {
|
||||
return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
|
||||
HWY_CHOSEN_TARGET_MASK_TARGETS);
|
||||
return hwy::Num0BitsBelowLS1Bit_Nonzero64(
|
||||
static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
|
||||
}
|
||||
|
||||
private:
|
||||
// Initialized to 1 so GetIndex() returns 0.
|
||||
std::atomic<uint32_t> mask_{1};
|
||||
// TODO(janwas): remove #if once <atomic> is available
|
||||
#if HWY_ARCH_RVV
|
||||
int64_t LoadMask() const { return mask_; }
|
||||
void StoreMask(int64_t mask) { mask_ = mask; }
|
||||
|
||||
int64_t mask_{1}; // Initialized to 1 so GetIndex() returns 0.
|
||||
#else
|
||||
int64_t LoadMask() const { return mask_.load(); }
|
||||
void StoreMask(int64_t mask) { mask_.store(mask); }
|
||||
|
||||
std::atomic<int64_t> mask_{1}; // Initialized to 1 so GetIndex() returns 0.
|
||||
#endif // HWY_ARCH_RVV
|
||||
};
|
||||
|
||||
// For internal use (e.g. by FunctionCache and DisableTargets).
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -18,9 +19,10 @@
|
|||
|
||||
namespace fake {
|
||||
|
||||
#define DECLARE_FUNCTION(TGT) \
|
||||
namespace N_##TGT { \
|
||||
uint32_t FakeFunction(int) { return HWY_##TGT; } \
|
||||
#define DECLARE_FUNCTION(TGT) \
|
||||
namespace N_##TGT { \
|
||||
/* Function argument is just to ensure/demonstrate they are possible. */ \
|
||||
int64_t FakeFunction(int) { return HWY_##TGT; } \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(AVX3_DL)
|
||||
|
@ -31,41 +33,62 @@ DECLARE_FUNCTION(SSSE3)
|
|||
DECLARE_FUNCTION(NEON)
|
||||
DECLARE_FUNCTION(SVE)
|
||||
DECLARE_FUNCTION(SVE2)
|
||||
DECLARE_FUNCTION(SVE_256)
|
||||
DECLARE_FUNCTION(SVE2_128)
|
||||
DECLARE_FUNCTION(PPC8)
|
||||
DECLARE_FUNCTION(WASM)
|
||||
DECLARE_FUNCTION(RVV)
|
||||
DECLARE_FUNCTION(SCALAR)
|
||||
DECLARE_FUNCTION(EMU128)
|
||||
|
||||
HWY_EXPORT(FakeFunction);
|
||||
|
||||
void CallFunctionForTarget(int64_t target, int line) {
|
||||
if ((HWY_TARGETS & target) == 0) return;
|
||||
hwy::SetSupportedTargetsForTest(target);
|
||||
|
||||
// Call Update() first to make &HWY_DYNAMIC_DISPATCH() return
|
||||
// the pointer to the already cached function.
|
||||
hwy::GetChosenTarget().Update(hwy::SupportedTargets());
|
||||
|
||||
EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
|
||||
|
||||
// Calling DeInit() will test that the initializer function
|
||||
// also calls the right function.
|
||||
hwy::GetChosenTarget().DeInit();
|
||||
|
||||
#if HWY_DISPATCH_WORKAROUND
|
||||
EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
|
||||
#else
|
||||
EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
|
||||
#endif
|
||||
|
||||
// Second call uses the cached value from the previous call.
|
||||
EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
|
||||
}
|
||||
|
||||
void CheckFakeFunction() {
|
||||
#define CHECK_ARRAY_ENTRY(TGT) \
|
||||
if ((HWY_TARGETS & HWY_##TGT) != 0) { \
|
||||
hwy::SetSupportedTargetsForTest(HWY_##TGT); \
|
||||
/* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
|
||||
/* the pointer to the already cached function. */ \
|
||||
hwy::GetChosenTarget().Update(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Calling DeInit() will test that the initializer function */ \
|
||||
/* also calls the right function. */ \
|
||||
hwy::GetChosenTarget().DeInit(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Second call uses the cached value from the previous call. */ \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
}
|
||||
CHECK_ARRAY_ENTRY(AVX3_DL)
|
||||
CHECK_ARRAY_ENTRY(AVX3)
|
||||
CHECK_ARRAY_ENTRY(AVX2)
|
||||
CHECK_ARRAY_ENTRY(SSE4)
|
||||
CHECK_ARRAY_ENTRY(SSSE3)
|
||||
CHECK_ARRAY_ENTRY(NEON)
|
||||
CHECK_ARRAY_ENTRY(SVE)
|
||||
CHECK_ARRAY_ENTRY(SVE2)
|
||||
CHECK_ARRAY_ENTRY(PPC8)
|
||||
CHECK_ARRAY_ENTRY(WASM)
|
||||
CHECK_ARRAY_ENTRY(RVV)
|
||||
CHECK_ARRAY_ENTRY(SCALAR)
|
||||
#undef CHECK_ARRAY_ENTRY
|
||||
// When adding a target, also add to DECLARE_FUNCTION above.
|
||||
CallFunctionForTarget(HWY_AVX3_DL, __LINE__);
|
||||
CallFunctionForTarget(HWY_AVX3, __LINE__);
|
||||
CallFunctionForTarget(HWY_AVX2, __LINE__);
|
||||
CallFunctionForTarget(HWY_SSE4, __LINE__);
|
||||
CallFunctionForTarget(HWY_SSSE3, __LINE__);
|
||||
CallFunctionForTarget(HWY_NEON, __LINE__);
|
||||
CallFunctionForTarget(HWY_SVE, __LINE__);
|
||||
CallFunctionForTarget(HWY_SVE2, __LINE__);
|
||||
CallFunctionForTarget(HWY_SVE_256, __LINE__);
|
||||
CallFunctionForTarget(HWY_SVE2_128, __LINE__);
|
||||
CallFunctionForTarget(HWY_PPC8, __LINE__);
|
||||
CallFunctionForTarget(HWY_WASM, __LINE__);
|
||||
CallFunctionForTarget(HWY_RVV, __LINE__);
|
||||
// The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
|
||||
// is opt-in only.
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
|
||||
CallFunctionForTarget(HWY_SCALAR, __LINE__);
|
||||
#else
|
||||
CallFunctionForTarget(HWY_EMU128, __LINE__);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace fake
|
||||
|
@ -86,31 +109,27 @@ class HwyTargetsTest : public testing::Test {
|
|||
TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
|
||||
|
||||
TEST_F(HwyTargetsTest, DisabledTargetsTest) {
|
||||
DisableTargets(~0u);
|
||||
// Check that the baseline can't be disabled.
|
||||
HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
|
||||
DisableTargets(~0LL);
|
||||
// Check that disabling everything at least leaves the static target.
|
||||
HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets());
|
||||
|
||||
DisableTargets(0); // Reset the mask.
|
||||
uint32_t current_targets = SupportedTargets();
|
||||
if ((current_targets & ~uint32_t(HWY_ENABLED_BASELINE)) == 0) {
|
||||
const int64_t current_targets = SupportedTargets();
|
||||
const int64_t enabled_baseline = static_cast<int64_t>(HWY_ENABLED_BASELINE);
|
||||
// Exclude these two because they are always returned by SupportedTargets.
|
||||
const int64_t fallback = HWY_SCALAR | HWY_EMU128;
|
||||
if ((current_targets & ~enabled_baseline & ~fallback) == 0) {
|
||||
// We can't test anything else if the only compiled target is the baseline.
|
||||
return;
|
||||
}
|
||||
|
||||
// Get the lowest bit in the mask (the best target) and disable that one.
|
||||
uint32_t lowest_target = current_targets & (~current_targets + 1);
|
||||
// The lowest target shouldn't be one in the baseline.
|
||||
HWY_ASSERT((lowest_target & ~uint32_t(HWY_ENABLED_BASELINE)) != 0);
|
||||
DisableTargets(lowest_target);
|
||||
const int64_t best_target = current_targets & (~current_targets + 1);
|
||||
DisableTargets(best_target);
|
||||
|
||||
// Check that the other targets are still enabled.
|
||||
HWY_ASSERT((lowest_target ^ current_targets) == SupportedTargets());
|
||||
HWY_ASSERT((best_target ^ current_targets) == SupportedTargets());
|
||||
DisableTargets(0); // Reset the mask.
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2019 Google LLC
|
||||
// SPDX-License-Identifier: Apache-2.0
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
|
@ -16,12 +17,9 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/foreach_target.h" // IWYU pragma: keep
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
|
@ -177,6 +175,23 @@ HWY_NOINLINE void TestAllAbs() {
|
|||
ForFloatTypes(ForPartialVectors<TestFloatAbs>());
|
||||
}
|
||||
|
||||
struct TestNeg {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vn = Set(d, T(-3));
|
||||
const auto vp = Set(d, T(3));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
|
||||
HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllNeg() {
|
||||
ForSignedTypes(ForPartialVectors<TestNeg>());
|
||||
ForFloatTypes(ForPartialVectors<TestNeg>());
|
||||
}
|
||||
|
||||
struct TestUnsignedMinMax {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -263,16 +278,15 @@ HWY_NOINLINE void TestAllMinMax() {
|
|||
ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
|
||||
}
|
||||
|
||||
class TestMinMax128 {
|
||||
template <class D>
|
||||
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
|
||||
alignas(16) uint64_t in[2];
|
||||
in[0] = lo;
|
||||
in[1] = hi;
|
||||
return LoadDup128(d, in);
|
||||
}
|
||||
template <class D>
|
||||
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
|
||||
alignas(16) uint64_t in[2];
|
||||
in[0] = lo;
|
||||
in[1] = hi;
|
||||
return LoadDup128(d, in);
|
||||
}
|
||||
|
||||
public:
|
||||
struct TestMinMax128 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using V = Vec<D>;
|
||||
|
@ -341,779 +355,73 @@ HWY_NOINLINE void TestAllMinMax128() {
|
|||
ForGEVectors<128, TestMinMax128>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestUnsignedMul {
|
||||
struct TestMinMax128Upper {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, T(1));
|
||||
const auto vi = Iota(d, 1);
|
||||
const auto vj = Iota(d, 3);
|
||||
using V = Vec<D>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((1 + i) * (1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((1 + i) * (3 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj));
|
||||
|
||||
const T max = LimitsMax<T>();
|
||||
const auto vmax = Set(d, max);
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
|
||||
|
||||
const size_t bits = sizeof(T) * 8;
|
||||
const uint64_t mask = (1ull << bits) - 1;
|
||||
const T max2 = (uint64_t(max) * max) & mask;
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestSignedMul {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, T(1));
|
||||
const auto vi = Iota(d, 1);
|
||||
const auto vn = Iota(d, -T(N)); // no i8 supported, so no wraparound
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((1 + i) * (1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMul() {
|
||||
const ForPartialVectors<TestUnsignedMul> test_unsigned;
|
||||
// No u8.
|
||||
test_unsigned(uint16_t());
|
||||
test_unsigned(uint32_t());
|
||||
// No u64.
|
||||
|
||||
const ForPartialVectors<TestSignedMul> test_signed;
|
||||
// No i8.
|
||||
test_signed(int16_t());
|
||||
test_signed(int32_t());
|
||||
// No i64.
|
||||
}
|
||||
|
||||
struct TestMulHigh {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using Wide = MakeWide<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto expected_lanes = AllocateAligned<T>(N);
|
||||
|
||||
const auto vi = Iota(d, 1);
|
||||
const auto vni = Iota(d, -T(N)); // no i8 supported, so no wraparound
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0));
|
||||
|
||||
// Large positive squared
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = T(LimitsMax<T>() >> i);
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
|
||||
}
|
||||
auto v = Load(d, in_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
|
||||
|
||||
// Large positive * small positive
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
|
||||
|
||||
// Large positive * small negative
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMulHigh() {
|
||||
ForPartialVectors<TestMulHigh> test;
|
||||
test(int16_t());
|
||||
test(uint16_t());
|
||||
}
|
||||
|
||||
struct TestMulEven {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using Wide = MakeWide<T>;
|
||||
const Repartition<Wide, D> d2;
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<Wide>(Lanes(d2));
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
in_lanes[i + 0] = LimitsMax<T>() >> i;
|
||||
if (N != 1) {
|
||||
in_lanes[i + 1] = 1; // unused
|
||||
}
|
||||
expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0];
|
||||
}
|
||||
|
||||
const auto v = Load(d, in_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestMulEvenOdd64 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto expected_even = AllocateAligned<T>(N);
|
||||
auto expected_odd = AllocateAligned<T>(N);
|
||||
|
||||
// Random inputs in each lane
|
||||
auto a_lanes = AllocateAligned<T>(N);
|
||||
auto b_lanes = AllocateAligned<T>(N);
|
||||
auto min_lanes = AllocateAligned<T>(N);
|
||||
auto max_lanes = AllocateAligned<T>(N);
|
||||
RandomState rng;
|
||||
|
||||
const V v00 = Zero(d);
|
||||
const V v01 = Make128(d, 0, 1);
|
||||
const V v10 = Make128(d, 1, 0);
|
||||
const V v11 = Add(v01, v10);
|
||||
|
||||
// Same arg
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
|
||||
|
||||
// Equivalent but not equal (chooses second arg)
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
|
||||
|
||||
// First arg less
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
|
||||
|
||||
// Second arg less
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
|
||||
|
||||
// Also check 128-bit blocks are independent
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = Random64(&rng);
|
||||
in2[i] = Random64(&rng);
|
||||
a_lanes[i] = Random64(&rng);
|
||||
b_lanes[i] = Random64(&rng);
|
||||
}
|
||||
|
||||
const V a = Load(d, a_lanes.get());
|
||||
const V b = Load(d, b_lanes.get());
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
|
||||
expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
|
||||
}
|
||||
|
||||
const auto a = Load(d, in1.get());
|
||||
const auto b = Load(d, in2.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMulEven() {
|
||||
ForGEVectors<64, TestMulEven> test;
|
||||
test(int32_t());
|
||||
test(uint32_t());
|
||||
|
||||
ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestMulAdd {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto k0 = Zero(d);
|
||||
const auto kNeg0 = Set(d, T(-0.0));
|
||||
const auto v1 = Iota(d, 1);
|
||||
const auto v2 = Iota(d, 2);
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((i + 1) * (i + 2));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] =
|
||||
T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = -T(i + 2);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((i + 1) * (i + 2));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMulAdd() {
|
||||
ForFloatTypes(ForPartialVectors<TestMulAdd>());
|
||||
}
|
||||
|
||||
struct TestReorderWidenMulAccumulate {
|
||||
template <typename TN, class DN>
|
||||
HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
|
||||
using TW = MakeWide<TN>;
|
||||
const RepartitionToWide<DN> dw;
|
||||
const auto f0 = Zero(dw);
|
||||
const auto f1 = Set(dw, 1.0f);
|
||||
const auto fi = Iota(dw, 1);
|
||||
const auto bf0 = ReorderDemote2To(dn, f0, f0);
|
||||
const auto bf1 = ReorderDemote2To(dn, f1, f1);
|
||||
const auto bfi = ReorderDemote2To(dn, fi, fi);
|
||||
const size_t NW = Lanes(dw);
|
||||
auto delta = AllocateAligned<TW>(2 * NW);
|
||||
for (size_t i = 0; i < 2 * NW; ++i) {
|
||||
delta[i] = 0.0f;
|
||||
}
|
||||
|
||||
// Any input zero => both outputs zero
|
||||
auto sum1 = f0;
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
|
||||
// delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
|
||||
for (size_t p = 0; p < 2 * NW; ++p) {
|
||||
delta[p] = 1.0f;
|
||||
const auto delta0 = Load(dw, delta.get() + 0);
|
||||
const auto delta1 = Load(dw, delta.get() + NW);
|
||||
delta[p] = 0.0f;
|
||||
const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
|
||||
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Swapped arg order
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1, and swap arg order
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
|
||||
min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
|
||||
min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
|
||||
max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
|
||||
max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
|
||||
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
|
||||
}
|
||||
|
||||
struct TestDiv {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, T(-2));
|
||||
const auto v1 = Set(d, T(1));
|
||||
|
||||
// Unchanged after division by 1.
|
||||
HWY_ASSERT_VEC_EQ(d, v, Div(v, v1));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (T(i) - 2) / T(2);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2))));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); }
|
||||
|
||||
struct TestApproximateReciprocal {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, T(-2));
|
||||
const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v);
|
||||
const size_t N = Lanes(d);
|
||||
auto input = AllocateAligned<T>(N);
|
||||
Store(nonzero, d, input.get());
|
||||
|
||||
auto actual = AllocateAligned<T>(N);
|
||||
Store(ApproximateReciprocal(nonzero), d, actual.get());
|
||||
|
||||
double max_l1 = 0.0;
|
||||
double worst_expected = 0.0;
|
||||
double worst_actual = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const double expected = 1.0 / input[i];
|
||||
const double l1 = std::abs(expected - actual[i]);
|
||||
if (l1 > max_l1) {
|
||||
max_l1 = l1;
|
||||
worst_expected = expected;
|
||||
worst_actual = actual[i];
|
||||
}
|
||||
}
|
||||
const double abs_worst_expected = std::abs(worst_expected);
|
||||
if (abs_worst_expected > 1E-5) {
|
||||
const double max_rel = max_l1 / abs_worst_expected;
|
||||
fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
|
||||
worst_expected, worst_actual);
|
||||
HWY_ASSERT(max_rel < 0.004);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllApproximateReciprocal() {
|
||||
ForPartialVectors<TestApproximateReciprocal>()(float());
|
||||
}
|
||||
|
||||
struct TestSquareRoot {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto vi = Iota(d, 0);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSquareRoot() {
|
||||
ForFloatTypes(ForPartialVectors<TestSquareRoot>());
|
||||
}
|
||||
|
||||
struct TestReciprocalSquareRoot {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Set(d, 123.0f);
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(ApproximateReciprocalSqrt(v), d, lanes.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
float err = lanes[i] - 0.090166f;
|
||||
if (err < 0.0f) err = -err;
|
||||
if (err >= 4E-4f) {
|
||||
HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
|
||||
err);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReciprocalSquareRoot() {
|
||||
ForPartialVectors<TestReciprocalSquareRoot>()(float());
|
||||
}
|
||||
|
||||
template <typename T, class D>
|
||||
AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
|
||||
const T eps = std::numeric_limits<T>::epsilon();
|
||||
const T test_cases[] = {
|
||||
// +/- 1
|
||||
T(1),
|
||||
T(-1),
|
||||
// +/- 0
|
||||
T(0),
|
||||
T(-0),
|
||||
// near 0
|
||||
T(0.4),
|
||||
T(-0.4),
|
||||
// +/- integer
|
||||
T(4),
|
||||
T(-32),
|
||||
// positive near limit
|
||||
MantissaEnd<T>() - T(1.5),
|
||||
MantissaEnd<T>() + T(1.5),
|
||||
// negative near limit
|
||||
-MantissaEnd<T>() - T(1.5),
|
||||
-MantissaEnd<T>() + T(1.5),
|
||||
// positive tiebreak
|
||||
T(1.5),
|
||||
T(2.5),
|
||||
// negative tiebreak
|
||||
T(-1.5),
|
||||
T(-2.5),
|
||||
// positive +/- delta
|
||||
T(2.0001),
|
||||
T(3.9999),
|
||||
// negative +/- delta
|
||||
T(-999.9999),
|
||||
T(-998.0001),
|
||||
// positive +/- epsilon
|
||||
T(1) + eps,
|
||||
T(1) - eps,
|
||||
// negative +/- epsilon
|
||||
T(-1) + eps,
|
||||
T(-1) - eps,
|
||||
// +/- huge (but still fits in float)
|
||||
T(1E34),
|
||||
T(-1E35),
|
||||
// +/- infinity
|
||||
std::numeric_limits<T>::infinity(),
|
||||
-std::numeric_limits<T>::infinity(),
|
||||
// qNaN
|
||||
GetLane(NaN(d))
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
|
||||
auto in = AllocateAligned<T>(padded);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, T(0));
|
||||
return in;
|
||||
}
|
||||
|
||||
struct TestRound {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
size_t padded;
|
||||
auto in = RoundTestCases(t, d, padded);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
|
||||
for (size_t i = 0; i < padded; ++i) {
|
||||
// Avoid [std::]round, which does not round to nearest *even*.
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
expected[i] = static_cast<T>(nearbyint(in[i]));
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllRound() {
|
||||
ForFloatTypes(ForPartialVectors<TestRound>());
|
||||
}
|
||||
|
||||
struct TestNearestInt {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF tf, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const RebindToSigned<DF> di;
|
||||
|
||||
size_t padded;
|
||||
auto in = RoundTestCases(tf, df, padded);
|
||||
auto expected = AllocateAligned<TI>(padded);
|
||||
|
||||
constexpr double max = static_cast<double>(LimitsMax<TI>());
|
||||
for (size_t i = 0; i < padded; ++i) {
|
||||
if (std::isnan(in[i])) {
|
||||
// We replace NaN with 0 below (no_nan)
|
||||
expected[i] = 0;
|
||||
} else if (std::isinf(in[i]) || double(std::abs(in[i])) >= max) {
|
||||
// Avoid undefined result for lrintf
|
||||
expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
|
||||
} else {
|
||||
expected[i] = static_cast<TI>(lrintf(in[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(df)) {
|
||||
const auto v = Load(df, &in[i]);
|
||||
const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
|
||||
HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllNearestInt() {
|
||||
ForPartialVectors<TestNearestInt>()(float());
|
||||
}
|
||||
|
||||
struct TestTrunc {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
size_t padded;
|
||||
auto in = RoundTestCases(t, d, padded);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
|
||||
for (size_t i = 0; i < padded; ++i) {
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
expected[i] = static_cast<T>(trunc(in[i]));
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllTrunc() {
|
||||
ForFloatTypes(ForPartialVectors<TestTrunc>());
|
||||
}
|
||||
|
||||
struct TestCeil {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
size_t padded;
|
||||
auto in = RoundTestCases(t, d, padded);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
|
||||
for (size_t i = 0; i < padded; ++i) {
|
||||
expected[i] = std::ceil(in[i]);
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCeil() {
|
||||
ForFloatTypes(ForPartialVectors<TestCeil>());
|
||||
}
|
||||
|
||||
struct TestFloor {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
size_t padded;
|
||||
auto in = RoundTestCases(t, d, padded);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
|
||||
for (size_t i = 0; i < padded; ++i) {
|
||||
expected[i] = std::floor(in[i]);
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFloor() {
|
||||
ForFloatTypes(ForPartialVectors<TestFloor>());
|
||||
}
|
||||
|
||||
struct TestSumOfLanes {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
|
||||
// Lane i = bit i, higher lanes 0
|
||||
double sum = 0.0;
|
||||
// Avoid setting sign bit and cap at double precision
|
||||
constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
|
||||
sum += static_cast<double>(in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
|
||||
SumOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = i (iota) to include upper lanes
|
||||
sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
sum += static_cast<double>(i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumOfLanes() {
|
||||
ForUIF3264(ForPartialVectors<TestSumOfLanes>());
|
||||
}
|
||||
|
||||
struct TestMinOfLanes {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
|
||||
// Lane i = bit i, higher lanes = 2 (not the minimum)
|
||||
T min = HighestValue<T>();
|
||||
// Avoid setting sign bit and cap at double precision
|
||||
constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
|
||||
min = HWY_MIN(min, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = N - i to include upper lanes
|
||||
min = HighestValue<T>();
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = static_cast<T>(N - i); // no 8-bit T so no wraparound
|
||||
min = HWY_MIN(min, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestMaxOfLanes {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
|
||||
T max = LowestValue<T>();
|
||||
// Avoid setting sign bit and cap at double precision
|
||||
constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
|
||||
max = HWY_MAX(max, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = i to include upper lanes
|
||||
max = LowestValue<T>();
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = static_cast<T>(i); // no 8-bit T so no wraparound
|
||||
max = HWY_MAX(max, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMinMaxOfLanes() {
|
||||
const ForPartialVectors<TestMinOfLanes> test_min;
|
||||
const ForPartialVectors<TestMaxOfLanes> test_max;
|
||||
ForUIF3264(test_min);
|
||||
ForUIF3264(test_max);
|
||||
test_min(uint16_t());
|
||||
test_max(uint16_t());
|
||||
test_min(int16_t());
|
||||
test_max(int16_t());
|
||||
}
|
||||
|
||||
struct TestAbsDiff {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes_a = AllocateAligned<T>(N);
|
||||
auto in_lanes_b = AllocateAligned<T>(N);
|
||||
auto out_lanes = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes_a[i] = static_cast<T>((i ^ 1u) << i);
|
||||
in_lanes_b[i] = static_cast<T>(i << i);
|
||||
out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]);
|
||||
}
|
||||
const auto a = Load(d, in_lanes_a.get());
|
||||
const auto b = Load(d, in_lanes_b.get());
|
||||
const auto expected = Load(d, out_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllAbsDiff() {
|
||||
ForPartialVectors<TestAbsDiff>()(float());
|
||||
}
|
||||
|
||||
struct TestSumsOf8 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N < 8) return;
|
||||
const Repartition<uint64_t, D> du64;
|
||||
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = Random64(&rng) & 0xFF;
|
||||
}
|
||||
|
||||
for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
|
||||
uint64_t sum = 0;
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
sum += in_lanes[idx_sum * 8 + i];
|
||||
}
|
||||
sum_lanes[idx_sum] = sum;
|
||||
}
|
||||
|
||||
const Vec<D> in = Load(d, in_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumsOf8() {
|
||||
ForGEVectors<64, TestSumsOf8>()(uint8_t());
|
||||
}
|
||||
|
||||
struct TestNeg {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vn = Set(d, T(-3));
|
||||
const auto vp = Set(d, T(3));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
|
||||
HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllNeg() {
|
||||
ForSignedTypes(ForPartialVectors<TestNeg>());
|
||||
ForFloatTypes(ForPartialVectors<TestNeg>());
|
||||
HWY_NOINLINE void TestAllMinMax128Upper() {
|
||||
ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -1127,35 +435,12 @@ namespace hwy {
|
|||
HWY_BEFORE_TEST(HwyArithmeticTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulHigh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulEven);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulAdd);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReorderWidenMulAccumulate);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllDiv);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllApproximateReciprocal);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSquareRoot);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReciprocalSquareRoot);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumOfLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMaxOfLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRound);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNearestInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче