зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1743793 - Update libjxl and highway r=tnikkel
Differential Revision: https://phabricator.services.mozilla.com/D132952
This commit is contained in:
Родитель
8d68ea1d86
Коммит
59265056dd
|
@ -18,6 +18,8 @@ EXPORTS.hwy += [
|
|||
"/third_party/highway/hwy/aligned_allocator.h",
|
||||
"/third_party/highway/hwy/base.h",
|
||||
"/third_party/highway/hwy/cache_control.h",
|
||||
"/third_party/highway/hwy/detect_compiler_arch.h",
|
||||
"/third_party/highway/hwy/detect_targets.h",
|
||||
"/third_party/highway/hwy/foreach_target.h",
|
||||
"/third_party/highway/hwy/highway.h",
|
||||
"/third_party/highway/hwy/targets.h",
|
||||
|
@ -25,6 +27,8 @@ EXPORTS.hwy += [
|
|||
|
||||
EXPORTS.hwy.ops += [
|
||||
"/third_party/highway/hwy/ops/arm_neon-inl.h",
|
||||
"/third_party/highway/hwy/ops/arm_sve-inl.h",
|
||||
"/third_party/highway/hwy/ops/generic_ops-inl.h",
|
||||
"/third_party/highway/hwy/ops/rvv-inl.h",
|
||||
"/third_party/highway/hwy/ops/scalar-inl.h",
|
||||
"/third_party/highway/hwy/ops/set_macros-inl.h",
|
||||
|
|
|
@ -1,43 +1,44 @@
|
|||
# Version of this schema
|
||||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
# Bugzilla product and component for this directory and subdirectories
|
||||
product: Core
|
||||
component: "ImageLib"
|
||||
|
||||
# Document the source of externally hosted code
|
||||
origin:
|
||||
|
||||
# Short name of the package/library
|
||||
name: highway
|
||||
|
||||
description: Performance-portable, length-agnostic SIMD with runtime dispatch
|
||||
|
||||
# Full URL for the package's homepage/etc
|
||||
# Usually different from repository url
|
||||
url: https://github.com/google/highway
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit e2397743fe092df68b760d358253773699a16c93 (2021-06-09T08:56:32Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: e2397743fe092df68b760d358253773699a16c93
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
# Multiple licenses can be specified (as a YAML list)
|
||||
# A "LICENSE" file must exist containing the full license text
|
||||
license: Apache-2.0
|
||||
|
||||
license-file: LICENSE
|
||||
|
||||
vendoring:
|
||||
url: https://github.com/google/highway
|
||||
source-hosting: github
|
||||
vendor-directory: third_party/highway
|
||||
|
||||
exclude:
|
||||
- g3doc/
|
||||
# Version of this schema
|
||||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
# Bugzilla product and component for this directory and subdirectories
|
||||
product: Core
|
||||
component: "ImageLib"
|
||||
|
||||
# Document the source of externally hosted code
|
||||
origin:
|
||||
|
||||
# Short name of the package/library
|
||||
name: highway
|
||||
|
||||
description: Performance-portable, length-agnostic SIMD with runtime dispatch
|
||||
|
||||
# Full URL for the package's homepage/etc
|
||||
# Usually different from repository url
|
||||
url: https://github.com/google/highway
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: e69083a12a05caf037cabecdf1b248b7579705a5
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
# Multiple licenses can be specified (as a YAML list)
|
||||
# A "LICENSE" file must exist containing the full license text
|
||||
license: Apache-2.0
|
||||
|
||||
license-file: LICENSE
|
||||
|
||||
vendoring:
|
||||
url: https://github.com/google/highway
|
||||
source-hosting: github
|
||||
vendor-directory: third_party/highway
|
||||
|
||||
exclude:
|
||||
- g3doc/
|
||||
|
||||
|
|
|
@ -20,12 +20,12 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 9e8c5766ba32c008ddcaf11f0fac591aa7964f7b (2021-11-10T07:51:06Z).
|
||||
release: commit a9100da7143e4f367d2e26f4d11e457e85343543 (2021-11-25T17:57:37Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
# NOTE(krosylight): Update highway together when updating this!
|
||||
revision: 9e8c5766ba32c008ddcaf11f0fac591aa7964f7b
|
||||
revision: a9100da7143e4f367d2e26f4d11e457e85343543
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
@ -53,3 +53,4 @@ vendoring:
|
|||
- third_party/testdata/
|
||||
- tools/
|
||||
|
||||
|
||||
|
|
|
@ -41,9 +41,15 @@ selects.config_setting_group(
|
|||
],
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "emulate_sve",
|
||||
values = {
|
||||
"copt": "-DHWY_EMULATE_SVE",
|
||||
},
|
||||
)
|
||||
|
||||
# Additional warnings for Clang OR GCC (skip for MSVC)
|
||||
CLANG_GCC_COPTS = [
|
||||
"-Werror",
|
||||
"-Wunused-parameter",
|
||||
"-Wunused-variable",
|
||||
"-Wextra-semi",
|
||||
|
@ -74,13 +80,19 @@ COPTS = select({
|
|||
":compiler_gcc": CLANG_GCC_COPTS,
|
||||
# Default to clang because compiler detection only works in Bazel
|
||||
"//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": [
|
||||
"-march=rv64gcv0p10",
|
||||
"-menable-experimental-extensions",
|
||||
],
|
||||
"//conditions:default": [
|
||||
],
|
||||
})
|
||||
|
||||
# Unused on Bazel builds, where these are not defined/known; Copybara replaces
|
||||
# usages of this with an empty list.
|
||||
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
|
||||
# usages with an empty list.
|
||||
COMPAT = [
|
||||
"//buildenv/target:mobile",
|
||||
"//buildenv/target:vendor",
|
||||
"//buildenv/target:non_prod", # includes mobile/vendor.
|
||||
]
|
||||
|
||||
# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
|
||||
|
@ -94,24 +106,23 @@ cc_library(
|
|||
"hwy/aligned_allocator.cc",
|
||||
"hwy/targets.cc",
|
||||
],
|
||||
# Normal headers with include guards
|
||||
hdrs = [
|
||||
"hwy/aligned_allocator.h",
|
||||
"hwy/base.h",
|
||||
"hwy/cache_control.h",
|
||||
"hwy/highway.h",
|
||||
"hwy/detect_compiler_arch.h", # private
|
||||
"hwy/detect_targets.h", # private
|
||||
"hwy/targets.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
# TODO(janwas): remove once WASM toolchain supports *extend instead of *widen
|
||||
defines = select({
|
||||
"//tools/cc_target_os:wasm": ["HWY_WASM_OLD_NAMES"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
textual_hdrs = [
|
||||
"hwy/highway.h", # public
|
||||
"hwy/foreach_target.h", # public
|
||||
"hwy/ops/arm_neon-inl.h",
|
||||
"hwy/ops/arm_sve-inl.h",
|
||||
"hwy/ops/generic_ops-inl.h",
|
||||
"hwy/ops/rvv-inl.h",
|
||||
"hwy/ops/scalar-inl.h",
|
||||
"hwy/ops/set_macros-inl.h",
|
||||
|
@ -121,6 +132,19 @@ cc_library(
|
|||
"hwy/ops/x86_256-inl.h",
|
||||
"hwy/ops/x86_512-inl.h",
|
||||
],
|
||||
deps = select({
|
||||
":emulate_sve": ["//third_party/farm_sve"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "dot",
|
||||
compatible_with = [],
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/dot/dot-inl.h",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -144,9 +168,26 @@ cc_library(
|
|||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "sort",
|
||||
compatible_with = [],
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/sort/sort-inl.h",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
# Everything required for tests that use Highway.
|
||||
cc_library(
|
||||
name = "hwy_test_util",
|
||||
textual_hdrs = ["hwy/tests/test_util-inl.h"],
|
||||
srcs = ["hwy/tests/test_util.cc"],
|
||||
hdrs = ["hwy/tests/test_util.h"],
|
||||
textual_hdrs = [
|
||||
"hwy/tests/test_util-inl.h",
|
||||
"hwy/tests/hwy_gtest.h",
|
||||
],
|
||||
# Must not depend on a gtest variant, which can conflict with the
|
||||
# GUNIT_INTERNAL_BUILD_MODE defined by the test.
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
|
@ -182,8 +223,10 @@ cc_binary(
|
|||
|
||||
# path, name
|
||||
HWY_TESTS = [
|
||||
("hwy/contrib/dot/", "dot_test"),
|
||||
("hwy/contrib/image/", "image_test"),
|
||||
("hwy/contrib/math/", "math_test"),
|
||||
("hwy/contrib/sort/", "sort_test"),
|
||||
("hwy/examples/", "skeleton_test"),
|
||||
("hwy/", "nanobenchmark_test"),
|
||||
("hwy/", "aligned_allocator_test"),
|
||||
|
@ -191,10 +234,13 @@ HWY_TESTS = [
|
|||
("hwy/", "highway_test"),
|
||||
("hwy/", "targets_test"),
|
||||
("hwy/tests/", "arithmetic_test"),
|
||||
("hwy/tests/", "blockwise_test"),
|
||||
("hwy/tests/", "combine_test"),
|
||||
("hwy/tests/", "compare_test"),
|
||||
("hwy/tests/", "convert_test"),
|
||||
("hwy/tests/", "crypto_test"),
|
||||
("hwy/tests/", "logical_test"),
|
||||
("hwy/tests/", "mask_test"),
|
||||
("hwy/tests/", "memory_test"),
|
||||
("hwy/tests/", "swizzle_test"),
|
||||
("hwy/tests/", "test_util_test"),
|
||||
|
@ -209,16 +255,32 @@ HWY_TESTS = [
|
|||
srcs = [
|
||||
subdir + test + ".cc",
|
||||
],
|
||||
copts = COPTS,
|
||||
# for test_suite. math_test is not yet supported on RVV.
|
||||
tags = ["hwy_ops_test"] if test != "math_test" else [],
|
||||
copts = COPTS + [
|
||||
# gTest triggers this warning (which is enabled by the
|
||||
# extra-semi in COPTS), so we need to disable it here,
|
||||
# but it's still enabled for :hwy.
|
||||
"-Wno-c++98-compat-extra-semi",
|
||||
],
|
||||
features = select({
|
||||
"@platforms//cpu:riscv64": ["fully_static_link"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
linkstatic = select({
|
||||
"@platforms//cpu:riscv64": True,
|
||||
"//conditions:default": False,
|
||||
}),
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
deps = [
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
":image",
|
||||
":math",
|
||||
":nanobenchmark",
|
||||
":skeleton",
|
||||
":sort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
],
|
||||
),
|
||||
|
|
|
@ -19,7 +19,7 @@ if(POLICY CMP0083)
|
|||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 0.12.2) # Keep in sync with highway.h version
|
||||
project(hwy VERSION 0.15.0) # Keep in sync with highway.h version
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
@ -42,6 +42,12 @@ endif()
|
|||
|
||||
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
|
||||
|
||||
# Unconditionally adding -Werror risks breaking the build when new warnings
|
||||
# arise due to compiler/platform changes. Enable this in CI/tests.
|
||||
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
|
||||
|
||||
set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?")
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
|
@ -54,9 +60,11 @@ check_cxx_source_compiles(
|
|||
)
|
||||
|
||||
set(HWY_CONTRIB_SOURCES
|
||||
hwy/contrib/dot/dot-inl.h
|
||||
hwy/contrib/image/image.cc
|
||||
hwy/contrib/image/image.h
|
||||
hwy/contrib/math/math-inl.h
|
||||
hwy/contrib/sort/sort-inl.h
|
||||
)
|
||||
|
||||
set(HWY_SOURCES
|
||||
|
@ -64,12 +72,15 @@ set(HWY_SOURCES
|
|||
hwy/aligned_allocator.h
|
||||
hwy/base.h
|
||||
hwy/cache_control.h
|
||||
hwy/detect_compiler_arch.h # private
|
||||
hwy/detect_targets.h # private
|
||||
hwy/foreach_target.h
|
||||
hwy/highway.h
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h
|
||||
hwy/ops/arm_neon-inl.h
|
||||
hwy/ops/arm_sve-inl.h
|
||||
hwy/ops/generic_ops-inl.h
|
||||
hwy/ops/scalar-inl.h
|
||||
hwy/ops/set_macros-inl.h
|
||||
hwy/ops/shared-inl.h
|
||||
|
@ -79,7 +90,13 @@ set(HWY_SOURCES
|
|||
hwy/ops/x86_512-inl.h
|
||||
hwy/targets.cc
|
||||
hwy/targets.h
|
||||
)
|
||||
|
||||
set(HWY_TEST_SOURCES
|
||||
hwy/tests/hwy_gtest.h
|
||||
hwy/tests/test_util-inl.h
|
||||
hwy/tests/test_util.cc
|
||||
hwy/tests/test_util.h
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
|
@ -98,16 +115,15 @@ else()
|
|||
# Warnings
|
||||
-Wall
|
||||
-Wextra
|
||||
-Wformat-security
|
||||
-Wno-unused-function
|
||||
-Wnon-virtual-dtor
|
||||
-Woverloaded-virtual
|
||||
# These are not included in Wall nor Wextra:
|
||||
-Wconversion
|
||||
-Wsign-conversion
|
||||
-Wvla
|
||||
-Wnon-virtual-dtor
|
||||
)
|
||||
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wc++2a-extensions
|
||||
-Wfloat-overflow-conversion
|
||||
-Wfloat-zero-conversion
|
||||
-Wfor-loop-analysis
|
||||
|
@ -126,32 +142,40 @@ else()
|
|||
# Use color in messages
|
||||
-fdiagnostics-show-option -fcolor-diagnostics
|
||||
)
|
||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
|
||||
list(APPEND HWY_FLAGS -Wc++2a-extensions)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wno-global-constructors
|
||||
-Wno-language-extension-token
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-shadow-field-in-constructor
|
||||
-Wno-unused-member-function
|
||||
-Wno-unused-template
|
||||
-Wno-c++98-compat-pedantic
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-zero-as-null-pointer-constant
|
||||
)
|
||||
endif()
|
||||
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wno-c++98-compat-pedantic
|
||||
-Wno-cast-align
|
||||
-Wno-double-promotion
|
||||
-Wno-float-equal
|
||||
-Wno-format-nonliteral
|
||||
-Wno-global-constructors
|
||||
-Wno-language-extension-token
|
||||
-Wno-missing-prototypes
|
||||
-Wno-shadow
|
||||
-Wno-shadow-field-in-constructor
|
||||
-Wno-sign-conversion
|
||||
-Wno-unused-member-function
|
||||
-Wno-unused-template
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-zero-as-null-pointer-constant
|
||||
)
|
||||
else()
|
||||
list(APPEND HWY_FLAGS
|
||||
-fmath-errno
|
||||
-fno-exceptions
|
||||
)
|
||||
endif()
|
||||
endif() # WIN32
|
||||
|
||||
if (HWY_CMAKE_ARM7)
|
||||
list(APPEND HWY_FLAGS
|
||||
|
@ -162,6 +186,10 @@ else()
|
|||
)
|
||||
endif() # HWY_CMAKE_ARM7
|
||||
|
||||
if (HWY_WARNINGS_ARE_ERRORS)
|
||||
list(APPEND HWY_FLAGS -Werror)
|
||||
endif()
|
||||
|
||||
endif() # !MSVC
|
||||
|
||||
add_library(hwy STATIC ${HWY_SOURCES})
|
||||
|
@ -174,6 +202,31 @@ target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
|
|||
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
add_library(hwy_test STATIC ${HWY_TEST_SOURCES})
|
||||
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
# flags. This tool will print to stderr at build time, after building hwy.
|
||||
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
|
||||
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
|
||||
target_link_libraries(hwy_list_targets hwy)
|
||||
target_include_directories(hwy_list_targets PRIVATE
|
||||
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
|
||||
# TARGET_FILE always returns the path to executable
|
||||
# Naked target also not always could be run (due to the lack of '.\' prefix)
|
||||
# Thus effective command to run should contain the full path
|
||||
# and emulator prefix (if any).
|
||||
add_custom_command(TARGET hwy_list_targets POST_BUILD
|
||||
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Allow skipping the following sections for projects that do not need them:
|
||||
# tests, examples, benchmarks and installation.
|
||||
if (HWY_EXAMPLES_TESTS_INSTALL)
|
||||
|
||||
# -------------------------------------------------------- install library
|
||||
install(TARGETS hwy
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
|
@ -199,6 +252,18 @@ foreach (source ${HWY_CONTRIB_SOURCES})
|
|||
endif()
|
||||
endforeach()
|
||||
|
||||
install(TARGETS hwy_test
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_TEST_SOURCES})
|
||||
if ("${source}" MATCHES "\.h$")
|
||||
get_filename_component(dirname "${source}" DIRECTORY)
|
||||
install(FILES "${source}"
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Add a pkg-config file for libhwy and the contrib/test libraries.
|
||||
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
|
||||
foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
|
||||
|
@ -207,20 +272,6 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
|
|||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endforeach()
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
# flags. This tool will print to stderr at build time, after building hwy.
|
||||
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
|
||||
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
|
||||
target_include_directories(hwy_list_targets PRIVATE
|
||||
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
|
||||
# TARGET_FILE always returns the path to executable
|
||||
# Naked target also not always could be run (due to the lack of '.\' prefix)
|
||||
# Thus effective command to run should contain the full path
|
||||
# and emulator prefix (if any).
|
||||
add_custom_command(TARGET hwy POST_BUILD
|
||||
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
|
||||
|
||||
# -------------------------------------------------------- Examples
|
||||
|
||||
# Avoids mismatch between GTest's static CRT and our dynamic.
|
||||
|
@ -284,18 +335,25 @@ endif()
|
|||
endif() # HWY_SYSTEM_GTEST
|
||||
|
||||
set(HWY_TEST_FILES
|
||||
hwy/contrib/dot/dot_test.cc
|
||||
hwy/contrib/image/image_test.cc
|
||||
# Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
|
||||
# not reproducible locally. Still tested via bazel build.
|
||||
# hwy/contrib/math/math_test.cc
|
||||
hwy/contrib/sort/sort_test.cc
|
||||
hwy/aligned_allocator_test.cc
|
||||
hwy/base_test.cc
|
||||
hwy/highway_test.cc
|
||||
hwy/targets_test.cc
|
||||
hwy/examples/skeleton_test.cc
|
||||
hwy/tests/arithmetic_test.cc
|
||||
hwy/tests/blockwise_test.cc
|
||||
hwy/tests/combine_test.cc
|
||||
hwy/tests/compare_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/crypto_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/mask_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
|
@ -314,9 +372,9 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
|||
target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib GTest::GTest GTest::Main)
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test GTest::GTest GTest::Main)
|
||||
else()
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib gtest gtest_main)
|
||||
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
|
||||
endif()
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
|
||||
|
@ -336,3 +394,5 @@ endforeach ()
|
|||
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
|
||||
|
||||
endif() # BUILD_TESTING
|
||||
|
||||
endif() # HWY_EXAMPLES_TESTS_INSTALL
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
cmake_minimum_required(VERSION 2.8.2)
|
||||
cmake_minimum_required(VERSION 2.8.12)
|
||||
|
||||
project(googletest-download NONE)
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Efficient and performance-portable SIMD
|
||||
|
||||
Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
|
||||
applying the same operation to 'lanes'.
|
||||
applying the same operation to multiple 'lanes' using a single CPU instruction.
|
||||
|
||||
## Why Highway?
|
||||
|
||||
|
@ -14,13 +14,17 @@ applying the same operation to 'lanes'.
|
|||
|
||||
## Current status
|
||||
|
||||
Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
|
||||
Ports to RVV and SVE/SVE2 are in progress.
|
||||
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE,
|
||||
WASM SIMD.
|
||||
|
||||
SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not
|
||||
yet validated. A subset of RVV is implemented and tested with GCC and QEMU.
|
||||
Work is underway to compile using LLVM, which has different intrinsics with AVL.
|
||||
|
||||
Version 0.11 is considered stable enough to use in other projects, and is
|
||||
expected to remain backwards compatible unless serious issues are discovered
|
||||
while implementing SVE/RVV targets. After these targets are added, Highway will
|
||||
reach version 1.0.
|
||||
while finishing the RVV target. After that, Highway will reach version 1.0.
|
||||
|
||||
Continuous integration tests build with a recent version of Clang (running on
|
||||
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
|
||||
|
@ -61,11 +65,6 @@ make -j && make test
|
|||
|
||||
Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
|
||||
|
||||
To test on all the attainable targets for your platform, use
|
||||
`cmake .. -DCMAKE_CXX_FLAGS="-DHWY_COMPILE_ALL_ATTAINABLE"`. Otherwise, the
|
||||
default configuration skips baseline targets (e.g. scalar) that are superseded
|
||||
by another baseline target.
|
||||
|
||||
Bazel is also supported for building, but it is not as widely used/tested.
|
||||
|
||||
## Quick start
|
||||
|
@ -73,16 +72,17 @@ Bazel is also supported for building, but it is not as widely used/tested.
|
|||
You can use the `benchmark` inside examples/ as a starting point.
|
||||
|
||||
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
|
||||
and their parameters, and the [instruction_matrix][instmtx] indicates the
|
||||
number of instructions per operation.
|
||||
and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
|
||||
indicates the number of instructions per operation.
|
||||
|
||||
We recommend using full SIMD vectors whenever possible for maximum performance
|
||||
portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
|
||||
`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
|
||||
two) lanes: `HWY_CAPPED(T, N)`. 128-bit vectors are guaranteed to be available
|
||||
for lanes of type `T` if `HWY_TARGET != HWY_SCALAR` and `N == 16 / sizeof(T)`.
|
||||
two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET ==
|
||||
HWY_SCALAR`, the vector always has one lane. For all other targets, up to
|
||||
128-bit vectors are guaranteed to be available.
|
||||
|
||||
Functions using Highway must be inside a namespace `namespace HWY_NAMESPACE {`
|
||||
Functions using Highway must be inside `namespace HWY_NAMESPACE {`
|
||||
(possibly nested in one or more other namespaces defined by the project), and
|
||||
additionally either prefixed with `HWY_ATTR`, or residing between
|
||||
`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
|
||||
|
@ -97,11 +97,27 @@ additionally either prefixed with `HWY_ATTR`, or residing between
|
|||
|
||||
* For dynamic dispatch, a table of function pointers is generated via the
|
||||
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
|
||||
call the best function pointer for the current CPU supported targets. A
|
||||
call the best function pointer for the current CPU's supported targets. A
|
||||
module is automatically compiled for each target in `HWY_TARGETS` (see
|
||||
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
|
||||
defined and foreach_target.h is included.
|
||||
|
||||
## Compiler flags
|
||||
|
||||
Applications should be compiled with optimizations enabled - without inlining,
|
||||
SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
|
||||
generally sufficient.
|
||||
|
||||
For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
|
||||
pass vector arguments in registers. If intending to use the AVX2 target together
|
||||
with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
|
||||
with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
|
||||
instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
|
||||
non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
|
||||
resulting binary will then require AVX2. Note that no such flag is needed for
|
||||
clang and GCC because they support target-specific attributes, which we use to
|
||||
ensure proper VEX code generation for AVX2 targets.
|
||||
|
||||
## Strip-mining loops
|
||||
|
||||
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
|
||||
|
@ -139,10 +155,7 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
The template parameter and second function arguments are again not needed.
|
||||
|
||||
This avoids duplicating code, and is reasonable if `count` is large.
|
||||
Otherwise, multiple iterations may be slower than one `LoopBody` variant
|
||||
with masking, especially because the `HWY_SCALAR` target selected by
|
||||
`HWY_CAPPED(T, 1)` is slower for some operations due to workarounds for
|
||||
undefined behavior in C++.
|
||||
If `count` is small, the second loop may be slower than the next option.
|
||||
|
||||
* Process whole vectors as above, followed by a single call to a modified
|
||||
`LoopBody` with masking:
|
||||
|
@ -157,210 +170,23 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
}
|
||||
```
|
||||
Now the template parameter and second function argument can be used inside
|
||||
`LoopBody` to replace `Load/Store` of full aligned vectors with
|
||||
`LoadN/StoreN(n)` that affect no more than `1 <= n <= N` aligned elements
|
||||
(pending implementation).
|
||||
`LoopBody` to 'blend' the new partial vector with previous memory contents:
|
||||
`Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.
|
||||
|
||||
This is a good default when it is infeasible to ensure vectors are padded.
|
||||
In contrast to the scalar loop, only a single final iteration is needed.
|
||||
|
||||
## Design philosophy
|
||||
|
||||
* Performance is important but not the sole consideration. Anyone who goes to
|
||||
the trouble of using SIMD clearly cares about speed. However, portability,
|
||||
maintainability and readability also matter, otherwise we would write in
|
||||
assembly. We aim for performance within 10-20% of a hand-written assembly
|
||||
implementation on the development platform.
|
||||
|
||||
* The guiding principles of C++ are "pay only for what you use" and "leave no
|
||||
room for a lower-level language below C++". We apply these by defining a
|
||||
SIMD API that ensures operation costs are visible, predictable and minimal.
|
||||
|
||||
* Performance portability is important, i.e. the API should be efficient on
|
||||
all target platforms. Unfortunately, common idioms for one platform can be
|
||||
inefficient on others. For example: summing lanes horizontally versus
|
||||
shuffling. Documenting which operations are expensive does not prevent their
|
||||
use, as evidenced by widespread use of `HADDPS`. Performance acceptance
|
||||
tests may detect large regressions, but do not help choose the approach
|
||||
during initial development. Analysis tools can warn about some potential
|
||||
inefficiencies, but likely not all. We instead provide [a carefully chosen
|
||||
set of vector types and operations that are efficient on all target
|
||||
platforms][instmtx] (PPC8, SSE4/AVX2+, ARMv8).
|
||||
|
||||
* Future SIMD hardware features are difficult to predict. For example, AVX2
|
||||
came with surprising semantics (almost no interaction between 128-bit
|
||||
blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
|
||||
To ensure the API reflects hardware realities, we suggest a flexible
|
||||
approach that adds new operations as they become commonly available, with
|
||||
scalar fallbacks where not supported.
|
||||
|
||||
* Masking is not yet widely supported on current CPUs. It is difficult to
|
||||
define an interface that provides access to all platform features while
|
||||
retaining performance portability. The P0214R5 proposal lacks support for
|
||||
AVX-512/ARM SVE zeromasks. We suggest limiting usage of masks to the
|
||||
`IfThen[Zero]Else[Zero]` functions until the community has gained more
|
||||
experience with them.
|
||||
|
||||
* "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
|
||||
For example, valarray-like code can iterate over a 1D array with a
|
||||
library-specified vector width. This will result in better code when vector
|
||||
sizes increase, and matches the direction taken by
|
||||
[ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
|
||||
RiscV V as well as Agner Fog's
|
||||
[ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
|
||||
applications may require fixed sizes, so we also guarantee support for
|
||||
128-bit vectors in each instruction set.
|
||||
|
||||
* The API and its implementation should be usable and efficient with commonly
|
||||
used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
|
||||
instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
|
||||
(https://godbolt.org/g/rKx5Ga). Highway requires function-specific
|
||||
target attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
|
||||
|
||||
* Efficient and safe runtime dispatch is important. Modules such as image or
|
||||
video codecs are typically embedded into larger applications such as
|
||||
browsers, so they cannot require separate binaries for each CPU. Libraries
|
||||
also cannot predict whether the application already uses AVX2 (and pays the
|
||||
frequency throttling cost), so this decision must be left to the
|
||||
application. Using only the lowest-common denominator instructions
|
||||
sacrifices too much performance.
|
||||
Therefore, we provide code paths for multiple instruction sets and choose
|
||||
the most suitable at runtime. To reduce overhead, dispatch should be hoisted
|
||||
to higher layers instead of checking inside every low-level function.
|
||||
Highway supports inlining functions in the same file or in *-inl.h headers.
|
||||
We generate all code paths from the same source to reduce implementation-
|
||||
and debugging cost.
|
||||
|
||||
* Not every CPU need be supported. For example, pre-SSE4.1 CPUs are
|
||||
increasingly rare and the AVX instruction set is limited to floating-point
|
||||
operations. To reduce code size and compile time, we provide specializations
|
||||
for SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar fallback.
|
||||
|
||||
* Access to platform-specific intrinsics is necessary for acceptance in
|
||||
performance-critical projects. We provide conversions to and from intrinsics
|
||||
to allow utilizing specialized platform-specific functionality, and simplify
|
||||
incremental porting of existing code.
|
||||
|
||||
* The core API should be compact and easy to learn. We provide only the few
|
||||
dozen operations which are necessary and sufficient for most of the 150+
|
||||
SIMD applications we examined.
|
||||
|
||||
## Prior API designs
|
||||
|
||||
The author has been writing SIMD code since 2002: first via assembly language,
|
||||
then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
|
||||
of custom vector classes. The first used macros to generate the classes, which
|
||||
reduces duplication but also readability. The second used templates instead.
|
||||
The third (used in highwayhash and PIK) added support for AVX2 and runtime
|
||||
dispatch. The current design (used in JPEG XL) enables code generation for
|
||||
multiple platforms and/or instruction sets from the same source, and improves
|
||||
runtime dispatch.
|
||||
|
||||
## Differences versus [P0214R5 proposal](https://goo.gl/zKW4SA)
|
||||
|
||||
1. Adding widely used and portable operations such as `AndNot`, `AverageRound`,
|
||||
bit-shift by immediates and `IfThenElse`.
|
||||
|
||||
1. Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
|
||||
crossing 128-bit 'block' boundaries.
|
||||
|
||||
1. Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
|
||||
returns `fixed_size<>` vectors which are more expensive to access because
|
||||
they reside on the stack. We can avoid this plus additional overhead on
|
||||
ARM/AVX2 by defining width-expanding operations as functions of a vector
|
||||
part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
|
||||
`uint16_t`, or demoting full vectors to half vectors with half-width lanes.
|
||||
|
||||
1. Guaranteeing access to the underlying intrinsic vector type. This ensures
|
||||
all platform-specific capabilities can be used. P0214R5 instead only
|
||||
'encourages' implementations to provide access.
|
||||
|
||||
1. Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
|
||||
based on the Vc library, which does not provide assistance for linking
|
||||
multiple instruction sets into the same binary. The Vc documentation
|
||||
suggests compiling separate executables for each instruction set or using
|
||||
GCC's ifunc (indirect functions). The latter is compiler-specific and risks
|
||||
crashes due to ODR violations when compiling the same function with
|
||||
different compiler flags. We solve this problem via target-specific
|
||||
namespaces and attributes (see HOWTO section below). We also permit a mix of
|
||||
static target selection and runtime dispatch for hotspots that may benefit
|
||||
from newer instruction sets if available.
|
||||
|
||||
1. Using built-in PPC vector types without a wrapper class. This leads to much
|
||||
better code generation with GCC 6.3: https://godbolt.org/z/pd2PNP.
|
||||
By contrast, P0214R5 requires a wrapper. We avoid this by using only the
|
||||
member operators provided by the PPC vectors; all other functions and
|
||||
typedefs are non-members. 2019-04 update: Clang power64le does not have
|
||||
this issue, so we simplified get_part(d, v) to GetLane(v).
|
||||
|
||||
1. Omitting inefficient or non-performance-portable operations such as `hmax`,
|
||||
`operator[]`, and unsupported integer comparisons. Applications can often
|
||||
replace these operations at lower cost than emulating that exact behavior.
|
||||
|
||||
1. Omitting `long double` types: these are not commonly available in hardware.
|
||||
|
||||
1. Ensuring signed integer overflow has well-defined semantics (wraparound).
|
||||
|
||||
1. Simple header-only implementation and less than a tenth of the size of the
|
||||
Vc library from which P0214 was derived (98,000 lines in
|
||||
https://github.com/VcDevel/Vc according to the gloc Chrome extension).
|
||||
|
||||
1. Avoiding hidden performance costs. P0214R5 allows implicit conversions from
|
||||
integer to float, which costs 3-4 cycles on x86. We make these conversions
|
||||
explicit to ensure their cost is visible.
|
||||
|
||||
## Other related work
|
||||
|
||||
* [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
|
||||
adopts a similar approach with interchangeable vector/scalar types and
|
||||
a compact interface. It allows access to the underlying intrinsics, but
|
||||
does not appear to be designed for other platforms than x86.
|
||||
|
||||
* UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
|
||||
also adopts an explicit vectorization model with vector classes.
|
||||
However, it exposes the union of all platform capabilities, which makes the
|
||||
API harder to learn (209-page spec) and implement (the estimated LOC count
|
||||
is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
|
||||
because it allows applications to use operations that are inefficient on
|
||||
other platforms.
|
||||
|
||||
* Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
|
||||
is a vector library for scientific computing with some innovative features:
|
||||
automatic FLOPS counting, and "if/else branches" using lambda functions.
|
||||
It supports IBM Power8, but only provides float and double types.
|
||||
|
||||
## Overloaded function API
|
||||
|
||||
Most C++ vector APIs rely on class templates. However, the ARM SVE vector
|
||||
type is sizeless and cannot be wrapped in a class. We instead rely on overloaded
|
||||
functions. Overloading based on vector types is also undesirable because SVE
|
||||
vectors cannot be default-constructed. We instead use a dedicated 'descriptor'
|
||||
type `Simd` for overloading, abbreviated to `D` for template arguments and
|
||||
`d` in lvalues.
|
||||
|
||||
Note that generic function templates are possible (see highway.h).
|
||||
|
||||
## Masks
|
||||
|
||||
AVX-512 introduced a major change to the SIMD interface: special mask registers
|
||||
(one bit per lane) that serve as predicates. It would be expensive to force
|
||||
AVX-512 implementations to conform to the prior model of full vectors with lanes
|
||||
set to all one or all zero bits. We instead provide a Mask type that emulates
|
||||
a subset of this functionality on other platforms at zero cost.
|
||||
|
||||
Masks are returned by comparisons and `TestBit`; they serve as the input to
|
||||
`IfThen*`. We provide conversions between masks and vector lanes. For clarity
|
||||
and safety, we use FF..FF as the definition of true. To also benefit from
|
||||
x86 instructions that only require the sign bit of floating-point inputs to be
|
||||
set, we provide a special `ZeroIfNegative` function.
|
||||
|
||||
## Additional resources
|
||||
|
||||
* [Highway introduction (slides)][intro]
|
||||
* [Overview of instructions per operation on different architectures][instmtx]
|
||||
* [Highway introduction (slides)](g3doc/highway_intro.pdf)
|
||||
* [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
|
||||
* [Design philosophy and comparison](g3doc/design_philosophy.md)
|
||||
|
||||
[intro]: g3doc/highway_intro.pdf
|
||||
[instmtx]: g3doc/instruction_matrix.pdf
|
||||
## Acknowledgments
|
||||
|
||||
We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
|
||||
Bramas; it has proved useful for checking the SVE port on an x86 development
|
||||
machine.
|
||||
|
||||
This is not an officially supported Google product.
|
||||
Contact: janwas@google.com
|
||||
|
|
|
@ -1,3 +1,41 @@
|
|||
highway (0.15.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
|
||||
* New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
|
||||
* Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
|
||||
* Contrib: add sort(ing network) and dot(product)
|
||||
* Targets: update RVV for LLVM, add experimental WASM2
|
||||
* Separate library hwy_test for test utils
|
||||
* Add non-macro Simd<> aliases
|
||||
* Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Wed, 10 Nov 2021 10:00:00 +0100
|
||||
|
||||
highway (0.14.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add MaskedLoad
|
||||
* Fix non-glibc PPC, Windows GCC, MSVC 19.14
|
||||
* Opt-in for -Werror; separate design_philosophy.md
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 24 Aug 2021 15:00:00 +0200
|
||||
|
||||
highway (0.14.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add LoadMaskBits, CompressBits[Store]
|
||||
* Fix CPU feature check (AES/F16C) and warnings
|
||||
* Improved DASSERT - disabled in optimized builds
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 17 Aug 2021 14:00:00 +0200
|
||||
|
||||
highway (0.14.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add SVE, S-SSE3, AVX3_DL targets
|
||||
* Support partial vectors in all ops
|
||||
* Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
|
||||
* Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 29 Jul 2021 15:00:00 +0200
|
||||
|
||||
highway (0.12.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* fix scalar-only test and Windows macro conflict with Load/StoreFence
|
||||
|
|
|
@ -27,10 +27,21 @@
|
|||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize);
|
||||
#if HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
// Not actually an upper bound on the size, but this value prevents crossing a
|
||||
// 4K boundary (relevant on Andes).
|
||||
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
|
||||
#else
|
||||
constexpr size_t kAlignment = HWY_ALIGNMENT;
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
|
||||
// if this is used for single-vector allocations. 256 is more reasonable.
|
||||
constexpr size_t kAlias = kAlignment * 4;
|
||||
#else
|
||||
constexpr size_t kAlias = kAlignment;
|
||||
#endif
|
||||
|
||||
#pragma pack(push, 1)
|
||||
struct AllocationHeader {
|
||||
|
@ -53,6 +64,7 @@ size_t NextAlignedOffset() {
|
|||
|
||||
void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
|
||||
void* opaque_ptr) {
|
||||
HWY_ASSERT(payload_size != 0); // likely a bug in caller
|
||||
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
|
||||
HWY_DASSERT(false && "payload_size too large");
|
||||
return nullptr;
|
||||
|
@ -94,7 +106,7 @@ void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
|
|||
header->allocated = allocated;
|
||||
header->payload_size = payload_size;
|
||||
|
||||
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kMaxVectorSize);
|
||||
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
|
||||
}
|
||||
|
||||
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
||||
|
|
|
@ -120,13 +120,13 @@ TEST(AlignedAllocatorTest, AllocDefaultPointers) {
|
|||
/*opaque_ptr=*/nullptr);
|
||||
ASSERT_NE(nullptr, ptr);
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
|
||||
char* p = static_cast<char*>(ptr);
|
||||
size_t ret = 0;
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
// Performs a computation using p[] to prevent it being optimized away.
|
||||
p[i] = static_cast<char>(i & 0x7F);
|
||||
if (i) ret += p[i] * p[i - 1];
|
||||
if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
|
||||
}
|
||||
EXPECT_NE(0U, ret);
|
||||
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
|
||||
|
@ -152,7 +152,7 @@ TEST(AlignedAllocatorTest, CustomAlloc) {
|
|||
// We should have only requested one alloc from the allocator.
|
||||
EXPECT_EQ(1U, fake_alloc.PendingAllocs());
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
|
||||
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
|
||||
EXPECT_EQ(0U, fake_alloc.PendingAllocs());
|
||||
}
|
||||
|
@ -197,7 +197,7 @@ TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
|
|||
TEST(AlignedAllocatorTest, AllocSingleInt) {
|
||||
auto ptr = AllocateAligned<uint32_t>(1);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
|
||||
// Force delete of the unique_ptr now to check that it doesn't crash.
|
||||
ptr.reset(nullptr);
|
||||
EXPECT_EQ(nullptr, ptr.get());
|
||||
|
@ -207,7 +207,7 @@ TEST(AlignedAllocatorTest, AllocMultipleInt) {
|
|||
const size_t kSize = 7777;
|
||||
auto ptr = AllocateAligned<uint32_t>(kSize);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
|
||||
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
|
||||
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
|
||||
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
|
||||
|
@ -276,3 +276,9 @@ TEST(AlignedAllocatorTest, DefaultInit) {
|
|||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
|
@ -23,72 +23,7 @@
|
|||
#include <atomic>
|
||||
#include <cfloat>
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect compiler using predefined macros
|
||||
|
||||
// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
|
||||
// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
|
||||
// purpose.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang can masquerade as MSVC/GCC, in which case both are set.
|
||||
#ifdef __clang__
|
||||
#ifdef __APPLE__
|
||||
// Apple LLVM version is unrelated to the actual Clang version, which we need
|
||||
// for enabling workarounds. Use the presence of warning flags to deduce it.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
#elif __has_warning("-Wmisleading-indentation")
|
||||
#define HWY_COMPILER_CLANG 1000
|
||||
#elif defined(__FILE_NAME__)
|
||||
#define HWY_COMPILER_CLANG 900
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi")
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#else // Non-Apple: normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
|
||||
!HWY_COMPILER_CLANG
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler-specific definitions
|
||||
|
@ -140,18 +75,6 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Builtin/attributes
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
#define HWY_HAS_BUILTIN(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_attribute
|
||||
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
// Enables error-checking of format strings.
|
||||
#if HWY_HAS_ATTRIBUTE(__format__)
|
||||
#define HWY_FORMAT(idx_fmt, idx_arg) \
|
||||
|
@ -175,9 +98,9 @@
|
|||
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
||||
// automatic annotation via pragmas.
|
||||
#if HWY_COMPILER_CLANG
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
|
||||
apply_to = function))
|
||||
apply_to = function))
|
||||
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
|
||||
#elif HWY_COMPILER_GCC
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
|
@ -188,78 +111,6 @@
|
|||
#define HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect architecture using predefined macros
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HWY_ARCH_X86_64 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
|
||||
#error "Cannot have both x86-32 and x86-64"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
||||
#define HWY_ARCH_X86 1
|
||||
#else
|
||||
#define HWY_ARCH_X86 0
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc64__) || defined(_M_PPC)
|
||||
#define HWY_ARCH_PPC 1
|
||||
#else
|
||||
#define HWY_ARCH_PPC 0
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
||||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
#define HWY_ARCH_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#define HWY_ARCH_RVV 1
|
||||
#else
|
||||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
||||
HWY_ARCH_RVV) > 1
|
||||
#error "Must not detect more than one architecture"
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Macros
|
||||
|
||||
|
@ -295,9 +146,37 @@
|
|||
} \
|
||||
} while (0)
|
||||
|
||||
// Only for "debug" builds
|
||||
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
|
||||
defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
|
||||
#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
|
||||
#define HWY_IS_MSAN 1
|
||||
#else
|
||||
#define HWY_IS_MSAN 0
|
||||
#endif
|
||||
|
||||
#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
|
||||
#define HWY_IS_ASAN 1
|
||||
#else
|
||||
#define HWY_IS_ASAN 0
|
||||
#endif
|
||||
|
||||
#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
|
||||
#define HWY_IS_TSAN 1
|
||||
#else
|
||||
#define HWY_IS_TSAN 0
|
||||
#endif
|
||||
|
||||
// For enabling HWY_DASSERT and shortening tests in slower debug builds
|
||||
#if !defined(HWY_IS_DEBUG_BUILD)
|
||||
// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
|
||||
// MSVC defines NDEBUG (if not, could instead check _DEBUG).
|
||||
#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
|
||||
HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
|
||||
#define HWY_IS_DEBUG_BUILD 1
|
||||
#else
|
||||
#define HWY_IS_DEBUG_BUILD 0
|
||||
#endif
|
||||
#endif // HWY_IS_DEBUG_BUILD
|
||||
|
||||
#if HWY_IS_DEBUG_BUILD
|
||||
#define HWY_DASSERT(condition) HWY_ASSERT(condition)
|
||||
#else
|
||||
#define HWY_DASSERT(condition) \
|
||||
|
@ -305,24 +184,33 @@
|
|||
} while (0)
|
||||
#endif
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
class FarmFloat16;
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Alignment
|
||||
// kMaxVectorSize (undocumented, pending removal)
|
||||
|
||||
// Not guaranteed to be an upper bound, but the alignment established by
|
||||
// aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
|
||||
#if HWY_ARCH_X86
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
|
||||
#define HWY_ALIGN_MAX alignas(64)
|
||||
#elif HWY_ARCH_RVV
|
||||
// Not actually an upper bound on the size, but this value prevents crossing a
|
||||
// 4K boundary (relevant on Andes).
|
||||
#elif HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
// Not actually an upper bound on the size.
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
|
||||
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
|
||||
#else
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Alignment
|
||||
|
||||
// For stack-allocated partial arrays or LoadDup128.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ALIGN_MAX alignas(64)
|
||||
#elif HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
|
||||
#else
|
||||
#define HWY_ALIGN_MAX alignas(16)
|
||||
#endif
|
||||
|
||||
|
@ -332,26 +220,33 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|||
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
||||
// by concatenating base type and bits.
|
||||
|
||||
// RVV already has a builtin type and the GCC intrinsics require it.
|
||||
#if HWY_ARCH_RVV && HWY_COMPILER_GCC
|
||||
#if HWY_ARCH_ARM && (__ARM_FP & 2)
|
||||
#define HWY_NATIVE_FLOAT16 1
|
||||
#else
|
||||
#define HWY_NATIVE_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
#pragma pack(push, 1)
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
using float16_t = FarmFloat16;
|
||||
#elif HWY_NATIVE_FLOAT16
|
||||
using float16_t = __fp16;
|
||||
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
|
||||
// arguments, so use a wrapper.
|
||||
// TODO(janwas): replace with _Float16 when that is supported?
|
||||
#else
|
||||
#pragma pack(push, 1)
|
||||
struct float16_t {
|
||||
uint16_t bits;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
#endif
|
||||
|
||||
struct bfloat16_t {
|
||||
uint16_t bits;
|
||||
};
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
using float32_t = float;
|
||||
using float64_t = double;
|
||||
|
||||
|
@ -368,15 +263,34 @@ struct EnableIfT<true, T> {
|
|||
template <bool Condition, class T = void>
|
||||
using EnableIf = typename EnableIfT<Condition, T>::type;
|
||||
|
||||
template <typename T, typename U>
|
||||
struct IsSameT {
|
||||
enum { value = 0 };
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct IsSameT<T, T> {
|
||||
enum { value = 1 };
|
||||
};
|
||||
|
||||
template <typename T, typename U>
|
||||
HWY_API constexpr bool IsSame() {
|
||||
return IsSameT<T, U>::value;
|
||||
}
|
||||
|
||||
// Insert into template/function arguments to enable this overload only for
|
||||
// vectors of AT MOST this many bits.
|
||||
//
|
||||
// Note that enabling for exactly 128 bits is unnecessary because a function can
|
||||
// simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
|
||||
// other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
|
||||
// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
|
||||
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
|
||||
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
|
||||
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
|
||||
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
|
||||
#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
|
||||
#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
|
||||
#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
|
||||
#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
|
||||
|
||||
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
|
||||
#define HWY_IF_SIGNED(T) \
|
||||
|
@ -393,28 +307,50 @@ using EnableIf = typename EnableIfT<Condition, T>::type;
|
|||
template <size_t N>
|
||||
struct SizeTag {};
|
||||
|
||||
template <class T>
|
||||
struct RemoveConstT {
|
||||
using type = T;
|
||||
};
|
||||
template <class T>
|
||||
struct RemoveConstT<const T> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using RemoveConst = typename RemoveConstT<T>::type;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
template <typename T>
|
||||
constexpr bool IsFloat() {
|
||||
return T(1.25) != T(1);
|
||||
HWY_API constexpr bool IsFloat() {
|
||||
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
||||
// from a float, not compared.
|
||||
return IsSame<T, float>() || IsSame<T, double>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool IsSigned() {
|
||||
HWY_API constexpr bool IsSigned() {
|
||||
return T(0) > T(-1);
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<float16_t>() {
|
||||
return true;
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<bfloat16_t>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Largest/smallest representable integer values.
|
||||
template <typename T>
|
||||
constexpr T LimitsMax() {
|
||||
HWY_API constexpr T LimitsMax() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
|
||||
: static_cast<T>(~0ull);
|
||||
}
|
||||
template <typename T>
|
||||
constexpr T LimitsMin() {
|
||||
HWY_API constexpr T LimitsMin() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
|
||||
}
|
||||
|
@ -422,7 +358,7 @@ constexpr T LimitsMin() {
|
|||
// Largest/smallest representable value (integer or float). This naming avoids
|
||||
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
||||
template <typename T>
|
||||
constexpr T LowestValue() {
|
||||
HWY_API constexpr T LowestValue() {
|
||||
return LimitsMin<T>();
|
||||
}
|
||||
template <>
|
||||
|
@ -435,7 +371,7 @@ constexpr double LowestValue<double>() {
|
|||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T HighestValue() {
|
||||
HWY_API constexpr T HighestValue() {
|
||||
return LimitsMax<T>();
|
||||
}
|
||||
template <>
|
||||
|
@ -550,11 +486,18 @@ struct Relations<float16_t> {
|
|||
using Wide = float;
|
||||
};
|
||||
template <>
|
||||
struct Relations<bfloat16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = float;
|
||||
};
|
||||
template <>
|
||||
struct Relations<float> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = double;
|
||||
using Narrow = float16_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<double> {
|
||||
|
@ -564,6 +507,31 @@ struct Relations<double> {
|
|||
using Narrow = float;
|
||||
};
|
||||
|
||||
template <size_t N>
|
||||
struct TypeFromSize;
|
||||
template <>
|
||||
struct TypeFromSize<1> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<2> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<4> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
};
|
||||
template <>
|
||||
struct TypeFromSize<8> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Aliases for types of a different category, but the same size.
|
||||
|
@ -580,6 +548,14 @@ using MakeWide = typename detail::Relations<T>::Wide;
|
|||
template <typename T>
|
||||
using MakeNarrow = typename detail::Relations<T>::Narrow;
|
||||
|
||||
// Obtain type from its size [bytes].
|
||||
template <size_t N>
|
||||
using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
|
||||
template <size_t N>
|
||||
using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
|
||||
template <size_t N>
|
||||
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
|
||||
|
@ -599,27 +575,123 @@ HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
|||
unsigned long index; // NOLINT
|
||||
_BitScanForward(&index, x);
|
||||
return index;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_ctz(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
#if HWY_ARCH_X86_64
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanForward64(&index, x);
|
||||
return index;
|
||||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanForward64 not available
|
||||
uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
unsigned long index;
|
||||
if (lsb == 0) {
|
||||
uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
_BitScanForward(&index, msb);
|
||||
return 32 + index;
|
||||
} else {
|
||||
_BitScanForward(&index, lsb);
|
||||
return index;
|
||||
}
|
||||
#endif // HWY_ARCH_X86_64
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_ctzll(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
// Undefined results for x == 0.
|
||||
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanReverse(&index, x);
|
||||
return 31 - index;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_clz(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
#if HWY_ARCH_X86_64
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanReverse64(&index, x);
|
||||
return 63 - index;
|
||||
#else // HWY_ARCH_X86_64
|
||||
// _BitScanReverse64 not available
|
||||
const uint32_t msb = static_cast<uint32_t>(x >> 32u);
|
||||
unsigned long index;
|
||||
if (msb == 0) {
|
||||
const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
|
||||
_BitScanReverse(&index, lsb);
|
||||
return 63 - index;
|
||||
} else {
|
||||
_BitScanReverse(&index, msb);
|
||||
return 31 - index;
|
||||
}
|
||||
#endif // HWY_ARCH_X86_64
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_clzll(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t PopCount(uint64_t x) {
|
||||
#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
|
||||
return static_cast<size_t>(__builtin_popcountll(x));
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
// This instruction has a separate feature flag, but is often called from
|
||||
// non-SIMD code, so we don't want to require dynamic dispatch. It was first
|
||||
// supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
|
||||
// for AVX, so check for that.
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
|
||||
return _mm_popcnt_u64(x);
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
|
||||
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
|
||||
#else
|
||||
x -= ((x >> 1) & 0x55555555U);
|
||||
x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
|
||||
x = (((x >> 4) + x) & 0x0F0F0F0FU);
|
||||
x -= ((x >> 1) & 0x5555555555555555ULL);
|
||||
x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
|
||||
x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
|
||||
x += (x >> 8);
|
||||
x += (x >> 16);
|
||||
x += (x >> 32);
|
||||
x = x & 0x0000007FU;
|
||||
return (unsigned int)x;
|
||||
return static_cast<size_t>(x & 0x7Fu);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename TI>
|
||||
HWY_API constexpr size_t FloorLog2(TI x) {
|
||||
return x == 1 ? 0 : FloorLog2(x >> 1) + 1;
|
||||
}
|
||||
|
||||
template <typename TI>
|
||||
HWY_API constexpr size_t CeilLog2(TI x) {
|
||||
return x == 1 ? 0 : FloorLog2(x - 1) + 1;
|
||||
}
|
||||
|
||||
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
#pragma intrinsic(_umul128)
|
||||
#endif
|
||||
|
||||
// 64 x 64 = 128 bit multiplication
|
||||
HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
|
||||
#if defined(__SIZEOF_INT128__)
|
||||
__uint128_t product = (__uint128_t)a * (__uint128_t)b;
|
||||
*upper = (uint64_t)(product >> 64);
|
||||
return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
return _umul128(a, b, upper);
|
||||
#else
|
||||
constexpr uint64_t kLo32 = 0xFFFFFFFFU;
|
||||
const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
|
||||
const uint64_t hi_lo = (a >> 32) * (b & kLo32);
|
||||
const uint64_t lo_hi = (a & kLo32) * (b >> 32);
|
||||
const uint64_t hi_hi = (a >> 32) * (b >> 32);
|
||||
const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
|
||||
*upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
|
||||
return (t << 32) | (lo_lo & kLo32);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -639,6 +711,22 @@ HWY_API void CopyBytes(const From* from, To* to) {
|
|||
#endif
|
||||
}
|
||||
|
||||
HWY_API float F32FromBF16(bfloat16_t bf) {
|
||||
uint32_t bits = bf.bits;
|
||||
bits <<= 16;
|
||||
float f;
|
||||
CopyBytes<4>(&bits, &f);
|
||||
return f;
|
||||
}
|
||||
|
||||
HWY_API bfloat16_t BF16FromF32(float f) {
|
||||
uint32_t bits;
|
||||
CopyBytes<4>(&f, &bits);
|
||||
bfloat16_t bf;
|
||||
bf.bits = static_cast<uint16_t>(bits >> 16);
|
||||
return bf;
|
||||
}
|
||||
|
||||
HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...);
|
||||
|
||||
|
|
|
@ -90,6 +90,51 @@ HWY_NOINLINE void TestAllType() {
|
|||
ForFloatTypes(TestIsFloat());
|
||||
}
|
||||
|
||||
struct TestIsSame {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(IsSame<T, T>(), "T == T");
|
||||
static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
|
||||
static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
|
||||
|
||||
HWY_NOINLINE void TestAllBitScan() {
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsAboveMS1Bit_Nonzero32(3u));
|
||||
HWY_ASSERT_EQ(size_t(31), Num0BitsAboveMS1Bit_Nonzero32(1u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0),
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(0),
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(1),
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(1),
|
||||
Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
|
||||
HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t(62), Num0BitsAboveMS1Bit_Nonzero64(3ull));
|
||||
HWY_ASSERT_EQ(size_t(63), Num0BitsAboveMS1Bit_Nonzero64(1ull));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero32(1u));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero32(2u));
|
||||
HWY_ASSERT_EQ(size_t(30), Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
|
||||
HWY_ASSERT_EQ(size_t(31), Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(0), Num0BitsBelowLS1Bit_Nonzero64(1ull));
|
||||
HWY_ASSERT_EQ(size_t(1), Num0BitsBelowLS1Bit_Nonzero64(2ull));
|
||||
HWY_ASSERT_EQ(size_t(62),
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
|
||||
HWY_ASSERT_EQ(size_t(63),
|
||||
Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPopCount() {
|
||||
HWY_ASSERT_EQ(size_t(0), PopCount(0u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(1u));
|
||||
|
@ -113,11 +158,21 @@ HWY_NOINLINE void TestAllPopCount() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(BaseTest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -55,20 +55,28 @@ namespace hwy {
|
|||
// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
|
||||
// serves as a full fence (waits for all prior instructions to complete).
|
||||
// No effect on non-x86.
|
||||
// DEPRECATED due to differing behavior across architectures AND vendors.
|
||||
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_lfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Ensures previous weakly-ordered stores are visible. No effect on non-x86.
|
||||
HWY_INLINE HWY_ATTR_CACHE void StoreFence() {
|
||||
// Ensures values written by previous `Stream` calls are visible on the current
|
||||
// core. This is NOT sufficient for synchronizing across cores; when `Stream`
|
||||
// outputs are to be consumed by other core(s), the producer must publish
|
||||
// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Begins loading the cache line containing "p".
|
||||
// DEPRECATED, replace with `FlushStream`.
|
||||
HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); }
|
||||
|
||||
// Optionally begins loading the cache line containing "p" to reduce latency of
|
||||
// subsequent actual loads.
|
||||
template <typename T>
|
||||
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
|
@ -82,7 +90,7 @@ HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Invalidates and flushes the cache line containing "p". No effect on non-x86.
|
||||
// Invalidates and flushes the cache line containing "p", if possible.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_clflush(p);
|
||||
|
@ -91,7 +99,7 @@ HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Reduces power consumption in spin-loops. No effect on non-x86.
|
||||
// When called inside a spin-loop, may reduce power consumption.
|
||||
HWY_INLINE HWY_ATTR_CACHE void Pause() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_pause();
|
||||
|
|
|
@ -0,0 +1,258 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Include guard (still compiled once per target)
|
||||
#include <cmath>
|
||||
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct Dot {
|
||||
// Specify zero or more of these, ORed together, as the kAssumptions template
|
||||
// argument to Compute. Each one may improve performance or reduce code size,
|
||||
// at the cost of additional requirements on the arguments.
|
||||
enum Assumptions {
|
||||
// num_elements is at least N, which may be up to HWY_MAX_LANES(T).
|
||||
kAtLeastOneVector = 1,
|
||||
// num_elements is divisible by N (a power of two, so this can be used if
|
||||
// the problem size is known to be a power of two >= HWY_MAX_LANES(T)).
|
||||
kMultipleOfVector = 2,
|
||||
// RoundUpTo(num_elements, N) elements are accessible; their value does not
|
||||
// matter (will be treated as if they were zero).
|
||||
kPaddedToVector = 4,
|
||||
// Pointers pa and pb, respectively, are multiples of N * sizeof(T).
|
||||
// For example, aligned_allocator.h ensures this. Note that it is still
|
||||
// beneficial to ensure such alignment even if these flags are not set.
|
||||
// If not set, the pointers need only be aligned to alignof(T).
|
||||
kVectorAlignedA = 8,
|
||||
kVectorAlignedB = 16,
|
||||
};
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for float or double inputs.
|
||||
template <int kAssumptions, class D, typename T = TFromD<D>,
|
||||
HWY_IF_NOT_LANE_SIZE_D(D, 2)>
|
||||
static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
|
||||
const T* const HWY_RESTRICT pb,
|
||||
const size_t num_elements) {
|
||||
static_assert(IsFloat<T>(), "MulAdd requires float type");
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
size_t i = 0;
|
||||
|
||||
constexpr bool kIsAtLeastOneVector =
|
||||
(kAssumptions & kAtLeastOneVector) != 0;
|
||||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
|
||||
constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
HWY_UNLIKELY(num_elements < N)) {
|
||||
// Only 2x unroll to avoid excessive code size.
|
||||
T sum0 = T(0);
|
||||
T sum1 = T(0);
|
||||
for (; i + 2 <= num_elements; i += 2) {
|
||||
sum0 += pa[i + 0] * pb[i + 0];
|
||||
sum1 += pa[i + 1] * pb[i + 1];
|
||||
}
|
||||
if (i < num_elements) {
|
||||
sum1 += pa[i] * pb[i];
|
||||
}
|
||||
return sum0 + sum1;
|
||||
}
|
||||
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
|
||||
// for unaligned inputs (each unaligned pointer halves the throughput
|
||||
// because it occupies both L1 load ports for a cycle). We cannot have
|
||||
// arrays of vectors on RVV/SVE, so always unroll 4x.
|
||||
V sum0 = Zero(d);
|
||||
V sum1 = Zero(d);
|
||||
V sum2 = Zero(d);
|
||||
V sum3 = Zero(d);
|
||||
|
||||
// Main loop: unrolled
|
||||
for (; i + 4 * N <= num_elements; /* i += 4 * N */) { // incr in loop
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = MulAdd(a0, b0, sum0);
|
||||
const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum1 = MulAdd(a1, b1, sum1);
|
||||
const auto a2 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b2 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = MulAdd(a2, b2, sum2);
|
||||
const auto a3 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b3 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum3 = MulAdd(a3, b3, sum3);
|
||||
}
|
||||
|
||||
// Up to 3 iterations of whole vectors
|
||||
for (; i + N <= num_elements; i += N) {
|
||||
const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
sum0 = MulAdd(a, b, sum0);
|
||||
}
|
||||
|
||||
if (!kIsMultipleOfVector) {
|
||||
const size_t remaining = num_elements - i;
|
||||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(d, remaining);
|
||||
const auto a = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
|
||||
} else {
|
||||
// Unaligned load such that the last element is in the highest lane -
|
||||
// ensures we do not touch any elements outside the valid range.
|
||||
// If we get here, then num_elements >= N.
|
||||
HWY_DASSERT(i >= N);
|
||||
i += remaining - N;
|
||||
const auto skip = FirstN(d, N - remaining);
|
||||
const auto a = LoadU(d, pa + i); // always unaligned
|
||||
const auto b = LoadU(d, pb + i);
|
||||
sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
|
||||
}
|
||||
}
|
||||
} // kMultipleOfVector
|
||||
|
||||
// Reduction tree: sum of all accumulators by pairs, then across lanes.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
return GetLane(SumOfLanes(d, sum0));
|
||||
}
|
||||
|
||||
// Returns sum{pa[i] * pb[i]} for bfloat16 inputs.
|
||||
template <int kAssumptions, class D>
|
||||
static HWY_INLINE float Compute(const D d,
|
||||
const bfloat16_t* const HWY_RESTRICT pa,
|
||||
const bfloat16_t* const HWY_RESTRICT pb,
|
||||
const size_t num_elements) {
|
||||
const RebindToUnsigned<D> du16;
|
||||
const Repartition<float, D> df32;
|
||||
|
||||
using V = decltype(Zero(df32));
|
||||
const size_t N = Lanes(d);
|
||||
size_t i = 0;
|
||||
|
||||
constexpr bool kIsAtLeastOneVector =
|
||||
(kAssumptions & kAtLeastOneVector) != 0;
|
||||
constexpr bool kIsMultipleOfVector =
|
||||
(kAssumptions & kMultipleOfVector) != 0;
|
||||
constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
|
||||
constexpr bool kIsAlignedA = (kAssumptions & kVectorAlignedA) != 0;
|
||||
constexpr bool kIsAlignedB = (kAssumptions & kVectorAlignedB) != 0;
|
||||
|
||||
// Won't be able to do a full vector load without padding => scalar loop.
|
||||
if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
|
||||
HWY_UNLIKELY(num_elements < N)) {
|
||||
float sum0 = 0.0f; // Only 2x unroll to avoid excessive code size for..
|
||||
float sum1 = 0.0f; // this unlikely(?) case.
|
||||
for (; i + 2 <= num_elements; i += 2) {
|
||||
sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
|
||||
sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
|
||||
}
|
||||
if (i < num_elements) {
|
||||
sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
|
||||
}
|
||||
return sum0 + sum1;
|
||||
}
|
||||
|
||||
// See comment in the other Compute() overload. Unroll 2x, but we need
|
||||
// twice as many sums for ReorderWidenMulAccumulate.
|
||||
V sum0 = Zero(df32);
|
||||
V sum1 = Zero(df32);
|
||||
V sum2 = Zero(df32);
|
||||
V sum3 = Zero(df32);
|
||||
|
||||
// Main loop: unrolled
|
||||
for (; i + 2 * N <= num_elements; /* i += 2 * N */) { // incr in loop
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
const auto a1 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b1 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
|
||||
}
|
||||
|
||||
// Possibly one more iteration of whole vectors
|
||||
if (i + N <= num_elements) {
|
||||
const auto a0 = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto b0 = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
i += N;
|
||||
sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
|
||||
}
|
||||
|
||||
if (!kIsMultipleOfVector) {
|
||||
const size_t remaining = num_elements - i;
|
||||
if (remaining != 0) {
|
||||
if (kIsPaddedToVector) {
|
||||
const auto mask = FirstN(du16, remaining);
|
||||
const auto va = kIsAlignedA ? Load(d, pa + i) : LoadU(d, pa + i);
|
||||
const auto vb = kIsAlignedB ? Load(d, pb + i) : LoadU(d, pb + i);
|
||||
const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
|
||||
const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
|
||||
|
||||
} else {
|
||||
// Unaligned load such that the last element is in the highest lane -
|
||||
// ensures we do not touch any elements outside the valid range.
|
||||
// If we get here, then num_elements >= N.
|
||||
HWY_DASSERT(i >= N);
|
||||
i += remaining - N;
|
||||
const auto skip = FirstN(du16, N - remaining);
|
||||
const auto va = LoadU(d, pa + i); // always unaligned
|
||||
const auto vb = LoadU(d, pb + i);
|
||||
const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
|
||||
const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
|
||||
sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
|
||||
}
|
||||
}
|
||||
} // kMultipleOfVector
|
||||
|
||||
// Reduction tree: sum of all accumulators by pairs, then across lanes.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
return GetLane(SumOfLanes(df32, sum0));
|
||||
}
|
||||
};
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
|
|
@ -0,0 +1,193 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/contrib/dot/dot-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
|
||||
double sum = 0.0;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
sum += pa[i] * pb[i];
|
||||
}
|
||||
return static_cast<T>(sum);
|
||||
}
|
||||
|
||||
HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
|
||||
size_t num) {
|
||||
float sum = 0.0f;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SetValue(const float value, T* HWY_RESTRICT ptr) {
|
||||
*ptr = static_cast<T>(value);
|
||||
}
|
||||
void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
|
||||
*ptr = BF16FromF32(value);
|
||||
}
|
||||
|
||||
class TestDot {
|
||||
// Computes/verifies one dot product.
|
||||
template <int kAssumptions, class D>
|
||||
void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
|
||||
RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
const auto random_t = [&rng]() {
|
||||
const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
|
||||
return static_cast<float>(bits - 512) * (1.0f / 64);
|
||||
};
|
||||
|
||||
const bool kIsAlignedA = (kAssumptions & Dot::kVectorAlignedA) != 0;
|
||||
const bool kIsAlignedB = (kAssumptions & Dot::kVectorAlignedB) != 0;
|
||||
|
||||
HWY_ASSERT(!kIsAlignedA || misalign_a == 0);
|
||||
HWY_ASSERT(!kIsAlignedB || misalign_b == 0);
|
||||
const size_t padded =
|
||||
(kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
|
||||
AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
|
||||
AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
|
||||
T* a = pa.get() + misalign_a;
|
||||
T* b = pb.get() + misalign_b;
|
||||
size_t i = 0;
|
||||
for (; i < num; ++i) {
|
||||
SetValue(random_t(), a + i);
|
||||
SetValue(random_t(), b + i);
|
||||
}
|
||||
// Fill padding with NaN - the values are not used, but avoids MSAN errors.
|
||||
for (; i < padded; ++i) {
|
||||
ScalableTag<float> df1;
|
||||
SetValue(GetLane(NaN(df1)), a + i);
|
||||
SetValue(GetLane(NaN(df1)), b + i);
|
||||
}
|
||||
|
||||
const auto expected = SimpleDot(a, b, num);
|
||||
const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
|
||||
const auto max = static_cast<decltype(actual)>(8 * 8 * num);
|
||||
HWY_ASSERT(-max <= actual && actual <= max);
|
||||
HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
|
||||
}
|
||||
|
||||
// Runs tests with various alignments compatible with the given assumptions.
|
||||
template <int kAssumptions, class D>
|
||||
void ForeachMisalign(D d, size_t num, RandomState& rng) {
|
||||
static_assert(
|
||||
(kAssumptions & (Dot::kVectorAlignedA | Dot::kVectorAlignedB)) == 0,
|
||||
"Alignment must not be specified by caller");
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
|
||||
|
||||
// Both flags, both aligned
|
||||
Test<kAssumptions | Dot::kVectorAlignedA | Dot::kVectorAlignedB>(d, num, 0,
|
||||
0, rng);
|
||||
|
||||
// One flag and aligned, other aligned/misaligned
|
||||
for (size_t m : misalignments) {
|
||||
Test<kAssumptions | Dot::kVectorAlignedA>(d, num, 0, m, rng);
|
||||
Test<kAssumptions | Dot::kVectorAlignedB>(d, num, m, 0, rng);
|
||||
}
|
||||
|
||||
// Neither flag, all combinations of aligned/misaligned
|
||||
for (size_t ma : misalignments) {
|
||||
for (size_t mb : misalignments) {
|
||||
Test<kAssumptions>(d, num, ma, mb, rng);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs tests with various lengths compatible with the given assumptions.
|
||||
template <int kAssumptions, class D>
|
||||
void ForeachCount(D d, RandomState& rng) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t counts[] = {1,
|
||||
3,
|
||||
7,
|
||||
16,
|
||||
HWY_MAX(N / 2, 1),
|
||||
HWY_MAX(2 * N / 3, 1),
|
||||
N,
|
||||
N + 1,
|
||||
4 * N / 3,
|
||||
3 * N,
|
||||
8 * N,
|
||||
8 * N + 2};
|
||||
for (size_t num : counts) {
|
||||
if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
|
||||
if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
|
||||
ForeachMisalign<kAssumptions>(d, num, rng);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
// All 8 combinations of the three length-related flags:
|
||||
ForeachCount<0>(d, rng);
|
||||
ForeachCount<Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kMultipleOfVector>(d, rng);
|
||||
ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
|
||||
ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
|
||||
Dot::kAtLeastOneVector>(d, rng);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
|
||||
void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(DotTest);
|
||||
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
|
||||
HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -26,7 +26,7 @@
|
|||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
size_t GetVectorSize() { return Lanes(HWY_FULL(uint8_t)()); }
|
||||
size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
||||
|
@ -58,7 +58,7 @@ size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
|
|||
}
|
||||
|
||||
// Round up to vector and cache line size.
|
||||
const size_t align = std::max<size_t>(vec_size, HWY_ALIGNMENT);
|
||||
const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
|
||||
size_t bytes_per_row = RoundUpTo(valid_bytes, align);
|
||||
|
||||
// During the lengthy window before writes are committed to memory, CPUs
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
// SIMD/multicore-friendly planar image representation with row accessors.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
@ -102,7 +103,7 @@ struct ImageBase {
|
|||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (y >= ysize_) {
|
||||
HWY_ABORT("Row(%zu) >= %u\n", y, ysize_);
|
||||
HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -221,9 +222,14 @@ class Image3 {
|
|||
|
||||
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
|
||||
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
|
||||
HWY_ABORT("Not same size: %zu x %zu, %zu x %zu, %zu x %zu\n",
|
||||
plane0.xsize(), plane0.ysize(), plane1.xsize(), plane1.ysize(),
|
||||
plane2.xsize(), plane2.ysize());
|
||||
HWY_ABORT("Not same size: %" PRIu64 " x %" PRIu64 ", %" PRIu64
|
||||
" x %" PRIu64 ", %" PRIu64 " x %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(plane0.xsize()),
|
||||
static_cast<uint64_t>(plane0.ysize()),
|
||||
static_cast<uint64_t>(plane1.xsize()),
|
||||
static_cast<uint64_t>(plane1.ysize()),
|
||||
static_cast<uint64_t>(plane2.xsize()),
|
||||
static_cast<uint64_t>(plane2.ysize()));
|
||||
}
|
||||
planes_[0] = std::move(plane0);
|
||||
planes_[1] = std::move(plane1);
|
||||
|
@ -288,7 +294,9 @@ class Image3 {
|
|||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (c >= kNumPlanes || y >= ysize()) {
|
||||
HWY_ABORT("PlaneRow(%zu, %zu) >= %zu\n", c, y, ysize());
|
||||
HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(c), static_cast<uint64_t>(y),
|
||||
static_cast<uint64_t>(ysize()));
|
||||
}
|
||||
#endif
|
||||
// Use the first plane's stride because the compiler might not realize they
|
||||
|
|
|
@ -42,7 +42,7 @@ struct TestAlignedT {
|
|||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 16);
|
||||
const HWY_FULL(T) d;
|
||||
const ScalableTag<T> d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 64; ++xsize) {
|
||||
|
@ -73,7 +73,7 @@ struct TestUnalignedT {
|
|||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 3);
|
||||
const HWY_FULL(T) d;
|
||||
const ScalableTag<T> d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 128; ++xsize) {
|
||||
|
@ -87,7 +87,7 @@ struct TestUnalignedT {
|
|||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
row[x] = 1 << dist(rng);
|
||||
row[x] = static_cast<T>(1u << dist(rng));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,7 +96,7 @@ struct TestUnalignedT {
|
|||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
accum |= LoadU(d, row + x);
|
||||
accum = Or(accum, LoadU(d, row + x));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -143,9 +143,17 @@ void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(ImageTest);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -38,7 +38,7 @@ namespace HWY_NAMESPACE {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Acos(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAcos(const D d, V x) {
|
||||
HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
|
||||
return Acos(d, x);
|
||||
}
|
||||
|
||||
|
@ -53,7 +53,7 @@ HWY_NOINLINE V CallAcos(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Acosh(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAcosh(const D d, V x) {
|
||||
HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
|
||||
return Acosh(d, x);
|
||||
}
|
||||
|
||||
|
@ -68,7 +68,7 @@ HWY_NOINLINE V CallAcosh(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Asin(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAsin(const D d, V x) {
|
||||
HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
|
||||
return Asin(d, x);
|
||||
}
|
||||
|
||||
|
@ -83,7 +83,7 @@ HWY_NOINLINE V CallAsin(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Asinh(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAsinh(const D d, V x) {
|
||||
HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
|
||||
return Asinh(d, x);
|
||||
}
|
||||
|
||||
|
@ -98,7 +98,7 @@ HWY_NOINLINE V CallAsinh(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Atan(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAtan(const D d, V x) {
|
||||
HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
|
||||
return Atan(d, x);
|
||||
}
|
||||
|
||||
|
@ -113,7 +113,7 @@ HWY_NOINLINE V CallAtan(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Atanh(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallAtanh(const D d, V x) {
|
||||
HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) {
|
||||
return Atanh(d, x);
|
||||
}
|
||||
|
||||
|
@ -128,7 +128,7 @@ HWY_NOINLINE V CallAtanh(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Cos(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallCos(const D d, V x) {
|
||||
HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
|
||||
return Cos(d, x);
|
||||
}
|
||||
|
||||
|
@ -143,7 +143,7 @@ HWY_NOINLINE V CallCos(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Exp(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallExp(const D d, V x) {
|
||||
HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
|
||||
return Exp(d, x);
|
||||
}
|
||||
|
||||
|
@ -158,7 +158,7 @@ HWY_NOINLINE V CallExp(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Expm1(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallExpm1(const D d, V x) {
|
||||
HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) {
|
||||
return Expm1(d, x);
|
||||
}
|
||||
|
||||
|
@ -173,7 +173,7 @@ HWY_NOINLINE V CallExpm1(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Log(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallLog(const D d, V x) {
|
||||
HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
|
||||
return Log(d, x);
|
||||
}
|
||||
|
||||
|
@ -188,7 +188,7 @@ HWY_NOINLINE V CallLog(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Log10(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallLog10(const D d, V x) {
|
||||
HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) {
|
||||
return Log10(d, x);
|
||||
}
|
||||
|
||||
|
@ -203,7 +203,7 @@ HWY_NOINLINE V CallLog10(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Log1p(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallLog1p(const D d, V x) {
|
||||
HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) {
|
||||
return Log1p(d, x);
|
||||
}
|
||||
|
||||
|
@ -218,7 +218,7 @@ HWY_NOINLINE V CallLog1p(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Log2(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallLog2(const D d, V x) {
|
||||
HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
|
||||
return Log2(d, x);
|
||||
}
|
||||
|
||||
|
@ -233,7 +233,7 @@ HWY_NOINLINE V CallLog2(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Sin(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallSin(const D d, V x) {
|
||||
HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
|
||||
return Sin(d, x);
|
||||
}
|
||||
|
||||
|
@ -248,7 +248,7 @@ HWY_NOINLINE V CallSin(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Sinh(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallSinh(const D d, V x) {
|
||||
HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
|
||||
return Sinh(d, x);
|
||||
}
|
||||
|
||||
|
@ -263,7 +263,7 @@ HWY_NOINLINE V CallSinh(const D d, V x) {
|
|||
template <class D, class V>
|
||||
HWY_INLINE V Tanh(const D d, V x);
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE V CallTanh(const D d, V x) {
|
||||
HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
|
||||
return Tanh(d, x);
|
||||
}
|
||||
|
||||
|
@ -282,43 +282,49 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
|
|||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
|
||||
T x2(x * x);
|
||||
T x2 = Mul(x, x);
|
||||
return MulAdd(x2, c2, MulAdd(c1, x, c0));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
|
||||
T x2(x * x);
|
||||
T x2 = Mul(x, x);
|
||||
return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
|
||||
T x2(x * x), x4(x2 * x2);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
|
||||
T x2(x * x), x4(x2 * x2);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
return MulAdd(x4, MulAdd(c5, x, c4),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6) {
|
||||
T x2(x * x), x4(x2 * x2);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7) {
|
||||
T x2(x * x), x4(x2 * x2);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
|
||||
}
|
||||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8, c8,
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
|
||||
|
@ -326,7 +332,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
|||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8, MulAdd(c9, x, c8),
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
|
||||
|
@ -334,7 +342,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
|||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
|
||||
|
@ -342,7 +352,9 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
|||
template <class T>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
|
||||
|
@ -351,7 +363,9 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(
|
||||
x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
|
||||
|
@ -361,7 +375,9 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8,
|
||||
MulAdd(x4, MulAdd(c13, x, c12),
|
||||
MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
|
||||
|
@ -372,7 +388,9 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13, T c14) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8,
|
||||
MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
|
||||
MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
|
||||
|
@ -383,7 +401,9 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13, T c14, T c15) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
return MulAdd(x8,
|
||||
MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
|
||||
MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
|
||||
|
@ -394,7 +414,10 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13, T c14, T c15, T c16) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
T x16 = Mul(x8, x8);
|
||||
return MulAdd(
|
||||
x16, c16,
|
||||
MulAdd(x8,
|
||||
|
@ -407,7 +430,10 @@ template <class T>
|
|||
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
||||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13, T c14, T c15, T c16, T c17) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
T x16 = Mul(x8, x8);
|
||||
return MulAdd(
|
||||
x16, MulAdd(c17, x, c16),
|
||||
MulAdd(x8,
|
||||
|
@ -421,7 +447,10 @@ HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
|
|||
T c6, T c7, T c8, T c9, T c10, T c11,
|
||||
T c12, T c13, T c14, T c15, T c16, T c17,
|
||||
T c18) {
|
||||
T x2(x * x), x4(x2 * x2), x8(x4 * x4), x16(x8 * x8);
|
||||
T x2 = Mul(x, x);
|
||||
T x4 = Mul(x2, x2);
|
||||
T x8 = Mul(x4, x4);
|
||||
T x16 = Mul(x8, x8);
|
||||
return MulAdd(
|
||||
x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
|
||||
MulAdd(x8,
|
||||
|
@ -497,8 +526,8 @@ struct AtanImpl<float> {
|
|||
const auto k6 = Set(d, -0.0159569028764963150024414f);
|
||||
const auto k7 = Set(d, +0.00282363896258175373077393f);
|
||||
|
||||
const auto y = (x * x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), (y * x), x);
|
||||
const auto y = Mul(x, x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -529,10 +558,10 @@ struct AtanImpl<double> {
|
|||
const auto k17 = Set(d, +0.000209850076645816976906797);
|
||||
const auto k18 = Set(d, -1.88796008463073496563746e-5);
|
||||
|
||||
const auto y = (x * x);
|
||||
const auto y = Mul(x, x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
|
||||
k12, k13, k14, k15, k16, k17, k18),
|
||||
(y * x), x);
|
||||
Mul(y, x), x);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -553,8 +582,8 @@ struct CosSinImpl<float> {
|
|||
const auto k2 = Set(d, -1.981069071916863322258e-4f);
|
||||
const auto k3 = Set(d, +2.6083159809786593541503e-6f);
|
||||
|
||||
const auto y(x * x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3), (y * x), x);
|
||||
const auto y = Mul(x, x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
|
||||
}
|
||||
|
||||
template <class D, class V, class VI32>
|
||||
|
@ -628,8 +657,8 @@ struct CosSinImpl<double> {
|
|||
const auto k7 = Set(d, +2.81009972710863200091251e-15);
|
||||
const auto k8 = Set(d, -7.97255955009037868891952e-18);
|
||||
|
||||
const auto y(x * x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), (y * x), x);
|
||||
const auto y = Mul(x, x);
|
||||
return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
|
||||
}
|
||||
|
||||
template <class D, class V, class VI32>
|
||||
|
@ -702,7 +731,7 @@ struct ExpImpl<float> {
|
|||
const auto k4 = Set(d, +0.00139304355252534151077271f);
|
||||
const auto k5 = Set(d, +0.000198527617612853646278381f);
|
||||
|
||||
return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), (x * x), x);
|
||||
return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
|
||||
}
|
||||
|
||||
// Computes 2^x, where x is an integer.
|
||||
|
@ -710,14 +739,14 @@ struct ExpImpl<float> {
|
|||
HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
|
||||
const Rebind<int32_t, D> di32;
|
||||
const VI32 kOffset = Set(di32, 0x7F);
|
||||
return BitCast(d, ShiftLeft<23>(x + kOffset));
|
||||
return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
|
||||
}
|
||||
|
||||
// Sets the exponent of 'x' to 2^e.
|
||||
template <class D, class V, class VI32>
|
||||
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
|
||||
const VI32 y = ShiftRight<1>(e);
|
||||
return x * Pow2I(d, y) * Pow2I(d, e - y);
|
||||
return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
|
||||
}
|
||||
|
||||
template <class D, class V, class VI32>
|
||||
|
@ -740,7 +769,8 @@ struct LogImpl<float> {
|
|||
HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
|
||||
const Rebind<int32_t, D> di32;
|
||||
const Rebind<uint32_t, D> du32;
|
||||
return BitCast(di32, ShiftRight<23>(BitCast(du32, x))) - Set(di32, 0x7F);
|
||||
const auto kBias = Set(di32, 0x7F);
|
||||
return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
|
||||
}
|
||||
|
||||
// Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
|
||||
|
@ -751,9 +781,9 @@ struct LogImpl<float> {
|
|||
const V k2 = Set(d, 0.28498786688f);
|
||||
const V k3 = Set(d, 0.24279078841f);
|
||||
|
||||
const V x2 = (x * x);
|
||||
const V x4 = (x2 * x2);
|
||||
return MulAdd(MulAdd(k2, x4, k0), x2, (MulAdd(k3, x4, k1) * x4));
|
||||
const V x2 = Mul(x, x);
|
||||
const V x4 = Mul(x2, x2);
|
||||
return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -781,7 +811,7 @@ struct ExpImpl<double> {
|
|||
const auto k10 = Set(d, +2.08860621107283687536341e-9);
|
||||
|
||||
return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
|
||||
(x * x), x);
|
||||
Mul(x, x), x);
|
||||
}
|
||||
|
||||
// Computes 2^x, where x is an integer.
|
||||
|
@ -790,14 +820,14 @@ struct ExpImpl<double> {
|
|||
const Rebind<int32_t, D> di32;
|
||||
const Rebind<int64_t, D> di64;
|
||||
const VI32 kOffset = Set(di32, 0x3FF);
|
||||
return BitCast(d, ShiftLeft<52>(PromoteTo(di64, x + kOffset)));
|
||||
return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
|
||||
}
|
||||
|
||||
// Sets the exponent of 'x' to 2^e.
|
||||
template <class D, class V, class VI32>
|
||||
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
|
||||
const VI32 y = ShiftRight<1>(e);
|
||||
return (x * Pow2I(d, y) * Pow2I(d, e - y));
|
||||
return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
|
||||
}
|
||||
|
||||
template <class D, class V, class VI32>
|
||||
|
@ -820,7 +850,8 @@ struct LogImpl<double> {
|
|||
HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
|
||||
const Rebind<int64_t, D> di64;
|
||||
const Rebind<uint64_t, D> du64;
|
||||
return BitCast(di64, ShiftRight<52>(BitCast(du64, x))) - Set(di64, 0x3FF);
|
||||
return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
|
||||
Set(di64, 0x3FF));
|
||||
}
|
||||
|
||||
// Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
|
||||
|
@ -834,10 +865,10 @@ struct LogImpl<double> {
|
|||
const V k5 = Set(d, 0.1531383769920937332);
|
||||
const V k6 = Set(d, 0.1479819860511658591);
|
||||
|
||||
const V x2 = (x * x);
|
||||
const V x4 = (x2 * x2);
|
||||
const V x2 = Mul(x, x);
|
||||
const V x4 = Mul(x2, x2);
|
||||
return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
|
||||
(MulAdd(MulAdd(k5, x4, k3), x4, k1) * x4));
|
||||
(Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -846,200 +877,215 @@ struct LogImpl<double> {
|
|||
template <class D, class V, bool kAllowSubnormals = true>
|
||||
HWY_INLINE V Log(const D d, V x) {
|
||||
// http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
|
||||
using LaneType = LaneType<V>;
|
||||
impl::LogImpl<LaneType> impl;
|
||||
using T = TFromD<D>;
|
||||
impl::LogImpl<T> impl;
|
||||
|
||||
// clang-format off
|
||||
constexpr bool kIsF32 = (sizeof(LaneType) == 4);
|
||||
constexpr bool kIsF32 = (sizeof(T) == 4);
|
||||
|
||||
// Float Constants
|
||||
const V kLn2Hi = Set(d, (kIsF32 ? 0.69313812256f :
|
||||
0.693147180369123816490 ));
|
||||
const V kLn2Lo = Set(d, (kIsF32 ? 9.0580006145e-6f :
|
||||
1.90821492927058770002e-10));
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kMinNormal = Set(d, (kIsF32 ? 1.175494351e-38f :
|
||||
2.2250738585072014e-308 ));
|
||||
const V kScale = Set(d, (kIsF32 ? 3.355443200e+7f :
|
||||
1.8014398509481984e+16 ));
|
||||
const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
|
||||
: static_cast<T>(0.693147180369123816490));
|
||||
const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
|
||||
: static_cast<T>(1.90821492927058770002e-10));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
|
||||
: static_cast<T>(2.2250738585072014e-308));
|
||||
const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
|
||||
: static_cast<T>(1.8014398509481984e+16));
|
||||
|
||||
// Integer Constants
|
||||
const Rebind<MakeSigned<LaneType>, D> di;
|
||||
using TI = MakeSigned<T>;
|
||||
const Rebind<TI, D> di;
|
||||
using VI = decltype(Zero(di));
|
||||
const VI kLowerBits = Set(di, (kIsF32 ? 0x00000000L : 0xFFFFFFFFLL));
|
||||
const VI kMagic = Set(di, (kIsF32 ? 0x3F3504F3L : 0x3FE6A09E00000000LL));
|
||||
const VI kExpMask = Set(di, (kIsF32 ? 0x3F800000L : 0x3FF0000000000000LL));
|
||||
const VI kExpScale = Set(di, (kIsF32 ? -25 : -54));
|
||||
const VI kManMask = Set(di, (kIsF32 ? 0x7FFFFFL : 0xFFFFF00000000LL));
|
||||
// clang-format on
|
||||
const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
|
||||
: static_cast<TI>(0xFFFFFFFFLL));
|
||||
const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
|
||||
: static_cast<TI>(0x3FE6A09E00000000LL));
|
||||
const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
|
||||
: static_cast<TI>(0x3FF0000000000000LL));
|
||||
const VI kExpScale =
|
||||
Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
|
||||
const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
|
||||
: static_cast<TI>(0xFFFFF00000000LL));
|
||||
|
||||
// Scale up 'x' so that it is no longer denormalized.
|
||||
VI exp_bits;
|
||||
V exp;
|
||||
if (kAllowSubnormals == true) {
|
||||
const auto is_denormal = (x < kMinNormal);
|
||||
x = IfThenElse(is_denormal, (x * kScale), x);
|
||||
const auto is_denormal = Lt(x, kMinNormal);
|
||||
x = IfThenElse(is_denormal, Mul(x, kScale), x);
|
||||
|
||||
// Compute the new exponent.
|
||||
exp_bits = (BitCast(di, x) + (kExpMask - kMagic));
|
||||
exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
|
||||
const VI exp_scale =
|
||||
BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
|
||||
exp = ConvertTo(
|
||||
d, exp_scale + impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
|
||||
d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
|
||||
} else {
|
||||
// Compute the new exponent.
|
||||
exp_bits = (BitCast(di, x) + (kExpMask - kMagic));
|
||||
exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
|
||||
exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
|
||||
}
|
||||
|
||||
// Renormalize.
|
||||
const V y = Or(And(x, BitCast(d, kLowerBits)),
|
||||
BitCast(d, ((exp_bits & kManMask) + kMagic)));
|
||||
BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
|
||||
|
||||
// Approximate and reconstruct.
|
||||
const V ym1 = (y - kOne);
|
||||
const V z = (ym1 / (y + kOne));
|
||||
const V ym1 = Sub(y, kOne);
|
||||
const V z = Div(ym1, Add(y, kOne));
|
||||
|
||||
return MulSub(exp, kLn2Hi,
|
||||
(MulSub(z, (ym1 - impl.LogPoly(d, z)), (exp * kLn2Lo)) - ym1));
|
||||
return MulSub(
|
||||
exp, kLn2Hi,
|
||||
Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
|
||||
}
|
||||
|
||||
} // namespace impl
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Acos(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
using T = TFromD<D>;
|
||||
|
||||
const V kZero = Zero(d);
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
const V kPi = Set(d, +3.14159265358979323846264);
|
||||
const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
|
||||
const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
|
||||
|
||||
const V sign_x = And(SignBit(d), x);
|
||||
const V abs_x = Xor(x, sign_x);
|
||||
const auto mask = (abs_x < kHalf);
|
||||
const V yy = IfThenElse(mask, (abs_x * abs_x), ((kOne - abs_x) * kHalf));
|
||||
const auto mask = Lt(abs_x, kHalf);
|
||||
const V yy =
|
||||
IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
|
||||
const V y = IfThenElse(mask, abs_x, Sqrt(yy));
|
||||
|
||||
impl::AsinImpl<LaneType> impl;
|
||||
const V t = (impl.AsinPoly(d, yy, y) * (y * yy));
|
||||
const V z = IfThenElse(mask, (kPiOverTwo - (Xor(y, sign_x) + Xor(t, sign_x))),
|
||||
((t + y) * kTwo));
|
||||
return IfThenElse(Or(mask, (x >= kZero)), z, (kPi - z));
|
||||
impl::AsinImpl<T> impl;
|
||||
const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
|
||||
|
||||
const V t_plus_y = Add(t, y);
|
||||
const V z =
|
||||
IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
|
||||
Add(t_plus_y, t_plus_y));
|
||||
return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Acosh(const D d, V x) {
|
||||
const V kLarge = Set(d, 268435456.0);
|
||||
const V kLog2 = Set(d, 0.693147180559945286227);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
using T = TFromD<D>;
|
||||
|
||||
const auto is_x_large = (x > kLarge);
|
||||
const auto is_x_gt_2 = (x > kTwo);
|
||||
const V kLarge = Set(d, static_cast<T>(268435456.0));
|
||||
const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kTwo = Set(d, static_cast<T>(+2.0));
|
||||
|
||||
const V x_minus_1 = (x - kOne);
|
||||
const V y0 = MulSub(kTwo, x, (kOne / (Sqrt(MulSub(x, x, kOne)) + x)));
|
||||
const auto is_x_large = Gt(x, kLarge);
|
||||
const auto is_x_gt_2 = Gt(x, kTwo);
|
||||
|
||||
const V x_minus_1 = Sub(x, kOne);
|
||||
const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
|
||||
const V y1 =
|
||||
(Sqrt(MulAdd(x_minus_1, kTwo, (x_minus_1 * x_minus_1))) + x_minus_1);
|
||||
Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
|
||||
const V y2 =
|
||||
IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), (y1 + kOne));
|
||||
IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
|
||||
const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
|
||||
|
||||
const auto is_pole = y2 == kOne;
|
||||
const auto divisor = IfThenZeroElse(is_pole, y2) - kOne;
|
||||
return IfThenElse(is_x_gt_2, z, IfThenElse(is_pole, y1, z * y1 / divisor)) +
|
||||
IfThenElseZero(is_x_large, kLog2);
|
||||
const auto is_pole = Eq(y2, kOne);
|
||||
const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
|
||||
return Add(IfThenElse(is_x_gt_2, z,
|
||||
IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
|
||||
IfThenElseZero(is_x_large, kLog2));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Asin(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
using T = TFromD<D>;
|
||||
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kTwo = Set(d, static_cast<T>(+2.0));
|
||||
const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
|
||||
|
||||
const V sign_x = And(SignBit(d), x);
|
||||
const V abs_x = Xor(x, sign_x);
|
||||
const auto mask = (abs_x < kHalf);
|
||||
const V yy = IfThenElse(mask, (abs_x * abs_x), (kOne - abs_x) * kHalf);
|
||||
const auto mask = Lt(abs_x, kHalf);
|
||||
const V yy =
|
||||
IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
|
||||
const V y = IfThenElse(mask, abs_x, Sqrt(yy));
|
||||
|
||||
impl::AsinImpl<LaneType> impl;
|
||||
const V z0 = MulAdd(impl.AsinPoly(d, yy, y), (yy * y), y);
|
||||
const V z1 = (kPiOverTwo - (z0 * kTwo));
|
||||
impl::AsinImpl<T> impl;
|
||||
const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
|
||||
const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
|
||||
return Or(IfThenElse(mask, z0, z1), sign_x);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Asinh(const D d, V x) {
|
||||
const V kSmall = Set(d, 1.0 / 268435456.0);
|
||||
const V kLarge = Set(d, 268435456.0);
|
||||
const V kLog2 = Set(d, 0.693147180559945286227);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
using T = TFromD<D>;
|
||||
|
||||
const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
|
||||
const V kLarge = Set(d, static_cast<T>(268435456.0));
|
||||
const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kTwo = Set(d, static_cast<T>(+2.0));
|
||||
|
||||
const V sign_x = And(SignBit(d), x); // Extract the sign bit
|
||||
const V abs_x = Xor(x, sign_x);
|
||||
|
||||
const auto is_x_large = (abs_x > kLarge);
|
||||
const auto is_x_lt_2 = (abs_x < kTwo);
|
||||
const auto is_x_large = Gt(abs_x, kLarge);
|
||||
const auto is_x_lt_2 = Lt(abs_x, kTwo);
|
||||
|
||||
const V x2 = (x * x);
|
||||
const V sqrt_x2_plus_1 = Sqrt(x2 + kOne);
|
||||
const V x2 = Mul(x, x);
|
||||
const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
|
||||
|
||||
const V y0 = MulAdd(abs_x, kTwo, (kOne / (sqrt_x2_plus_1 + abs_x)));
|
||||
const V y1 = ((x2 / (sqrt_x2_plus_1 + kOne)) + abs_x);
|
||||
const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
|
||||
const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
|
||||
const V y2 =
|
||||
IfThenElse(is_x_lt_2, (y1 + kOne), IfThenElse(is_x_large, abs_x, y0));
|
||||
IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
|
||||
const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
|
||||
|
||||
const auto is_pole = y2 == kOne;
|
||||
const auto divisor = IfThenZeroElse(is_pole, y2) - kOne;
|
||||
const auto large = IfThenElse(is_pole, y1, z * y1 / divisor);
|
||||
const V y = IfThenElse(abs_x < kSmall, x, large);
|
||||
return Or((IfThenElse(is_x_lt_2, y, z) + IfThenElseZero(is_x_large, kLog2)),
|
||||
const auto is_pole = Eq(y2, kOne);
|
||||
const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
|
||||
const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
|
||||
const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
|
||||
return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
|
||||
sign_x);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Atan(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
using T = TFromD<D>;
|
||||
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
|
||||
|
||||
const V sign = And(SignBit(d), x);
|
||||
const V abs_x = Xor(x, sign);
|
||||
const auto mask = (abs_x > kOne);
|
||||
const auto mask = Gt(abs_x, kOne);
|
||||
|
||||
impl::AtanImpl<LaneType> impl;
|
||||
impl::AtanImpl<T> impl;
|
||||
const auto divisor = IfThenElse(mask, abs_x, kOne);
|
||||
const V y = impl.AtanPoly(d, IfThenElse(mask, kOne / divisor, abs_x));
|
||||
return Or(IfThenElse(mask, (kPiOverTwo - y), y), sign);
|
||||
const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
|
||||
return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Atanh(const D d, V x) {
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kOne = Set(d, +1.0);
|
||||
using T = TFromD<D>;
|
||||
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
|
||||
const V sign = And(SignBit(d), x); // Extract the sign bit
|
||||
const V abs_x = Xor(x, sign);
|
||||
return Log1p(d, ((abs_x + abs_x) / (kOne - abs_x))) * Xor(kHalf, sign);
|
||||
return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
|
||||
Xor(kHalf, sign));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Cos(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
impl::CosSinImpl<LaneType> impl;
|
||||
using T = TFromD<D>;
|
||||
impl::CosSinImpl<T> impl;
|
||||
|
||||
// Float Constants
|
||||
const V kOneOverPi = Set(d, 0.31830988618379067153);
|
||||
const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
|
||||
|
||||
// Integer Constants
|
||||
const Rebind<int32_t, D> di32;
|
||||
|
@ -1049,7 +1095,7 @@ HWY_INLINE V Cos(const D d, V x) {
|
|||
const V y = Abs(x); // cos(x) == cos(|x|)
|
||||
|
||||
// Compute the quadrant, q = int(|x| / pi) * 2 + 1
|
||||
const VI32 q = (ShiftLeft<1>(impl.ToInt32(d, y * kOneOverPi)) + kOne);
|
||||
const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
|
||||
|
||||
// Reduce range, apply sign, and approximate.
|
||||
return impl.Poly(
|
||||
|
@ -1058,17 +1104,16 @@ HWY_INLINE V Cos(const D d, V x) {
|
|||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Exp(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
using T = TFromD<D>;
|
||||
|
||||
// clang-format off
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kLowerBound = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
|
||||
const V kNegZero = Set(d, -0.0);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
|
||||
// clang-format on
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kLowerBound =
|
||||
Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
|
||||
const V kNegZero = Set(d, static_cast<T>(-0.0));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
|
||||
|
||||
impl::ExpImpl<LaneType> impl;
|
||||
impl::ExpImpl<T> impl;
|
||||
|
||||
// q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
|
||||
const auto q =
|
||||
|
@ -1076,25 +1121,24 @@ HWY_INLINE V Exp(const D d, V x) {
|
|||
|
||||
// Reduce, approximate, and then reconstruct.
|
||||
const V y = impl.LoadExpShortRange(
|
||||
d, (impl.ExpPoly(d, impl.ExpReduce(d, x, q)) + kOne), q);
|
||||
return IfThenElseZero(x >= kLowerBound, y);
|
||||
d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
|
||||
return IfThenElseZero(Ge(x, kLowerBound), y);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Expm1(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
using T = TFromD<D>;
|
||||
|
||||
// clang-format off
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kLowerBound = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
|
||||
const V kLn2Over2 = Set(d, +0.346573590279972654708616);
|
||||
const V kNegOne = Set(d, -1.0);
|
||||
const V kNegZero = Set(d, -0.0);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
|
||||
// clang-format on
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kLowerBound =
|
||||
Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
|
||||
const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
|
||||
const V kNegOne = Set(d, static_cast<T>(-1.0));
|
||||
const V kNegZero = Set(d, static_cast<T>(-0.0));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
|
||||
|
||||
impl::ExpImpl<LaneType> impl;
|
||||
impl::ExpImpl<T> impl;
|
||||
|
||||
// q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
|
||||
const auto q =
|
||||
|
@ -1102,9 +1146,9 @@ HWY_INLINE V Expm1(const D d, V x) {
|
|||
|
||||
// Reduce, approximate, and then reconstruct.
|
||||
const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
|
||||
const V z = IfThenElse(Abs(x) < kLn2Over2, y,
|
||||
impl.LoadExpShortRange(d, (y + kOne), q) - kOne);
|
||||
return IfThenElse(x < kLowerBound, kNegOne, z);
|
||||
const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
|
||||
Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
|
||||
return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
|
@ -1114,34 +1158,37 @@ HWY_INLINE V Log(const D d, V x) {
|
|||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Log10(const D d, V x) {
|
||||
return Log(d, x) * Set(d, 0.4342944819032518276511);
|
||||
using T = TFromD<D>;
|
||||
return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Log1p(const D d, V x) {
|
||||
const V kOne = Set(d, +1.0);
|
||||
using T = TFromD<D>;
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
|
||||
const V y = x + kOne;
|
||||
const auto is_pole = y == kOne;
|
||||
const auto divisor = IfThenZeroElse(is_pole, y) - kOne;
|
||||
const V y = Add(x, kOne);
|
||||
const auto is_pole = Eq(y, kOne);
|
||||
const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
|
||||
const auto non_pole =
|
||||
impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y) * (x / divisor);
|
||||
Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
|
||||
return IfThenElse(is_pole, x, non_pole);
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Log2(const D d, V x) {
|
||||
return Log(d, x) * Set(d, 1.44269504088896340735992);
|
||||
using T = TFromD<D>;
|
||||
return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Sin(const D d, V x) {
|
||||
using LaneType = LaneType<V>;
|
||||
impl::CosSinImpl<LaneType> impl;
|
||||
using T = TFromD<D>;
|
||||
impl::CosSinImpl<T> impl;
|
||||
|
||||
// Float Constants
|
||||
const V kOneOverPi = Set(d, 0.31830988618379067153);
|
||||
const V kHalf = Set(d, 0.5);
|
||||
const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
|
||||
const V kHalf = Set(d, static_cast<T>(0.5));
|
||||
|
||||
// Integer Constants
|
||||
const Rebind<int32_t, D> di32;
|
||||
|
@ -1160,27 +1207,29 @@ HWY_INLINE V Sin(const D d, V x) {
|
|||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Sinh(const D d, V x) {
|
||||
const V kHalf = Set(d, +0.5);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
using T = TFromD<D>;
|
||||
const V kHalf = Set(d, static_cast<T>(+0.5));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kTwo = Set(d, static_cast<T>(+2.0));
|
||||
|
||||
const V sign = And(SignBit(d), x); // Extract the sign bit
|
||||
const V abs_x = Xor(x, sign);
|
||||
const V y = Expm1(d, abs_x);
|
||||
const V z = ((y + kTwo) / (y + kOne) * (y * kHalf));
|
||||
const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
|
||||
return Xor(z, sign); // Reapply the sign bit
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_INLINE V Tanh(const D d, V x) {
|
||||
const V kLimit = Set(d, 18.714973875);
|
||||
const V kOne = Set(d, +1.0);
|
||||
const V kTwo = Set(d, +2.0);
|
||||
using T = TFromD<D>;
|
||||
const V kLimit = Set(d, static_cast<T>(18.714973875));
|
||||
const V kOne = Set(d, static_cast<T>(+1.0));
|
||||
const V kTwo = Set(d, static_cast<T>(+2.0));
|
||||
|
||||
const V sign = And(SignBit(d), x); // Extract the sign bit
|
||||
const V abs_x = Xor(x, sign);
|
||||
const V y = Expm1(d, abs_x * kTwo);
|
||||
const V z = IfThenElse((abs_x > kLimit), kOne, (y / (y + kTwo)));
|
||||
const V y = Expm1(d, Mul(abs_x, kTwo));
|
||||
const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
|
||||
return Xor(z, sign); // Reapply the sign bit
|
||||
}
|
||||
|
||||
|
|
|
@ -12,8 +12,9 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cfloat> // FLT_MAX
|
||||
#include <iostream>
|
||||
#include <type_traits>
|
||||
|
||||
// clang-format off
|
||||
|
@ -29,10 +30,18 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <class Out, class In>
|
||||
inline Out BitCast(const In& in) {
|
||||
static_assert(sizeof(Out) == sizeof(In), "");
|
||||
Out out;
|
||||
CopyBytes<sizeof(out)>(&in, &out);
|
||||
return out;
|
||||
}
|
||||
|
||||
template <class T, class D>
|
||||
void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
|
||||
D d, T min, T max, uint64_t max_error_ulp) {
|
||||
constexpr bool kIsF32 = (sizeof(T) == 4);
|
||||
HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
|
||||
Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
|
||||
uint64_t max_error_ulp) {
|
||||
using UintT = MakeUnsigned<T>;
|
||||
|
||||
const UintT min_bits = BitCast<UintT>(min);
|
||||
|
@ -51,18 +60,16 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
|
|||
}
|
||||
|
||||
uint64_t max_ulp = 0;
|
||||
#if HWY_ARCH_ARM
|
||||
// Emulation is slower, so cannot afford as many.
|
||||
constexpr UintT kSamplesPerRange = 25000;
|
||||
#else
|
||||
constexpr UintT kSamplesPerRange = 100000;
|
||||
#endif
|
||||
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000));
|
||||
for (int range_index = 0; range_index < range_count; ++range_index) {
|
||||
const UintT start = ranges[range_index][0];
|
||||
const UintT stop = ranges[range_index][1];
|
||||
const UintT step = std::max<UintT>(1, ((stop - start) / kSamplesPerRange));
|
||||
const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
|
||||
for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
|
||||
const T value = BitCast<T>(std::min(value_bits, stop));
|
||||
// For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
|
||||
// value_bits can be less than start, and thus possibly NaN.
|
||||
const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
|
||||
const T actual = GetLane(fxN(d, Set(d, value)));
|
||||
const T expected = fx1(value);
|
||||
|
||||
|
@ -73,21 +80,41 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
|
|||
}
|
||||
#endif
|
||||
|
||||
const auto ulp = ComputeUlpDelta(actual, expected);
|
||||
max_ulp = std::max<uint64_t>(max_ulp, ulp);
|
||||
const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
|
||||
max_ulp = HWY_MAX(max_ulp, ulp);
|
||||
if (ulp > max_error_ulp) {
|
||||
std::cout << name << "<" << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
|
||||
<< ">(" << value << ") expected: " << expected
|
||||
<< " actual: " << actual << " ulp: " << ulp
|
||||
<< " max: " << max_error_ulp << std::endl;
|
||||
fprintf(stderr,
|
||||
"%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
|
||||
expected, actual, static_cast<uint64_t>(ulp),
|
||||
static_cast<uint32_t>(max_error_ulp));
|
||||
}
|
||||
HWY_ASSERT(ulp <= max_error_ulp);
|
||||
}
|
||||
}
|
||||
std::cout << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
|
||||
<< ", Max ULP: " << max_ulp << std::endl;
|
||||
fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
|
||||
hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
|
||||
HWY_ASSERT(max_ulp <= max_error_ulp);
|
||||
}
|
||||
|
||||
// TODO(janwas): remove once RVV supports fractional LMUL
|
||||
#undef DEFINE_MATH_TEST_FUNC
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
|
||||
#define DEFINE_MATH_TEST_FUNC(NAME) \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define DEFINE_MATH_TEST_FUNC(NAME) \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#undef DEFINE_MATH_TEST
|
||||
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
|
||||
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
|
||||
struct Test##NAME { \
|
||||
|
@ -97,28 +124,45 @@ void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
|
|||
TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \
|
||||
F32_ERROR); \
|
||||
} else { \
|
||||
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, F64_MIN, F64_MAX, \
|
||||
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, \
|
||||
static_cast<T>(F64_MIN), static_cast<T>(F64_MAX), \
|
||||
F64_ERROR); \
|
||||
} \
|
||||
} \
|
||||
}; \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
|
||||
}
|
||||
DEFINE_MATH_TEST_FUNC(NAME)
|
||||
|
||||
// Floating point values closest to but less than 1.0
|
||||
const float kNearOneF = BitCast<float>(0x3F7FFFFF);
|
||||
const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
|
||||
|
||||
// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
|
||||
// only increase the error tolerance there.
|
||||
constexpr uint64_t Cos64ULP() {
|
||||
#if defined(__MINGW32__)
|
||||
return 23;
|
||||
#else
|
||||
return 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
constexpr uint64_t ACosh32ULP() {
|
||||
#if defined(__MINGW32__)
|
||||
return 8;
|
||||
#else
|
||||
return 3;
|
||||
#endif
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
DEFINE_MATH_TEST(Acos,
|
||||
std::acos, CallAcos, -1.0, +1.0, 3, // NEON is 3 instead of 2
|
||||
std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2
|
||||
std::acos, CallAcos, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Acosh,
|
||||
std::acosh, CallAcosh, +1.0, +FLT_MAX, 3,
|
||||
std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(),
|
||||
std::acosh, CallAcosh, +1.0, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Asin,
|
||||
std::asin, CallAsin, -1.0, +1.0, 4, // ARMv7 is 4 instead of 2
|
||||
std::asin, CallAsin, -1.0f, +1.0f, 4, // ARMv7 is 4 instead of 2
|
||||
std::asin, CallAsin, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Asinh,
|
||||
std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3,
|
||||
|
@ -130,13 +174,13 @@ DEFINE_MATH_TEST(Atanh,
|
|||
std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3
|
||||
std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3)
|
||||
DEFINE_MATH_TEST(Cos,
|
||||
std::cos, CallCos, -39000.0, +39000.0, 3,
|
||||
std::cos, CallCos, -39000.0, +39000.0, 3)
|
||||
std::cos, CallCos, -39000.0f, +39000.0f, 3,
|
||||
std::cos, CallCos, -39000.0, +39000.0, Cos64ULP())
|
||||
DEFINE_MATH_TEST(Exp,
|
||||
std::exp, CallExp, -FLT_MAX, +104.0, 1,
|
||||
std::exp, CallExp, -FLT_MAX, +104.0f, 1,
|
||||
std::exp, CallExp, -DBL_MAX, +104.0, 1)
|
||||
DEFINE_MATH_TEST(Expm1,
|
||||
std::expm1, CallExpm1, -FLT_MAX, +104.0, 4,
|
||||
std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4,
|
||||
std::expm1, CallExpm1, -DBL_MAX, +104.0, 4)
|
||||
DEFINE_MATH_TEST(Log,
|
||||
std::log, CallLog, +FLT_MIN, +FLT_MAX, 1,
|
||||
|
@ -145,14 +189,14 @@ DEFINE_MATH_TEST(Log10,
|
|||
std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log1p,
|
||||
std::log1p, CallLog1p, +0.0f, +1e37, 3, // NEON is 3 instead of 2
|
||||
std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2
|
||||
std::log1p, CallLog1p, +0.0, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log2,
|
||||
std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Sin,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 3,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 3)
|
||||
std::sin, CallSin, -39000.0f, +39000.0f, 3,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3
|
||||
DEFINE_MATH_TEST(Sinh,
|
||||
std::sinh, CallSinh, -80.0f, +80.0f, 4,
|
||||
std::sinh, CallSinh, -709.0, +709.0, 4)
|
||||
|
@ -167,6 +211,7 @@ DEFINE_MATH_TEST(Tanh,
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMathTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
|
||||
|
@ -186,4 +231,11 @@ HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
|
|||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,909 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target include guard
|
||||
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
||||
#endif
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
enum class SortOrder { kAscending, kDescending };
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
#define HWY_SORT_VERIFY 1
|
||||
|
||||
constexpr inline SortOrder Reverse(SortOrder order) {
|
||||
return (order == SortOrder::kAscending) ? SortOrder::kDescending
|
||||
: SortOrder::kAscending;
|
||||
}
|
||||
|
||||
namespace verify {
|
||||
|
||||
template <typename T>
|
||||
bool Compare(T a, T b, SortOrder kOrder) {
|
||||
if (kOrder == SortOrder::kAscending) return a <= b;
|
||||
return a >= b;
|
||||
}
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
|
||||
template <class D>
|
||||
class Runs {
|
||||
using T = TFromD<D>;
|
||||
|
||||
public:
|
||||
Runs(D d, size_t num_regs, size_t run_length = 0, bool alternating = false) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
buf_ = AllocateAligned<T>(N);
|
||||
consecutive_ = AllocateAligned<T>(num_regs * N);
|
||||
|
||||
num_regs_ = num_regs;
|
||||
if (run_length) {
|
||||
run_length_ = run_length;
|
||||
num_runs_ = num_regs * N / run_length;
|
||||
is_vector_ = true;
|
||||
alternating_ = alternating;
|
||||
} else {
|
||||
run_length_ = num_regs * 4;
|
||||
num_runs_ = N / 4;
|
||||
is_vector_ = false;
|
||||
alternating_ = false;
|
||||
}
|
||||
}
|
||||
|
||||
void ScatterQuartets(D d, const size_t idx_reg, Vec<D> v) {
|
||||
HWY_ASSERT(idx_reg < num_regs_);
|
||||
const size_t N = Lanes(d);
|
||||
for (size_t i = 0; i < N; i += 4) {
|
||||
Store(v, d, buf_.get());
|
||||
const size_t idx_q = (i / 4) * num_regs_ + idx_reg;
|
||||
CopyBytes<16>(buf_.get() + i, consecutive_.get() + idx_q * 4);
|
||||
}
|
||||
}
|
||||
|
||||
void StoreVector(D d, const size_t idx_reg, Vec<D> v) {
|
||||
HWY_ASSERT(idx_reg < num_regs_);
|
||||
Store(v, d, &consecutive_[idx_reg * Lanes(d)]);
|
||||
}
|
||||
|
||||
bool IsBitonic() const {
|
||||
HWY_ASSERT(!alternating_);
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
bool is_asc = true;
|
||||
bool is_desc = true;
|
||||
bool is_zero = true;
|
||||
|
||||
for (size_t i = 0; i < run_length_ / 2 - 1; ++i) {
|
||||
is_asc &= (p[i] <= p[i + 1]);
|
||||
is_desc &= (p[i] >= p[i + 1]);
|
||||
}
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
is_zero &= (p[i] == 0);
|
||||
}
|
||||
|
||||
bool is_asc2 = true;
|
||||
bool is_desc2 = true;
|
||||
for (size_t i = run_length_ / 2; i < run_length_ - 1; ++i) {
|
||||
is_asc2 &= (p[i] <= p[i + 1]);
|
||||
is_desc2 &= (p[i] >= p[i + 1]);
|
||||
}
|
||||
|
||||
if (is_zero) continue;
|
||||
if (is_asc && is_desc2) continue;
|
||||
if (is_desc && is_asc2) continue;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void CheckBitonic(int line, int caller) const {
|
||||
if (IsBitonic()) return;
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
printf("run %" PRIu64 " (len %" PRIu64 ")\n", static_cast<uint64_t>(ir),
|
||||
static_cast<uint64_t>(run_length_));
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
printf("%.0f\n", static_cast<float>(p[i]));
|
||||
}
|
||||
}
|
||||
printf("caller %d\n", caller);
|
||||
hwy::Abort("", line, "not bitonic");
|
||||
}
|
||||
|
||||
void CheckSorted(SortOrder kOrder, int line, int caller) const {
|
||||
for (size_t ir = 0; ir < num_runs_; ++ir) {
|
||||
const SortOrder order =
|
||||
(alternating_ && (ir & 1)) ? Reverse(kOrder) : kOrder;
|
||||
const T* p = &consecutive_[ir * run_length_];
|
||||
|
||||
for (size_t i = 0; i < run_length_ - 1; ++i) {
|
||||
if (!Compare(p[i], p[i + 1], order)) {
|
||||
printf("ir%" PRIu64 " run_length=%" PRIu64
|
||||
" alt=%d original order=%d this order=%d\n",
|
||||
static_cast<uint64_t>(ir), static_cast<uint64_t>(run_length_),
|
||||
alternating_, static_cast<int>(kOrder),
|
||||
static_cast<int>(order));
|
||||
for (size_t i = 0; i < run_length_; ++i) {
|
||||
printf(" %.0f\n", static_cast<float>(p[i]));
|
||||
}
|
||||
printf("caller %d\n", caller);
|
||||
hwy::Abort("", line, "not sorted");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
AlignedFreeUniquePtr<T[]> buf_;
|
||||
AlignedFreeUniquePtr<T[]> consecutive_;
|
||||
size_t num_regs_;
|
||||
size_t run_length_;
|
||||
size_t num_runs_;
|
||||
bool is_vector_;
|
||||
bool alternating_;
|
||||
};
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0) {
|
||||
Runs<D> runs(d, 1);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1) {
|
||||
Runs<D> runs(d, 2);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3) {
|
||||
Runs<D> runs(d, 4);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
runs.ScatterQuartets(d, 2, v2);
|
||||
runs.ScatterQuartets(d, 3, v3);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
|
||||
Vec<D> v7) {
|
||||
Runs<D> runs(d, 8);
|
||||
runs.ScatterQuartets(d, 0, v0);
|
||||
runs.ScatterQuartets(d, 1, v1);
|
||||
runs.ScatterQuartets(d, 2, v2);
|
||||
runs.ScatterQuartets(d, 3, v3);
|
||||
runs.ScatterQuartets(d, 4, v4);
|
||||
runs.ScatterQuartets(d, 5, v5);
|
||||
runs.ScatterQuartets(d, 6, v6);
|
||||
runs.ScatterQuartets(d, 7, v7);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2,
|
||||
Vec<D> v3, Vec<D> v4, Vec<D> v5, Vec<D> v6,
|
||||
Vec<D> v7, Vec<D> v8, Vec<D> v9, Vec<D> vA,
|
||||
Vec<D> vB, Vec<D> vC, Vec<D> vD, Vec<D> vE,
|
||||
Vec<D> vF) {
|
||||
Runs<D> runs(d, 16);
|
||||
runs.ScatterQuartets(d, 0x0, v0);
|
||||
runs.ScatterQuartets(d, 0x1, v1);
|
||||
runs.ScatterQuartets(d, 0x2, v2);
|
||||
runs.ScatterQuartets(d, 0x3, v3);
|
||||
runs.ScatterQuartets(d, 0x4, v4);
|
||||
runs.ScatterQuartets(d, 0x5, v5);
|
||||
runs.ScatterQuartets(d, 0x6, v6);
|
||||
runs.ScatterQuartets(d, 0x7, v7);
|
||||
runs.ScatterQuartets(d, 0x8, v8);
|
||||
runs.ScatterQuartets(d, 0x9, v9);
|
||||
runs.ScatterQuartets(d, 0xA, vA);
|
||||
runs.ScatterQuartets(d, 0xB, vB);
|
||||
runs.ScatterQuartets(d, 0xC, vC);
|
||||
runs.ScatterQuartets(d, 0xD, vD);
|
||||
runs.ScatterQuartets(d, 0xE, vE);
|
||||
runs.ScatterQuartets(d, 0xF, vF);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreDeinterleavedQuartets(
|
||||
D d, const Vec<D>& v00, const Vec<D>& v01, const Vec<D>& v02,
|
||||
const Vec<D>& v03, const Vec<D>& v04, const Vec<D>& v05, const Vec<D>& v06,
|
||||
const Vec<D>& v07, const Vec<D>& v08, const Vec<D>& v09, const Vec<D>& v0A,
|
||||
const Vec<D>& v0B, const Vec<D>& v0C, const Vec<D>& v0D, const Vec<D>& v0E,
|
||||
const Vec<D>& v0F, const Vec<D>& v10, const Vec<D>& v11, const Vec<D>& v12,
|
||||
const Vec<D>& v13, const Vec<D>& v14, const Vec<D>& v15, const Vec<D>& v16,
|
||||
const Vec<D>& v17, const Vec<D>& v18, const Vec<D>& v19, const Vec<D>& v1A,
|
||||
const Vec<D>& v1B, const Vec<D>& v1C, const Vec<D>& v1D, const Vec<D>& v1E,
|
||||
const Vec<D>& v1F) {
|
||||
Runs<D> runs(d, 32);
|
||||
runs.ScatterQuartets(d, 0x00, v00);
|
||||
runs.ScatterQuartets(d, 0x01, v01);
|
||||
runs.ScatterQuartets(d, 0x02, v02);
|
||||
runs.ScatterQuartets(d, 0x03, v03);
|
||||
runs.ScatterQuartets(d, 0x04, v04);
|
||||
runs.ScatterQuartets(d, 0x05, v05);
|
||||
runs.ScatterQuartets(d, 0x06, v06);
|
||||
runs.ScatterQuartets(d, 0x07, v07);
|
||||
runs.ScatterQuartets(d, 0x08, v08);
|
||||
runs.ScatterQuartets(d, 0x09, v09);
|
||||
runs.ScatterQuartets(d, 0x0A, v0A);
|
||||
runs.ScatterQuartets(d, 0x0B, v0B);
|
||||
runs.ScatterQuartets(d, 0x0C, v0C);
|
||||
runs.ScatterQuartets(d, 0x0D, v0D);
|
||||
runs.ScatterQuartets(d, 0x0E, v0E);
|
||||
runs.ScatterQuartets(d, 0x0F, v0F);
|
||||
runs.ScatterQuartets(d, 0x10, v10);
|
||||
runs.ScatterQuartets(d, 0x11, v11);
|
||||
runs.ScatterQuartets(d, 0x12, v12);
|
||||
runs.ScatterQuartets(d, 0x13, v13);
|
||||
runs.ScatterQuartets(d, 0x14, v14);
|
||||
runs.ScatterQuartets(d, 0x15, v15);
|
||||
runs.ScatterQuartets(d, 0x16, v16);
|
||||
runs.ScatterQuartets(d, 0x17, v17);
|
||||
runs.ScatterQuartets(d, 0x18, v18);
|
||||
runs.ScatterQuartets(d, 0x19, v19);
|
||||
runs.ScatterQuartets(d, 0x1A, v1A);
|
||||
runs.ScatterQuartets(d, 0x1B, v1B);
|
||||
runs.ScatterQuartets(d, 0x1C, v1C);
|
||||
runs.ScatterQuartets(d, 0x1D, v1D);
|
||||
runs.ScatterQuartets(d, 0x1E, v1E);
|
||||
runs.ScatterQuartets(d, 0x1F, v1F);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, size_t run_length, bool alternating) {
|
||||
Runs<D> runs(d, 1, run_length, alternating);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1) {
|
||||
constexpr size_t kRegs = 2;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3) {
|
||||
constexpr size_t kRegs = 4;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
runs.StoreVector(d, 2, v2);
|
||||
runs.StoreVector(d, 3, v3);
|
||||
return runs;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
Runs<D> StoreVectors(D d, Vec<D> v0, Vec<D> v1, Vec<D> v2, Vec<D> v3, Vec<D> v4,
|
||||
Vec<D> v5, Vec<D> v6, Vec<D> v7) {
|
||||
constexpr size_t kRegs = 8;
|
||||
Runs<D> runs(d, kRegs, /*run_length=*/kRegs * Lanes(d), /*alternating=*/false);
|
||||
runs.StoreVector(d, 0, v0);
|
||||
runs.StoreVector(d, 1, v1);
|
||||
runs.StoreVector(d, 2, v2);
|
||||
runs.StoreVector(d, 3, v3);
|
||||
runs.StoreVector(d, 4, v4);
|
||||
runs.StoreVector(d, 5, v5);
|
||||
runs.StoreVector(d, 6, v6);
|
||||
runs.StoreVector(d, 7, v7);
|
||||
return runs;
|
||||
}
|
||||
|
||||
#endif // HWY_SORT_VERIFY
|
||||
} // namespace verify
|
||||
|
||||
namespace detail {
|
||||
|
||||
// ------------------------------ Vector-length agnostic (quartets)
|
||||
|
||||
// For each lane i: replaces a[i] with the first and b[i] with the second
|
||||
// according to kOrder.
|
||||
// Corresponds to a conditional swap, which is one "node" of a sorting network.
|
||||
// Min/Max are cheaper than compare + blend at least for integers.
|
||||
template <SortOrder kOrder, class V>
|
||||
HWY_INLINE void SortLanesIn2Vectors(V& a, V& b) {
|
||||
V temp = a;
|
||||
a = (kOrder == SortOrder::kAscending) ? Min(a, b) : Max(a, b);
|
||||
b = (kOrder == SortOrder::kAscending) ? Max(temp, b) : Min(temp, b);
|
||||
}
|
||||
|
||||
// For each lane: sorts the four values in the that lane of the four vectors.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void SortLanesIn4Vectors(D d, const TFromD<D>* in, V& v0, V& v1,
|
||||
V& v2, V& v3) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
// Bitonic and odd-even sorters both have 5 nodes. This one is from
|
||||
// http://users.telenet.be/bertdobbelaere/SorterHunter/sorting_networks.html
|
||||
|
||||
// layer 1
|
||||
v0 = Load(d, in + 0 * N);
|
||||
v2 = Load(d, in + 2 * N);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
v1 = Load(d, in + 1 * N);
|
||||
v3 = Load(d, in + 3 * N);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
|
||||
// layer 2
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
|
||||
// layer 3
|
||||
SortLanesIn2Vectors<kOrder>(v1, v2);
|
||||
}
|
||||
|
||||
// Inputs are vectors with columns in sorted order (from SortLanesIn4Vectors).
|
||||
// Transposes so that output vectors are sorted quartets (128-bit blocks),
|
||||
// and a quartet in v0 comes before its counterpart in v1, etc.
|
||||
template <class D, class V = Vec<D>>
|
||||
HWY_INLINE void Transpose4x4(D d, V& v0, V& v1, V& v2, V& v3) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
|
||||
// Input: first number is reg, second is lane (0 is lowest)
|
||||
// 03 02 01 00 |
|
||||
// 13 12 11 10 | columns are sorted
|
||||
// 23 22 21 20 | (in this order)
|
||||
// 33 32 31 30 V
|
||||
const V t0 = InterleaveLower(d, v0, v1); // 11 01 10 00
|
||||
const V t1 = InterleaveLower(d, v2, v3); // 31 21 30 20
|
||||
const V t2 = InterleaveUpper(d, v0, v1); // 13 03 12 02
|
||||
const V t3 = InterleaveUpper(d, v2, v3); // 33 23 32 22
|
||||
|
||||
// 30 20 10 00
|
||||
v0 = BitCast(d, InterleaveLower(BitCast(dw, t0), BitCast(dw, t1)));
|
||||
// 31 21 11 01
|
||||
v1 = BitCast(d, InterleaveUpper(BitCast(dw, t0), BitCast(dw, t1)));
|
||||
// 32 22 12 02
|
||||
v2 = BitCast(d, InterleaveLower(BitCast(dw, t2), BitCast(dw, t3)));
|
||||
// 33 23 13 03 --> sorted in descending order (03=smallest in lane 0).
|
||||
v3 = BitCast(d, InterleaveUpper(BitCast(dw, t2), BitCast(dw, t3)));
|
||||
}
|
||||
|
||||
// 12 ops (including 4 swizzle)
|
||||
// Precondition: v0 and v1 are already sorted according to kOrder.
|
||||
// Postcondition: concatenate(v0, v1) is sorted and v0 is the lower half.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void Merge2SortedQuartets(D d, V& v0, V& v1, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input0 = verify::StoreDeinterleavedQuartets(d, v0);
|
||||
const verify::Runs<D> input1 = verify::StoreDeinterleavedQuartets(d, v1);
|
||||
input0.CheckSorted(kOrder, __LINE__, caller);
|
||||
input1.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
|
||||
// See figure 5 from https://www.vldb.org/pvldb/vol8/p1274-inoue.pdf.
|
||||
// This requires 8 min/max vs 6 for bitonic merge (see Figure 2 in
|
||||
// http://www.vldb.org/pvldb/vol1/1454171.pdf), but is faster overall because
|
||||
// it needs less shuffling, and does not need a bitonic input.
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
v0 = Shuffle0321(v0);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
auto output = verify::StoreDeinterleavedQuartets(d, v0, v1);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Bitonic merge (quartets)
|
||||
|
||||
// For the last layer of bitonic merge. Conditionally swaps even-numbered lanes
|
||||
// with their odd-numbered neighbor. Works for both quartets and vectors.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortAdjacentLanesQV(D d, Vec<D>& q_or_v) {
|
||||
(void)d;
|
||||
// Optimization for 32-bit integers: swap via Shuffle and 64-bit Min/Max.
|
||||
// (not worthwhile on SSE4/AVX2 because they lack 64-bit Min/Max)
|
||||
#if !HWY_ARCH_X86 || HWY_TARGET <= HWY_AVX3
|
||||
if (sizeof(TFromD<D>) == 4 && !IsFloat<TFromD<D>>()) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
const auto wide = BitCast(dw, q_or_v);
|
||||
const auto swap = BitCast(dw, Shuffle2301(q_or_v));
|
||||
if (kOrder == SortOrder::kAscending) {
|
||||
q_or_v = BitCast(d, Max(wide, swap));
|
||||
} else {
|
||||
q_or_v = BitCast(d, Min(wide, swap));
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
Vec<D> swapped = Shuffle2301(q_or_v);
|
||||
SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
|
||||
q_or_v = OddEven(swapped, q_or_v);
|
||||
}
|
||||
}
|
||||
|
||||
// Lane 0 with 2, 1 with 3 etc. Works for both quartets and vectors.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortDistance2LanesQV(D d, Vec<D>& q_or_v) {
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
Vec<D> swapped = Shuffle1032(q_or_v);
|
||||
SortLanesIn2Vectors<kOrder>(q_or_v, swapped);
|
||||
q_or_v = BitCast(d, OddEven(BitCast(dw, swapped), BitCast(dw, q_or_v)));
|
||||
}
|
||||
|
||||
// For all BitonicMerge*, and each block, the concatenation of those blocks from
|
||||
// the first half and second half of the input vectors must be sorted in
|
||||
// opposite orders.
|
||||
|
||||
// 14 ops (including 4 swizzle)
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge2Quartets(D d, V& q0, V& q1, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input = verify::StoreDeinterleavedQuartets(d, q0, q1);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 4 (2 ops)
|
||||
SortLanesIn2Vectors<kOrder>(q0, q1);
|
||||
|
||||
// Layer 2: lane stride 2 (6 ops)
|
||||
SortDistance2LanesQV<kOrder>(d, q0);
|
||||
SortDistance2LanesQV<kOrder>(d, q1);
|
||||
|
||||
// Layer 3: lane stride 1 (4 ops)
|
||||
SortAdjacentLanesQV<kOrder>(d, q0);
|
||||
SortAdjacentLanesQV<kOrder>(d, q1);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output = verify::StoreDeinterleavedQuartets(d, q0, q1);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 32 ops, more efficient than three 4+4 merges (36 ops).
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge4Quartets(D d, V& q0, V& q1, V& q2, V& q3,
|
||||
int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 8
|
||||
SortLanesIn2Vectors<kOrder>(q0, q2);
|
||||
SortLanesIn2Vectors<kOrder>(q1, q3);
|
||||
|
||||
// Layers 2 to 4
|
||||
// Inputs are not fully sorted, so cannot use Merge2SortedQuartets.
|
||||
BitonicMerge2Quartets<kOrder>(d, q0, q1, __LINE__);
|
||||
BitonicMerge2Quartets<kOrder>(d, q2, q3, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 72 ops.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMerge8Quartets(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(q0, q4);
|
||||
SortLanesIn2Vectors<kOrder>(q1, q5);
|
||||
SortLanesIn2Vectors<kOrder>(q2, q6);
|
||||
SortLanesIn2Vectors<kOrder>(q3, q7);
|
||||
|
||||
// Layers 2 to 5
|
||||
BitonicMerge4Quartets<kOrder>(d, q0, q1, q2, q3, __LINE__);
|
||||
BitonicMerge4Quartets<kOrder>(d, q4, q5, q6, q7, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreDeinterleavedQuartets(d, q0, q1, q2, q3, q4, q5, q6, q7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Bitonic merge (vectors)
|
||||
|
||||
// Lane 0 with 4, 1 with 5 etc. Only used for vectors with at least 8 lanes.
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
|
||||
// TODO(janwas): move to op
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_2020(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_3131(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(3, 1, 3, 1))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> Shuffle128_2301(Vec512<T> a, Vec512<T> b) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(a.raw, b.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Vec512<T> OddEven128(Vec512<T> odd, Vec512<T> even) {
|
||||
return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, class T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 16> d, Vec<decltype(d)>& v) {
|
||||
// In: FEDCBA98 76543210
|
||||
// Swap 128-bit halves of each 256 bits => BA98FEDC 32107654
|
||||
Vec512<T> swapped = Shuffle128_2301(v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = OddEven128(swapped, v);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 8> d, Vec<decltype(d)>& v) {
|
||||
Vec<decltype(d)> swapped = ConcatLowerUpper(d, v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = ConcatUpperLower(swapped, v);
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_INLINE void SortDistance4LanesV(Simd<T, 4> /* tag */, ...) {}
|
||||
|
||||
// Only used for vectors with at least 16 lanes.
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_INLINE void SortDistance8LanesV(D d, Vec<D>& v) {
|
||||
Vec<D> swapped = ConcatLowerUpper(d, v, v);
|
||||
SortLanesIn2Vectors<kOrder>(v, swapped);
|
||||
v = ConcatUpperLower(swapped, v);
|
||||
}
|
||||
|
||||
// 120 ops. Only used if vectors are at least 8 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 32
|
||||
SortLanesIn2Vectors<kOrder>(v0, v4);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v3, v7);
|
||||
|
||||
// Layer 2: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
SortLanesIn2Vectors<kOrder>(v4, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v5, v7);
|
||||
|
||||
// Layer 3: lane stride 8
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
SortLanesIn2Vectors<kOrder>(v4, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v6, v7);
|
||||
|
||||
// Layer 4: lane stride 4
|
||||
SortDistance4LanesV<kOrder>(d, v0);
|
||||
SortDistance4LanesV<kOrder>(d, v1);
|
||||
SortDistance4LanesV<kOrder>(d, v2);
|
||||
SortDistance4LanesV<kOrder>(d, v3);
|
||||
SortDistance4LanesV<kOrder>(d, v4);
|
||||
SortDistance4LanesV<kOrder>(d, v5);
|
||||
SortDistance4LanesV<kOrder>(d, v6);
|
||||
SortDistance4LanesV<kOrder>(d, v7);
|
||||
|
||||
// Layer 5: lane stride 2
|
||||
SortDistance2LanesQV<kOrder>(d, v0);
|
||||
SortDistance2LanesQV<kOrder>(d, v1);
|
||||
SortDistance2LanesQV<kOrder>(d, v2);
|
||||
SortDistance2LanesQV<kOrder>(d, v3);
|
||||
SortDistance2LanesQV<kOrder>(d, v4);
|
||||
SortDistance2LanesQV<kOrder>(d, v5);
|
||||
SortDistance2LanesQV<kOrder>(d, v6);
|
||||
SortDistance2LanesQV<kOrder>(d, v7);
|
||||
|
||||
// Layer 6: lane stride 1
|
||||
SortAdjacentLanesQV<kOrder>(d, v0);
|
||||
SortAdjacentLanesQV<kOrder>(d, v1);
|
||||
SortAdjacentLanesQV<kOrder>(d, v2);
|
||||
SortAdjacentLanesQV<kOrder>(d, v3);
|
||||
SortAdjacentLanesQV<kOrder>(d, v4);
|
||||
SortAdjacentLanesQV<kOrder>(d, v5);
|
||||
SortAdjacentLanesQV<kOrder>(d, v6);
|
||||
SortAdjacentLanesQV<kOrder>(d, v7);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 60 ops. Only used if vectors are at least 16 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo64(D d, V& v0, V& v1, V& v2, V& v3, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input = verify::StoreVectors(d, v0, v1, v2, v3);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 32
|
||||
SortLanesIn2Vectors<kOrder>(v0, v2);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v3);
|
||||
|
||||
// Layer 2: lane stride 16
|
||||
SortLanesIn2Vectors<kOrder>(v0, v1);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v3);
|
||||
|
||||
// Layer 3: lane stride 8
|
||||
SortDistance8LanesV<kOrder>(d, v0);
|
||||
SortDistance8LanesV<kOrder>(d, v1);
|
||||
SortDistance8LanesV<kOrder>(d, v2);
|
||||
SortDistance8LanesV<kOrder>(d, v3);
|
||||
|
||||
// Layer 4: lane stride 4
|
||||
SortDistance4LanesV<kOrder>(d, v0);
|
||||
SortDistance4LanesV<kOrder>(d, v1);
|
||||
SortDistance4LanesV<kOrder>(d, v2);
|
||||
SortDistance4LanesV<kOrder>(d, v3);
|
||||
|
||||
// Layer 5: lane stride 2
|
||||
SortDistance2LanesQV<kOrder>(d, v0);
|
||||
SortDistance2LanesQV<kOrder>(d, v1);
|
||||
SortDistance2LanesQV<kOrder>(d, v2);
|
||||
SortDistance2LanesQV<kOrder>(d, v3);
|
||||
|
||||
// Layer 6: lane stride 1
|
||||
SortAdjacentLanesQV<kOrder>(d, v0);
|
||||
SortAdjacentLanesQV<kOrder>(d, v1);
|
||||
SortAdjacentLanesQV<kOrder>(d, v2);
|
||||
SortAdjacentLanesQV<kOrder>(d, v3);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output = verify::StoreVectors(d, v0, v1, v2, v3);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// 128 ops. Only used if vectors are at least 16 lanes.
|
||||
template <SortOrder kOrder, class D, class V = Vec<D>>
|
||||
HWY_INLINE void BitonicMergeTo128(D d, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, int caller) {
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> input =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
if (caller == -1) input.CheckBitonic(__LINE__, __LINE__);
|
||||
#endif
|
||||
|
||||
// Layer 1: lane stride 64
|
||||
SortLanesIn2Vectors<kOrder>(v0, v4);
|
||||
SortLanesIn2Vectors<kOrder>(v1, v5);
|
||||
SortLanesIn2Vectors<kOrder>(v2, v6);
|
||||
SortLanesIn2Vectors<kOrder>(v3, v7);
|
||||
|
||||
BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, __LINE__);
|
||||
BitonicMergeTo64<kOrder>(d, v4, v5, v6, v7, __LINE__);
|
||||
|
||||
#if HWY_SORT_VERIFY
|
||||
const verify::Runs<D> output =
|
||||
verify::StoreVectors(d, v0, v1, v2, v3, v4, v5, v6, v7);
|
||||
output.CheckSorted(kOrder, __LINE__, caller);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Vector-length dependent
|
||||
|
||||
// Only called when N=4 (single block, so quartets can just be stored).
|
||||
template <SortOrder kOrder, class D, class V>
|
||||
HWY_API size_t SingleQuartetPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, TFromD<D>* inout) {
|
||||
Store(q0, d, inout + 0 * 4);
|
||||
Store(q1, d, inout + 1 * 4);
|
||||
Store(q2, d, inout + 2 * 4);
|
||||
Store(q3, d, inout + 3 * 4);
|
||||
Store(q4, d, inout + 4 * 4);
|
||||
Store(q5, d, inout + 5 * 4);
|
||||
Store(q6, d, inout + 6 * 4);
|
||||
Store(q7, d, inout + 7 * 4);
|
||||
return 8 * 4;
|
||||
}
|
||||
|
||||
// Only called when N=8.
|
||||
template <SortOrder kOrder, class D, class V>
|
||||
HWY_API size_t TwoQuartetsPerVector(D d, V& q0, V& q1, V& q2, V& q3, V& q4,
|
||||
V& q5, V& q6, V& q7, TFromD<D>* inout) {
|
||||
V v0 = ConcatLowerLower(d, q1, q0);
|
||||
V v1 = ConcatLowerLower(d, q3, q2);
|
||||
V v2 = ConcatLowerLower(d, q5, q4);
|
||||
V v3 = ConcatLowerLower(d, q7, q6);
|
||||
// TODO(janwas): merge into single table
|
||||
V v4 = Reverse(d, ConcatUpperUpper(d, q7, q6));
|
||||
V v5 = Reverse(d, ConcatUpperUpper(d, q5, q4));
|
||||
V v6 = Reverse(d, ConcatUpperUpper(d, q3, q2));
|
||||
V v7 = Reverse(d, ConcatUpperUpper(d, q1, q0));
|
||||
detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
|
||||
|
||||
Store(v0, d, inout + 0 * 8);
|
||||
Store(v1, d, inout + 1 * 8);
|
||||
Store(v2, d, inout + 2 * 8);
|
||||
Store(v3, d, inout + 3 * 8);
|
||||
Store(v4, d, inout + 4 * 8);
|
||||
Store(v5, d, inout + 5 * 8);
|
||||
Store(v6, d, inout + 6 * 8);
|
||||
Store(v7, d, inout + 7 * 8);
|
||||
return 8 * 8;
|
||||
}
|
||||
|
||||
// Only called when N=16.
|
||||
template <SortOrder kOrder, typename T, class V>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 16> d, V& q0, V& q1, V& q2, V& q3,
|
||||
V& q4, V& q5, V& q6, V& q7, T* inout) {
|
||||
const V q11_01_10_00 = Shuffle128_2020(q0, q1);
|
||||
const V q13_03_12_02 = Shuffle128_2020(q2, q3);
|
||||
V v0 = Shuffle128_2020(q11_01_10_00, q13_03_12_02); // 3..0
|
||||
|
||||
const V q15_05_14_04 = Shuffle128_2020(q4, q5);
|
||||
const V q17_07_16_06 = Shuffle128_2020(q6, q7);
|
||||
V v1 = Shuffle128_2020(q15_05_14_04, q17_07_16_06); // 7..4
|
||||
|
||||
const V q19_09_18_08 = Shuffle128_3131(q0, q1);
|
||||
const V q1b_0b_1a_0a = Shuffle128_3131(q2, q3);
|
||||
V v3 = Reverse(d, Shuffle128_2020(q19_09_18_08, q1b_0b_1a_0a)); // b..8
|
||||
|
||||
const V q1d_0d_1c_0c = Shuffle128_3131(q4, q5);
|
||||
const V q1f_0f_1e_0e = Shuffle128_3131(q6, q7);
|
||||
V v2 = Reverse(d, Shuffle128_2020(q1d_0d_1c_0c, q1f_0f_1e_0e)); // f..c
|
||||
|
||||
detail::BitonicMergeTo64<kOrder>(d, v0, v1, v2, v3, -1);
|
||||
|
||||
// TODO(janwas): merge into single table
|
||||
V v4 = Shuffle128_3131(q11_01_10_00, q13_03_12_02); // 13..10
|
||||
V v5 = Shuffle128_3131(q15_05_14_04, q17_07_16_06); // 17..14
|
||||
V v7 = Reverse(d, Shuffle128_3131(q19_09_18_08, q1b_0b_1a_0a)); // 1b..18
|
||||
V v6 = Reverse(d, Shuffle128_3131(q1d_0d_1c_0c, q1f_0f_1e_0e)); // 1f..1c
|
||||
|
||||
detail::BitonicMergeTo64<Reverse(kOrder)>(d, v4, v5, v6, v7, -1);
|
||||
|
||||
detail::BitonicMergeTo128<kOrder>(d, v0, v1, v2, v3, v4, v5, v6, v7, -1);
|
||||
|
||||
Store(v0, d, inout + 0 * 16);
|
||||
Store(v1, d, inout + 1 * 16);
|
||||
Store(v2, d, inout + 2 * 16);
|
||||
Store(v3, d, inout + 3 * 16);
|
||||
Store(v4, d, inout + 4 * 16);
|
||||
Store(v5, d, inout + 5 * 16);
|
||||
Store(v6, d, inout + 6 * 16);
|
||||
Store(v7, d, inout + 7 * 16);
|
||||
return 8 * 16;
|
||||
}
|
||||
|
||||
// Avoid needing #if at the call sites.
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t TwoQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 4> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
template <SortOrder kOrder, typename T>
|
||||
HWY_API size_t FourQuartetsPerVector(Simd<T, 8> /* tag */, ...) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class D>
|
||||
HWY_API size_t SortBatchSize(D d) {
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 4) return 32;
|
||||
if (N == 8) return 64;
|
||||
if (N == 16) return 128;
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_API size_t SortBatch(D d, TFromD<D>* inout) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
Vec<D> q0, q1, q2, q3;
|
||||
detail::SortLanesIn4Vectors<kOrder>(d, inout, q0, q1, q2, q3);
|
||||
detail::Transpose4x4(d, q0, q1, q2, q3);
|
||||
detail::Merge2SortedQuartets<kOrder>(d, q0, q1, -1);
|
||||
detail::Merge2SortedQuartets<kOrder>(d, q2, q3, -1);
|
||||
|
||||
// Bitonic merges require one input to be in reverse order.
|
||||
constexpr SortOrder kReverse = Reverse(kOrder);
|
||||
|
||||
Vec<D> q4, q5, q6, q7;
|
||||
detail::SortLanesIn4Vectors<kReverse>(d, inout + 4 * N, q4, q5, q6, q7);
|
||||
detail::Transpose4x4(d, q4, q5, q6, q7);
|
||||
detail::Merge2SortedQuartets<kReverse>(d, q4, q5, -1);
|
||||
detail::Merge2SortedQuartets<kReverse>(d, q6, q7, -1);
|
||||
|
||||
detail::BitonicMerge4Quartets<kOrder>(d, q0, q1, q4, q5, -1);
|
||||
detail::BitonicMerge4Quartets<kReverse>(d, q2, q3, q6, q7, -1);
|
||||
|
||||
detail::BitonicMerge8Quartets<kOrder>(d, q0, q1, q4, q5, q2, q3, q6, q7,
|
||||
__LINE__);
|
||||
|
||||
if (N == 4) {
|
||||
return detail::SingleQuartetPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
if (N == 8) {
|
||||
return detail::TwoQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
return detail::FourQuartetsPerVector<kOrder>(d, q0, q1, q4, q5, q2, q3, q6,
|
||||
q7, inout);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Avoids unused attribute warning
|
||||
template <SortOrder kOrder, class D>
|
||||
HWY_API size_t SortBatch(D /* tag */, TFromD<D>* /* inout */) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORT_INL_H_
|
|
@ -0,0 +1,188 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/contrib/sort/sort-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
template <class D>
|
||||
size_t K(D d) {
|
||||
return SortBatchSize(d);
|
||||
}
|
||||
|
||||
template <SortOrder kOrder, class D>
|
||||
void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) {
|
||||
const size_t N = Lanes(d);
|
||||
// Ensure it matches the sort order
|
||||
for (size_t i = 0; i < K(d) - 1; ++i) {
|
||||
if (!verify::Compare(out[i], out[i + 1], kOrder)) {
|
||||
printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(i),
|
||||
static_cast<uint64_t>(N), static_cast<float>(out[i + 0]),
|
||||
static_cast<float>(out[i + 1]));
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f\n", static_cast<float>(out[i]));
|
||||
}
|
||||
|
||||
printf("\n\nin was:\n");
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f\n", static_cast<float>(in[i]));
|
||||
}
|
||||
fflush(stdout);
|
||||
HWY_ABORT("Sort is incorrect");
|
||||
}
|
||||
}
|
||||
|
||||
// Also verify sums match (detects duplicated/lost values)
|
||||
double expected_sum = 0.0;
|
||||
double actual_sum = 0.0;
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
expected_sum += in[i];
|
||||
actual_sum += out[i];
|
||||
}
|
||||
if (expected_sum != actual_sum) {
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f %.0f\n", static_cast<float>(in[i]),
|
||||
static_cast<float>(out[i]));
|
||||
}
|
||||
HWY_ABORT("Mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
class TestReverse {
|
||||
template <SortOrder kOrder, class D>
|
||||
void TestOrder(D d, RandomState& /* rng */) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
HWY_ASSERT((N % 4) == 0);
|
||||
auto in = AllocateAligned<T>(K(d));
|
||||
auto inout = AllocateAligned<T>(K(d));
|
||||
|
||||
const size_t expected_size = SortBatchSize(d);
|
||||
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
in[i] = static_cast<T>(K(d) - i);
|
||||
inout[i] = in[i];
|
||||
}
|
||||
|
||||
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
|
||||
HWY_ASSERT_EQ(expected_size, actual_size);
|
||||
Validate<kOrder>(d, in.get(), inout.get());
|
||||
}
|
||||
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
TestOrder<SortOrder::kAscending>(d, rng);
|
||||
TestOrder<SortOrder::kDescending>(d, rng);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllReverse() {
|
||||
TestReverse test;
|
||||
test(int32_t(), CappedTag<int32_t, 16>());
|
||||
test(uint32_t(), CappedTag<uint32_t, 16>());
|
||||
}
|
||||
|
||||
class TestRanges {
|
||||
template <SortOrder kOrder, class D>
|
||||
void TestOrder(D d, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
HWY_ASSERT((N % 4) == 0);
|
||||
auto in = AllocateAligned<T>(K(d));
|
||||
auto inout = AllocateAligned<T>(K(d));
|
||||
|
||||
const size_t expected_size = SortBatchSize(d);
|
||||
|
||||
// For each range, try all 0/1 combinations and set any other lanes to
|
||||
// random inputs.
|
||||
constexpr size_t kRange = 8;
|
||||
for (size_t range = 0; range < K(d); range += kRange) {
|
||||
for (size_t bits = 0; bits < (1ull << kRange); ++bits) {
|
||||
// First set all to random, will later overwrite those for `range`
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
// Now set the current combination of {0,1} for elements in the range.
|
||||
// This is sufficient to establish correctness (arbitrary inputs could
|
||||
// be mapped to 0/1 with a comparison predicate).
|
||||
for (size_t i = 0; i < kRange; ++i) {
|
||||
in[range + i] = inout[range + i] = (bits >> i) & 1;
|
||||
}
|
||||
|
||||
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
|
||||
HWY_ASSERT_EQ(expected_size, actual_size);
|
||||
Validate<kOrder>(d, in.get(), inout.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
TestOrder<SortOrder::kAscending>(d, rng);
|
||||
TestOrder<SortOrder::kDescending>(d, rng);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllRanges() {
|
||||
TestRanges test;
|
||||
test(int32_t(), CappedTag<int32_t, 16>());
|
||||
test(uint32_t(), CappedTag<uint32_t, 16>());
|
||||
}
|
||||
|
||||
#else
|
||||
void TestAllReverse() {}
|
||||
void TestAllRanges() {}
|
||||
#endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(SortTest);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,194 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
||||
|
||||
// Detects compiler and arch from predefined macros. Zero dependencies for
|
||||
// inclusion by foreach_target.h.
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler
|
||||
|
||||
// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
|
||||
// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
|
||||
// purpose.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang can masquerade as MSVC/GCC, in which case both are set.
|
||||
#ifdef __clang__
|
||||
#ifdef __APPLE__
|
||||
// Apple LLVM version is unrelated to the actual Clang version, which we need
|
||||
// for enabling workarounds. Use the presence of warning flags to deduce it.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
#elif __has_warning("-Wmisleading-indentation")
|
||||
#define HWY_COMPILER_CLANG 1000
|
||||
#elif defined(__FILE_NAME__)
|
||||
#define HWY_COMPILER_CLANG 900
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi")
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#else // Non-Apple: normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
|
||||
!HWY_COMPILER_CLANG
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
#define HWY_HAS_BUILTIN(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_attribute
|
||||
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_feature
|
||||
#define HWY_HAS_FEATURE(name) __has_feature(name)
|
||||
#else
|
||||
#define HWY_HAS_FEATURE(name) 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Architecture
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#define HWY_ARCH_X86 0
|
||||
#define HWY_ARCH_PPC 0
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#define HWY_ARCH_ARM 1
|
||||
#define HWY_ARCH_WASM 0
|
||||
#define HWY_ARCH_RVV 0
|
||||
|
||||
#else
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HWY_ARCH_X86_64 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
|
||||
#error "Cannot have both x86-32 and x86-64"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
||||
#define HWY_ARCH_X86 1
|
||||
#else
|
||||
#define HWY_ARCH_X86 0
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc64__) || defined(_M_PPC)
|
||||
#define HWY_ARCH_PPC 1
|
||||
#else
|
||||
#define HWY_ARCH_PPC 0
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
||||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
#define HWY_ARCH_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#define HWY_ARCH_RVV 1
|
||||
#else
|
||||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
#endif // defined(HWY_EMULATE_SVE)
|
||||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
||||
HWY_ARCH_RVV) > 1
|
||||
#error "Must not detect more than one architecture"
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
|
|
@ -0,0 +1,392 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
#define HIGHWAY_HWY_DETECT_TARGETS_H_
|
||||
|
||||
// Defines targets and chooses which to enable.
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See ../quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
|
||||
// Uncomment to override the default blocklist:
|
||||
// #define HWY_BROKEN_TARGETS HWY_AVX3
|
||||
|
||||
// Uncomment to definitely avoid generating those target(s):
|
||||
// #define HWY_DISABLED_TARGETS HWY_SSE4
|
||||
|
||||
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
|
||||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
|
||||
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
|
||||
//
|
||||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
|
||||
// can use 32-bit literals.
|
||||
|
||||
// 1,2: reserved
|
||||
|
||||
// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
|
||||
// be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
|
||||
// yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
|
||||
#define HWY_AVX3_DL 4 // see HWY_WANT_AVX3_DL below
|
||||
#define HWY_AVX3 8
|
||||
#define HWY_AVX2 16
|
||||
// 32: reserved for AVX
|
||||
#define HWY_SSE4 64
|
||||
#define HWY_SSSE3 128
|
||||
// 0x100, 0x200: reserved for SSE3, SSE2
|
||||
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 9
|
||||
|
||||
#define HWY_SVE2 0x400
|
||||
#define HWY_SVE 0x800
|
||||
// 0x1000 reserved for Helium
|
||||
#define HWY_NEON 0x2000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 13
|
||||
|
||||
// 0x4000, 0x8000 reserved
|
||||
#define HWY_PPC8 0x10000 // v2.07 or 3
|
||||
// 0x20000, 0x40000 reserved for prior VSX/AltiVec
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 18
|
||||
|
||||
#define HWY_WASM2 0x80000 // Experimental
|
||||
#define HWY_WASM 0x100000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 20
|
||||
|
||||
// 0x200000, 0x400000, 0x800000 reserved
|
||||
|
||||
#define HWY_RVV 0x1000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 24
|
||||
|
||||
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
|
||||
|
||||
#define HWY_SCALAR 0x20000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
|
||||
|
||||
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
||||
// Disabled means excluded from enabled at user's request. A separate config
|
||||
// macro allows disabling without deactivating the blocklist below.
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS 0
|
||||
#endif
|
||||
|
||||
// Broken means excluded from enabled due to known compiler issues. Allow the
|
||||
// user to override this blocklist without any guarantee of success.
|
||||
#ifndef HWY_BROKEN_TARGETS
|
||||
|
||||
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
||||
// SSE4 codegen (possibly only for msan), so disable all those targets.
|
||||
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
|
||||
// This entails a major speed reduction, so warn unless the user explicitly
|
||||
// opts in to scalar-only.
|
||||
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
||||
#endif
|
||||
|
||||
// 32-bit may fail to compile AVX2/3.
|
||||
#elif HWY_ARCH_X86_32
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
|
||||
|
||||
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
||||
#elif HWY_COMPILER_MSVC != 0
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
|
||||
|
||||
// armv7be has not been tested and is not yet supported.
|
||||
#elif HWY_ARCH_ARM_V7 && \
|
||||
(defined(__ARM_BIG_ENDIAN) || \
|
||||
(defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
|
||||
#define HWY_BROKEN_TARGETS (HWY_NEON)
|
||||
|
||||
// SVE[2] require recent clang or gcc versions.
|
||||
#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
|
||||
(!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
#endif
|
||||
|
||||
#endif // HWY_BROKEN_TARGETS
|
||||
|
||||
// Enabled means not disabled nor blocklisted.
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. Do not use
|
||||
// this directly because it does not take the blocklist into account. Allow the
|
||||
// user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
#define HWY_BASELINE_TARGETS HWY_SVE // does not support SVE2
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
#else
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_SCALAR.
|
||||
|
||||
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
||||
#if defined(HWY_WANT_WASM2)
|
||||
#define HWY_BASELINE_WASM HWY_WASM2
|
||||
#else
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#endif // HWY_WANT_WASM2
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
||||
// Avoid choosing the PPC target until we have an implementation.
|
||||
#if HWY_ARCH_PPC && defined(__VSX__) && 0
|
||||
#define HWY_BASELINE_PPC8 HWY_PPC8
|
||||
#else
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
// SVE compiles, but is not yet tested.
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#else
|
||||
#define HWY_BASELINE_SVE 0
|
||||
#endif
|
||||
|
||||
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
||||
#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
#define HWY_BASELINE_NEON HWY_NEON
|
||||
#else
|
||||
#define HWY_BASELINE_NEON 0
|
||||
#endif
|
||||
|
||||
// Special handling for MSVC because it has fewer predefined macros
|
||||
#if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
|
||||
|
||||
// We can only be sure SSSE3/SSE4 are enabled if AVX is
|
||||
// (https://stackoverflow.com/questions/18563978/)
|
||||
#if defined(__AVX__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
|
||||
// PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#define HWY_CHECK_F16C 1
|
||||
|
||||
#else // non-MSVC
|
||||
|
||||
#if defined(__SSSE3__)
|
||||
#define HWY_CHECK_SSSE3 1
|
||||
#else
|
||||
#define HWY_CHECK_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if defined(__SSE4_1__) && defined(__SSE4_2__)
|
||||
#define HWY_CHECK_SSE4 1
|
||||
#else
|
||||
#define HWY_CHECK_SSE4 0
|
||||
#endif
|
||||
|
||||
// If these are disabled, they should not gate the availability of SSE4/AVX2.
|
||||
#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
|
||||
#define HWY_CHECK_PCLMUL_AES 1
|
||||
#else
|
||||
#define HWY_CHECK_PCLMUL_AES 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
|
||||
#define HWY_CHECK_BMI2_FMA 1
|
||||
#else
|
||||
#define HWY_CHECK_BMI2_FMA 0
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
|
||||
#define HWY_CHECK_F16C 1
|
||||
#else
|
||||
#define HWY_CHECK_F16C 0
|
||||
#endif
|
||||
|
||||
#endif // non-MSVC
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_CHECK_SSSE3
|
||||
#define HWY_BASELINE_SSSE3 HWY_SSSE3
|
||||
#else
|
||||
#define HWY_BASELINE_SSSE3 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
#endif
|
||||
|
||||
#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
|
||||
defined(__AVX2__)
|
||||
#define HWY_BASELINE_AVX2 HWY_AVX2
|
||||
#else
|
||||
#define HWY_BASELINE_AVX2 0
|
||||
#endif
|
||||
|
||||
// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
|
||||
#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
|
||||
defined(__AVX512DQ__) && defined(__AVX512VL__)
|
||||
#define HWY_BASELINE_AVX3 HWY_AVX3
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3 0
|
||||
#endif
|
||||
|
||||
// TODO(janwas): not yet known whether these will be set by MSVC
|
||||
#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
|
||||
defined(__VPCLMULQDQ__)
|
||||
#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
#define HWY_BASELINE_RVV HWY_RVV
|
||||
#else
|
||||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
|
||||
HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 | \
|
||||
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
||||
HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
|
||||
|
||||
#endif // HWY_EMULATE_SVE
|
||||
|
||||
#else
|
||||
// User already defined HWY_BASELINE_TARGETS, but we still need to define
|
||||
// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
|
||||
#define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose target for static dispatch
|
||||
|
||||
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
|
||||
#if HWY_ENABLED_BASELINE == 0
|
||||
#error "At least one baseline target must be defined and enabled"
|
||||
#endif
|
||||
|
||||
// Best baseline, used for static dispatch. This is the least-significant 1-bit
|
||||
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
|
||||
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
|
||||
|
||||
// Start by assuming static dispatch. If we later use dynamic dispatch, this
|
||||
// will be defined to other targets during the multiple-inclusion, and finally
|
||||
// return to the initial value. Defining this outside begin/end_target ensures
|
||||
// inl headers successfully compile by themselves (required by Bazel).
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
|
||||
defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
|
||||
#error "Invalid config: can only define a single policy for targets"
|
||||
#endif
|
||||
|
||||
// Further to checking for disabled/broken targets, we only use AVX3_DL after
|
||||
// explicit opt-in (via this macro OR baseline compiler flags) to avoid
|
||||
// generating a codepath which is only helpful if the app uses AVX3_DL features.
|
||||
#if defined(HWY_WANT_AVX3_DL)
|
||||
#define HWY_CHECK_AVX3_DL HWY_AVX3_DL
|
||||
#else
|
||||
#define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
|
||||
HWY_CHECK_AVX3_DL)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
|
||||
#endif
|
||||
|
||||
// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
|
||||
// to ~HWY_SCALAR, but this is more explicit).
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
||||
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
|
||||
#elif defined(HWY_COMPILE_ONLY_STATIC)
|
||||
#define HWY_TARGETS HWY_STATIC_TARGET
|
||||
|
||||
// 3) For tests: include all attainable targets (in particular: scalar)
|
||||
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
|
||||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar.
|
||||
#else
|
||||
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
|
||||
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
|
||||
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
|
||||
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
|
||||
#error "Logic error: best baseline should be included in dynamic targets"
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_DETECT_TARGETS_H_
|
|
@ -16,7 +16,9 @@
|
|||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <memory>
|
||||
|
@ -57,7 +59,7 @@ template <class Benchmark>
|
|||
void RunBenchmark(const char* caption) {
|
||||
printf("%10s: ", caption);
|
||||
const size_t kNumInputs = 1;
|
||||
const size_t num_items = Benchmark::NumItems() * Unpredictable1();
|
||||
const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
|
||||
const FuncInput inputs[kNumInputs] = {num_items};
|
||||
Result results[kNumInputs];
|
||||
|
||||
|
@ -77,16 +79,17 @@ void RunBenchmark(const char* caption) {
|
|||
benchmark.Verify(num_items);
|
||||
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
const double cycles_per_item = results[i].ticks / results[i].input;
|
||||
const double cycles_per_item = results[i].ticks / double(results[i].input);
|
||||
const double mad = results[i].variability * cycles_per_item;
|
||||
printf("%6zu: %6.3f (+/- %5.3f)\n", results[i].input, cycles_per_item, mad);
|
||||
printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
|
||||
static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
|
||||
}
|
||||
}
|
||||
|
||||
void Intro() {
|
||||
HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
|
||||
HWY_ALIGN float out[16];
|
||||
HWY_FULL(float) d; // largest possible vector
|
||||
const ScalableTag<float> d; // largest possible vector
|
||||
for (size_t i = 0; i < 16; i += Lanes(d)) {
|
||||
const auto vec = Load(d, in + i); // aligned!
|
||||
auto result = vec * vec;
|
||||
|
@ -103,22 +106,22 @@ class BenchmarkDot : public TwoArray {
|
|||
BenchmarkDot() : dot_{-1.0f} {}
|
||||
|
||||
FuncOutput operator()(const size_t num_items) {
|
||||
HWY_FULL(float) d;
|
||||
const ScalableTag<float> d;
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
constexpr int unroll = 8;
|
||||
constexpr size_t unroll = 8;
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// Some older compilers might not be able to fit the 8 arrays in registers,
|
||||
// so manual unrolling can be helpfull if you run into this issue.
|
||||
// 2 FMA ports * 4 cycle latency = 8x unrolled.
|
||||
V sum[unroll];
|
||||
for (int i = 0; i < unroll; ++i) {
|
||||
for (size_t i = 0; i < unroll; ++i) {
|
||||
sum[i] = Zero(d);
|
||||
}
|
||||
const float* const HWY_RESTRICT pa = &a_[0];
|
||||
const float* const HWY_RESTRICT pb = b_;
|
||||
for (size_t i = 0; i < num_items; i += unroll * N) {
|
||||
for (int j = 0; j < unroll; ++j) {
|
||||
for (size_t j = 0; j < unroll; ++j) {
|
||||
const auto a = Load(d, pa + i + j * N);
|
||||
const auto b = Load(d, pb + i + j * N);
|
||||
sum[j] = MulAdd(a, b, sum[j]);
|
||||
|
@ -126,12 +129,12 @@ class BenchmarkDot : public TwoArray {
|
|||
}
|
||||
// Reduction tree: sum of all accumulators by pairs into sum[0], then the
|
||||
// lanes.
|
||||
for (int power = 1; power < unroll; power *= 2) {
|
||||
for (int i = 0; i < unroll; i += 2 * power) {
|
||||
for (size_t power = 1; power < unroll; power *= 2) {
|
||||
for (size_t i = 0; i < unroll; i += 2 * power) {
|
||||
sum[i] += sum[i + power];
|
||||
}
|
||||
}
|
||||
dot_ = GetLane(SumOfLanes(sum[0]));
|
||||
dot_ = GetLane(SumOfLanes(d, sum[0]));
|
||||
return static_cast<FuncOutput>(dot_);
|
||||
}
|
||||
void Verify(size_t num_items) {
|
||||
|
@ -166,7 +169,7 @@ struct BenchmarkDelta : public TwoArray {
|
|||
#elif HWY_CAP_GE256
|
||||
// Larger vectors are split into 128-bit blocks, easiest to use the
|
||||
// unaligned load support to shift between them.
|
||||
const HWY_FULL(float) df;
|
||||
const ScalableTag<float> df;
|
||||
const size_t N = Lanes(df);
|
||||
size_t i;
|
||||
b_[0] = a_[0];
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
|
||||
#include "hwy/examples/skeleton.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// First undef to prevent error when re-included.
|
||||
|
@ -35,21 +34,29 @@ namespace HWY_NAMESPACE {
|
|||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
|
||||
// For reasons unknown, optimized msan builds encounter long build times here;
|
||||
// work around it until a cause is found.
|
||||
#if HWY_COMPILER_CLANG && defined(MEMORY_SANITIZER) && defined(__OPTIMIZE__)
|
||||
#define ATTR_MSAN __attribute__((optnone))
|
||||
#else
|
||||
#define ATTR_MSAN
|
||||
#endif
|
||||
|
||||
// Computes log2 by converting to a vector of floats. Compiled once per target.
|
||||
template <class DF>
|
||||
HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const Rebind<int32_t, DF> d32;
|
||||
const Rebind<uint8_t, DF> d8;
|
||||
|
||||
const auto u8 = Load(d8, values);
|
||||
const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
|
||||
const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
|
||||
const auto exponent = Sub(ShiftRight<23>(bits), Set(d32, 127));
|
||||
Store(DemoteTo(d8, exponent), d8, log2);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void CodepathDemo() {
|
||||
void CodepathDemo() {
|
||||
// Highway defaults to portability, but per-target codepaths may be selected
|
||||
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
|
||||
#if HWY_CAP_INTEGER64
|
||||
|
@ -60,12 +67,12 @@ HWY_NOINLINE void CodepathDemo() {
|
|||
printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
CodepathDemo();
|
||||
|
||||
// Second argument is necessary on RVV until it supports fractional lengths.
|
||||
HWY_FULL(float, 4) df;
|
||||
const ScalableTag<float, 2> df;
|
||||
|
||||
const size_t N = Lanes(df);
|
||||
size_t i = 0;
|
||||
|
|
|
@ -99,9 +99,17 @@ HWY_NOINLINE void TestAllSumMulAdd() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace skeleton {
|
||||
HWY_BEFORE_TEST(SkeletonTest);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
|
||||
} // namespace skeleton
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
|
||||
// highway.h defines the corresponding macro/namespace.
|
||||
|
||||
#include "hwy/targets.h"
|
||||
#include "hwy/detect_targets.h"
|
||||
|
||||
// *_inl.h may include other headers, which requires include guards to prevent
|
||||
// repeated inclusion. The guards must be reset after compiling each target, so
|
||||
|
@ -74,6 +74,17 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSSE3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSE4
|
||||
|
@ -107,6 +118,28 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3_DL
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM2) && (HWY_STATIC_TARGET != HWY_WASM2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM
|
||||
|
|
|
@ -27,16 +27,13 @@ namespace hwy {
|
|||
|
||||
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
||||
#define HWY_MAJOR 0
|
||||
#define HWY_MINOR 12
|
||||
#define HWY_PATCH 2
|
||||
#define HWY_MINOR 15
|
||||
#define HWY_PATCH 0
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
|
||||
|
||||
// Because Highway functions take descriptor and/or vector arguments, ADL finds
|
||||
// these functions without requiring users in project::HWY_NAMESPACE to
|
||||
// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
|
||||
// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
|
||||
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
|
||||
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
|
||||
// HWY_CAPPED(T, N).
|
||||
|
||||
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
|
||||
// registers in the group, and is ignored on targets that do not support groups.
|
||||
|
@ -71,6 +68,8 @@ namespace hwy {
|
|||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_RVV
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_NEON
|
||||
|
@ -81,12 +80,16 @@ namespace hwy {
|
|||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC8
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSSE3
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSE4
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
|
||||
#endif
|
||||
|
||||
// Dynamic dispatch declarations.
|
||||
|
@ -129,6 +132,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM2
|
||||
#define HWY_CHOOSE_WASM2(FUNC_NAME) &N_WASM2::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_WASM2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM
|
||||
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
|
||||
#else
|
||||
|
@ -165,6 +174,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSSE3
|
||||
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSE4
|
||||
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
|
||||
#else
|
||||
|
@ -183,6 +198,12 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3_DL
|
||||
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
|
||||
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
|
||||
|
||||
|
@ -270,11 +291,11 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#endif
|
||||
|
||||
// These define ops inside namespace hwy::HWY_NAMESPACE.
|
||||
#if HWY_TARGET == HWY_SSE4
|
||||
#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
|
||||
#include "hwy/ops/x86_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX2
|
||||
#include "hwy/ops/x86_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX3
|
||||
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
|
||||
#include "hwy/ops/x86_512-inl.h"
|
||||
#elif HWY_TARGET == HWY_PPC8
|
||||
#error "PPC is not yet supported"
|
||||
|
@ -282,6 +303,8 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#include "hwy/ops/arm_neon-inl.h"
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
#include "hwy/ops/arm_sve-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM2
|
||||
#include "hwy/ops/wasm_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
#include "hwy/ops/wasm_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
|
@ -292,65 +315,6 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
// Commonly used functions/types that must come after ops are defined.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
|
||||
template <class V>
|
||||
using LaneType = decltype(GetLane(V()));
|
||||
|
||||
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
|
||||
// of functions that do not take a vector argument, or as an argument type if
|
||||
// the function only has a template argument for D, or for explicit type names
|
||||
// instead of auto. This may be a built-in type.
|
||||
template <class D>
|
||||
using Vec = decltype(Zero(D()));
|
||||
|
||||
// Mask type. Useful as the return type of functions that do not take a mask
|
||||
// argument, or as an argument type if the function only has a template argument
|
||||
// for D, or for explicit type names instead of auto.
|
||||
template <class D>
|
||||
using Mask = decltype(MaskFromVec(Zero(D())));
|
||||
|
||||
// Returns the closest value to v within [lo, hi].
|
||||
template <class V>
|
||||
HWY_API V Clamp(const V v, const V lo, const V hi) {
|
||||
return Min(Max(lo, v), hi);
|
||||
}
|
||||
|
||||
// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
|
||||
// TODO(janwas): implement for RVV
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
|
||||
|
||||
template <size_t kLanes, class V>
|
||||
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
|
||||
return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns lanes with the most significant bit set and all other bits zero.
|
||||
template <class D>
|
||||
HWY_API Vec<D> SignBit(D d) {
|
||||
using Unsigned = MakeUnsigned<TFromD<D>>;
|
||||
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
|
||||
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
|
||||
}
|
||||
|
||||
// Returns quiet NaN.
|
||||
template <class D>
|
||||
HWY_API Vec<D> NaN(D d) {
|
||||
const RebindToSigned<D> di;
|
||||
// LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
|
||||
// mantissa MSB (to indicate quiet) would be sufficient.
|
||||
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
#include "hwy/ops/generic_ops-inl.h"
|
||||
|
||||
#endif // HWY_HIGHWAY_PER_TARGET
|
||||
|
|
|
@ -66,9 +66,9 @@ struct TestOverflow {
|
|||
const auto vmax = Set(d, LimitsMax<T>());
|
||||
const auto vmin = Set(d, LimitsMin<T>());
|
||||
// Unsigned underflow / negative -> positive
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, vmin - v1);
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1));
|
||||
// Unsigned overflow / positive -> negative
|
||||
HWY_ASSERT_VEC_EQ(d, vmin, vmax + v1);
|
||||
HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -76,6 +76,22 @@ HWY_NOINLINE void TestAllOverflow() {
|
|||
ForIntegerTypes(ForPartialVectors<TestOverflow>());
|
||||
}
|
||||
|
||||
struct TestClamp {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto v2 = Set(d, 2);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllClamp() {
|
||||
ForAllTypes(ForPartialVectors<TestClamp>());
|
||||
}
|
||||
|
||||
struct TestSignBitInteger {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -122,16 +138,20 @@ bool IsNaN(TF f) {
|
|||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void AssertNaN(const D d, const V v, const char* file, int line) {
|
||||
HWY_NOINLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
|
||||
using T = TFromD<D>;
|
||||
const T lane = GetLane(v);
|
||||
if (!IsNaN(lane)) {
|
||||
const std::string type_name = TypeName(T(), Lanes(d));
|
||||
MakeUnsigned<T> bits;
|
||||
memcpy(&bits, &lane, sizeof(T));
|
||||
// RVV lacks PRIu64, so use size_t; double will be truncated on 32-bit.
|
||||
Abort(file, line, "Expected %s NaN, got %E (%zu)", type_name.c_str(), lane,
|
||||
size_t(bits));
|
||||
// RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
|
||||
// avoid truncating doubles.
|
||||
uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
|
||||
memcpy(bytes, &lane, sizeof(T));
|
||||
Abort(file, line,
|
||||
"Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
|
||||
"%02x)",
|
||||
type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3],
|
||||
bytes[4], bytes[5], bytes[6], bytes[7]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -187,18 +207,18 @@ struct TestNaN {
|
|||
HWY_ASSERT_NAN(d, Or(nan, v1));
|
||||
|
||||
// Comparison
|
||||
HWY_ASSERT(AllFalse(Eq(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Gt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Lt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Ge(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Le(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(d, Eq(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(d, Gt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(d, Lt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(d, Ge(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(d, Le(nan, v1)));
|
||||
|
||||
// Reduction
|
||||
HWY_ASSERT_NAN(d, SumOfLanes(nan));
|
||||
HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
|
||||
// TODO(janwas): re-enable after QEMU is fixed
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
HWY_ASSERT_NAN(d, MinOfLanes(nan));
|
||||
HWY_ASSERT_NAN(d, MaxOfLanes(nan));
|
||||
HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
|
||||
HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
|
||||
#endif
|
||||
|
||||
// Min
|
||||
|
@ -227,13 +247,6 @@ struct TestNaN {
|
|||
#endif
|
||||
HWY_ASSERT_NAN(d, Min(nan, nan));
|
||||
HWY_ASSERT_NAN(d, Max(nan, nan));
|
||||
|
||||
// Comparison
|
||||
HWY_ASSERT(AllFalse(Eq(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Gt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Lt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Ge(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Le(nan, v1)));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -286,6 +299,19 @@ HWY_NOINLINE void TestAllGetLane() {
|
|||
ForAllTypes(ForPartialVectors<TestGetLane>());
|
||||
}
|
||||
|
||||
struct TestDFromV {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
using D0 = DFromV<decltype(v0)>; // not necessarily same as D
|
||||
const auto v0b = And(v0, Set(D0(), 1)); // but vectors can interoperate
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v0b);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDFromV() {
|
||||
ForAllTypes(ForPartialVectors<TestDFromV>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
@ -293,13 +319,23 @@ HWY_NOINLINE void TestAllGetLane() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HighwayTest);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // abort
|
||||
|
@ -46,7 +47,7 @@
|
|||
#endif
|
||||
|
||||
#include "hwy/base.h"
|
||||
#if HWY_ARCH_PPC
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
||||
#elif HWY_ARCH_X86
|
||||
|
||||
|
@ -119,7 +120,7 @@ using Ticks = uint64_t;
|
|||
// divide by InvariantTicksPerSecond.
|
||||
inline Ticks Start() {
|
||||
Ticks t;
|
||||
#if HWY_ARCH_PPC
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
|
@ -154,14 +155,14 @@ inline Ticks Start() {
|
|||
#else // POSIX
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
|
||||
t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
inline Ticks Stop() {
|
||||
uint64_t t;
|
||||
#if HWY_ARCH_PPC
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
|
@ -353,6 +354,12 @@ void Cpuid(const uint32_t level, const uint32_t count,
|
|||
#endif
|
||||
}
|
||||
|
||||
bool HasRDTSCP() {
|
||||
uint32_t abcd[4];
|
||||
Cpuid(0x80000001U, 0, abcd); // Extended feature flags
|
||||
return (abcd[3] & (1u << 27)) != 0; // RDTSCP
|
||||
}
|
||||
|
||||
std::string BrandString() {
|
||||
char brand_string[49];
|
||||
std::array<uint32_t, 4> abcd;
|
||||
|
@ -399,8 +406,8 @@ double NominalClockRate() {
|
|||
} // namespace
|
||||
|
||||
double InvariantTicksPerSecond() {
|
||||
#if HWY_ARCH_PPC
|
||||
return __ppc_get_timebase_freq();
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
return double(__ppc_get_timebase_freq());
|
||||
#elif HWY_ARCH_X86
|
||||
// We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
|
||||
return NominalClockRate();
|
||||
|
@ -457,9 +464,10 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|||
static const double ticks_per_second = platform::InvariantTicksPerSecond();
|
||||
const size_t ticks_per_eval =
|
||||
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
|
||||
size_t samples_per_eval =
|
||||
est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
|
||||
samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
|
||||
size_t samples_per_eval = est == 0
|
||||
? p.min_samples_per_eval
|
||||
: static_cast<size_t>(ticks_per_eval / est);
|
||||
samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval);
|
||||
|
||||
std::vector<timer::Ticks> samples;
|
||||
samples.reserve(1 + samples_per_eval);
|
||||
|
@ -494,17 +502,21 @@ timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
|||
|
||||
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
|
||||
if (p.verbose) {
|
||||
printf("%6zu samples => %5zu (abs_mad=%4zu, rel_mad=%4.2f%%)\n",
|
||||
samples.size(), size_t(est), size_t(abs_mad), *rel_mad * 100.0);
|
||||
printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64
|
||||
", rel_mad=%4.2f%%)\n",
|
||||
static_cast<uint64_t>(samples.size()),
|
||||
static_cast<uint64_t>(est), static_cast<uint64_t>(abs_mad),
|
||||
*rel_mad * 100.0);
|
||||
}
|
||||
return est;
|
||||
}
|
||||
}
|
||||
|
||||
if (p.verbose) {
|
||||
printf(
|
||||
"WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
|
||||
*rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
|
||||
printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
|
||||
" samples.\n",
|
||||
*rel_mad * 100.0, max_rel_mad * 100.0,
|
||||
static_cast<uint64_t>(samples.size()));
|
||||
}
|
||||
return est;
|
||||
}
|
||||
|
@ -530,17 +542,22 @@ size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
|
|||
const timer::Ticks total = SampleUntilStable(
|
||||
p.target_rel_mad, &rel_mad, p,
|
||||
[func, arg, input]() { platform::PreventElision(func(arg, input)); });
|
||||
min_duration = std::min(min_duration, total - timer_resolution);
|
||||
min_duration = HWY_MIN(min_duration, total - timer_resolution);
|
||||
}
|
||||
|
||||
// Number of repetitions required to reach the target resolution.
|
||||
const size_t max_skip = p.precision_divisor;
|
||||
// Number of repetitions given the estimated duration.
|
||||
const size_t num_skip =
|
||||
min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
|
||||
min_duration == 0
|
||||
? 0
|
||||
: static_cast<size_t>((max_skip + min_duration - 1) / min_duration);
|
||||
if (p.verbose) {
|
||||
printf("res=%zu max_skip=%zu min_dur=%zu num_skip=%zu\n",
|
||||
size_t(timer_resolution), max_skip, size_t(min_duration), num_skip);
|
||||
printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64
|
||||
" num_skip=%" PRIu64 "\n",
|
||||
static_cast<uint64_t>(timer_resolution),
|
||||
static_cast<uint64_t>(max_skip), static_cast<uint64_t>(min_duration),
|
||||
static_cast<uint64_t>(num_skip));
|
||||
}
|
||||
return num_skip;
|
||||
}
|
||||
|
@ -615,7 +632,7 @@ timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
|
|||
platform::PreventElision(func(arg, input));
|
||||
}
|
||||
});
|
||||
*max_rel_mad = std::max(*max_rel_mad, rel_mad);
|
||||
*max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad);
|
||||
return duration;
|
||||
}
|
||||
|
||||
|
@ -644,6 +661,15 @@ int Unpredictable1() { return timer::Start() != ~0ULL; }
|
|||
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results, const Params& p) {
|
||||
NANOBENCHMARK_CHECK(num_inputs != 0);
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
if (!platform::HasRDTSCP()) {
|
||||
fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
|
||||
platform::BrandString().c_str());
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
const InputVec& unique = UniqueInputs(inputs, num_inputs);
|
||||
|
||||
const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
|
||||
|
@ -658,14 +684,19 @@ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
|||
const timer::Ticks overhead = Overhead(arg, &full, p);
|
||||
const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
|
||||
if (overhead < overhead_skip) {
|
||||
fprintf(stderr, "Measurement failed: overhead %zu < %zu\n",
|
||||
size_t(overhead), size_t(overhead_skip));
|
||||
fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(overhead),
|
||||
static_cast<uint64_t>(overhead_skip));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (p.verbose) {
|
||||
printf("#inputs=%5zu,%5zu overhead=%5zu,%5zu\n", full.size(), subset.size(),
|
||||
size_t(overhead), size_t(overhead_skip));
|
||||
printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64
|
||||
"\n",
|
||||
static_cast<uint64_t>(full.size()),
|
||||
static_cast<uint64_t>(subset.size()),
|
||||
static_cast<uint64_t>(overhead),
|
||||
static_cast<uint64_t>(overhead_skip));
|
||||
}
|
||||
|
||||
double max_rel_mad = 0.0;
|
||||
|
@ -677,8 +708,8 @@ size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
|||
TotalDuration(func, arg, &subset, p, &max_rel_mad);
|
||||
|
||||
if (total < total_skip) {
|
||||
fprintf(stderr, "Measurement failed: total %zu < %zu\n", size_t(total),
|
||||
size_t(total_skip));
|
||||
fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(total), static_cast<uint64_t>(total_skip));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -14,6 +14,8 @@
|
|||
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <random>
|
||||
|
@ -23,6 +25,13 @@
|
|||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
// Governs duration of test; avoid timeout in debug builds.
|
||||
#if HWY_IS_DEBUG_BUILD
|
||||
constexpr size_t kMaxEvals = 3;
|
||||
#else
|
||||
constexpr size_t kMaxEvals = 4;
|
||||
#endif
|
||||
|
||||
FuncOutput Div(const void*, FuncInput in) {
|
||||
// Here we're measuring the throughput because benchmark invocations are
|
||||
// independent. Any dividend will do; the divisor is nonzero.
|
||||
|
@ -34,11 +43,12 @@ void MeasureDiv(const FuncInput (&inputs)[N]) {
|
|||
printf("Measuring integer division (output on final two lines)\n");
|
||||
Result results[N];
|
||||
Params params;
|
||||
params.max_evals = 4; // avoid test timeout
|
||||
params.max_evals = kMaxEvals;
|
||||
const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
|
||||
results[i].ticks, results[i].variability * 100.0);
|
||||
printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n",
|
||||
static_cast<uint64_t>(results[i].input), results[i].ticks,
|
||||
results[i].variability * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -59,7 +69,7 @@ template <size_t N>
|
|||
void MeasureRandom(const FuncInput (&inputs)[N]) {
|
||||
Result results[N];
|
||||
Params p;
|
||||
p.max_evals = 4; // avoid test timeout
|
||||
p.max_evals = kMaxEvals;
|
||||
p.verbose = false;
|
||||
const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
|
@ -78,3 +88,9 @@ TEST(NanobenchmarkTest, RunAll) {
|
|||
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,324 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Target-independent types/functions defined after target-specific ops.
|
||||
|
||||
// Relies on the external include guard in highway.h.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
|
||||
template <class V>
|
||||
using LaneType = decltype(GetLane(V()));
|
||||
|
||||
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
|
||||
// of functions that do not take a vector argument, or as an argument type if
|
||||
// the function only has a template argument for D, or for explicit type names
|
||||
// instead of auto. This may be a built-in type.
|
||||
template <class D>
|
||||
using Vec = decltype(Zero(D()));
|
||||
|
||||
// Mask type. Useful as the return type of functions that do not take a mask
|
||||
// argument, or as an argument type if the function only has a template argument
|
||||
// for D, or for explicit type names instead of auto.
|
||||
template <class D>
|
||||
using Mask = decltype(MaskFromVec(Zero(D())));
|
||||
|
||||
// Returns the closest value to v within [lo, hi].
|
||||
template <class V>
|
||||
HWY_API V Clamp(const V v, const V lo, const V hi) {
|
||||
return Min(Max(lo, v), hi);
|
||||
}
|
||||
|
||||
// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
|
||||
// and RVV has its own implementation of -Lanes.
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
|
||||
|
||||
template <size_t kLanes, class D, class V = VFromD<D>>
|
||||
HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
|
||||
constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
|
||||
static_assert(kBytes < 16, "Shift count is per-block");
|
||||
return CombineShiftRightBytes<kBytes>(d, hi, lo);
|
||||
}
|
||||
|
||||
// DEPRECATED
|
||||
template <size_t kLanes, class V>
|
||||
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
|
||||
return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns lanes with the most significant bit set and all other bits zero.
|
||||
template <class D>
|
||||
HWY_API Vec<D> SignBit(D d) {
|
||||
using Unsigned = MakeUnsigned<TFromD<D>>;
|
||||
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
|
||||
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
|
||||
}
|
||||
|
||||
// Returns quiet NaN.
|
||||
template <class D>
|
||||
HWY_API Vec<D> NaN(D d) {
|
||||
const RebindToSigned<D> di;
|
||||
// LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
|
||||
// mantissa MSB (to indicate quiet) would be sufficient.
|
||||
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
|
||||
}
|
||||
|
||||
// ------------------------------ AESRound
|
||||
|
||||
// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
// Define for white-box testing, even if native instructions are available.
|
||||
namespace detail {
|
||||
|
||||
// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
|
||||
// Vector Permute Instructions" and the accompanying assembly language
|
||||
// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
|
||||
// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
|
||||
//
|
||||
// A brute-force 256 byte table lookup can also be made constant-time, and
|
||||
// possibly competitive on NEON, but this is more performance-portable
|
||||
// especially for x86 and large vectors.
|
||||
template <class V> // u8
|
||||
HWY_INLINE V SubBytes(V state) {
|
||||
const DFromV<V> du;
|
||||
const auto mask = Set(du, 0xF);
|
||||
|
||||
// Change polynomial basis to GF(2^4)
|
||||
{
|
||||
alignas(16) static constexpr uint8_t basisL[16] = {
|
||||
0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
|
||||
0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
|
||||
alignas(16) static constexpr uint8_t basisU[16] = {
|
||||
0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
|
||||
0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
|
||||
const auto sL = And(state, mask);
|
||||
const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
|
||||
const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
|
||||
const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
|
||||
state = Xor(gf4L, gf4U);
|
||||
}
|
||||
|
||||
// Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
|
||||
// cause TableLookupBytesOr0 to return 0.
|
||||
alignas(16) static constexpr uint8_t kZetaInv[16] = {
|
||||
0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
|
||||
alignas(16) static constexpr uint8_t kInv[16] = {
|
||||
0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
|
||||
const auto tbl = LoadDup128(du, kInv);
|
||||
const auto sL = And(state, mask); // L=low nibble, U=upper
|
||||
const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero
|
||||
const auto sX = Xor(sU, sL);
|
||||
const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
|
||||
const auto invU = TableLookupBytes(tbl, sU);
|
||||
const auto invX = TableLookupBytes(tbl, sX);
|
||||
const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
|
||||
const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
|
||||
|
||||
// Linear skew (cannot bake 0x63 bias into the table because out* indices
|
||||
// may have the infinity flag set).
|
||||
alignas(16) static constexpr uint8_t kAffineL[16] = {
|
||||
0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
|
||||
0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
|
||||
alignas(16) static constexpr uint8_t kAffineU[16] = {
|
||||
0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
|
||||
0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
|
||||
const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
|
||||
const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
|
||||
return Xor(Xor(affL, affU), Set(du, 0x63));
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
|
||||
// "Include guard": skip if native AES instructions are available.
|
||||
#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
|
||||
#ifdef HWY_NATIVE_AES
|
||||
#undef HWY_NATIVE_AES
|
||||
#else
|
||||
#define HWY_NATIVE_AES
|
||||
#endif
|
||||
|
||||
// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <class V> // u8
|
||||
HWY_API V ShiftRows(const V state) {
|
||||
const DFromV<V> du;
|
||||
alignas(16) static constexpr uint8_t kShiftRow[16] = {
|
||||
0, 5, 10, 15, // transposed: state is column major
|
||||
4, 9, 14, 3, //
|
||||
8, 13, 2, 7, //
|
||||
12, 1, 6, 11};
|
||||
const auto shift_row = LoadDup128(du, kShiftRow);
|
||||
return TableLookupBytes(state, shift_row);
|
||||
}
|
||||
|
||||
template <class V> // u8
|
||||
HWY_API V MixColumns(const V state) {
|
||||
const DFromV<V> du;
|
||||
// For each column, the rows are the sum of GF(2^8) matrix multiplication by:
|
||||
// 2 3 1 1 // Let s := state*1, d := state*2, t := state*3.
|
||||
// 1 2 3 1 // d are on diagonal, no permutation needed.
|
||||
// 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows.
|
||||
// 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301).
|
||||
alignas(16) static constexpr uint8_t k2301[16] = {
|
||||
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
|
||||
alignas(16) static constexpr uint8_t k1230[16] = {
|
||||
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
|
||||
const RebindToSigned<decltype(du)> di; // can only do signed comparisons
|
||||
const auto msb = Lt(BitCast(di, state), Zero(di));
|
||||
const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
|
||||
const auto d = Xor(Add(state, state), overflow); // = state*2 in GF(2^8).
|
||||
const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
|
||||
const auto d_s2301 = Xor(d, s2301);
|
||||
const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)}
|
||||
const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
|
||||
return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
template <class V> // u8
|
||||
HWY_API V AESRound(V state, const V round_key) {
|
||||
// Intel docs swap the first two steps, but it does not matter because
|
||||
// ShiftRows is a permutation and SubBytes is independent of lane index.
|
||||
state = detail::SubBytes(state);
|
||||
state = detail::ShiftRows(state);
|
||||
state = detail::MixColumns(state);
|
||||
state = Xor(state, round_key); // AddRoundKey
|
||||
return state;
|
||||
}
|
||||
|
||||
// Constant-time implementation inspired by
|
||||
// https://www.bearssl.org/constanttime.html, but about half the cost because we
|
||||
// use 64x64 multiplies and 128-bit XORs.
|
||||
template <class V>
|
||||
HWY_API V CLMulLower(V a, V b) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
|
||||
const auto k1 = Set(d, 0x1111111111111111ULL);
|
||||
const auto k2 = Set(d, 0x2222222222222222ULL);
|
||||
const auto k4 = Set(d, 0x4444444444444444ULL);
|
||||
const auto k8 = Set(d, 0x8888888888888888ULL);
|
||||
const auto a0 = And(a, k1);
|
||||
const auto a1 = And(a, k2);
|
||||
const auto a2 = And(a, k4);
|
||||
const auto a3 = And(a, k8);
|
||||
const auto b0 = And(b, k1);
|
||||
const auto b1 = And(b, k2);
|
||||
const auto b2 = And(b, k4);
|
||||
const auto b3 = And(b, k8);
|
||||
|
||||
auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
|
||||
auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
|
||||
auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
|
||||
auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
|
||||
m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
|
||||
m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
|
||||
m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
|
||||
m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
|
||||
return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
|
||||
}
|
||||
|
||||
template <class V>
|
||||
HWY_API V CLMulUpper(V a, V b) {
|
||||
const DFromV<V> d;
|
||||
static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
|
||||
const auto k1 = Set(d, 0x1111111111111111ULL);
|
||||
const auto k2 = Set(d, 0x2222222222222222ULL);
|
||||
const auto k4 = Set(d, 0x4444444444444444ULL);
|
||||
const auto k8 = Set(d, 0x8888888888888888ULL);
|
||||
const auto a0 = And(a, k1);
|
||||
const auto a1 = And(a, k2);
|
||||
const auto a2 = And(a, k4);
|
||||
const auto a3 = And(a, k8);
|
||||
const auto b0 = And(b, k1);
|
||||
const auto b1 = And(b, k2);
|
||||
const auto b2 = And(b, k4);
|
||||
const auto b3 = And(b, k8);
|
||||
|
||||
auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
|
||||
auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
|
||||
auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
|
||||
auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
|
||||
m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
|
||||
m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
|
||||
m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
|
||||
m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
|
||||
return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
|
||||
}
|
||||
|
||||
#endif // HWY_NATIVE_AES
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
|
||||
// "Include guard": skip if native POPCNT-related instructions are available.
|
||||
#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
|
||||
#ifdef HWY_NATIVE_POPCNT
|
||||
#undef HWY_NATIVE_POPCNT
|
||||
#else
|
||||
#define HWY_NATIVE_POPCNT
|
||||
#endif
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
constexpr DFromV<V> d;
|
||||
HWY_ALIGN constexpr uint8_t kLookup[16] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
};
|
||||
auto lo = And(v, Set(d, 0xF));
|
||||
auto hi = ShiftRight<4>(v);
|
||||
auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup);
|
||||
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
|
||||
}
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
Repartition<uint8_t, decltype(d)> d8;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
||||
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint32_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
Repartition<uint16_t, decltype(d)> d16;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
|
||||
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
Repartition<uint32_t, decltype(d)> d32;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
|
||||
return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // HWY_NATIVE_POPCNT
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -25,35 +25,84 @@
|
|||
|
||||
#endif // HWY_SET_MACROS_PER_TARGET
|
||||
|
||||
#include "hwy/targets.h"
|
||||
#include "hwy/detect_targets.h"
|
||||
|
||||
#undef HWY_NAMESPACE
|
||||
#undef HWY_ALIGN
|
||||
#undef HWY_MAX_BYTES
|
||||
#undef HWY_LANES
|
||||
|
||||
#undef HWY_CAP_INTEGER64
|
||||
#undef HWY_CAP_FLOAT16
|
||||
#undef HWY_CAP_FLOAT64
|
||||
#undef HWY_CAP_GE256
|
||||
#undef HWY_CAP_GE512
|
||||
|
||||
#undef HWY_TARGET_STR
|
||||
|
||||
#if defined(HWY_DISABLE_PCLMUL_AES)
|
||||
#define HWY_TARGET_STR_PCLMUL_AES ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA)
|
||||
#define HWY_TARGET_STR_BMI2_FMA ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
|
||||
#endif
|
||||
|
||||
#if defined(HWY_DISABLE_F16C)
|
||||
#define HWY_TARGET_STR_F16C ""
|
||||
#else
|
||||
#define HWY_TARGET_STR_F16C ",f16c"
|
||||
#endif
|
||||
|
||||
#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
|
||||
|
||||
#define HWY_TARGET_STR_SSE4 \
|
||||
HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
|
||||
// Include previous targets, which are the half-vectors of the next target.
|
||||
#define HWY_TARGET_STR_AVX2 \
|
||||
HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
|
||||
#define HWY_TARGET_STR_AVX3 \
|
||||
HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw"
|
||||
|
||||
// Before include guard so we redefine HWY_TARGET_STR on each include,
|
||||
// governed by the current HWY_TARGET.
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE4
|
||||
#if HWY_TARGET == HWY_SSE4
|
||||
// SSSE3
|
||||
#if HWY_TARGET == HWY_SSSE3
|
||||
|
||||
#define HWY_NAMESPACE N_SSE4
|
||||
#define HWY_NAMESPACE N_SSSE3
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_AES 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE4
|
||||
#elif HWY_TARGET == HWY_SSE4
|
||||
|
||||
#define HWY_NAMESPACE N_SSE4
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR "sse2,ssse3,sse4.1"
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX2
|
||||
|
@ -61,47 +110,57 @@
|
|||
|
||||
#define HWY_NAMESPACE N_AVX2
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA)
|
||||
#define HWY_TARGET_STR "avx,avx2,f16c"
|
||||
#else
|
||||
#define HWY_TARGET_STR "avx,avx2,bmi,bmi2,fma,f16c"
|
||||
#endif
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX3
|
||||
#elif HWY_TARGET == HWY_AVX3
|
||||
// AVX3[_DL]
|
||||
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
|
||||
|
||||
#define HWY_ALIGN alignas(64)
|
||||
#define HWY_MAX_BYTES 64
|
||||
#define HWY_LANES(T) (64 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 1
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3
|
||||
#if HWY_TARGET == HWY_AVX3
|
||||
|
||||
// Must include AVX2 because an AVX3 test may call AVX2 functions (e.g. when
|
||||
// converting to half-vectors). HWY_DISABLE_BMI2_FMA is not relevant because if
|
||||
// we have AVX3, we should also have BMI2/FMA.
|
||||
#define HWY_NAMESPACE N_AVX3
|
||||
#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
|
||||
|
||||
#elif HWY_TARGET == HWY_AVX3_DL
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3_DL
|
||||
#define HWY_TARGET_STR \
|
||||
"avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw"
|
||||
HWY_TARGET_STR_AVX3 \
|
||||
",vpclmulqdq,avx512vbmi2,vaes,avxvnni,avx512bitalg,avx512vpopcntdq"
|
||||
|
||||
#else
|
||||
#error "Logic error"
|
||||
#endif // HWY_TARGET == HWY_AVX3_DL
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// PPC8
|
||||
#elif HWY_TARGET == HWY_PPC8
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 0
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
@ -115,9 +174,11 @@
|
|||
#elif HWY_TARGET == HWY_NEON
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -135,12 +196,22 @@
|
|||
// SVE[2]
|
||||
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
|
||||
|
||||
#if defined(HWY_EMULATE_SVE) && !defined(__F16C__)
|
||||
#error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16"
|
||||
#endif
|
||||
|
||||
// SVE only requires lane alignment, not natural alignment of the entire vector.
|
||||
#define HWY_ALIGN alignas(8)
|
||||
// Upper bound, not the actual lane count!
|
||||
#define HWY_LANES(T) (256 / sizeof(T))
|
||||
|
||||
#define HWY_MAX_BYTES 256
|
||||
|
||||
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
|
||||
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
|
||||
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
|
||||
#define HWY_LANES(T) (32768 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
@ -151,16 +222,18 @@
|
|||
#define HWY_NAMESPACE N_SVE
|
||||
#endif
|
||||
|
||||
// HWY_TARGET_STR remains undefined - TODO(janwas): attribute for SVE?
|
||||
// HWY_TARGET_STR remains undefined
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 0
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
@ -169,6 +242,24 @@
|
|||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM2
|
||||
#elif HWY_TARGET == HWY_WASM2
|
||||
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 0
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM2
|
||||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// RVV
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
|
@ -177,16 +268,25 @@
|
|||
// and the compiler already aligns builtin types, so nothing to do here.
|
||||
#define HWY_ALIGN
|
||||
|
||||
// Arbitrary constant, not the actual lane count! Large enough that we can
|
||||
// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
|
||||
#define HWY_LANES(T) (4096 / sizeof(T))
|
||||
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
|
||||
#define HWY_MAX_BYTES 65536
|
||||
|
||||
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
|
||||
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
|
||||
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
|
||||
#define HWY_LANES(T) (8388608 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(__riscv_zfh)
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#else
|
||||
#define HWY_CAP_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#define HWY_NAMESPACE N_RVV
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
@ -197,10 +297,11 @@
|
|||
#elif HWY_TARGET == HWY_SCALAR
|
||||
|
||||
#define HWY_ALIGN
|
||||
// For internal use only; use Lanes(d) instead.
|
||||
#define HWY_MAX_BYTES 8
|
||||
#define HWY_LANES(T) 1
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
|
||||
#include <cmath>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Separate header because foreach_target.h re-enables its include guard.
|
||||
#include "hwy/ops/set_macros-inl.h"
|
||||
|
||||
|
@ -24,14 +26,11 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// SIMD operations are implemented as overloaded functions selected using a
|
||||
// "descriptor" D := Simd<T, N>. T is the lane type, N a number of lanes >= 1
|
||||
// (always a power of two). Users generally do not choose N directly, but
|
||||
// instead use HWY_FULL(T[, LMUL]) (the largest available size). N is not
|
||||
// necessarily the actual number of lanes, which is returned by Lanes(D()).
|
||||
//
|
||||
// Only HWY_FULL(T) and N <= 16 / sizeof(T) are guaranteed to be available - the
|
||||
// latter are useful if >128 bit vectors are unnecessary or undesirable.
|
||||
// SIMD operations are implemented as overloaded functions selected using a tag
|
||||
// type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
|
||||
// use only. Users create D via aliases ScalableTag<T>() (a full vector),
|
||||
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
|
||||
// (always a power of two) is Lanes(D()).
|
||||
template <typename Lane, size_t N>
|
||||
struct Simd {
|
||||
constexpr Simd() = default;
|
||||
|
@ -39,7 +38,7 @@ struct Simd {
|
|||
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
|
||||
|
||||
// Widening/narrowing ops change the number of lanes and/or their type.
|
||||
// To initialize such vectors, we need the corresponding descriptor types:
|
||||
// To initialize such vectors, we need the corresponding tag types:
|
||||
|
||||
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
|
||||
template <typename NewLane>
|
||||
|
@ -59,10 +58,88 @@ struct Simd {
|
|||
using Twice = Simd<T, 2 * N>;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
// Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
|
||||
// - a full vector (pow2 = 0);
|
||||
// - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
|
||||
// - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
|
||||
constexpr size_t ScaleByPower(size_t N, int pow2) {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For fractions, if N == 1 ensure we still return at least one lane.
|
||||
return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
|
||||
#else
|
||||
// If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
|
||||
return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Struct wrappers enable validation of arguments via static_assert.
|
||||
template <typename T, int kPow2>
|
||||
struct ScalableTagChecker {
|
||||
static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
|
||||
using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
|
||||
};
|
||||
|
||||
template <typename T, size_t kLimit>
|
||||
struct CappedTagChecker {
|
||||
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
|
||||
using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
|
||||
};
|
||||
|
||||
template <typename T, size_t kNumLanes>
|
||||
struct FixedTagChecker {
|
||||
static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
|
||||
static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes");
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// HWY_MAX_BYTES would still allow uint8x8, which is not supported.
|
||||
static_assert(kNumLanes == 1, "Scalar only supports one lane");
|
||||
#endif
|
||||
using type = Simd<T, kNumLanes>;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
|
||||
// e.g. 1D loops where the application does not care about the vector size) or a
|
||||
// fraction/multiple of one. Multiples are the same as full vectors for all
|
||||
// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
|
||||
// value of type promotion and demotion.
|
||||
template <typename T, int kPow2 = 0>
|
||||
using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
|
||||
|
||||
// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
|
||||
// targets with scalable vectors and HWY_SCALAR. The runtime lane count
|
||||
// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
|
||||
// typically used for 1D loops with a relatively low application-defined upper
|
||||
// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
|
||||
// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
|
||||
// chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
|
||||
// this would enable vector-length-agnostic loops using ScalableTag).
|
||||
template <typename T, size_t kLimit>
|
||||
using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
|
||||
|
||||
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
|
||||
// even on targets with scalable vectors. All targets except HWY_SCALAR support
|
||||
// up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
|
||||
// on that is non-portable and discouraged.
|
||||
//
|
||||
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
|
||||
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
|
||||
// This is useful for data structures that rely on exactly 128-bit SIMD, but
|
||||
// these are discouraged because they cannot benefit from wider vectors.
|
||||
// Instead, applications would ideally define a larger problem size and loop
|
||||
// over it with the (unknown size) vectors from ScalableTag.
|
||||
//
|
||||
// + e.g. if the baseline is known to support SIMD, or the application requires
|
||||
// ops such as TableLookupBytes not supported by HWY_SCALAR.
|
||||
template <typename T, size_t kNumLanes>
|
||||
using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
|
||||
|
||||
template <class D>
|
||||
using TFromD = typename D::T;
|
||||
|
||||
// Descriptor for the same number of lanes as D, but with the LaneType T.
|
||||
// Tag for the same number of lanes as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Rebind = typename D::template Rebind<T>;
|
||||
|
||||
|
@ -73,7 +150,7 @@ using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
|
|||
template <class D>
|
||||
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
|
||||
|
||||
// Descriptor for the same total size as D, but with the LaneType T.
|
||||
// Tag for the same total size as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Repartition = typename D::template Repartition<T>;
|
||||
|
||||
|
@ -82,7 +159,7 @@ using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
|
|||
template <class D>
|
||||
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
|
||||
|
||||
// Descriptor for the same lane type as D, but half the lanes.
|
||||
// Tag for the same lane type as D, but half the lanes.
|
||||
template <class D>
|
||||
using Half = typename D::Half;
|
||||
|
||||
|
@ -98,6 +175,17 @@ using Twice = typename D::Twice;
|
|||
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
|
||||
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
|
||||
|
||||
// Same, but with a vector argument.
|
||||
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
|
||||
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
|
||||
#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
|
||||
#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
|
||||
|
||||
// For implementing functions for a specific type.
|
||||
// IsSame<...>() in template arguments is broken on MSVC2015.
|
||||
#define HWY_IF_LANES_ARE(T, V) \
|
||||
EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr
|
||||
|
||||
// Compile-time-constant, (typically but not guaranteed) an upper bound on the
|
||||
// number of lanes.
|
||||
// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
|
||||
|
@ -119,6 +207,25 @@ HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
|
|||
|
||||
#endif
|
||||
|
||||
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
|
||||
// functions in two situations:
|
||||
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
|
||||
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
|
||||
// all) tests to fail.
|
||||
//
|
||||
// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
|
||||
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
|
||||
// and possibly also other functions that are not inlined.
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
|
||||
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64)
|
||||
template <class V>
|
||||
using VecArg = const V&;
|
||||
#else
|
||||
template <class V>
|
||||
using VecArg = V;
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -19,6 +19,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
|
@ -40,14 +41,14 @@ namespace {
|
|||
|
||||
#if HWY_ARCH_X86
|
||||
|
||||
bool IsBitSet(const uint32_t reg, const int index) {
|
||||
HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
|
||||
return (reg & (1U << index)) != 0;
|
||||
}
|
||||
|
||||
// Calls CPUID instruction with eax=level and ecx=count and returns the result
|
||||
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
|
||||
void Cpuid(const uint32_t level, const uint32_t count,
|
||||
uint32_t* HWY_RESTRICT abcd) {
|
||||
HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
|
||||
uint32_t* HWY_RESTRICT abcd) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
int regs[4];
|
||||
__cpuidex(regs, level, count);
|
||||
|
@ -95,37 +96,87 @@ uint32_t supported_targets_for_test_ = 0;
|
|||
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Bits indicating which instruction set extensions are supported.
|
||||
constexpr uint32_t kSSE = 1 << 0;
|
||||
constexpr uint32_t kSSE2 = 1 << 1;
|
||||
constexpr uint32_t kSSE3 = 1 << 2;
|
||||
constexpr uint32_t kSSSE3 = 1 << 3;
|
||||
constexpr uint32_t kSSE41 = 1 << 4;
|
||||
constexpr uint32_t kSSE42 = 1 << 5;
|
||||
constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
|
||||
// Arbritrary bit indices indicating which instruction set extensions are
|
||||
// supported. Use enum to ensure values are distinct.
|
||||
enum class FeatureIndex : uint32_t {
|
||||
kSSE = 0,
|
||||
kSSE2,
|
||||
kSSE3,
|
||||
kSSSE3,
|
||||
|
||||
constexpr uint32_t kAVX = 1u << 6;
|
||||
constexpr uint32_t kAVX2 = 1u << 7;
|
||||
constexpr uint32_t kFMA = 1u << 8;
|
||||
constexpr uint32_t kLZCNT = 1u << 9;
|
||||
constexpr uint32_t kBMI = 1u << 10;
|
||||
constexpr uint32_t kBMI2 = 1u << 11;
|
||||
kSSE41,
|
||||
kSSE42,
|
||||
kCLMUL,
|
||||
kAES,
|
||||
|
||||
kAVX,
|
||||
kAVX2,
|
||||
kF16C,
|
||||
kFMA,
|
||||
kLZCNT,
|
||||
kBMI,
|
||||
kBMI2,
|
||||
|
||||
kAVX512F,
|
||||
kAVX512VL,
|
||||
kAVX512DQ,
|
||||
kAVX512BW,
|
||||
|
||||
kVNNI,
|
||||
kVPCLMULQDQ,
|
||||
kVBMI2,
|
||||
kVAES,
|
||||
kPOPCNTDQ,
|
||||
kBITALG,
|
||||
|
||||
kSentinel
|
||||
};
|
||||
static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
|
||||
"Too many bits for u64");
|
||||
|
||||
HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
|
||||
return 1ull << static_cast<size_t>(index);
|
||||
}
|
||||
|
||||
constexpr uint64_t kGroupSSSE3 =
|
||||
Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
|
||||
Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
|
||||
|
||||
constexpr uint64_t kGroupSSE4 =
|
||||
Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
|
||||
Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
|
||||
|
||||
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
|
||||
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
|
||||
// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
|
||||
// avoiding using and requiring these so AVX2 can still be used.
|
||||
#ifdef HWY_DISABLE_BMI2_FMA
|
||||
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
|
||||
constexpr uint64_t kGroupBMI2_FMA = 0;
|
||||
#else
|
||||
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
|
||||
constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
|
||||
Bit(FeatureIndex::kBMI2) |
|
||||
Bit(FeatureIndex::kFMA);
|
||||
#endif
|
||||
|
||||
constexpr uint32_t kAVX512F = 1u << 12;
|
||||
constexpr uint32_t kAVX512VL = 1u << 13;
|
||||
constexpr uint32_t kAVX512DQ = 1u << 14;
|
||||
constexpr uint32_t kAVX512BW = 1u << 15;
|
||||
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
|
||||
#ifdef HWY_DISABLE_F16C
|
||||
constexpr uint64_t kGroupF16C = 0;
|
||||
#else
|
||||
constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
|
||||
#endif
|
||||
|
||||
constexpr uint64_t kGroupAVX2 =
|
||||
Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
|
||||
Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
|
||||
|
||||
constexpr uint64_t kGroupAVX3 =
|
||||
Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
|
||||
Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
|
||||
|
||||
constexpr uint64_t kGroupAVX3_DL =
|
||||
Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
|
||||
Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) |
|
||||
Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | kGroupAVX3;
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
} // namespace
|
||||
|
@ -150,6 +201,8 @@ HWY_NORETURN void HWY_FORMAT(3, 4)
|
|||
|
||||
#if HWY_COMPILER_MSVC
|
||||
abort(); // Compile error without this due to HWY_NORETURN.
|
||||
#elif HWY_ARCH_RVV
|
||||
exit(1); // trap/abort just freeze Spike
|
||||
#else
|
||||
__builtin_trap();
|
||||
#endif
|
||||
|
@ -193,69 +246,90 @@ uint32_t SupportedTargets() {
|
|||
bits = HWY_SCALAR;
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
uint32_t flags = 0;
|
||||
uint32_t abcd[4];
|
||||
bool has_osxsave = false;
|
||||
{ // ensures we do not accidentally use flags outside this block
|
||||
uint64_t flags = 0;
|
||||
uint32_t abcd[4];
|
||||
|
||||
Cpuid(0, 0, abcd);
|
||||
const uint32_t max_level = abcd[0];
|
||||
Cpuid(0, 0, abcd);
|
||||
const uint32_t max_level = abcd[0];
|
||||
|
||||
// Standard feature flags
|
||||
Cpuid(1, 0, abcd);
|
||||
flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
|
||||
flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
|
||||
flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
|
||||
flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
|
||||
flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
|
||||
flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
|
||||
flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
|
||||
flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
|
||||
const bool has_osxsave = IsBitSet(abcd[2], 27);
|
||||
// Standard feature flags
|
||||
Cpuid(1, 0, abcd);
|
||||
flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
|
||||
flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
|
||||
flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
|
||||
flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
|
||||
flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
|
||||
flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
|
||||
flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
|
||||
flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
|
||||
flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
|
||||
flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
|
||||
flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
|
||||
has_osxsave = IsBitSet(abcd[2], 27);
|
||||
|
||||
// Extended feature flags
|
||||
Cpuid(0x80000001U, 0, abcd);
|
||||
flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
|
||||
// Extended feature flags
|
||||
Cpuid(0x80000001U, 0, abcd);
|
||||
flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
|
||||
|
||||
// Extended features
|
||||
if (max_level >= 7) {
|
||||
Cpuid(7, 0, abcd);
|
||||
flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
|
||||
flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
|
||||
flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
|
||||
// Extended features
|
||||
if (max_level >= 7) {
|
||||
Cpuid(7, 0, abcd);
|
||||
flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
|
||||
flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
|
||||
flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
|
||||
|
||||
flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
|
||||
flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
|
||||
flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
|
||||
flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
|
||||
flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
|
||||
flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
|
||||
flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
|
||||
flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
|
||||
|
||||
flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
|
||||
flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
|
||||
flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
|
||||
flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
|
||||
flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
|
||||
flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
|
||||
}
|
||||
|
||||
// Set target bit(s) if all their group's flags are all set.
|
||||
if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
|
||||
bits |= HWY_AVX3_DL;
|
||||
}
|
||||
if ((flags & kGroupAVX3) == kGroupAVX3) {
|
||||
bits |= HWY_AVX3;
|
||||
}
|
||||
if ((flags & kGroupAVX2) == kGroupAVX2) {
|
||||
bits |= HWY_AVX2;
|
||||
}
|
||||
if ((flags & kGroupSSE4) == kGroupSSE4) {
|
||||
bits |= HWY_SSE4;
|
||||
}
|
||||
if ((flags & kGroupSSSE3) == kGroupSSSE3) {
|
||||
bits |= HWY_SSSE3;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify OS support for XSAVE, without which XMM/YMM registers are not
|
||||
// preserved across context switches and are not safe to use.
|
||||
// Clear bits if the OS does not support XSAVE - otherwise, registers
|
||||
// are not preserved across context switches.
|
||||
if (has_osxsave) {
|
||||
const uint32_t xcr0 = ReadXCR0();
|
||||
// XMM
|
||||
if (!IsBitSet(xcr0, 1)) {
|
||||
flags = 0;
|
||||
bits &=
|
||||
~uint32_t(HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
|
||||
}
|
||||
// YMM
|
||||
if (!IsBitSet(xcr0, 2)) {
|
||||
flags &= ~kGroupAVX2;
|
||||
bits &= ~uint32_t(HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL);
|
||||
}
|
||||
// ZMM + opmask
|
||||
if ((xcr0 & 0x70) != 0x70) {
|
||||
flags &= ~kGroupAVX3;
|
||||
bits &= ~uint32_t(HWY_AVX3 | HWY_AVX3_DL);
|
||||
}
|
||||
}
|
||||
|
||||
// Set target bit(s) if all their group's flags are all set.
|
||||
if ((flags & kGroupAVX3) == kGroupAVX3) {
|
||||
bits |= HWY_AVX3;
|
||||
}
|
||||
if ((flags & kGroupAVX2) == kGroupAVX2) {
|
||||
bits |= HWY_AVX2;
|
||||
}
|
||||
if ((flags & kGroupSSE4) == kGroupSSE4) {
|
||||
bits |= HWY_SSE4;
|
||||
}
|
||||
#else
|
||||
// TODO(janwas): detect for other platforms
|
||||
bits = HWY_ENABLED_BASELINE;
|
||||
|
|
|
@ -21,281 +21,23 @@
|
|||
// generate and call.
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See ../quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
|
||||
// Uncomment to override the default blocklist:
|
||||
// #define HWY_BROKEN_TARGETS HWY_AVX3
|
||||
|
||||
// Uncomment to definitely avoid generating those target(s):
|
||||
// #define HWY_DISABLED_TARGETS HWY_SSE4
|
||||
|
||||
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
|
||||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
|
||||
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
|
||||
//
|
||||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
|
||||
// can use 32-bit literals.
|
||||
|
||||
// 1,2,4: reserved
|
||||
#define HWY_AVX3 8
|
||||
#define HWY_AVX2 16
|
||||
// 32: reserved for AVX
|
||||
#define HWY_SSE4 64
|
||||
// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
|
||||
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 9
|
||||
|
||||
#define HWY_SVE2 0x400
|
||||
#define HWY_SVE 0x800
|
||||
// 0x1000 reserved for Helium
|
||||
#define HWY_NEON 0x2000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 13
|
||||
|
||||
// 0x4000, 0x8000 reserved
|
||||
#define HWY_PPC8 0x10000 // v2.07 or 3
|
||||
// 0x20000, 0x40000 reserved for prior VSX/AltiVec
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 18
|
||||
|
||||
// 0x80000 reserved
|
||||
#define HWY_WASM 0x100000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 20
|
||||
|
||||
// 0x200000, 0x400000, 0x800000 reserved
|
||||
|
||||
#define HWY_RVV 0x1000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 24
|
||||
|
||||
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
|
||||
|
||||
#define HWY_SCALAR 0x20000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_SCALAR 29
|
||||
|
||||
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
||||
// Disabled means excluded from enabled at user's request. A separate config
|
||||
// macro allows disabling without deactivating the blocklist below.
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS 0
|
||||
#endif
|
||||
|
||||
// Broken means excluded from enabled due to known compiler issues. Allow the
|
||||
// user to override this blocklist without any guarantee of success.
|
||||
#ifndef HWY_BROKEN_TARGETS
|
||||
|
||||
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
||||
// SSE4 codegen (possibly only for msan), so disable all those targets.
|
||||
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
||||
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
|
||||
// This entails a major speed reduction, so warn unless the user explicitly
|
||||
// opts in to scalar-only.
|
||||
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
||||
#endif
|
||||
|
||||
// 32-bit may fail to compile AVX2/3.
|
||||
#elif HWY_ARCH_X86_32
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
|
||||
|
||||
// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
|
||||
#elif HWY_COMPILER_MSVC != 0
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX3)
|
||||
|
||||
// armv7be has not been tested and is not yet supported.
|
||||
#elif HWY_ARCH_ARM_V7 && (defined(__ARM_BIG_ENDIAN) || defined(__BIG_ENDIAN))
|
||||
#define HWY_BROKEN_TARGETS (HWY_NEON)
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
#endif
|
||||
|
||||
#endif // HWY_BROKEN_TARGETS
|
||||
|
||||
// Enabled means not disabled nor blocklisted.
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. Do not use
|
||||
// this directly because it does not take the blocklist into account. Allow the
|
||||
// user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_SCALAR.
|
||||
|
||||
#if HWY_ARCH_WASM && defined(__wasm_simd128__)
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
||||
// Avoid choosing the PPC target until we have an implementation.
|
||||
#if HWY_ARCH_PPC && defined(__VSX__) && 0
|
||||
#define HWY_BASELINE_PPC8 HWY_PPC8
|
||||
#else
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
// Avoid choosing the SVE[2] targets the implementation is ready.
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) && 0
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#else
|
||||
#define HWY_BASELINE_SVE2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE) && 0
|
||||
#define HWY_BASELINE_SVE HWY_SVE
|
||||
#else
|
||||
#define HWY_BASELINE_SVE 0
|
||||
#endif
|
||||
|
||||
// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
|
||||
#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
|
||||
#define HWY_BASELINE_NEON HWY_NEON
|
||||
#else
|
||||
#define HWY_BASELINE_NEON 0
|
||||
#endif
|
||||
|
||||
// MSVC does not set SSE4_1, but it does set AVX; checking for the latter means
|
||||
// we at least get SSE4 on machines supporting AVX but not AVX2.
|
||||
// https://stackoverflow.com/questions/18563978/
|
||||
#if HWY_ARCH_X86 && \
|
||||
(defined(__SSE4_1__) || (HWY_COMPILER_MSVC != 0 && defined(__AVX__)))
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && defined(__AVX2__)
|
||||
#define HWY_BASELINE_AVX2 HWY_AVX2
|
||||
#else
|
||||
#define HWY_BASELINE_AVX2 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && defined(__AVX512F__)
|
||||
#define HWY_BASELINE_AVX3 HWY_AVX3
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
#define HWY_BASELINE_RVV HWY_RVV
|
||||
#else
|
||||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
|
||||
HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSE4 | \
|
||||
HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | HWY_BASELINE_RVV)
|
||||
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose target for static dispatch
|
||||
|
||||
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
|
||||
#if HWY_ENABLED_BASELINE == 0
|
||||
#error "At least one baseline target must be defined and enabled"
|
||||
#endif
|
||||
|
||||
// Best baseline, used for static dispatch. This is the least-significant 1-bit
|
||||
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
|
||||
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
|
||||
|
||||
// Start by assuming static dispatch. If we later use dynamic dispatch, this
|
||||
// will be defined to other targets during the multiple-inclusion, and finally
|
||||
// return to the initial value. Defining this outside begin/end_target ensures
|
||||
// inl headers successfully compile by themselves (required by Bazel).
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
|
||||
defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
|
||||
#error "Invalid config: can only define a single policy for targets"
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
|
||||
#endif
|
||||
|
||||
// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
|
||||
// to ~HWY_SCALAR, but this is more explicit).
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
||||
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
|
||||
#elif defined(HWY_COMPILE_ONLY_STATIC)
|
||||
#define HWY_TARGETS HWY_STATIC_TARGET
|
||||
|
||||
// 3) For tests: include all attainable targets (in particular: scalar)
|
||||
#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
|
||||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar.
|
||||
#else
|
||||
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
|
||||
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
|
||||
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
|
||||
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
|
||||
#error "Logic error: best baseline should be included in dynamic targets"
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
#include "hwy/detect_targets.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns (cached) bitfield of enabled targets that are supported on this CPU.
|
||||
// Implemented in supported_targets.cc; unconditionally compiled to support the
|
||||
// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
|
||||
// allow eliding calls to this function.
|
||||
// Implemented in targets.cc; unconditionally compiled to support the use case
|
||||
// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
|
||||
// eliding calls to this function.
|
||||
uint32_t SupportedTargets();
|
||||
|
||||
// Evaluates to a function call, or literal if there is a single target.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
|
||||
#else
|
||||
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
|
||||
#endif
|
||||
|
||||
// Disable from runtime dispatch the mask of compiled in targets. Targets that
|
||||
// were not enabled at compile time are ignored. This function is useful to
|
||||
// disable a target supported by the CPU that is known to have bugs or when a
|
||||
|
@ -304,17 +46,9 @@ uint32_t SupportedTargets();
|
|||
// returns at least the baseline target.
|
||||
void DisableTargets(uint32_t disabled_targets);
|
||||
|
||||
// Single target: reduce code size by eliding the call and conditional branches
|
||||
// inside Choose*() functions.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
|
||||
#else
|
||||
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
|
||||
#endif
|
||||
|
||||
// Set the mock mask of CPU supported targets instead of the actual CPU
|
||||
// supported targets computed in SupportedTargets(). The return value of
|
||||
// SupportedTargets() will still be affected by the DisabledTargets() mask
|
||||
// SupportedTargets() will still be affected by the DisableTargets() mask
|
||||
// regardless of this mock, to prevent accidentally adding targets that are
|
||||
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
|
||||
// mock and use the actual CPU supported targets instead.
|
||||
|
@ -340,12 +74,16 @@ HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
|
|||
static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
||||
switch (target) {
|
||||
#if HWY_ARCH_X86
|
||||
case HWY_SSSE3:
|
||||
return "SSSE3";
|
||||
case HWY_SSE4:
|
||||
return "SSE4";
|
||||
case HWY_AVX2:
|
||||
return "AVX2";
|
||||
case HWY_AVX3:
|
||||
return "AVX3";
|
||||
case HWY_AVX3_DL:
|
||||
return "AVX3_DL";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
|
@ -423,17 +161,17 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
|
||||
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
|
||||
// corresponds to the best target. Don't include a "," at the end of the list.
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
nullptr, /* SSSE3 */ \
|
||||
nullptr, /* SSE3 */ \
|
||||
nullptr /* SSE2 */
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
HWY_CHOOSE_SSSE3(func_name), /* SSSE3 */ \
|
||||
nullptr, /* SSE3 */ \
|
||||
nullptr /* SSE2 */
|
||||
|
||||
#elif HWY_ARCH_ARM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
|
@ -460,11 +198,11 @@ static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
|||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM(func_name) /* WASM */
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM2(func_name), /* WASM2 */ \
|
||||
HWY_CHOOSE_WASM(func_name) /* WASM */
|
||||
|
||||
#elif HWY_ARCH_RVV
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
|
|
|
@ -23,10 +23,14 @@ namespace fake {
|
|||
uint32_t FakeFunction(int) { return HWY_##TGT; } \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(AVX3_DL)
|
||||
DECLARE_FUNCTION(AVX3)
|
||||
DECLARE_FUNCTION(AVX2)
|
||||
DECLARE_FUNCTION(SSE4)
|
||||
DECLARE_FUNCTION(SSSE3)
|
||||
DECLARE_FUNCTION(NEON)
|
||||
DECLARE_FUNCTION(SVE)
|
||||
DECLARE_FUNCTION(SVE2)
|
||||
DECLARE_FUNCTION(PPC8)
|
||||
DECLARE_FUNCTION(WASM)
|
||||
DECLARE_FUNCTION(RVV)
|
||||
|
@ -49,10 +53,14 @@ void CheckFakeFunction() {
|
|||
/* Second call uses the cached value from the previous call. */ \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
}
|
||||
CHECK_ARRAY_ENTRY(AVX3_DL)
|
||||
CHECK_ARRAY_ENTRY(AVX3)
|
||||
CHECK_ARRAY_ENTRY(AVX2)
|
||||
CHECK_ARRAY_ENTRY(SSE4)
|
||||
CHECK_ARRAY_ENTRY(SSSE3)
|
||||
CHECK_ARRAY_ENTRY(NEON)
|
||||
CHECK_ARRAY_ENTRY(SVE)
|
||||
CHECK_ARRAY_ENTRY(SVE2)
|
||||
CHECK_ARRAY_ENTRY(PPC8)
|
||||
CHECK_ARRAY_ENTRY(WASM)
|
||||
CHECK_ARRAY_ENTRY(RVV)
|
||||
|
@ -84,14 +92,14 @@ TEST_F(HwyTargetsTest, DisabledTargetsTest) {
|
|||
|
||||
DisableTargets(0); // Reset the mask.
|
||||
uint32_t current_targets = SupportedTargets();
|
||||
if ((current_targets & ~HWY_ENABLED_BASELINE) == 0) {
|
||||
if ((current_targets & ~uint32_t(HWY_ENABLED_BASELINE)) == 0) {
|
||||
// We can't test anything else if the only compiled target is the baseline.
|
||||
return;
|
||||
}
|
||||
// Get the lowest bit in the mask (the best target) and disable that one.
|
||||
uint32_t lowest_target = current_targets & (~current_targets + 1);
|
||||
// The lowest target shouldn't be one in the baseline.
|
||||
HWY_ASSERT((lowest_target & ~HWY_ENABLED_BASELINE) != 0);
|
||||
HWY_ASSERT((lowest_target & ~uint32_t(HWY_ENABLED_BASELINE)) != 0);
|
||||
DisableTargets(lowest_target);
|
||||
|
||||
// Check that the other targets are still enabled.
|
||||
|
@ -100,3 +108,9 @@ TEST_F(HwyTargetsTest, DisabledTargetsTest) {
|
|||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
|
@ -40,7 +41,7 @@ struct TestPlusMinus {
|
|||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = static_cast<T>((2 + i) + (3 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, lanes.get(), v2 + v3);
|
||||
HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
|
@ -199,7 +200,7 @@ struct TestLeftShifts {
|
|||
|
||||
// 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
|
||||
|
@ -207,7 +208,7 @@ struct TestLeftShifts {
|
|||
|
||||
// max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
|
||||
|
@ -301,6 +302,43 @@ struct TestUnsignedRightShifts {
|
|||
}
|
||||
};
|
||||
|
||||
struct TestRotateRight {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr size_t kBits = sizeof(T) * 8;
|
||||
const auto mask_shift = Set(d, T{kBits});
|
||||
// Cover as many bit positions as possible to test shifting out
|
||||
const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
|
||||
|
||||
// Rotate by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
|
||||
|
||||
// Rotate by 1
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
|
||||
|
||||
// Rotate by half
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
|
||||
|
||||
// Rotate by max
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestVariableUnsignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -353,9 +391,9 @@ T RightShiftNegative(T val) {
|
|||
TU bits;
|
||||
CopyBytes<sizeof(T)>(&val, &bits);
|
||||
|
||||
const TU shifted = bits >> kAmount;
|
||||
const TU shifted = TU(bits >> kAmount);
|
||||
|
||||
const TU all = ~TU(0);
|
||||
const TU all = TU(~TU(0));
|
||||
const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
|
||||
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
|
||||
|
||||
|
@ -376,7 +414,7 @@ class TestSignedRightShifts {
|
|||
|
||||
// First test positive values, negative are checked below.
|
||||
const auto v0 = Zero(d);
|
||||
const auto values = Iota(d, 0) & Set(d, kMax);
|
||||
const auto values = And(Iota(d, 0), Set(d, kMax));
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
||||
|
@ -512,6 +550,14 @@ HWY_NOINLINE void TestAllVariableShifts() {
|
|||
#endif
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllRotateRight() {
|
||||
const ForPartialVectors<TestRotateRight> test;
|
||||
test(uint32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestUnsignedMinMax {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -616,7 +662,7 @@ struct TestUnsignedMul {
|
|||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((1 + i) * (1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), vi * vi);
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((1 + i) * (3 + i));
|
||||
|
@ -655,8 +701,8 @@ struct TestSignedMul {
|
|||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
|
||||
|
||||
for (int i = 0; i < static_cast<int>(N); ++i) {
|
||||
expected[i] = static_cast<T>((-T(N) + i) * (1 + i));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
|
||||
|
@ -695,22 +741,22 @@ struct TestMulHigh {
|
|||
|
||||
// Large positive squared
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = LimitsMax<T>() >> i;
|
||||
expected_lanes[i] = (Wide(in_lanes[i]) * in_lanes[i]) >> 16;
|
||||
in_lanes[i] = T(LimitsMax<T>() >> i);
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
|
||||
}
|
||||
auto v = Load(d, in_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
|
||||
|
||||
// Large positive * small positive
|
||||
for (int i = 0; i < static_cast<int>(N); ++i) {
|
||||
expected_lanes[i] = static_cast<T>((Wide(in_lanes[i]) * T(1 + i)) >> 16);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
|
||||
|
||||
// Large positive * small negative
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected_lanes[i] = (Wide(in_lanes[i]) * T(i - N)) >> 16;
|
||||
expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
|
||||
|
@ -747,10 +793,52 @@ struct TestMulEven {
|
|||
}
|
||||
};
|
||||
|
||||
struct TestMulEvenOdd64 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto expected_even = AllocateAligned<T>(N);
|
||||
auto expected_odd = AllocateAligned<T>(N);
|
||||
|
||||
// Random inputs in each lane
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = Random64(&rng);
|
||||
in2[i] = Random64(&rng);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
|
||||
expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
|
||||
}
|
||||
|
||||
const auto a = Load(d, in1.get());
|
||||
const auto b = Load(d, in2.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMulEven() {
|
||||
ForPartialVectors<TestMulEven> test;
|
||||
ForExtendableVectors<TestMulEven> test;
|
||||
test(int32_t());
|
||||
test(uint32_t());
|
||||
|
||||
ForGE128Vectors<TestMulEvenOdd64>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestMulAdd {
|
||||
|
@ -784,7 +872,8 @@ struct TestMulAdd {
|
|||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = -T(i + 2) * (i + 2) + (1 + i);
|
||||
expected[i] =
|
||||
T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
|
||||
|
||||
|
@ -819,6 +908,78 @@ HWY_NOINLINE void TestAllMulAdd() {
|
|||
ForFloatTypes(ForPartialVectors<TestMulAdd>());
|
||||
}
|
||||
|
||||
struct TestReorderWidenMulAccumulate {
|
||||
template <typename TN, class DN>
|
||||
HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
|
||||
using TW = MakeWide<TN>;
|
||||
const RepartitionToWide<DN> dw;
|
||||
const auto f0 = Zero(dw);
|
||||
const auto f1 = Set(dw, 1.0f);
|
||||
const auto fi = Iota(dw, 1);
|
||||
const auto bf0 = ReorderDemote2To(dn, f0, f0);
|
||||
const auto bf1 = ReorderDemote2To(dn, f1, f1);
|
||||
const auto bfi = ReorderDemote2To(dn, fi, fi);
|
||||
const size_t NW = Lanes(dw);
|
||||
auto delta = AllocateAligned<TW>(2 * NW);
|
||||
for (size_t i = 0; i < 2 * NW; ++i) {
|
||||
delta[i] = 0.0f;
|
||||
}
|
||||
|
||||
// Any input zero => both outputs zero
|
||||
auto sum1 = f0;
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
HWY_ASSERT_VEC_EQ(dw, f0,
|
||||
ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
|
||||
HWY_ASSERT_VEC_EQ(dw, f0, sum1);
|
||||
|
||||
// delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
|
||||
for (size_t p = 0; p < 2 * NW; ++p) {
|
||||
delta[p] = 1.0f;
|
||||
const auto delta0 = Load(dw, delta.get() + 0);
|
||||
const auto delta1 = Load(dw, delta.get() + NW);
|
||||
delta[p] = 0.0f;
|
||||
const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
|
||||
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Swapped arg order
|
||||
{
|
||||
sum1 = f0;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
|
||||
HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
// Start with nonzero sum0 or sum1, and swap arg order
|
||||
{
|
||||
sum1 = delta1;
|
||||
const auto sum0 =
|
||||
ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
|
||||
HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
|
||||
ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
|
||||
}
|
||||
|
||||
struct TestDiv {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -852,13 +1013,24 @@ struct TestApproximateReciprocal {
|
|||
Store(ApproximateReciprocal(nonzero), d, actual.get());
|
||||
|
||||
double max_l1 = 0.0;
|
||||
double worst_expected = 0.0;
|
||||
double worst_actual = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
max_l1 = std::max<double>(max_l1, std::abs((1.0 / input[i]) - actual[i]));
|
||||
const double expected = 1.0 / input[i];
|
||||
const double l1 = std::abs(expected - actual[i]);
|
||||
if (l1 > max_l1) {
|
||||
max_l1 = l1;
|
||||
worst_expected = expected;
|
||||
worst_actual = actual[i];
|
||||
}
|
||||
}
|
||||
const double abs_worst_expected = std::abs(worst_expected);
|
||||
if (abs_worst_expected > 1E-5) {
|
||||
const double max_rel = max_l1 / abs_worst_expected;
|
||||
fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
|
||||
worst_expected, worst_actual);
|
||||
HWY_ASSERT(max_rel < 0.004);
|
||||
}
|
||||
const double max_rel = max_l1 / std::abs(1.0 / input[N - 1]);
|
||||
printf("max err %f\n", max_rel);
|
||||
|
||||
HWY_ASSERT(max_rel < 0.002);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -870,7 +1042,7 @@ struct TestSquareRoot {
|
|||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto vi = Iota(d, 0);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Sqrt(vi * vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -888,7 +1060,11 @@ struct TestReciprocalSquareRoot {
|
|||
for (size_t i = 0; i < N; ++i) {
|
||||
float err = lanes[i] - 0.090166f;
|
||||
if (err < 0.0f) err = -err;
|
||||
HWY_ASSERT(err < 1E-4f);
|
||||
if (err >= 4E-4f) {
|
||||
HWY_ABORT("Lane %" PRIu64 "(%" PRIu64 "): actual %f err %f\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(N), lanes[i],
|
||||
err);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -901,36 +1077,53 @@ template <typename T, class D>
|
|||
AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
|
||||
const T eps = std::numeric_limits<T>::epsilon();
|
||||
const T test_cases[] = {
|
||||
// +/- 1
|
||||
T(1), T(-1),
|
||||
// +/- 0
|
||||
T(0), T(-0),
|
||||
// near 0
|
||||
T(0.4), T(-0.4),
|
||||
// +/- integer
|
||||
T(4), T(-32),
|
||||
// positive near limit
|
||||
MantissaEnd<T>() - T(1.5), MantissaEnd<T>() + T(1.5),
|
||||
// negative near limit
|
||||
-MantissaEnd<T>() - T(1.5), -MantissaEnd<T>() + T(1.5),
|
||||
// +/- huge (but still fits in float)
|
||||
T(1E34), T(-1E35),
|
||||
// positive tiebreak
|
||||
T(1.5), T(2.5),
|
||||
// negative tiebreak
|
||||
T(-1.5), T(-2.5),
|
||||
// positive +/- delta
|
||||
T(2.0001), T(3.9999),
|
||||
// negative +/- delta
|
||||
T(-999.9999), T(-998.0001),
|
||||
// positive +/- epsilon
|
||||
T(1) + eps, T(1) - eps,
|
||||
// negative +/- epsilon
|
||||
T(-1) + eps, T(-1) - eps,
|
||||
// +/- infinity
|
||||
std::numeric_limits<T>::infinity(), -std::numeric_limits<T>::infinity(),
|
||||
// qNaN
|
||||
GetLane(NaN(d))};
|
||||
// +/- 1
|
||||
T(1),
|
||||
T(-1),
|
||||
// +/- 0
|
||||
T(0),
|
||||
T(-0),
|
||||
// near 0
|
||||
T(0.4),
|
||||
T(-0.4),
|
||||
// +/- integer
|
||||
T(4),
|
||||
T(-32),
|
||||
// positive near limit
|
||||
MantissaEnd<T>() - T(1.5),
|
||||
MantissaEnd<T>() + T(1.5),
|
||||
// negative near limit
|
||||
-MantissaEnd<T>() - T(1.5),
|
||||
-MantissaEnd<T>() + T(1.5),
|
||||
// positive tiebreak
|
||||
T(1.5),
|
||||
T(2.5),
|
||||
// negative tiebreak
|
||||
T(-1.5),
|
||||
T(-2.5),
|
||||
// positive +/- delta
|
||||
T(2.0001),
|
||||
T(3.9999),
|
||||
// negative +/- delta
|
||||
T(-999.9999),
|
||||
T(-998.0001),
|
||||
// positive +/- epsilon
|
||||
T(1) + eps,
|
||||
T(1) - eps,
|
||||
// negative +/- epsilon
|
||||
T(-1) + eps,
|
||||
T(-1) - eps,
|
||||
#if !defined(HWY_EMULATE_SVE) // these are not safe to just cast to int
|
||||
// +/- huge (but still fits in float)
|
||||
T(1E34),
|
||||
T(-1E35),
|
||||
// +/- infinity
|
||||
std::numeric_limits<T>::infinity(),
|
||||
-std::numeric_limits<T>::infinity(),
|
||||
// qNaN
|
||||
GetLane(NaN(d))
|
||||
#endif
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
|
||||
|
@ -952,7 +1145,7 @@ struct TestRound {
|
|||
// Avoid [std::]round, which does not round to nearest *even*.
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
expected[i] = nearbyint(in[i]);
|
||||
expected[i] = static_cast<T>(nearbyint(in[i]));
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
|
||||
|
@ -983,7 +1176,7 @@ struct TestNearestInt {
|
|||
// Avoid undefined result for lrintf
|
||||
expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
|
||||
} else {
|
||||
expected[i] = lrintf(in[i]);
|
||||
expected[i] = static_cast<TI>(lrintf(in[i]));
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(df)) {
|
||||
|
@ -1008,7 +1201,7 @@ struct TestTrunc {
|
|||
for (size_t i = 0; i < padded; ++i) {
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
expected[i] = trunc(in[i]);
|
||||
expected[i] = static_cast<T>(trunc(in[i]));
|
||||
}
|
||||
for (size_t i = 0; i < padded; i += Lanes(d)) {
|
||||
HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
|
||||
|
@ -1074,30 +1267,20 @@ struct TestSumOfLanes {
|
|||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
|
||||
sum += static_cast<double>(in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(Load(d, in_lanes.get())));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
|
||||
SumOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = i (iota) to include upper lanes
|
||||
sum = 0.0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
sum += static_cast<double>(i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(Iota(d, 0)));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumOfLanes() {
|
||||
const ForPartialVectors<TestSumOfLanes> sum;
|
||||
|
||||
// No u8/u16/i8/i16.
|
||||
sum(uint32_t());
|
||||
sum(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
sum(uint64_t());
|
||||
sum(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(sum);
|
||||
ForUIF3264(ForPartialVectors<TestSumOfLanes>());
|
||||
}
|
||||
|
||||
struct TestMinOfLanes {
|
||||
|
@ -1112,17 +1295,17 @@ struct TestMinOfLanes {
|
|||
constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
|
||||
min = std::min(min, in_lanes[i]);
|
||||
min = HWY_MIN(min, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(Load(d, in_lanes.get())));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = N - i to include upper lanes
|
||||
min = HighestValue<T>();
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = static_cast<T>(N - i); // no 8-bit T so no wraparound
|
||||
min = std::min(min, in_lanes[i]);
|
||||
min = HWY_MIN(min, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(Load(d, in_lanes.get())));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1137,39 +1320,29 @@ struct TestMaxOfLanes {
|
|||
constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
|
||||
max = std::max(max, in_lanes[i]);
|
||||
max = HWY_MAX(max, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(Load(d, in_lanes.get())));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
|
||||
|
||||
// Lane i = i to include upper lanes
|
||||
max = LowestValue<T>();
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = static_cast<T>(i); // no 8-bit T so no wraparound
|
||||
max = std::max(max, in_lanes[i]);
|
||||
max = HWY_MAX(max, in_lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(Load(d, in_lanes.get())));
|
||||
HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMinMaxOfLanes() {
|
||||
const ForPartialVectors<TestMinOfLanes> min;
|
||||
const ForPartialVectors<TestMaxOfLanes> max;
|
||||
|
||||
// No u8/u16/i8/i16.
|
||||
min(uint32_t());
|
||||
max(uint32_t());
|
||||
min(int32_t());
|
||||
max(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
min(uint64_t());
|
||||
max(uint64_t());
|
||||
min(int64_t());
|
||||
max(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(min);
|
||||
ForFloatTypes(max);
|
||||
const ForPartialVectors<TestMinOfLanes> test_min;
|
||||
const ForPartialVectors<TestMaxOfLanes> test_max;
|
||||
ForUIF3264(test_min);
|
||||
ForUIF3264(test_max);
|
||||
test_min(uint16_t());
|
||||
test_max(uint16_t());
|
||||
test_min(int16_t());
|
||||
test_max(int16_t());
|
||||
}
|
||||
|
||||
struct TestAbsDiff {
|
||||
|
@ -1219,12 +1392,14 @@ HWY_NOINLINE void TestAllNeg() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyArithmeticTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
|
||||
|
@ -1232,6 +1407,7 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
|
|||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulHigh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulEven);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMulAdd);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllReorderWidenMulAccumulate);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllDiv);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllApproximateReciprocal);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSquareRoot);
|
||||
|
@ -1246,4 +1422,11 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
|
|||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,645 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestShiftBytes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define Shift*Bytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> du8;
|
||||
const size_t N8 = Lanes(du8);
|
||||
|
||||
// Zero remains zero
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
|
||||
|
||||
// Zero after shifting out the high/low byte
|
||||
auto bytes = AllocateAligned<uint8_t>(N8);
|
||||
std::fill(bytes.get(), bytes.get() + N8, 0);
|
||||
bytes[N8 - 1] = 0x7F;
|
||||
const auto vhi = BitCast(d, Load(du8, bytes.get()));
|
||||
bytes[N8 - 1] = 0;
|
||||
bytes[0] = 0x7F;
|
||||
const auto vlo = BitCast(d, Load(du8, bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
|
||||
|
||||
// Check expected result with Iota
|
||||
const size_t N = Lanes(d);
|
||||
auto in = AllocateAligned<T>(N);
|
||||
const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
|
||||
const auto v = BitCast(d, Iota(du8, 1));
|
||||
Store(v, d, in.get());
|
||||
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
const size_t kBlockSize = HWY_MIN(N8, 16);
|
||||
for (size_t block = 0; block < N8; block += kBlockSize) {
|
||||
expected_bytes[block] = 0;
|
||||
memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
|
||||
|
||||
for (size_t block = 0; block < N8; block += kBlockSize) {
|
||||
memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
|
||||
expected_bytes[block + kBlockSize - 1] = 0;
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
|
||||
#else
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShiftBytes() {
|
||||
ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
|
||||
}
|
||||
|
||||
struct TestShiftLanes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define Shift*Lanes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const auto v = Iota(d, T(1));
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
|
||||
HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
|
||||
HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
|
||||
|
||||
constexpr size_t kLanesPerBlock = 16 / sizeof(T);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t mod = i % kLanesPerBlock;
|
||||
expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
|
||||
#else
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShiftLanes() {
|
||||
ForAllTypes(ForPartialVectors<TestShiftLanes>());
|
||||
}
|
||||
|
||||
template <typename D, int kLane>
|
||||
struct TestBroadcastR {
|
||||
HWY_NOINLINE void operator()() const {
|
||||
using T = typename D::T;
|
||||
const D d;
|
||||
const size_t N = Lanes(d);
|
||||
if (kLane >= N) return;
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
|
||||
const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
|
||||
// Need to set within each 128-bit block
|
||||
for (size_t block = 0; block < N; block += blockN) {
|
||||
in_lanes[block + kLane] = static_cast<T>(block + 1);
|
||||
}
|
||||
const auto in = Load(d, in_lanes.get());
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t block = 0; block < N; block += blockN) {
|
||||
for (size_t i = 0; i < blockN; ++i) {
|
||||
expected[block + i] = T(block + 1);
|
||||
}
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
|
||||
|
||||
TestBroadcastR<D, kLane - 1>()();
|
||||
}
|
||||
};
|
||||
|
||||
template <class D>
|
||||
struct TestBroadcastR<D, -1> {
|
||||
void operator()() const {}
|
||||
};
|
||||
|
||||
struct TestBroadcast {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllBroadcast() {
|
||||
const ForPartialVectors<TestBroadcast> test;
|
||||
// No u/i8.
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
ForUIF3264(test);
|
||||
}
|
||||
|
||||
template <bool kFull>
|
||||
struct ChooseTableSize {
|
||||
template <typename T, typename DIdx>
|
||||
using type = DIdx;
|
||||
};
|
||||
template <>
|
||||
struct ChooseTableSize<true> {
|
||||
template <typename T, typename DIdx>
|
||||
using type = ScalableTag<T>;
|
||||
};
|
||||
|
||||
template <bool kFull>
|
||||
struct TestTableLookupBytes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
RandomState rng;
|
||||
const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
|
||||
const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
|
||||
const size_t NT8 = Lanes(d_tbl8);
|
||||
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t N8 = Lanes(d8);
|
||||
|
||||
// Random input bytes
|
||||
auto in_bytes = AllocateAligned<uint8_t>(NT8);
|
||||
for (size_t i = 0; i < NT8; ++i) {
|
||||
in_bytes[i] = Random32(&rng) & 0xFF;
|
||||
}
|
||||
const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
|
||||
|
||||
// Enough test data; for larger vectors, upper lanes will be zero.
|
||||
const uint8_t index_bytes_source[64] = {
|
||||
// Same index as source, multiple outputs from same input,
|
||||
// unused input (9), ascending/descending and nonconsecutive neighbors.
|
||||
0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11,
|
||||
11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0,
|
||||
4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1};
|
||||
auto index_bytes = AllocateAligned<uint8_t>(N8);
|
||||
const size_t max_index = HWY_MIN(N8, 16) - 1;
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
|
||||
// Avoid asan error for partial vectors.
|
||||
index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
|
||||
}
|
||||
const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
|
||||
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
for (size_t block = 0; block < N8; block += 16) {
|
||||
for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
|
||||
const uint8_t index = index_bytes[block + i];
|
||||
HWY_ASSERT(block + index < N8); // indices were already capped to N8.
|
||||
// For large vectors, the lane index may wrap around due to block.
|
||||
expected_bytes[block + i] = in_bytes[(block & 0xFF) + index];
|
||||
}
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
|
||||
|
||||
// Individually test zeroing each byte position.
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
const uint8_t prev_expected = expected_bytes[i];
|
||||
const uint8_t prev_index = index_bytes[i];
|
||||
expected_bytes[i] = 0;
|
||||
|
||||
const int idx = 0x80 + (int(Random32(&rng) & 7) << 4);
|
||||
HWY_ASSERT(0x80 <= idx && idx < 256);
|
||||
index_bytes[i] = static_cast<uint8_t>(idx);
|
||||
|
||||
const auto indices =
|
||||
Load(d, reinterpret_cast<const T*>(index_bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
|
||||
expected_bytes[i] = prev_expected;
|
||||
index_bytes[i] = prev_index;
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllTableLookupBytes() {
|
||||
// Partial index, same-sized table.
|
||||
ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
|
||||
|
||||
// TODO(janwas): requires LMUL trunc/ext, which is not yet implemented.
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
// Partial index, full-size table.
|
||||
ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestInterleaveLower {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const size_t blockN = HWY_MIN(16 / sizeof(T), N);
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
const size_t index = (i % blockN) + block * 2 * blockN;
|
||||
expected[i] = static_cast<T>(index & LimitsMax<TU>());
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestInterleaveUpper {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const size_t blockN = HWY_MIN(16 / sizeof(T), N);
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllInterleave() {
|
||||
// Not DemoteVectors because this cannot be supported by HWY_SCALAR.
|
||||
ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
|
||||
ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
|
||||
}
|
||||
|
||||
struct TestZipLower {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using WideT = MakeWide<T>;
|
||||
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
|
||||
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
|
||||
const size_t N = Lanes(d);
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const Repartition<WideT, D> dw;
|
||||
const size_t NW = Lanes(dw);
|
||||
auto expected = AllocateAligned<WideT>(NW);
|
||||
const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
|
||||
|
||||
for (size_t i = 0; i < NW; ++i) {
|
||||
const size_t block = i / blockN;
|
||||
// Value of least-significant lane in lo-vector.
|
||||
const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
|
||||
const size_t kBits = sizeof(T) * 8;
|
||||
expected[i] = static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) +
|
||||
static_cast<WideT>(lo));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
|
||||
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(dw, even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestZipUpper {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using WideT = MakeWide<T>;
|
||||
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
|
||||
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
|
||||
const size_t N = Lanes(d);
|
||||
if (N < 16 / sizeof(T)) return;
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const Repartition<WideT, D> dw;
|
||||
const size_t NW = Lanes(dw);
|
||||
auto expected = AllocateAligned<WideT>(NW);
|
||||
const size_t blockN = HWY_MIN(size_t(16) / sizeof(WideT), NW);
|
||||
|
||||
for (size_t i = 0; i < NW; ++i) {
|
||||
const size_t block = i / blockN;
|
||||
const size_t lo = 2u * (i % blockN) + 4u * block * blockN;
|
||||
const size_t kBits = sizeof(T) * 8;
|
||||
expected[i] = static_cast<WideT>(
|
||||
(static_cast<WideT>(lo + 2 * blockN + 1) << kBits) +
|
||||
static_cast<WideT>(lo + 2 * blockN));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(dw, even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllZip() {
|
||||
const ForDemoteVectors<TestZipLower> lower_unsigned;
|
||||
// TODO(janwas): enable after LowerHalf available
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
lower_unsigned(uint8_t());
|
||||
#endif
|
||||
lower_unsigned(uint16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
const ForDemoteVectors<TestZipLower> lower_signed;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
lower_signed(int8_t());
|
||||
#endif
|
||||
lower_signed(int16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
const ForShrinkableVectors<TestZipUpper> upper_unsigned;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
upper_unsigned(uint8_t());
|
||||
#endif
|
||||
upper_unsigned(uint16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
const ForShrinkableVectors<TestZipUpper> upper_signed;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
upper_signed(int8_t());
|
||||
#endif
|
||||
upper_signed(int16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
// No float - concatenating f32 does not result in a f64
|
||||
}
|
||||
|
||||
template <int kBytes>
|
||||
struct TestCombineShiftRightBytesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const size_t kBlockSize = 16;
|
||||
static_assert(kBytes < kBlockSize, "Shift count is per block");
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
if (N8 < 16) return;
|
||||
auto hi_bytes = AllocateAligned<uint8_t>(N8);
|
||||
auto lo_bytes = AllocateAligned<uint8_t>(N8);
|
||||
auto expected_bytes = AllocateAligned<uint8_t>(N8);
|
||||
uint8_t combined[2 * kBlockSize];
|
||||
|
||||
// Random inputs in each lane
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
|
||||
lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
|
||||
}
|
||||
for (size_t i = 0; i < N8; i += kBlockSize) {
|
||||
CopyBytes<kBlockSize>(&lo_bytes[i], combined);
|
||||
CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
|
||||
CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
|
||||
}
|
||||
|
||||
const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
|
||||
const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
|
||||
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
|
||||
}
|
||||
|
||||
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <int kLanes>
|
||||
struct TestCombineShiftRightLanesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
if (N8 < 16) return;
|
||||
|
||||
auto hi_bytes = AllocateAligned<uint8_t>(N8);
|
||||
auto lo_bytes = AllocateAligned<uint8_t>(N8);
|
||||
auto expected_bytes = AllocateAligned<uint8_t>(N8);
|
||||
const size_t kBlockSize = 16;
|
||||
uint8_t combined[2 * kBlockSize];
|
||||
|
||||
// Random inputs in each lane
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
|
||||
lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
|
||||
}
|
||||
for (size_t i = 0; i < N8; i += kBlockSize) {
|
||||
CopyBytes<kBlockSize>(&lo_bytes[i], combined);
|
||||
CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
|
||||
CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
|
||||
&expected_bytes[i]);
|
||||
}
|
||||
|
||||
const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
|
||||
const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
|
||||
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
|
||||
}
|
||||
|
||||
TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightBytesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightLanesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
struct TestCombineShiftRight {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
|
||||
TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
|
||||
TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCombineShiftRight() {
|
||||
// Need at least 2 lanes.
|
||||
ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
|
||||
}
|
||||
|
||||
class TestSpecialShuffle32 {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, 0);
|
||||
VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
|
||||
VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
|
||||
VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
|
||||
VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
|
||||
VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
|
||||
const size_t i2, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
using T = TFromD<D>;
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
const size_t N = Lanes(d);
|
||||
if (N < 4) return;
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t block = 0; block < N; block += kBlockN) {
|
||||
expected[block + 3] = static_cast<T>(block + i3);
|
||||
expected[block + 2] = static_cast<T>(block + i2);
|
||||
expected[block + 1] = static_cast<T>(block + i1);
|
||||
expected[block + 0] = static_cast<T>(block + i0);
|
||||
}
|
||||
AssertVecEqual(d, expected.get(), actual, filename, line);
|
||||
}
|
||||
};
|
||||
|
||||
class TestSpecialShuffle64 {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, 0);
|
||||
VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
using T = TFromD<D>;
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
const size_t N = Lanes(d);
|
||||
if (N < 2) return;
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t block = 0; block < N; block += kBlockN) {
|
||||
expected[block + 1] = static_cast<T>(block + i1);
|
||||
expected[block + 0] = static_cast<T>(block + i0);
|
||||
}
|
||||
AssertVecEqual(d, expected.get(), actual, filename, line);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSpecialShuffles() {
|
||||
const ForGE128Vectors<TestSpecialShuffle32> test32;
|
||||
test32(uint32_t());
|
||||
test32(int32_t());
|
||||
test32(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test64;
|
||||
test64(uint64_t());
|
||||
test64(int64_t());
|
||||
#endif
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test_d;
|
||||
test_d(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyBlockwiseTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftBytes);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllShiftLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytes);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZip);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllCombineShiftRight);
|
||||
HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -36,16 +36,21 @@ struct TestLowerHalf {
|
|||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
auto lanes2 = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
std::fill(lanes2.get(), lanes2.get() + N, T(0));
|
||||
const auto v = Iota(d, 1);
|
||||
Store(LowerHalf(v), d2, lanes.get());
|
||||
Store(LowerHalf(d2, v), d2, lanes.get());
|
||||
Store(LowerHalf(v), d2, lanes2.get()); // optionally without D
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d2); ++i) {
|
||||
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
|
||||
HWY_ASSERT_EQ(T(1 + i), lanes2[i]);
|
||||
}
|
||||
// Other half remains unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
HWY_ASSERT_EQ(T(0), lanes2[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -53,29 +58,35 @@ struct TestLowerHalf {
|
|||
struct TestLowerQuarter {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const Half<Half<D>> d4;
|
||||
const Half<D> d2;
|
||||
const Half<decltype(d2)> d4;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
auto lanes2 = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
std::fill(lanes2.get(), lanes2.get() + N, T(0));
|
||||
const auto v = Iota(d, 1);
|
||||
const auto lo = LowerHalf(LowerHalf(v));
|
||||
const auto lo = LowerHalf(d4, LowerHalf(d2, v));
|
||||
const auto lo2 = LowerHalf(LowerHalf(v)); // optionally without D
|
||||
Store(lo, d4, lanes.get());
|
||||
Store(lo2, d4, lanes2.get());
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d4); ++i) {
|
||||
HWY_ASSERT_EQ(T(i + 1), lanes[i]);
|
||||
HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
|
||||
}
|
||||
// Upper 3/4 remain unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
HWY_ASSERT_EQ(T(0), lanes2[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLowerHalf() {
|
||||
constexpr size_t kDiv = 1;
|
||||
ForAllTypes(ForPartialVectors<TestLowerHalf, kDiv, /*kMinLanes=*/2>());
|
||||
ForAllTypes(ForPartialVectors<TestLowerQuarter, kDiv, /*kMinLanes=*/4>());
|
||||
ForAllTypes(ForDemoteVectors<TestLowerHalf>());
|
||||
ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>());
|
||||
}
|
||||
|
||||
struct TestUpperHalf {
|
||||
|
@ -90,7 +101,7 @@ struct TestUpperHalf {
|
|||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
Store(UpperHalf(v), d2, lanes.get());
|
||||
Store(UpperHalf(d2, v), d2, lanes.get());
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d2); ++i) {
|
||||
HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
|
||||
|
@ -106,13 +117,12 @@ struct TestUpperHalf {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllUpperHalf() {
|
||||
ForAllTypes(ForGE128Vectors<TestUpperHalf>());
|
||||
ForAllTypes(ForShrinkableVectors<TestUpperHalf>());
|
||||
}
|
||||
|
||||
struct TestZeroExtendVector {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_CAP_GE256
|
||||
const Twice<D> d2;
|
||||
|
||||
const auto v = Iota(d, 1);
|
||||
|
@ -121,7 +131,7 @@ struct TestZeroExtendVector {
|
|||
Store(v, d, &lanes[0]);
|
||||
Store(v, d, &lanes[N2 / 2]);
|
||||
|
||||
const auto ext = ZeroExtendVector(v);
|
||||
const auto ext = ZeroExtendVector(d2, v);
|
||||
Store(ext, d2, lanes.get());
|
||||
|
||||
size_t i = 0;
|
||||
|
@ -133,9 +143,6 @@ struct TestZeroExtendVector {
|
|||
for (; i < N2; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -146,21 +153,17 @@ HWY_NOINLINE void TestAllZeroExtendVector() {
|
|||
struct TestCombine {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_CAP_GE256
|
||||
const Twice<D> d2;
|
||||
const size_t N2 = Lanes(d2);
|
||||
auto lanes = AllocateAligned<T>(N2);
|
||||
|
||||
const auto lo = Iota(d, 1);
|
||||
const auto hi = Iota(d, N2 / 2 + 1);
|
||||
const auto combined = Combine(hi, lo);
|
||||
const auto combined = Combine(d2, hi, lo);
|
||||
Store(combined, d2, lanes.get());
|
||||
|
||||
const auto expected = Iota(d2, 1);
|
||||
HWY_ASSERT_VEC_EQ(d2, expected, combined);
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -168,102 +171,81 @@ HWY_NOINLINE void TestAllCombine() {
|
|||
ForAllTypes(ForExtendableVectors<TestCombine>());
|
||||
}
|
||||
|
||||
|
||||
template <int kBytes>
|
||||
struct TestCombineShiftRightBytesR {
|
||||
struct TestConcat {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
const auto lo = BitCast(d, Iota(d8, 1));
|
||||
const auto hi = BitCast(d, Iota(d8, 1 + N8));
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
const size_t half_bytes = N * sizeof(T) / 2;
|
||||
|
||||
auto expected = AllocateAligned<T>(Lanes(d));
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
auto hi = AllocateAligned<T>(N);
|
||||
auto lo = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 10; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
hi[i] = static_cast<T>(Random64(&rng) & 0xFF);
|
||||
lo[i] = static_cast<T>(Random64(&rng) & 0xFF);
|
||||
}
|
||||
|
||||
const size_t kBlockSize = 16;
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
const size_t block = i / kBlockSize;
|
||||
const size_t lane = i % kBlockSize;
|
||||
const size_t first_lo = block * kBlockSize;
|
||||
const size_t idx = lane + kBytes;
|
||||
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
|
||||
const bool at_end = idx >= 2 * kBlockSize;
|
||||
expected_bytes[i] =
|
||||
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
|
||||
{
|
||||
memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
|
||||
memcpy(&expected[0], &lo[0], half_bytes);
|
||||
const auto vhi = Load(d, hi.get());
|
||||
const auto vlo = Load(d, lo.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo));
|
||||
}
|
||||
|
||||
{
|
||||
memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
|
||||
memcpy(&expected[0], &lo[N / 2], half_bytes);
|
||||
const auto vhi = Load(d, hi.get());
|
||||
const auto vlo = Load(d, lo.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo));
|
||||
}
|
||||
|
||||
{
|
||||
memcpy(&expected[N / 2], &hi[0], half_bytes);
|
||||
memcpy(&expected[0], &lo[N / 2], half_bytes);
|
||||
const auto vhi = Load(d, hi.get());
|
||||
const auto vlo = Load(d, lo.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo));
|
||||
}
|
||||
|
||||
{
|
||||
memcpy(&expected[N / 2], &hi[0], half_bytes);
|
||||
memcpy(&expected[0], &lo[0], half_bytes);
|
||||
const auto vhi = Load(d, hi.get());
|
||||
const auto vlo = Load(d, lo.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo));
|
||||
}
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(),
|
||||
CombineShiftRightBytes<kBytes>(hi, lo));
|
||||
}
|
||||
};
|
||||
|
||||
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
|
||||
HWY_NOINLINE void TestAllConcat() {
|
||||
ForAllTypes(ForShrinkableVectors<TestConcat>());
|
||||
}
|
||||
|
||||
struct TestConcatOddEven {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
|
||||
const size_t N = Lanes(d);
|
||||
const auto hi = Iota(d, N);
|
||||
const auto lo = Iota(d, 0);
|
||||
const auto even = Add(Iota(d, 0), Iota(d, 0));
|
||||
const auto odd = Add(even, Set(d, 1));
|
||||
HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
|
||||
HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <int kLanes>
|
||||
struct TestCombineShiftRightLanesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
const auto lo = BitCast(d, Iota(d8, 1));
|
||||
const auto hi = BitCast(d, Iota(d8, 1 + N8));
|
||||
|
||||
auto expected = AllocateAligned<T>(Lanes(d));
|
||||
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
const size_t kBlockSize = 16;
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
const size_t block = i / kBlockSize;
|
||||
const size_t lane = i % kBlockSize;
|
||||
const size_t first_lo = block * kBlockSize;
|
||||
const size_t idx = lane + kLanes * sizeof(T);
|
||||
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
|
||||
const bool at_end = idx >= 2 * kBlockSize;
|
||||
expected_bytes[i] =
|
||||
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(),
|
||||
CombineShiftRightLanes<kLanes>(hi, lo));
|
||||
|
||||
TestCombineShiftRightBytesR<kLanes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightBytesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightLanesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
struct TestCombineShiftRight {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
TestCombineShiftRightBytesR<15>()(t, d);
|
||||
TestCombineShiftRightLanesR<16 / sizeof(T) - 1>()(t, d);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCombineShiftRight() {
|
||||
ForAllTypes(ForGE128Vectors<TestCombineShiftRight>());
|
||||
HWY_NOINLINE void TestAllConcatOddEven() {
|
||||
ForUIF3264(ForShrinkableVectors<TestConcatOddEven>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -272,15 +254,24 @@ HWY_NOINLINE void TestAllCombineShiftRight() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyCombineTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven);
|
||||
} // namespace hwy
|
||||
#endif
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#else
|
||||
int main(int, char**) { return 0; }
|
||||
|
|
|
@ -26,25 +26,6 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// All types.
|
||||
struct TestMask {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
const auto actual_false = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
|
||||
|
||||
memset(lanes.get(), 0xFF, N * sizeof(T));
|
||||
const auto actual_true = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMask() { ForAllTypes(ForPartialVectors<TestMask>()); }
|
||||
|
||||
// All types.
|
||||
struct TestEquality {
|
||||
template <typename T, class D>
|
||||
|
@ -57,8 +38,14 @@ struct TestEquality {
|
|||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -90,6 +77,37 @@ void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
|
|||
|
||||
#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
|
||||
|
||||
struct TestStrictUnsigned {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const T max = LimitsMax<T>();
|
||||
const auto v0 = Zero(d);
|
||||
const auto v2 = And(Iota(d, T(2)), Set(d, 255)); // 0..255
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
|
||||
// Individual values of interest
|
||||
HWY_ENSURE_GREATER(d, 2, 1);
|
||||
HWY_ENSURE_GREATER(d, 1, 0);
|
||||
HWY_ENSURE_GREATER(d, 128, 127);
|
||||
HWY_ENSURE_GREATER(d, max, max / 2);
|
||||
HWY_ENSURE_GREATER(d, max, 1);
|
||||
HWY_ENSURE_GREATER(d, max, 0);
|
||||
|
||||
// Also use Iota to ensure lanes are independent
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStrictUnsigned() {
|
||||
ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>());
|
||||
}
|
||||
|
||||
struct TestStrictInt {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -97,7 +115,7 @@ struct TestStrictInt {
|
|||
const T max = LimitsMax<T>();
|
||||
const auto v0 = Zero(d);
|
||||
const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127
|
||||
const auto vn = Neg(v2) - Set(d, 1); // -1..-128
|
||||
const auto vn = Sub(Neg(v2), Set(d, 1)); // -1..-128
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
@ -131,14 +149,14 @@ struct TestStrictInt {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStrictInt() {
|
||||
ForSignedTypes(ForExtendableVectors<TestStrictInt>());
|
||||
ForSignedTypes(ForPartialVectors<TestStrictInt>());
|
||||
}
|
||||
|
||||
struct TestStrictFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const T huge_neg = -1E35;
|
||||
const T huge_pos = 1E36;
|
||||
const T huge_neg = T(-1E35);
|
||||
const T huge_pos = T(1E36);
|
||||
const auto v0 = Zero(d);
|
||||
const auto v2 = Iota(d, T(2));
|
||||
const auto vn = Neg(v2);
|
||||
|
@ -173,13 +191,13 @@ struct TestStrictFloat {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStrictFloat() {
|
||||
ForFloatTypes(ForExtendableVectors<TestStrictFloat>());
|
||||
ForFloatTypes(ForPartialVectors<TestStrictFloat>());
|
||||
}
|
||||
|
||||
struct TestWeakFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v2 = Iota(d, 2);
|
||||
const auto v2 = Iota(d, T(2));
|
||||
const auto vn = Iota(d, -T(Lanes(d)));
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
|
@ -206,12 +224,20 @@ HWY_NOINLINE void TestAllWeakFloat() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyCompareTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -34,7 +34,10 @@ struct TestBitCast {
|
|||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const Repartition<ToT, D> dto;
|
||||
HWY_ASSERT_EQ(Lanes(d) * sizeof(T), Lanes(dto) * sizeof(ToT));
|
||||
const size_t N = Lanes(d);
|
||||
const size_t Nto = Lanes(dto);
|
||||
if (N == 0 || Nto == 0) return;
|
||||
HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT));
|
||||
const auto vf = Iota(d, 1);
|
||||
const auto vt = BitCast(dto, vf);
|
||||
// Must return the same bits
|
||||
|
@ -130,8 +133,10 @@ HWY_NOINLINE void TestAllBitCast() {
|
|||
#endif // HWY_CAP_INTEGER64
|
||||
#endif // HWY_CAP_FLOAT64
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
// For non-scalar vectors, we can cast all types to all.
|
||||
ForAllTypes(ForGE128Vectors<TestBitCastFrom>());
|
||||
ForAllTypes(ForGE64Vectors<TestBitCastFrom>());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
|
@ -146,7 +151,7 @@ struct TestPromoteTo {
|
|||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 200; ++rep) {
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
|
@ -160,39 +165,39 @@ struct TestPromoteTo {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllPromoteTo() {
|
||||
const ForPartialVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
|
||||
const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
|
||||
to_u16div2(uint8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
|
||||
to_u32div4(uint8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
|
||||
to_u32div2(uint16_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
|
||||
to_i16div2(uint8_t());
|
||||
to_i16div2(int8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
|
||||
to_i32div2(uint16_t());
|
||||
to_i32div2(int16_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
|
||||
to_i32div4(uint8_t());
|
||||
to_i32div4(int8_t());
|
||||
|
||||
// Must test f16 separately because we can only load/store/convert them.
|
||||
// Must test f16/bf16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForPartialVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
|
||||
const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
|
||||
to_u64div2(uint32_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
|
||||
to_i64div2(int32_t());
|
||||
#endif
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForPartialVectors<TestPromoteTo<double>, 2> to_f64div2;
|
||||
const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2;
|
||||
to_f64div2(int32_t());
|
||||
to_f64div2(float());
|
||||
#endif
|
||||
|
@ -224,14 +229,23 @@ struct TestDemoteTo {
|
|||
const T min = LimitsMin<ToT>();
|
||||
const T max = LimitsMax<ToT>();
|
||||
|
||||
const auto value_ok = [&](T& value) {
|
||||
if (!IsFinite(value)) return false;
|
||||
#if HWY_EMULATE_SVE
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
value = HWY_MIN(HWY_MAX(min, value), max);
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!IsFinite(from[i]));
|
||||
expected[i] = static_cast<ToT>(std::min(std::max(min, from[i]), max));
|
||||
} while (!value_ok(from[i]));
|
||||
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
|
@ -241,22 +255,22 @@ struct TestDemoteTo {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToInt() {
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
|
||||
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> to_u16;
|
||||
const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
|
||||
to_u16(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<int16_t>, 2> to_i16;
|
||||
const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
|
||||
to_i16(int32_t());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToMixed() {
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteTo<int32_t>, 2> to_i32;
|
||||
const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
|
||||
to_i32(double());
|
||||
#endif
|
||||
}
|
||||
|
@ -275,7 +289,7 @@ struct TestDemoteToFloat {
|
|||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
|
@ -285,7 +299,7 @@ struct TestDemoteToFloat {
|
|||
const T max_abs = HighestValue<ToT>();
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
const T clipped = copysign(std::min(magn, max_abs), from[i]);
|
||||
const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
|
||||
expected[i] = static_cast<ToT>(clipped);
|
||||
}
|
||||
|
||||
|
@ -338,6 +352,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
|
|||
struct TestF16 {
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_CAP_FLOAT16
|
||||
size_t padded;
|
||||
auto in = F16TestCases(d32, padded);
|
||||
using TF16 = float16_t;
|
||||
|
@ -350,10 +365,192 @@ struct TestF16 {
|
|||
Store(DemoteTo(d16, loaded), d16, temp16.get());
|
||||
HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
|
||||
}
|
||||
#else
|
||||
(void)d32;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16, 2>()(float()); }
|
||||
HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
// +/- 1
|
||||
1.0f, -1.0f,
|
||||
// +/- 0
|
||||
0.0f, -0.0f,
|
||||
// near 0
|
||||
0.25f, -0.25f,
|
||||
// +/- integer
|
||||
4.0f, -32.0f,
|
||||
// positive near limit
|
||||
3.389531389251535E38f, 1.99384199368e+38f,
|
||||
// negative near limit
|
||||
-3.389531389251535E38f, -1.99384199368e+38f,
|
||||
// positive +/- delta
|
||||
2.015625f, 3.984375f,
|
||||
// negative +/- delta
|
||||
-2.015625f, -3.984375f,
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
|
||||
auto in = AllocateAligned<float>(padded);
|
||||
auto expected = AllocateAligned<float>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
|
||||
return in;
|
||||
}
|
||||
|
||||
struct TestBF16 {
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
size_t padded;
|
||||
auto in = BF16TestCases(d32, padded);
|
||||
using TBF16 = bfloat16_t;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
const Rebind<TBF16, DF32> dbf16; // avoid 4/2 = 2 lanes
|
||||
#else
|
||||
const Repartition<TBF16, DF32> dbf16;
|
||||
#endif
|
||||
const Half<decltype(dbf16)> dbf16_half;
|
||||
const size_t N = Lanes(d32);
|
||||
auto temp16 = AllocateAligned<TBF16>(N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += N) {
|
||||
const auto loaded = Load(d32, &in[i]);
|
||||
const auto v16 = DemoteTo(dbf16_half, loaded);
|
||||
Store(v16, dbf16_half, temp16.get());
|
||||
const auto v16_loaded = Load(dbf16_half, temp16.get());
|
||||
HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded));
|
||||
}
|
||||
#else
|
||||
(void)d32;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
// Same as BF16TestCases:
|
||||
// +/- 1
|
||||
1.0f,
|
||||
-1.0f,
|
||||
// +/- 0
|
||||
0.0f,
|
||||
-0.0f,
|
||||
// near 0
|
||||
0.25f,
|
||||
-0.25f,
|
||||
// +/- integer
|
||||
4.0f,
|
||||
-32.0f,
|
||||
// positive +/- delta
|
||||
2.015625f,
|
||||
3.984375f,
|
||||
// negative +/- delta
|
||||
-2.015625f,
|
||||
-3.984375f,
|
||||
|
||||
// No huge values - would interfere with sum. But add more to fill 2 * N:
|
||||
-2.0f,
|
||||
-10.0f,
|
||||
0.03125f,
|
||||
1.03125f,
|
||||
1.5f,
|
||||
2.0f,
|
||||
4.0f,
|
||||
5.0f,
|
||||
6.0f,
|
||||
8.0f,
|
||||
10.0f,
|
||||
256.0f,
|
||||
448.0f,
|
||||
2080.0f,
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
|
||||
auto in = AllocateAligned<float>(padded);
|
||||
auto expected = AllocateAligned<float>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
|
||||
return in;
|
||||
}
|
||||
|
||||
class TestReorderDemote2To {
|
||||
// In-place N^2 selection sort to avoid dependencies
|
||||
void Sort(float* p, size_t count) {
|
||||
for (size_t i = 0; i < count - 1; ++i) {
|
||||
// Find min_element
|
||||
size_t idx_min = i;
|
||||
for (size_t j = i + 1; j < count; j++) {
|
||||
if (p[j] < p[idx_min]) {
|
||||
idx_min = j;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap with current
|
||||
const float tmp = p[i];
|
||||
p[i] = p[idx_min];
|
||||
p[idx_min] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
size_t padded;
|
||||
auto in = ReorderBF16TestCases(d32, padded);
|
||||
|
||||
using TBF16 = bfloat16_t;
|
||||
const Repartition<TBF16, DF32> dbf16;
|
||||
const Half<decltype(dbf16)> dbf16_half;
|
||||
const size_t N = Lanes(d32);
|
||||
auto temp16 = AllocateAligned<TBF16>(2 * N);
|
||||
auto expected = AllocateAligned<float>(2 * N);
|
||||
auto actual = AllocateAligned<float>(2 * N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += 2 * N) {
|
||||
const auto f0 = Load(d32, &in[i + 0]);
|
||||
const auto f1 = Load(d32, &in[i + N]);
|
||||
const auto v16 = ReorderDemote2To(dbf16, f0, f1);
|
||||
Store(v16, dbf16, temp16.get());
|
||||
const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
|
||||
const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
|
||||
|
||||
// Smoke test: sum should be same (with tolerance for non-associativity)
|
||||
const auto sum_expected =
|
||||
GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
|
||||
const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
|
||||
HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
|
||||
sum_expected <= sum_actual + 1E-4);
|
||||
|
||||
// Ensure values are the same after sorting to undo the Reorder
|
||||
Store(f0, d32, expected.get() + 0);
|
||||
Store(f1, d32, expected.get() + N);
|
||||
Store(promoted0, d32, actual.get() + 0);
|
||||
Store(promoted1, d32, actual.get() + N);
|
||||
Sort(expected.get(), 2 * N);
|
||||
Sort(actual.get(), 2 * N);
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
|
||||
}
|
||||
#else // HWY_SCALAR
|
||||
(void)d32;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReorderDemote2To() {
|
||||
ForShrinkableVectors<TestReorderDemote2To>()(float());
|
||||
}
|
||||
|
||||
struct TestConvertU8 {
|
||||
template <typename T, class D>
|
||||
|
@ -376,8 +573,9 @@ struct TestIntFromFloatHuge {
|
|||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
// Still does not work, although ARMv7 manual says that float->int
|
||||
// saturates, i.e. chooses the nearest representable value.
|
||||
#if HWY_TARGET != HWY_NEON
|
||||
// saturates, i.e. chooses the nearest representable value. Also causes
|
||||
// out-of-memory for MSVC, and unsafe cast in farm_sve.
|
||||
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE)
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
|
||||
|
@ -395,9 +593,68 @@ struct TestIntFromFloatHuge {
|
|||
}
|
||||
};
|
||||
|
||||
struct TestIntFromFloat {
|
||||
class TestIntFromFloat {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
constexpr size_t kBits = sizeof(TF) * 8;
|
||||
|
||||
// Powers of two, plus offsets to set some mantissa bits.
|
||||
const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)};
|
||||
for (int sign = 0; sign < 2; ++sign) {
|
||||
for (size_t shift = 0; shift < kBits - 1; ++shift) {
|
||||
for (int64_t ofs : ofs_table) {
|
||||
const int64_t mag = (int64_t(1) << shift) + ofs;
|
||||
const int64_t val = sign ? mag : -mag;
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
|
||||
ConvertTo(di, Set(df, static_cast<TF>(val))));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename TF, class DF>
|
||||
static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// TF does not have enough precision to represent TI.
|
||||
const double min = static_cast<double>(LimitsMin<TI>());
|
||||
const double max = static_cast<double>(LimitsMax<TI>());
|
||||
|
||||
// Also check random values.
|
||||
auto from = AllocateAligned<TF>(N);
|
||||
auto expected = AllocateAligned<TI>(N);
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(TF));
|
||||
} while (!std::isfinite(from[i]));
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2);
|
||||
#endif
|
||||
if (from[i] >= max) {
|
||||
expected[i] = LimitsMax<TI>();
|
||||
} else if (from[i] <= min) {
|
||||
expected[i] = LimitsMin<TI>();
|
||||
} else {
|
||||
expected[i] = static_cast<TI>(from[i]);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(di, expected.get(),
|
||||
ConvertTo(di, Load(df, from.get())));
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF tf, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
@ -423,32 +680,8 @@ struct TestIntFromFloat {
|
|||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
|
||||
ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
|
||||
// TF does not have enough precision to represent TI.
|
||||
const double min = static_cast<double>(LimitsMin<TI>());
|
||||
const double max = static_cast<double>(LimitsMax<TI>());
|
||||
|
||||
// Also check random values.
|
||||
auto from = AllocateAligned<TF>(N);
|
||||
auto expected = AllocateAligned<TI>(N);
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(TF));
|
||||
} while (!std::isfinite(from[i]));
|
||||
if (from[i] >= max) {
|
||||
expected[i] = LimitsMax<TI>();
|
||||
} else if (from[i] <= min) {
|
||||
expected[i] = LimitsMin<TI>();
|
||||
} else {
|
||||
expected[i] = static_cast<TI>(from[i]);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(di, expected.get(),
|
||||
ConvertTo(di, Load(df, from.get())));
|
||||
}
|
||||
TestPowers(tf, df);
|
||||
TestRandom(tf, df);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -458,10 +691,10 @@ HWY_NOINLINE void TestAllIntFromFloat() {
|
|||
}
|
||||
|
||||
struct TestFloatFromInt {
|
||||
template <typename TI, class DI>
|
||||
HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
|
||||
using TF = MakeFloat<TI>;
|
||||
const Rebind<TF, DI> df;
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const RebindToSigned<DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
|
@ -481,10 +714,7 @@ struct TestFloatFromInt {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFloatFromInt() {
|
||||
ForPartialVectors<TestFloatFromInt>()(int32_t());
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
ForPartialVectors<TestFloatFromInt>()(int64_t());
|
||||
#endif
|
||||
ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
|
||||
}
|
||||
|
||||
struct TestI32F64 {
|
||||
|
@ -521,14 +751,6 @@ struct TestI32F64 {
|
|||
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
|
||||
|
||||
// Huge positive float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
|
||||
DemoteTo(di, Set(df, TF(1E12))));
|
||||
|
||||
// Huge negative float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
|
||||
DemoteTo(di, Set(df, TF(-1E12))));
|
||||
|
||||
// Max positive int
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
|
||||
PromoteTo(df, Set(di, LimitsMax<TI>())));
|
||||
|
@ -536,12 +758,23 @@ struct TestI32F64 {
|
|||
// Min negative int
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
|
||||
PromoteTo(df, Set(di, LimitsMin<TI>())));
|
||||
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
#if !defined(HWY_EMULATE_SVE)
|
||||
// Huge positive float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
|
||||
DemoteTo(di, Set(df, TF(1E12))));
|
||||
|
||||
// Huge negative float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
|
||||
DemoteTo(di, Set(df, TF(-1E12))));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllI32F64() {
|
||||
#if HWY_CAP_FLOAT64
|
||||
ForDemoteVectors<TestI32F64, 2>()(double());
|
||||
ForDemoteVectors<TestI32F64>()(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -552,6 +785,7 @@ HWY_NOINLINE void TestAllI32F64() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyConvertTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
|
||||
|
@ -560,9 +794,18 @@ HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
|
|||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,549 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
#define HWY_PRINT_CLMUL_GOLDEN 0
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
class TestAES {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void TestSBox(T /*unused*/, D d) {
|
||||
// The generic implementation of the S-box is difficult to verify by
|
||||
// inspection, so we add a white-box test that verifies it using enumeration
|
||||
// (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box).
|
||||
const uint8_t sbox[256] = {
|
||||
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
|
||||
0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
|
||||
0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
|
||||
0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
|
||||
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
|
||||
0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
|
||||
0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
|
||||
0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
|
||||
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
|
||||
0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
|
||||
0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
|
||||
0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
|
||||
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
|
||||
0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
|
||||
0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
|
||||
0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
|
||||
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
|
||||
0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
|
||||
0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
|
||||
0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
|
||||
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
|
||||
0xb0, 0x54, 0xbb, 0x16};
|
||||
|
||||
// Ensure it's safe to load an entire vector by padding.
|
||||
const size_t N = Lanes(d);
|
||||
const size_t padded = RoundUpTo(256, N);
|
||||
auto expected = AllocateAligned<T>(padded);
|
||||
// Must wrap around to match the input (Iota).
|
||||
for (size_t pos = 0; pos < padded;) {
|
||||
const size_t remaining = HWY_MIN(padded - pos, size_t(256));
|
||||
memcpy(expected.get() + pos, sbox, remaining);
|
||||
pos += remaining;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 256; i += N) {
|
||||
const auto in = Iota(d, i);
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Test vector (after first KeyAddition) from
|
||||
// https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf
|
||||
alignas(16) constexpr uint8_t test_lanes[16] = {
|
||||
0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30,
|
||||
0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
|
||||
const auto test = LoadDup128(d, test_lanes);
|
||||
|
||||
// = MixColumn result
|
||||
alignas(16) constexpr uint8_t expected0_lanes[16] = {
|
||||
0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
|
||||
0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
|
||||
const auto expected0 = LoadDup128(d, expected0_lanes);
|
||||
|
||||
// = KeyAddition result
|
||||
alignas(16) constexpr uint8_t expected_lanes[16] = {
|
||||
0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B,
|
||||
0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C};
|
||||
const auto expected = LoadDup128(d, expected_lanes);
|
||||
|
||||
alignas(16) uint8_t key_lanes[16];
|
||||
for (size_t i = 0; i < 16; ++i) {
|
||||
key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i];
|
||||
}
|
||||
const auto round_key = LoadDup128(d, key_lanes);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d)));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
|
||||
|
||||
TestSBox(t, d);
|
||||
}
|
||||
};
|
||||
HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); }
|
||||
|
||||
#else
|
||||
HWY_NOINLINE void TestAllAES() {}
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
|
||||
struct TestCLMul {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// needs 64 bit lanes and 128-bit result
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
|
||||
constexpr size_t kCLMulNum = 512;
|
||||
// Depends on rng!
|
||||
static constexpr uint64_t kCLMulLower[kCLMulNum] = {
|
||||
0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL,
|
||||
0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL,
|
||||
0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL,
|
||||
0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL,
|
||||
0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL,
|
||||
0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL,
|
||||
0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL,
|
||||
0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL,
|
||||
0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL,
|
||||
0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL,
|
||||
0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL,
|
||||
0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL,
|
||||
0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL,
|
||||
0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL,
|
||||
0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL,
|
||||
0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL,
|
||||
0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL,
|
||||
0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL,
|
||||
0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL,
|
||||
0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL,
|
||||
0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL,
|
||||
0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL,
|
||||
0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL,
|
||||
0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL,
|
||||
0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL,
|
||||
0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL,
|
||||
0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL,
|
||||
0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL,
|
||||
0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL,
|
||||
0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL,
|
||||
0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL,
|
||||
0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL,
|
||||
0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL,
|
||||
0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL,
|
||||
0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL,
|
||||
0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL,
|
||||
0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL,
|
||||
0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL,
|
||||
0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL,
|
||||
0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL,
|
||||
0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL,
|
||||
0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL,
|
||||
0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL,
|
||||
0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL,
|
||||
0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL,
|
||||
0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL,
|
||||
0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL,
|
||||
0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL,
|
||||
0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL,
|
||||
0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL,
|
||||
0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL,
|
||||
0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL,
|
||||
0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL,
|
||||
0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL,
|
||||
0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL,
|
||||
0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL,
|
||||
0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL,
|
||||
0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL,
|
||||
0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL,
|
||||
0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL,
|
||||
0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL,
|
||||
0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL,
|
||||
0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL,
|
||||
0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL,
|
||||
0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL,
|
||||
0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL,
|
||||
0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL,
|
||||
0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL,
|
||||
0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL,
|
||||
0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL,
|
||||
0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL,
|
||||
0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL,
|
||||
0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL,
|
||||
0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL,
|
||||
0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL,
|
||||
0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL,
|
||||
0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL,
|
||||
0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL,
|
||||
0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL,
|
||||
0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL,
|
||||
0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL,
|
||||
0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL,
|
||||
0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL,
|
||||
0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL,
|
||||
0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL,
|
||||
0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL,
|
||||
0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL,
|
||||
0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL,
|
||||
0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL,
|
||||
0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL,
|
||||
0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL,
|
||||
0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL,
|
||||
0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL,
|
||||
0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL,
|
||||
0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL,
|
||||
0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL,
|
||||
0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL,
|
||||
0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL,
|
||||
0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL,
|
||||
0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL,
|
||||
0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL,
|
||||
0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL,
|
||||
0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL,
|
||||
0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL,
|
||||
0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL,
|
||||
0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL,
|
||||
0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL,
|
||||
0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL,
|
||||
0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL,
|
||||
0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL,
|
||||
0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL,
|
||||
0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL,
|
||||
0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL,
|
||||
0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL,
|
||||
0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL,
|
||||
0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL,
|
||||
0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL,
|
||||
0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL,
|
||||
0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL,
|
||||
0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL,
|
||||
0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL,
|
||||
0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL,
|
||||
0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL,
|
||||
0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL,
|
||||
0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL,
|
||||
0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL,
|
||||
0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL,
|
||||
0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL,
|
||||
0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL,
|
||||
0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL,
|
||||
0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL,
|
||||
0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL,
|
||||
0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL,
|
||||
0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL,
|
||||
0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL,
|
||||
0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL,
|
||||
0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL,
|
||||
0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL,
|
||||
0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL,
|
||||
0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL,
|
||||
0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL,
|
||||
0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL,
|
||||
0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL,
|
||||
0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL,
|
||||
0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL,
|
||||
0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL,
|
||||
0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL,
|
||||
0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL,
|
||||
0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL,
|
||||
0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL,
|
||||
0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL,
|
||||
0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL,
|
||||
0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL,
|
||||
0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL,
|
||||
0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL,
|
||||
0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL,
|
||||
0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL,
|
||||
0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL,
|
||||
0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL,
|
||||
0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL,
|
||||
0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL,
|
||||
0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL,
|
||||
0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL,
|
||||
0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL,
|
||||
0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL,
|
||||
0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL,
|
||||
0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL,
|
||||
0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL,
|
||||
0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL,
|
||||
0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL,
|
||||
0xeaedae76e975b17cULL, 0x1852aa090effe18eULL};
|
||||
|
||||
static constexpr uint64_t kCLMulUpper[kCLMulNum] = {
|
||||
0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL,
|
||||
0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL,
|
||||
0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL,
|
||||
0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL,
|
||||
0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL,
|
||||
0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL,
|
||||
0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL,
|
||||
0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL,
|
||||
0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL,
|
||||
0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL,
|
||||
0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL,
|
||||
0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL,
|
||||
0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL,
|
||||
0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL,
|
||||
0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL,
|
||||
0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL,
|
||||
0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL,
|
||||
0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL,
|
||||
0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL,
|
||||
0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL,
|
||||
0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL,
|
||||
0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL,
|
||||
0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL,
|
||||
0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL,
|
||||
0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL,
|
||||
0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL,
|
||||
0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL,
|
||||
0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL,
|
||||
0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL,
|
||||
0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL,
|
||||
0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL,
|
||||
0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL,
|
||||
0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL,
|
||||
0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL,
|
||||
0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL,
|
||||
0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL,
|
||||
0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL,
|
||||
0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL,
|
||||
0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL,
|
||||
0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL,
|
||||
0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL,
|
||||
0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL,
|
||||
0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL,
|
||||
0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL,
|
||||
0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL,
|
||||
0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL,
|
||||
0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL,
|
||||
0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL,
|
||||
0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL,
|
||||
0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL,
|
||||
0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL,
|
||||
0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL,
|
||||
0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL,
|
||||
0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL,
|
||||
0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL,
|
||||
0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL,
|
||||
0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL,
|
||||
0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL,
|
||||
0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL,
|
||||
0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL,
|
||||
0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL,
|
||||
0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL,
|
||||
0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL,
|
||||
0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL,
|
||||
0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL,
|
||||
0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL,
|
||||
0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL,
|
||||
0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL,
|
||||
0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL,
|
||||
0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL,
|
||||
0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL,
|
||||
0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL,
|
||||
0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL,
|
||||
0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL,
|
||||
0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL,
|
||||
0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL,
|
||||
0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL,
|
||||
0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL,
|
||||
0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL,
|
||||
0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL,
|
||||
0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL,
|
||||
0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL,
|
||||
0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL,
|
||||
0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL,
|
||||
0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL,
|
||||
0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL,
|
||||
0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL,
|
||||
0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL,
|
||||
0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL,
|
||||
0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL,
|
||||
0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL,
|
||||
0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL,
|
||||
0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL,
|
||||
0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL,
|
||||
0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL,
|
||||
0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL,
|
||||
0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL,
|
||||
0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL,
|
||||
0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL,
|
||||
0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL,
|
||||
0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL,
|
||||
0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL,
|
||||
0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL,
|
||||
0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL,
|
||||
0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL,
|
||||
0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL,
|
||||
0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL,
|
||||
0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL,
|
||||
0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL,
|
||||
0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL,
|
||||
0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL,
|
||||
0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL,
|
||||
0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL,
|
||||
0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL,
|
||||
0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL,
|
||||
0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL,
|
||||
0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL,
|
||||
0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL,
|
||||
0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL,
|
||||
0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL,
|
||||
0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL,
|
||||
0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL,
|
||||
0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL,
|
||||
0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL,
|
||||
0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL,
|
||||
0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL,
|
||||
0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL,
|
||||
0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL,
|
||||
0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL,
|
||||
0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL,
|
||||
0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL,
|
||||
0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL,
|
||||
0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL,
|
||||
0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL,
|
||||
0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL,
|
||||
0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL,
|
||||
0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL,
|
||||
0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL,
|
||||
0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL,
|
||||
0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL,
|
||||
0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL,
|
||||
0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL,
|
||||
0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL,
|
||||
0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL,
|
||||
0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL,
|
||||
0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL,
|
||||
0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL,
|
||||
0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL,
|
||||
0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL,
|
||||
0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL,
|
||||
0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL,
|
||||
0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL,
|
||||
0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL,
|
||||
0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL,
|
||||
0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL,
|
||||
0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL,
|
||||
0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL,
|
||||
0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL,
|
||||
0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL,
|
||||
0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL,
|
||||
0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL,
|
||||
0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL,
|
||||
0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL,
|
||||
0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL,
|
||||
0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL,
|
||||
0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL,
|
||||
0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL,
|
||||
0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL,
|
||||
0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL,
|
||||
0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL,
|
||||
0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL,
|
||||
};
|
||||
|
||||
const size_t padded = RoundUpTo(kCLMulNum, N);
|
||||
auto expected_lower = AllocateAligned<T>(padded);
|
||||
auto expected_upper = AllocateAligned<T>(padded);
|
||||
memcpy(expected_lower.get(), kCLMulLower, kCLMulNum * sizeof(T));
|
||||
memcpy(expected_upper.get(), kCLMulUpper, kCLMulNum * sizeof(T));
|
||||
const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
|
||||
memset(expected_lower.get() + kCLMulNum, 0, padding_size);
|
||||
memset(expected_upper.get() + kCLMulNum, 0, padding_size);
|
||||
|
||||
// Random inputs in each lane
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < kCLMulNum / N; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = Random64(&rng);
|
||||
in2[i] = Random64(&rng);
|
||||
}
|
||||
|
||||
const auto a = Load(d, in1.get());
|
||||
const auto b = Load(d, in2.get());
|
||||
#if HWY_PRINT_CLMUL_GOLDEN
|
||||
Store(CLMulLower(a, b), d, expected_lower.get() + rep * N);
|
||||
Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N);
|
||||
#else
|
||||
HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b));
|
||||
#endif
|
||||
}
|
||||
|
||||
#if HWY_PRINT_CLMUL_GOLDEN
|
||||
// RVV lacks PRIu64, so print 32-bit halves.
|
||||
for (size_t i = 0; i < kCLMulNum; ++i) {
|
||||
printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32),
|
||||
static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU));
|
||||
}
|
||||
printf("\n");
|
||||
for (size_t i = 0; i < kCLMulNum; ++i) {
|
||||
printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32),
|
||||
static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU));
|
||||
}
|
||||
#endif // HWY_PRINT_CLMUL_GOLDEN
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyCryptoTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,156 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HWY_TESTS_HWY_GTEST_H_
|
||||
#define HWY_TESTS_HWY_GTEST_H_
|
||||
|
||||
// Adapters for GUnit to run tests for all targets.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility> // std::tuple
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
|
||||
// used INSTANTIATE_TEST_CASE_P which is now deprecated.
|
||||
#ifdef INSTANTIATE_TEST_SUITE_P
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
|
||||
#else
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
|
||||
#endif
|
||||
|
||||
// Helper class to run parametric tests using the hwy target as parameter. To
|
||||
// use this define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// TEST_P(MyTestSuite, MyTest) { ... }
|
||||
class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
|
||||
protected:
|
||||
void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
};
|
||||
|
||||
// Function to convert the test parameter of a TestWithParamTarget for
|
||||
// displaying it in the gtest test name.
|
||||
static inline std::string TestParamTargetName(
|
||||
const testing::TestParamInfo<uint32_t>& info) {
|
||||
return TargetName(info.param);
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
::hwy::TestParamTargetName)
|
||||
|
||||
// Helper class similar to TestWithParamTarget to run parametric tests that
|
||||
// depend on the target and another parametric test. If you need to use multiple
|
||||
// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
|
||||
// the generator. To use this class define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTargetT<int> {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
|
||||
// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
|
||||
template <typename T>
|
||||
class TestWithParamTargetAndT
|
||||
: public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
|
||||
public:
|
||||
// Expose the parametric type here so it can be used by the
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
|
||||
using HwyParamType = T;
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
SetSupportedTargetsForTest(std::get<0>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
|
||||
T GetParam() {
|
||||
return std::get<1>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string TestParamTargetNameAndT(
|
||||
const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
|
||||
return std::string(TargetName(std::get<0>(info.param))) + "_" +
|
||||
::testing::PrintToString(std::get<1>(info.param));
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
::testing::Combine( \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
generator), \
|
||||
::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
|
||||
|
||||
// Helper macro to export a function and define a test that tests it. This is
|
||||
// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
|
||||
#define HWY_EXPORT_AND_TEST_P(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_BEFORE_TEST(suite) \
|
||||
class suite : public hwy::TestWithParamTarget {}; \
|
||||
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_TESTS_HWY_GTEST_H_
|
|
@ -21,7 +21,9 @@
|
|||
|
||||
void PrintTargets(const char* msg, uint32_t targets) {
|
||||
fprintf(stderr, "%s", msg);
|
||||
for (unsigned x = targets; x != 0; x = x & (x - 1)) {
|
||||
// For each bit:
|
||||
for (uint32_t x = targets; x != 0; x = x & (x - 1)) {
|
||||
// Extract value of least-significant bit.
|
||||
fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -30,5 +32,6 @@ void PrintTargets(const char* msg, uint32_t targets) {
|
|||
int main() {
|
||||
PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
|
||||
PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
|
||||
PrintTargets("Current CPU supports:", hwy::SupportedTargets());
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -16,12 +16,12 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h> // memcmp
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
|
@ -36,7 +36,7 @@ struct TestLogicalInteger {
|
|||
const auto vi = Iota(d, 0);
|
||||
const auto ones = VecFromMask(d, Eq(v0, v0));
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto vnot1 = Set(d, ~T(1));
|
||||
const auto vnot1 = Set(d, T(~T(1)));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
|
||||
HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
|
||||
|
@ -160,308 +160,6 @@ HWY_NOINLINE void TestAllCopySign() {
|
|||
ForFloatTypes(ForPartialVectors<TestCopySign>());
|
||||
}
|
||||
|
||||
struct TestFirstN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
|
||||
// GCC workaround: we previously used zero to indicate true because we can
|
||||
// safely compare with that value. However, that hits an ICE for u64x1 on
|
||||
// GCC 8.3 but not 8.4, even if the implementation of operator== is
|
||||
// simplified to return zero. Using MaskFromVec avoids this, and requires
|
||||
// FF..FF and 0 constants.
|
||||
T on;
|
||||
memset(&on, 0xFF, sizeof(on));
|
||||
const T off = 0;
|
||||
|
||||
for (size_t len = 0; len <= N; ++len) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
mask_lanes[i] = i < len ? on : off;
|
||||
}
|
||||
const auto mask_vals = Load(d, mask_lanes.get());
|
||||
const auto mask = MaskFromVec(mask_vals);
|
||||
HWY_ASSERT_MASK_EQ(d, mask, FirstN(d, len));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFirstN() {
|
||||
ForAllTypes(ForPartialVectors<TestFirstN>());
|
||||
}
|
||||
|
||||
struct TestIfThenElse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// NOTE: reverse polarity (mask is true iff lane == 0) because we cannot
|
||||
// reliably compare against all bits set (NaN for float types).
|
||||
const T off = 1;
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 50; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = static_cast<T>(Random32(&rng));
|
||||
in2[i] = static_cast<T>(Random32(&rng));
|
||||
mask_lanes[i] = (Random32(&rng) & 1024) ? off : T(0);
|
||||
}
|
||||
|
||||
const auto v1 = Load(d, in1.get());
|
||||
const auto v2 = Load(d, in2.get());
|
||||
const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (mask_lanes[i] == off) ? in2[i] : in1[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = mask_lanes[i] ? T(0) : in1[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = mask_lanes[i] ? in2[i] : T(0);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIfThenElse() {
|
||||
ForAllTypes(ForPartialVectors<TestIfThenElse>());
|
||||
}
|
||||
|
||||
struct TestMaskVec {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
|
||||
}
|
||||
|
||||
const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMaskVec() {
|
||||
const ForPartialVectors<TestMaskVec> test;
|
||||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
// TODO(janwas): float16_t - cannot compare yet
|
||||
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
test(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
#if HWY_CAP_FLOAT64
|
||||
test(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestCompress {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TU = MakeUnsigned<T>;
|
||||
const Rebind<TU, D> du;
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto mask_lanes = AllocateAligned<TU>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
auto actual = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
size_t expected_pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const uint64_t bits = Random32(&rng);
|
||||
in_lanes[i] = T(); // cannot initialize float16_t directly.
|
||||
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
|
||||
mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
|
||||
if (mask_lanes[i] == 0) { // Zero means true (easier to compare)
|
||||
expected[expected_pos++] = in_lanes[i];
|
||||
}
|
||||
}
|
||||
|
||||
const auto in = Load(d, in_lanes.get());
|
||||
const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
|
||||
|
||||
Store(Compress(in, mask), d, actual.get());
|
||||
// Upper lanes are undefined.
|
||||
for (size_t i = 0; i < expected_pos; ++i) {
|
||||
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
||||
}
|
||||
|
||||
// Also check CompressStore in the same way.
|
||||
memset(actual.get(), 0, N * sizeof(T));
|
||||
const size_t num_written = CompressStore(in, mask, d, actual.get());
|
||||
HWY_ASSERT_EQ(expected_pos, num_written);
|
||||
for (size_t i = 0; i < expected_pos; ++i) {
|
||||
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
namespace detail { // for code folding
|
||||
void PrintCompress16x8Tables() {
|
||||
constexpr size_t N = 8; // 128-bit SIMD
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint8_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Doubled (for converting lane to byte indices)
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d,", 2 * indices[i]);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Compressed to nibbles
|
||||
void PrintCompress32x8Tables() {
|
||||
constexpr size_t N = 8; // AVX2
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to nibbles
|
||||
uint64_t packed = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(indices[i] < 16);
|
||||
packed += indices[i] << (i * 4);
|
||||
}
|
||||
|
||||
HWY_ASSERT(packed < (1ull << 32));
|
||||
printf("0x%08x,", static_cast<uint32_t>(packed));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Pairs of 32-bit lane indices
|
||||
void PrintCompress64x4Tables() {
|
||||
constexpr size_t N = 4; // AVX2
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// 4-tuple of byte indices
|
||||
void PrintCompress32x4Tables() {
|
||||
using T = uint32_t;
|
||||
constexpr size_t N = 4; // SSE4
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
||||
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// 8-tuple of byte indices
|
||||
void PrintCompress64x2Tables() {
|
||||
using T = uint64_t;
|
||||
constexpr size_t N = 2; // SSE4
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
||||
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
} // namespace detail
|
||||
#endif
|
||||
|
||||
HWY_NOINLINE void TestAllCompress() {
|
||||
// detail::PrintCompress32x8Tables();
|
||||
// detail::PrintCompress64x4Tables();
|
||||
// detail::PrintCompress32x4Tables();
|
||||
// detail::PrintCompress64x2Tables();
|
||||
// detail::PrintCompress16x8Tables();
|
||||
|
||||
const ForPartialVectors<TestCompress> test;
|
||||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
test(float16_t());
|
||||
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
test(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
#if HWY_CAP_FLOAT64
|
||||
test(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestZeroIfNegative {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -488,7 +186,7 @@ struct TestBroadcastSignBit {
|
|||
const auto s0 = Zero(d);
|
||||
const auto s1 = Set(d, -1); // all bit set
|
||||
const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
|
||||
const auto vneg = s1 - vpos;
|
||||
const auto vneg = Sub(s1, vpos);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
|
||||
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
|
||||
|
@ -508,23 +206,23 @@ struct TestTestBit {
|
|||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t kNumBits = sizeof(T) * 8;
|
||||
for (size_t i = 0; i < kNumBits; ++i) {
|
||||
const auto bit1 = Set(d, 1ull << i);
|
||||
const auto bit2 = Set(d, 1ull << ((i + 1) % kNumBits));
|
||||
const auto bit3 = Set(d, 1ull << ((i + 2) % kNumBits));
|
||||
const auto bit1 = Set(d, T(1ull << i));
|
||||
const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits)));
|
||||
const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits)));
|
||||
const auto bits12 = Or(bit1, bit2);
|
||||
const auto bits23 = Or(bit2, bit3);
|
||||
HWY_ASSERT(AllTrue(TestBit(bit1, bit1)));
|
||||
HWY_ASSERT(AllTrue(TestBit(bits12, bit1)));
|
||||
HWY_ASSERT(AllTrue(TestBit(bits12, bit2)));
|
||||
HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1)));
|
||||
HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1)));
|
||||
HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2)));
|
||||
|
||||
HWY_ASSERT(AllFalse(TestBit(bits12, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bits23, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit1, bit2)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit2, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit1, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit3, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit2, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit3, bit2)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3)));
|
||||
HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -533,198 +231,54 @@ HWY_NOINLINE void TestAllTestBit() {
|
|||
ForIntegerTypes(ForPartialVectors<TestTestBit>());
|
||||
}
|
||||
|
||||
struct TestAllTrueFalse {
|
||||
struct TestPopulationCount {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto zero = Zero(d);
|
||||
auto v = zero;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
|
||||
HWY_ASSERT(AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(Eq(v, zero)));
|
||||
|
||||
// Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
|
||||
// lanes and one is nonzero.
|
||||
const bool expected_all_false = (N != 1);
|
||||
|
||||
// Set each lane to nonzero and back to zero
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = T(1);
|
||||
v = Load(d, lanes.get());
|
||||
|
||||
// GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
|
||||
// Assigning to an lvalue is insufficient but storing to memory prevents
|
||||
// the bug; so does Print of VecFromMask(d, Eq(v, zero)).
|
||||
Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
|
||||
HWY_ASSERT(!AllTrue(MaskFromVec(Load(d, mask_lanes.get()))));
|
||||
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
|
||||
|
||||
lanes[i] = T(-1);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(!AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
|
||||
|
||||
// Reset to all zero
|
||||
lanes[i] = T(0);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(Eq(v, zero)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllAllTrueFalse() {
|
||||
ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
|
||||
}
|
||||
|
||||
class TestStoreMaskBits {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*t*/, D d) {
|
||||
// TODO(janwas): remove once implemented (cast or vse1)
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
const size_t expected_bytes = (N + 7) / 8;
|
||||
auto bits = AllocateAligned<uint8_t>(expected_bytes);
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
// Generate random mask pattern.
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = static_cast<T>((rng() & 1024) ? 1 : 0);
|
||||
}
|
||||
const auto mask = Load(d, lanes.get()) == Zero(d);
|
||||
|
||||
const size_t bytes_written = StoreMaskBits(mask, bits.get());
|
||||
|
||||
HWY_ASSERT_EQ(expected_bytes, bytes_written);
|
||||
size_t i = 0;
|
||||
// Stored bits must match original mask
|
||||
for (; i < N; ++i) {
|
||||
const bool bit = (bits[i / 8] & (1 << (i % 8))) != 0;
|
||||
HWY_ASSERT_EQ(bit, lanes[i] == 0);
|
||||
}
|
||||
// Any partial bits in the last byte must be zero
|
||||
for (; i < 8 * bytes_written; ++i) {
|
||||
const int bit = (bits[i / 8] & (1 << (i % 8)));
|
||||
HWY_ASSERT_EQ(bit, 0);
|
||||
}
|
||||
}
|
||||
#if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD
|
||||
constexpr size_t kNumTests = 1 << 14;
|
||||
#else
|
||||
(void)d;
|
||||
constexpr size_t kNumTests = 1 << 20;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStoreMaskBits() {
|
||||
ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
|
||||
}
|
||||
|
||||
struct TestCountTrue {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = std::min(N, size_t(10));
|
||||
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(1));
|
||||
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
// Number of zeros written = number of mask lanes that are true.
|
||||
size_t expected = 0;
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
lanes[i] = T(1);
|
||||
if (code & (1ull << i)) {
|
||||
++expected;
|
||||
lanes[i] = T(0);
|
||||
}
|
||||
RandomState rng;
|
||||
size_t N = Lanes(d);
|
||||
auto data = AllocateAligned<T>(N);
|
||||
auto popcnt = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < kNumTests / N; i++) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
data[i] = static_cast<T>(rng());
|
||||
popcnt[i] = static_cast<T>(PopCount(data[i]));
|
||||
}
|
||||
|
||||
const auto mask = Eq(Load(d, lanes.get()), Zero(d));
|
||||
const size_t actual = CountTrue(mask);
|
||||
HWY_ASSERT_EQ(expected, actual);
|
||||
HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCountTrue() {
|
||||
ForAllTypes(ForPartialVectors<TestCountTrue>());
|
||||
HWY_NOINLINE void TestAllPopulationCount() {
|
||||
ForUnsignedTypes(ForPartialVectors<TestPopulationCount>());
|
||||
}
|
||||
|
||||
struct TestLogicalMask {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto m0 = MaskFalse(d);
|
||||
const auto m_all = MaskTrue(d);
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(1));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = std::min(N, size_t(6));
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
lanes[i] = T(1);
|
||||
if (code & (1ull << i)) {
|
||||
lanes[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
const auto m = Eq(Load(d, lanes.get()), Zero(d));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLogicalMask() {
|
||||
ForAllTypes(ForPartialVectors<TestLogicalMask>());
|
||||
}
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyLogicalTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllFirstN);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllTrueFalse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,465 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcmp
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// All types.
|
||||
struct TestFromVec {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
|
||||
memset(lanes.get(), 0, N * sizeof(T));
|
||||
const auto actual_false = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
|
||||
|
||||
memset(lanes.get(), 0xFF, N * sizeof(T));
|
||||
const auto actual_true = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFromVec() {
|
||||
ForAllTypes(ForPartialVectors<TestFromVec>());
|
||||
}
|
||||
|
||||
struct TestFirstN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
const RebindToSigned<D> di;
|
||||
using TI = TFromD<decltype(di)>;
|
||||
using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>;
|
||||
const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
|
||||
|
||||
for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) {
|
||||
const auto expected =
|
||||
RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len))));
|
||||
const auto actual = FirstN(d, len);
|
||||
HWY_ASSERT_MASK_EQ(d, expected, actual);
|
||||
}
|
||||
|
||||
// Also ensure huge values yield all-true.
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), FirstN(d, max_len));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFirstN() {
|
||||
ForAllTypes(ForPartialVectors<TestFirstN>());
|
||||
}
|
||||
|
||||
struct TestIfThenElse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(d);
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = static_cast<T>(Random32(&rng));
|
||||
in2[i] = static_cast<T>(Random32(&rng));
|
||||
bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const auto v1 = Load(d, in1.get());
|
||||
const auto v2 = Load(d, in2.get());
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = bool_lanes[i] ? in1[i] : in2[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = bool_lanes[i] ? in1[i] : T(0);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = bool_lanes[i] ? T(0) : in2[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIfThenElse() {
|
||||
ForAllTypes(ForPartialVectors<TestIfThenElse>());
|
||||
}
|
||||
|
||||
struct TestMaskVec {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(d);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMaskVec() {
|
||||
const ForPartialVectors<TestMaskVec> test;
|
||||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
// TODO(janwas): float16_t - cannot compare yet
|
||||
|
||||
ForUIF3264(test);
|
||||
}
|
||||
|
||||
struct TestMaskedLoad {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(d);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(Iota(d, T{1}), d, lanes.get());
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
|
||||
const auto actual = MaskedLoad(mask, d, lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(d, expected, actual);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMaskedLoad() {
|
||||
ForAllTypes(ForPartialVectors<TestMaskedLoad>());
|
||||
}
|
||||
|
||||
struct TestAllTrueFalse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto zero = Zero(d);
|
||||
auto v = zero;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
|
||||
HWY_ASSERT(AllTrue(d, Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
|
||||
|
||||
// Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
|
||||
// lanes and one is nonzero.
|
||||
const bool expected_all_false = (N != 1);
|
||||
|
||||
// Set each lane to nonzero and back to zero
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = T(1);
|
||||
v = Load(d, lanes.get());
|
||||
|
||||
// GCC 10.2.1 workaround: AllTrue(Eq(v, zero)) is true but should not be.
|
||||
// Assigning to an lvalue is insufficient but storing to memory prevents
|
||||
// the bug; so does Print of VecFromMask(d, Eq(v, zero)).
|
||||
Store(VecFromMask(d, Eq(v, zero)), d, mask_lanes.get());
|
||||
HWY_ASSERT(!AllTrue(d, MaskFromVec(Load(d, mask_lanes.get()))));
|
||||
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
|
||||
|
||||
lanes[i] = T(-1);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
|
||||
|
||||
// Reset to all zero
|
||||
lanes[i] = T(0);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(AllTrue(d, Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllAllTrueFalse() {
|
||||
ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
|
||||
}
|
||||
|
||||
class TestStoreMaskBits {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
|
||||
// TODO(janwas): remove once implemented (cast or vse1)
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
RandomState rng;
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(di);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
|
||||
const ScalableTag<uint8_t, -3> d_bits;
|
||||
const size_t expected_num_bytes = (N + 7) / 8;
|
||||
auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
|
||||
auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
|
||||
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
// Generate random mask pattern.
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
|
||||
}
|
||||
const auto bools = Load(di, bool_lanes.get());
|
||||
const auto mask = Gt(bools, Zero(di));
|
||||
|
||||
// Requires at least 8 bytes, ensured above.
|
||||
const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
|
||||
if (bytes_written != expected_num_bytes) {
|
||||
fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
|
||||
TypeName(T(), N).c_str(),
|
||||
static_cast<uint64_t>(expected_num_bytes),
|
||||
static_cast<uint64_t>(bytes_written));
|
||||
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
|
||||
// TODO(janwas): enable after implemented
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
// Requires at least 8 bytes, ensured above.
|
||||
const auto mask2 = LoadMaskBits(di, actual.get());
|
||||
HWY_ASSERT_MASK_EQ(di, mask, mask2);
|
||||
#endif
|
||||
|
||||
memset(expected.get(), 0, expected_num_bytes);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i / 8] = uint8_t(expected[i / 8] | (bool_lanes[i] << (i % 8)));
|
||||
}
|
||||
|
||||
size_t i = 0;
|
||||
// Stored bits must match original mask
|
||||
for (; i < N; ++i) {
|
||||
const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
|
||||
if (is_set != bool_lanes[i]) {
|
||||
fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
|
||||
TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
|
||||
int(bool_lanes[i]), int(is_set));
|
||||
Print(di, "bools", bools, 0, N);
|
||||
Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
|
||||
expected_num_bytes);
|
||||
Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
|
||||
expected_num_bytes);
|
||||
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
// Any partial bits in the last byte must be zero
|
||||
for (; i < 8 * bytes_written; ++i) {
|
||||
const int bit = (actual[i / 8] & (1 << (i % 8)));
|
||||
if (bit != 0) {
|
||||
fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
|
||||
TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
|
||||
Print(di, "bools", bools, 0, N);
|
||||
Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
|
||||
expected_num_bytes);
|
||||
Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
|
||||
expected_num_bytes);
|
||||
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStoreMaskBits() {
|
||||
ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
|
||||
}
|
||||
|
||||
struct TestCountTrue {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(di);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
memset(bool_lanes.get(), 0, N * sizeof(TI));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = HWY_MIN(N, size_t(10));
|
||||
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
// Number of zeros written = number of mask lanes that are true.
|
||||
size_t expected = 0;
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
const bool is_true = (code & (1ull << i)) != 0;
|
||||
bool_lanes[i] = is_true ? TI(1) : TI(0);
|
||||
expected += is_true;
|
||||
}
|
||||
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
const size_t actual = CountTrue(d, mask);
|
||||
HWY_ASSERT_EQ(expected, actual);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCountTrue() {
|
||||
ForAllTypes(ForPartialVectors<TestCountTrue>());
|
||||
}
|
||||
|
||||
struct TestFindFirstTrue {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(di);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
memset(bool_lanes.get(), 0, N * sizeof(TI));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = HWY_MIN(N, size_t(10));
|
||||
|
||||
HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
|
||||
HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
|
||||
|
||||
for (size_t code = 1; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const intptr_t expected =
|
||||
static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero32(uint32_t(code)));
|
||||
const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
const intptr_t actual = FindFirstTrue(d, mask);
|
||||
HWY_ASSERT_EQ(expected, actual);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFindFirstTrue() {
|
||||
ForAllTypes(ForPartialVectors<TestFindFirstTrue>());
|
||||
}
|
||||
|
||||
struct TestLogicalMask {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto m0 = MaskFalse(d);
|
||||
const auto m_all = MaskTrue(d);
|
||||
|
||||
using TI = MakeSigned<T>; // For mask > 0 comparison
|
||||
const Rebind<TI, D> di;
|
||||
const size_t N = Lanes(di);
|
||||
auto bool_lanes = AllocateAligned<TI>(N);
|
||||
memset(bool_lanes.get(), 0, N * sizeof(TI));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = HWY_MIN(N, size_t(6));
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
|
||||
}
|
||||
|
||||
const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLogicalMask() {
|
||||
ForAllTypes(ForPartialVectors<TestLogicalMask>());
|
||||
}
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMaskTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllIfThenElse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -85,6 +85,8 @@ HWY_NOINLINE void TestAllLoadStore() {
|
|||
struct TestStoreInterleaved3 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// TODO(janwas): restore once segment intrinsics are available
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
RandomState rng;
|
||||
|
@ -124,6 +126,9 @@ struct TestStoreInterleaved3 {
|
|||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -140,6 +145,8 @@ HWY_NOINLINE void TestAllStoreInterleaved3() {
|
|||
struct TestStoreInterleaved4 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// TODO(janwas): restore once segment intrinsics are available
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
RandomState rng;
|
||||
|
@ -182,6 +189,9 @@ struct TestStoreInterleaved4 {
|
|||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -287,8 +297,8 @@ struct TestScatter {
|
|||
std::fill(expected.get(), expected.get() + range, T(0));
|
||||
std::fill(actual.get(), actual.get() + range, T(0));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] =
|
||||
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
|
||||
// Must be aligned
|
||||
offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
|
||||
CopyBytes<sizeof(T)>(
|
||||
bytes.get() + i * sizeof(T),
|
||||
reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
|
||||
|
@ -307,7 +317,7 @@ struct TestScatter {
|
|||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] = static_cast<Offset>(Random32(&rng) % range);
|
||||
CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
|
||||
&expected[offsets[i]]);
|
||||
&expected[size_t(offsets[i])]);
|
||||
}
|
||||
const auto vindices = Load(d_offsets, offsets.get());
|
||||
ScatterIndex(data, d, actual.get(), vindices);
|
||||
|
@ -321,17 +331,7 @@ struct TestScatter {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllScatter() {
|
||||
// No u8,u16,i8,i16.
|
||||
const ForPartialVectors<TestScatter> test;
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(test);
|
||||
ForUIF3264(ForPartialVectors<TestScatter>());
|
||||
}
|
||||
|
||||
struct TestGather {
|
||||
|
@ -340,11 +340,12 @@ struct TestGather {
|
|||
using Offset = MakeSigned<T>;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t range = 4 * N; // number of items to gather
|
||||
const size_t max_bytes = range * sizeof(T); // upper bound on offset
|
||||
|
||||
RandomState rng;
|
||||
|
||||
// Data to be gathered from
|
||||
const size_t max_bytes = 4 * N * sizeof(T); // upper bound on offset
|
||||
auto bytes = AllocateAligned<uint8_t>(max_bytes);
|
||||
for (size_t i = 0; i < max_bytes; ++i) {
|
||||
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
||||
|
@ -357,8 +358,8 @@ struct TestGather {
|
|||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
// Offsets
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] =
|
||||
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
|
||||
// Must be aligned
|
||||
offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
|
||||
CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
|
||||
}
|
||||
|
||||
|
@ -380,16 +381,7 @@ struct TestGather {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllGather() {
|
||||
// No u8,u16,i8,i16.
|
||||
const ForPartialVectors<TestGather> test;
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
ForFloatTypes(test);
|
||||
ForUIF3264(ForPartialVectors<TestGather>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllCache() {
|
||||
|
@ -407,6 +399,7 @@ HWY_NOINLINE void TestAllCache() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMemoryTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
|
||||
|
@ -418,4 +411,11 @@ HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
|
|||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -12,254 +12,18 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Normal include guard for non-SIMD portion of this header.
|
||||
#ifndef HWY_TESTS_TEST_UTIL_H_
|
||||
#define HWY_TESTS_TEST_UTIL_H_
|
||||
// Target-specific helper functions for use by *_test.cc.
|
||||
|
||||
// Helper functions for use by *_test.cc.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <utility> // std::forward
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// The maximum vector size used in tests when defining test data. DEPRECATED.
|
||||
constexpr size_t kTestMaxVectorSize = 64;
|
||||
|
||||
// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
|
||||
// used INSTANTIATE_TEST_CASE_P which is now deprecated.
|
||||
#ifdef INSTANTIATE_TEST_SUITE_P
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
|
||||
#else
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
|
||||
#endif
|
||||
|
||||
// Helper class to run parametric tests using the hwy target as parameter. To
|
||||
// use this define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// TEST_P(MyTestSuite, MyTest) { ... }
|
||||
class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
|
||||
protected:
|
||||
void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
};
|
||||
|
||||
// Function to convert the test parameter of a TestWithParamTarget for
|
||||
// displaying it in the gtest test name.
|
||||
static inline std::string TestParamTargetName(
|
||||
const testing::TestParamInfo<uint32_t>& info) {
|
||||
return TargetName(info.param);
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
::hwy::TestParamTargetName)
|
||||
|
||||
// Helper class similar to TestWithParamTarget to run parametric tests that
|
||||
// depend on the target and another parametric test. If you need to use multiple
|
||||
// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
|
||||
// the generator. To use this class define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTargetT<int> {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
|
||||
// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
|
||||
template <typename T>
|
||||
class TestWithParamTargetAndT
|
||||
: public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
|
||||
public:
|
||||
// Expose the parametric type here so it can be used by the
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
|
||||
using HwyParamType = T;
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
SetSupportedTargetsForTest(std::get<0>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
|
||||
T GetParam() {
|
||||
return std::get<1>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string TestParamTargetNameAndT(
|
||||
const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
|
||||
return std::string(TargetName(std::get<0>(info.param))) + "_" +
|
||||
::testing::PrintToString(std::get<1>(info.param));
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
::testing::Combine( \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
generator), \
|
||||
::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
|
||||
|
||||
// Helper macro to export a function and define a test that tests it. This is
|
||||
// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
|
||||
#define HWY_EXPORT_AND_TEST_P(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_BEFORE_TEST(suite) \
|
||||
class suite : public hwy::TestWithParamTarget {}; \
|
||||
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
|
||||
// which triggers a compiler bug.
|
||||
class RandomState {
|
||||
public:
|
||||
explicit RandomState(const uint64_t seed = 0x123456789ull) {
|
||||
s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
|
||||
s1_ = SplitMix64(s0_);
|
||||
}
|
||||
|
||||
HWY_INLINE uint64_t operator()() {
|
||||
uint64_t s1 = s0_;
|
||||
const uint64_t s0 = s1_;
|
||||
const uint64_t bits = s1 + s0;
|
||||
s0_ = s0;
|
||||
s1 ^= s1 << 23;
|
||||
s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
|
||||
s1_ = s1;
|
||||
return bits;
|
||||
}
|
||||
|
||||
private:
|
||||
static uint64_t SplitMix64(uint64_t z) {
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
|
||||
return z ^ (z >> 31);
|
||||
}
|
||||
|
||||
uint64_t s0_;
|
||||
uint64_t s1_;
|
||||
};
|
||||
|
||||
static HWY_INLINE uint32_t Random32(RandomState* rng) {
|
||||
return static_cast<uint32_t>((*rng)());
|
||||
}
|
||||
|
||||
// Prevents the compiler from eliding the computations that led to "output".
|
||||
// Works by indicating to the compiler that "output" is being read and modified.
|
||||
// The +r constraint avoids unnecessary writes to memory, but only works for
|
||||
// built-in types.
|
||||
template <class T>
|
||||
inline void PreventElision(T&& output) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
(void)output;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
asm volatile("" : "+r"(output) : : "memory");
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
|
||||
// unsigned/signed/floating point, followed by the number of bits per lane;
|
||||
// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
|
||||
// understanding which instantiation of a generic test failed.
|
||||
template <typename T>
|
||||
static inline std::string TypeName(T /*unused*/, size_t N) {
|
||||
const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
|
||||
char name[64];
|
||||
// Omit the xN suffix for scalars.
|
||||
if (N == 1) {
|
||||
snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
|
||||
} else {
|
||||
snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
// String comparison
|
||||
|
||||
template <typename T1, typename T2>
|
||||
inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
|
||||
size_t* pos = nullptr) {
|
||||
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
|
||||
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (bytes1[i] != bytes2[i]) {
|
||||
fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
|
||||
size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
|
||||
TypeName(T2(), 1).c_str());
|
||||
if (pos != nullptr) {
|
||||
*pos = i;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool StringsEqual(const char* s1, const char* s2) {
|
||||
while (*s1 == *s2++) {
|
||||
if (*s1++ == '\0') return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_TESTS_TEST_UTIL_H_
|
||||
#include "hwy/tests/hwy_gtest.h"
|
||||
#include "hwy/tests/test_util.h"
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
|
||||
#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
|
||||
#else
|
||||
|
@ -270,164 +34,139 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 1)>
|
||||
HWY_NOINLINE void PrintValue(T value) {
|
||||
uint8_t byte;
|
||||
CopyBytes<1>(&value, &byte); // endian-safe: we ensured sizeof(T)=1.
|
||||
fprintf(stderr, "0x%02X,", byte);
|
||||
}
|
||||
|
||||
#if HWY_CAP_FLOAT16
|
||||
HWY_NOINLINE void PrintValue(float16_t value) {
|
||||
uint16_t bits;
|
||||
CopyBytes<2>(&value, &bits);
|
||||
fprintf(stderr, "0x%02X,", bits);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
|
||||
HWY_NOINLINE void PrintValue(T value) {
|
||||
fprintf(stderr, "%g,", double(value));
|
||||
}
|
||||
|
||||
// Prints lanes around `lane`, in memory order.
|
||||
template <class D>
|
||||
HWY_NOINLINE void Print(const D d, const char* caption, const Vec<D> v,
|
||||
intptr_t lane = 0) {
|
||||
template <class D, class V = Vec<D>>
|
||||
void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
|
||||
size_t max_lanes = 7) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(v, d, lanes.get());
|
||||
const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
|
||||
const size_t end = std::min(begin + 7, N);
|
||||
fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption,
|
||||
begin);
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
fprintf(stderr, "%g,", double(lanes[i]));
|
||||
}
|
||||
if (begin >= end) fprintf(stderr, "(out of bounds)");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static HWY_NORETURN HWY_NOINLINE void NotifyFailure(
|
||||
const char* filename, const int line, const char* type_name,
|
||||
const size_t lane, const char* expected, const char* actual) {
|
||||
hwy::Abort(filename, line,
|
||||
"%s, %s lane %zu mismatch: expected '%s', got '%s'.\n",
|
||||
hwy::TargetName(HWY_TARGET), type_name, lane, expected, actual);
|
||||
}
|
||||
|
||||
template <class Out, class In>
|
||||
inline Out BitCast(const In& in) {
|
||||
static_assert(sizeof(Out) == sizeof(In), "");
|
||||
Out out;
|
||||
CopyBytes<sizeof(out)>(&in, &out);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Computes the difference in units of last place between x and y.
|
||||
template <typename TF>
|
||||
MakeUnsigned<TF> ComputeUlpDelta(TF x, TF y) {
|
||||
static_assert(IsFloat<TF>(), "Only makes sense for floating-point");
|
||||
using TU = MakeUnsigned<TF>;
|
||||
|
||||
// Handle -0 == 0 and infinities.
|
||||
if (x == y) return 0;
|
||||
|
||||
// Consider "equal" if both are NaN, so we can verify an expected NaN.
|
||||
// Needs a special case because there are many possible NaN representations.
|
||||
if (std::isnan(x) && std::isnan(y)) return 0;
|
||||
|
||||
// NOTE: no need to check for differing signs; they will result in large
|
||||
// differences, which is fine, and we avoid overflow.
|
||||
|
||||
const TU ux = BitCast<TU>(x);
|
||||
const TU uy = BitCast<TU>(y);
|
||||
// Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
|
||||
return std::max(ux, uy) - std::min(ux, uy);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
||||
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
|
||||
return expected == actual;
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
|
||||
return ComputeUlpDelta(expected, actual) <= 1;
|
||||
}
|
||||
|
||||
// Compare non-vector, non-string T.
|
||||
template <typename T>
|
||||
HWY_NOINLINE void AssertEqual(const T expected, const T actual,
|
||||
const std::string& type_name,
|
||||
const char* filename = "", const int line = -1,
|
||||
const size_t lane = 0) {
|
||||
if (!IsEqual(expected, actual)) {
|
||||
char expected_str[100];
|
||||
snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
|
||||
char actual_str[100];
|
||||
snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
|
||||
NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
|
||||
actual_str);
|
||||
}
|
||||
}
|
||||
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void AssertStringEqual(
|
||||
const char* expected, const char* actual, const char* filename = "",
|
||||
const int line = -1, const size_t lane = 0) {
|
||||
if (!hwy::StringsEqual(expected, actual)) {
|
||||
NotifyFailure(filename, line, "string", lane, expected, actual);
|
||||
}
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes);
|
||||
}
|
||||
|
||||
// Compare expected vector to vector.
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void AssertVecEqual(D d, const V expected, const V actual,
|
||||
const char* filename, const int line) {
|
||||
using T = TFromD<D>;
|
||||
template <class D, typename T = TFromD<D>, class V = Vec<D>>
|
||||
void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
|
||||
const char* filename, const int line) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected_lanes = AllocateAligned<T>(N);
|
||||
auto actual_lanes = AllocateAligned<T>(N);
|
||||
Store(expected, d, expected_lanes.get());
|
||||
Store(actual, d, actual_lanes.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (!IsEqual(expected_lanes[i], actual_lanes[i])) {
|
||||
fprintf(stderr, "\n\n");
|
||||
Print(d, "expect", expected, i);
|
||||
Print(d, "actual", actual, i);
|
||||
|
||||
char expected_str[100];
|
||||
snprintf(expected_str, sizeof(expected_str), "%g",
|
||||
double(expected_lanes[i]));
|
||||
char actual_str[100];
|
||||
snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
|
||||
|
||||
NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
|
||||
expected_str, actual_str);
|
||||
}
|
||||
}
|
||||
const auto info = hwy::detail::MakeTypeInfo<T>();
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,
|
||||
target_name, filename, line);
|
||||
}
|
||||
|
||||
// Compare expected lanes to vector.
|
||||
template <class D>
|
||||
HWY_NOINLINE void AssertVecEqual(D d, const TFromD<D>* expected, Vec<D> actual,
|
||||
template <class D, typename T = TFromD<D>, class V = Vec<D>>
|
||||
HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
|
||||
const char* filename, int line) {
|
||||
AssertVecEqual(d, LoadU(d, expected), actual, filename, line);
|
||||
auto expected_lanes = AllocateAligned<T>(Lanes(d));
|
||||
Store(expected, d, expected_lanes.get());
|
||||
AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
|
||||
}
|
||||
|
||||
// Only checks the valid mask elements (those whose index < Lanes(d)).
|
||||
template <class D>
|
||||
HWY_NOINLINE void AssertMaskEqual(D d, Mask<D> a, Mask<D> b,
|
||||
HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
|
||||
const char* filename, int line) {
|
||||
AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
|
||||
|
||||
const std::string type_name = TypeName(TFromD<D>(), Lanes(d));
|
||||
AssertEqual(CountTrue(a), CountTrue(b), type_name, filename, line, 0);
|
||||
AssertEqual(AllTrue(a), AllTrue(b), type_name, filename, line, 0);
|
||||
AssertEqual(AllFalse(a), AllFalse(b), type_name, filename, line, 0);
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
|
||||
AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);
|
||||
AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);
|
||||
|
||||
// TODO(janwas): StoreMaskBits
|
||||
// TODO(janwas): remove RVV once implemented (cast or vse1)
|
||||
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
|
||||
const size_t N = Lanes(d);
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
|
||||
auto bits_b = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
|
||||
memset(bits_a.get(), 0, N8);
|
||||
memset(bits_b.get(), 0, N8);
|
||||
const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
|
||||
const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
|
||||
AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);
|
||||
size_t i = 0;
|
||||
// First check whole bytes (if that many elements are still valid)
|
||||
for (; i < N / 8; ++i) {
|
||||
if (bits_a[i] != bits_b[i]) {
|
||||
fprintf(stderr, "Mismatch in byte %" PRIu64 ": %d != %d\n",
|
||||
static_cast<uint64_t>(i), bits_a[i], bits_b[i]);
|
||||
Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
|
||||
Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
|
||||
hwy::Abort(filename, line, "Masks not equal");
|
||||
}
|
||||
}
|
||||
// Then the valid bit(s) in the last byte.
|
||||
const size_t remainder = N % 8;
|
||||
if (remainder != 0) {
|
||||
const int mask = (1 << remainder) - 1;
|
||||
const int valid_a = bits_a[i] & mask;
|
||||
const int valid_b = bits_b[i] & mask;
|
||||
if (valid_a != valid_b) {
|
||||
fprintf(stderr, "Mismatch in last byte %" PRIu64 ": %d != %d\n",
|
||||
static_cast<uint64_t>(i), valid_a, valid_b);
|
||||
Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
|
||||
Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
|
||||
hwy::Abort(filename, line, "Masks not equal");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Only sets valid elements (those whose index < Lanes(d)). This helps catch
|
||||
// tests that are not masking off the (undefined) upper mask elements.
|
||||
//
|
||||
// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> MaskTrue(const D d) {
|
||||
return FirstN(d, Lanes(d));
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_NOINLINE Mask<D> MaskTrue(const D d) {
|
||||
const auto v0 = Zero(d);
|
||||
return Eq(v0, v0);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_NOINLINE Mask<D> MaskFalse(const D d) {
|
||||
// Lt is only for signed types and we cannot yet cast mask types.
|
||||
return Eq(Zero(d), Set(d, 1));
|
||||
HWY_INLINE Mask<D> MaskFalse(const D d) {
|
||||
const auto zero = Zero(RebindToSigned<D>());
|
||||
return RebindMask(d, Lt(zero, zero));
|
||||
}
|
||||
|
||||
#ifndef HWY_ASSERT_EQ
|
||||
|
||||
#define HWY_ASSERT_EQ(expected, actual) \
|
||||
AssertEqual(expected, actual, hwy::TypeName(expected, 1), __FILE__, __LINE__)
|
||||
#define HWY_ASSERT_EQ(expected, actual) \
|
||||
hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \
|
||||
__LINE__)
|
||||
|
||||
#define HWY_ASSERT_STRING_EQ(expected, actual) \
|
||||
AssertStringEqual(expected, actual, __FILE__, __LINE__)
|
||||
#define HWY_ASSERT_STRING_EQ(expected, actual) \
|
||||
hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \
|
||||
__FILE__, __LINE__)
|
||||
|
||||
#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
|
||||
AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
|
||||
|
@ -439,83 +178,182 @@ HWY_NOINLINE Mask<D> MaskFalse(const D d) {
|
|||
|
||||
// Helpers for instantiating tests with combinations of lane types / counts.
|
||||
|
||||
// For all powers of two in [kMinLanes, N * kMinLanes] (so that recursion stops
|
||||
// at N == 0)
|
||||
template <typename T, size_t N, size_t kMinLanes, class Test>
|
||||
// For ensuring we do not call tests with D such that widening D results in 0
|
||||
// lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
|
||||
// of a u64 vector in this case.
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE size_t PromotedLanes(const D d) {
|
||||
return Lanes(RepartitionToWide<decltype(d)>());
|
||||
}
|
||||
// Already the widest possible T, cannot widen.
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE size_t PromotedLanes(const D d) {
|
||||
return Lanes(d);
|
||||
}
|
||||
|
||||
// For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
|
||||
// stops at kMul == 0). Note that N may be capped or a fraction.
|
||||
template <typename T, size_t kMul, size_t kMinLanes, class Test,
|
||||
bool kPromote = false>
|
||||
struct ForeachSizeR {
|
||||
static void Do() {
|
||||
static_assert(N != 0, "End of recursion");
|
||||
Test()(T(), Simd<T, N * kMinLanes>());
|
||||
ForeachSizeR<T, N / 2, kMinLanes, Test>::Do();
|
||||
const Simd<T, kMul * kMinLanes> d;
|
||||
|
||||
// Skip invalid fractions (e.g. 1/8th of u32x4).
|
||||
const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d);
|
||||
if (lanes < kMinLanes) return;
|
||||
|
||||
Test()(T(), d);
|
||||
|
||||
static_assert(kMul != 0, "Recursion should have ended already");
|
||||
ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do();
|
||||
}
|
||||
};
|
||||
|
||||
// Base case to stop the recursion.
|
||||
template <typename T, size_t kMinLanes, class Test>
|
||||
struct ForeachSizeR<T, 0, kMinLanes, Test> {
|
||||
template <typename T, size_t kMinLanes, class Test, bool kPromote>
|
||||
struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> {
|
||||
static void Do() {}
|
||||
};
|
||||
|
||||
// These adapters may be called directly, or via For*Types:
|
||||
|
||||
// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
|
||||
template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
|
||||
struct ForPartialVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
|
||||
ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all vectors that can be demoted log2(kFactor) times.
|
||||
template <class Test, size_t kFactor>
|
||||
struct ForDemoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Only m1..8 for now.
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T), 1, Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all powers of two in [128 bits, max bits].
|
||||
template <class Test>
|
||||
struct ForGE128Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
Test>::Do();
|
||||
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all vectors that can be expanded by kFactor.
|
||||
// Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
|
||||
// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForExtendableVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
constexpr bool kPromote = true;
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do();
|
||||
#endif
|
||||
#endif // HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
|
||||
// that narrow their input, e.g. UpperHalf.
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForShrinkableVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
|
||||
// ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
|
||||
template <class Test>
|
||||
struct ForGE128Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
|
||||
// ops that require at least 64 bits, e.g. casts.
|
||||
template <class Test>
|
||||
struct ForGE64Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)),
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all N that can be promoted (not the same as Extendable because
|
||||
// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForPromoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do();
|
||||
#else
|
||||
return ForExtendableVectors<Test, kFactor>()(T());
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all N than can be demoted (not the same as Shrinkable because
|
||||
// HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForDemoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
ForeachSizeR<T, 1, 1, Test>::Do();
|
||||
#else
|
||||
return ForShrinkableVectors<Test, kFactor>()(T());
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [1, Lanes(d)]. This is the default
|
||||
// for ops that do not narrow nor widen their input, nor require 128 bits.
|
||||
template <class Test>
|
||||
struct ForPartialVectors {
|
||||
template <typename T>
|
||||
void operator()(T t) const {
|
||||
ForExtendableVectors<Test, 1>()(t);
|
||||
}
|
||||
};
|
||||
|
||||
// Type lists to shorten call sites:
|
||||
|
||||
template <class Func>
|
||||
|
@ -558,6 +396,42 @@ void ForAllTypes(const Func& func) {
|
|||
ForFloatTypes(func);
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF3264(const Func& func) {
|
||||
func(uint32_t());
|
||||
func(int32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(uint64_t());
|
||||
func(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(func);
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF163264(const Func& func) {
|
||||
ForUIF3264(func);
|
||||
func(uint16_t());
|
||||
func(int16_t());
|
||||
#if HWY_CAP_FLOAT16
|
||||
func(float16_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
// For tests that involve loops, adjust the trip count so that emulated tests
|
||||
// finish quickly (but always at least 2 iterations to ensure some diversity).
|
||||
constexpr size_t AdjustedReps(size_t max_reps) {
|
||||
#if HWY_ARCH_RVV
|
||||
return HWY_MAX(max_reps / 16, 2);
|
||||
#elif HWY_ARCH_ARM
|
||||
return HWY_MAX(max_reps / 4, 2);
|
||||
#elif HWY_IS_DEBUG_BUILD
|
||||
return HWY_MAX(max_reps / 8, 2);
|
||||
#else
|
||||
return HWY_MAX(max_reps, 2);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
|
@ -0,0 +1,198 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/tests/test_util.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
bool BytesEqual(const void* p1, const void* p2, const size_t size,
|
||||
size_t* pos) {
|
||||
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
|
||||
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (bytes1[i] != bytes2[i]) {
|
||||
fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i],
|
||||
bytes2[i]);
|
||||
if (pos != nullptr) {
|
||||
*pos = i;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void AssertStringEqual(const char* expected, const char* actual,
|
||||
const char* target_name, const char* filename,
|
||||
int line) {
|
||||
while (*expected == *actual++) {
|
||||
if (*expected++ == '\0') return;
|
||||
}
|
||||
|
||||
Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n",
|
||||
target_name, expected, actual);
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
bool IsEqual(const TypeInfo& info, const void* expected_ptr,
|
||||
const void* actual_ptr) {
|
||||
if (!info.is_float) {
|
||||
return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t);
|
||||
}
|
||||
|
||||
if (info.sizeof_t == 4) {
|
||||
float expected, actual;
|
||||
CopyBytes<4>(expected_ptr, &expected);
|
||||
CopyBytes<4>(actual_ptr, &actual);
|
||||
return ComputeUlpDelta(expected, actual) <= 1;
|
||||
} else if (info.sizeof_t == 8) {
|
||||
double expected, actual;
|
||||
CopyBytes<8>(expected_ptr, &expected);
|
||||
CopyBytes<8>(actual_ptr, &actual);
|
||||
return ComputeUlpDelta(expected, actual) <= 1;
|
||||
} else {
|
||||
HWY_ABORT("Unexpected float size %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(info.sizeof_t));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void TypeName(const TypeInfo& info, size_t N, char* string100) {
|
||||
const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
|
||||
// Omit the xN suffix for scalars.
|
||||
if (N == 1) {
|
||||
snprintf(string100, 64, "%c%" PRIu64, prefix,
|
||||
static_cast<uint64_t>(info.sizeof_t * 8));
|
||||
} else {
|
||||
snprintf(string100, 64, "%c%" PRIu64 "x%" PRIu64, prefix,
|
||||
static_cast<uint64_t>(info.sizeof_t * 8),
|
||||
static_cast<uint64_t>(N));
|
||||
}
|
||||
}
|
||||
|
||||
void ToString(const TypeInfo& info, const void* ptr, char* string100) {
|
||||
if (info.sizeof_t == 1) {
|
||||
uint8_t byte;
|
||||
CopyBytes<1>(ptr, &byte); // endian-safe: we ensured sizeof(T)=1.
|
||||
snprintf(string100, 100, "0x%02X", byte);
|
||||
} else if (info.sizeof_t == 2) {
|
||||
uint16_t bits;
|
||||
CopyBytes<2>(ptr, &bits);
|
||||
snprintf(string100, 100, "0x%04X", bits);
|
||||
} else if (info.sizeof_t == 4) {
|
||||
if (info.is_float) {
|
||||
float value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%g", double(value));
|
||||
} else if (info.is_signed) {
|
||||
int32_t value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%d", value);
|
||||
} else {
|
||||
uint32_t value;
|
||||
CopyBytes<4>(ptr, &value);
|
||||
snprintf(string100, 100, "%u", value);
|
||||
}
|
||||
} else {
|
||||
HWY_ASSERT(info.sizeof_t == 8);
|
||||
if (info.is_float) {
|
||||
double value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%g", value);
|
||||
} else if (info.is_signed) {
|
||||
int64_t value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%" PRIi64 "", value);
|
||||
} else {
|
||||
uint64_t value;
|
||||
CopyBytes<8>(ptr, &value);
|
||||
snprintf(string100, 100, "%" PRIu64 "", value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N, size_t lane_u,
|
||||
size_t max_lanes) {
|
||||
const uint8_t* array_bytes = reinterpret_cast<const uint8_t*>(array_void);
|
||||
|
||||
char type_name[100];
|
||||
TypeName(info, N, type_name);
|
||||
|
||||
const intptr_t lane = intptr_t(lane_u);
|
||||
const size_t begin = static_cast<size_t>(HWY_MAX(0, lane - 2));
|
||||
const size_t end = HWY_MIN(begin + max_lanes, N);
|
||||
fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n ", type_name, caption,
|
||||
static_cast<uint64_t>(begin));
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
const void* ptr = array_bytes + i * info.sizeof_t;
|
||||
char str[100];
|
||||
ToString(info, ptr, str);
|
||||
fprintf(stderr, "%s,", str);
|
||||
}
|
||||
if (begin >= end) fprintf(stderr, "(out of bounds)");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
|
||||
const void* expected_ptr,
|
||||
const void* actual_ptr,
|
||||
const char* target_name,
|
||||
const char* filename, int line,
|
||||
size_t lane, size_t num_lanes) {
|
||||
char type_name[100];
|
||||
TypeName(info, 1, type_name);
|
||||
char expected_str[100];
|
||||
ToString(info, expected_ptr, expected_str);
|
||||
char actual_str[100];
|
||||
ToString(info, actual_ptr, actual_str);
|
||||
Abort(filename, line,
|
||||
"%s, %sx%" PRIu64 " lane %" PRIu64
|
||||
" mismatch: expected '%s', got '%s'.\n",
|
||||
target_name, type_name, static_cast<uint64_t>(num_lanes),
|
||||
static_cast<uint64_t>(lane), expected_str, actual_str);
|
||||
}
|
||||
|
||||
void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
|
||||
const void* actual_void, size_t N,
|
||||
const char* target_name, const char* filename, int line) {
|
||||
const uint8_t* expected_array =
|
||||
reinterpret_cast<const uint8_t*>(expected_void);
|
||||
const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const void* expected_ptr = expected_array + i * info.sizeof_t;
|
||||
const void* actual_ptr = actual_array + i * info.sizeof_t;
|
||||
if (!IsEqual(info, expected_ptr, actual_ptr)) {
|
||||
fprintf(stderr, "\n\n");
|
||||
PrintArray(info, "expect", expected_array, N, i);
|
||||
PrintArray(info, "actual", actual_array, N, i);
|
||||
|
||||
PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name,
|
||||
filename, line, i, N);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
} // namespace hwy
|
|
@ -0,0 +1,185 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HWY_TESTS_TEST_UTIL_H_
|
||||
#define HWY_TESTS_TEST_UTIL_H_
|
||||
|
||||
// Target-independent helper functions for use by *_test.cc.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// The maximum vector size used in tests when defining test data. DEPRECATED.
|
||||
constexpr size_t kTestMaxVectorSize = 64;
|
||||
|
||||
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
|
||||
// which triggers a compiler bug.
|
||||
class RandomState {
|
||||
public:
|
||||
explicit RandomState(const uint64_t seed = 0x123456789ull) {
|
||||
s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
|
||||
s1_ = SplitMix64(s0_);
|
||||
}
|
||||
|
||||
HWY_INLINE uint64_t operator()() {
|
||||
uint64_t s1 = s0_;
|
||||
const uint64_t s0 = s1_;
|
||||
const uint64_t bits = s1 + s0;
|
||||
s0_ = s0;
|
||||
s1 ^= s1 << 23;
|
||||
s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
|
||||
s1_ = s1;
|
||||
return bits;
|
||||
}
|
||||
|
||||
private:
|
||||
static uint64_t SplitMix64(uint64_t z) {
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
|
||||
return z ^ (z >> 31);
|
||||
}
|
||||
|
||||
uint64_t s0_;
|
||||
uint64_t s1_;
|
||||
};
|
||||
|
||||
static HWY_INLINE uint32_t Random32(RandomState* rng) {
|
||||
return static_cast<uint32_t>((*rng)());
|
||||
}
|
||||
|
||||
static HWY_INLINE uint64_t Random64(RandomState* rng) {
|
||||
return (*rng)();
|
||||
}
|
||||
|
||||
// Prevents the compiler from eliding the computations that led to "output".
|
||||
// Works by indicating to the compiler that "output" is being read and modified.
|
||||
// The +r constraint avoids unnecessary writes to memory, but only works for
|
||||
// built-in types.
|
||||
template <class T>
|
||||
inline void PreventElision(T&& output) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
(void)output;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
asm volatile("" : "+r"(output) : : "memory");
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
bool BytesEqual(const void* p1, const void* p2, const size_t size,
|
||||
size_t* pos = nullptr);
|
||||
|
||||
void AssertStringEqual(const char* expected, const char* actual,
|
||||
const char* target_name, const char* filename, int line);
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T, typename TU = MakeUnsigned<T>>
|
||||
TU ComputeUlpDelta(const T expected, const T actual) {
|
||||
// Handle -0 == 0 and infinities.
|
||||
if (expected == actual) return 0;
|
||||
|
||||
// Consider "equal" if both are NaN, so we can verify an expected NaN.
|
||||
// Needs a special case because there are many possible NaN representations.
|
||||
if (std::isnan(expected) && std::isnan(actual)) return 0;
|
||||
|
||||
// Compute the difference in units of last place. We do not need to check for
|
||||
// differing signs; they will result in large differences, which is fine.
|
||||
TU ux, uy;
|
||||
CopyBytes<sizeof(T)>(&expected, &ux);
|
||||
CopyBytes<sizeof(T)>(&actual, &uy);
|
||||
|
||||
// Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
|
||||
const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
|
||||
return ulp;
|
||||
}
|
||||
|
||||
// For implementing value comparisons etc. as type-erased functions to reduce
|
||||
// template bloat.
|
||||
struct TypeInfo {
|
||||
size_t sizeof_t;
|
||||
bool is_float;
|
||||
bool is_signed;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE TypeInfo MakeTypeInfo() {
|
||||
TypeInfo info;
|
||||
info.sizeof_t = sizeof(T);
|
||||
info.is_float = IsFloat<T>();
|
||||
info.is_signed = IsSigned<T>();
|
||||
return info;
|
||||
}
|
||||
|
||||
bool IsEqual(const TypeInfo& info, const void* expected_ptr,
|
||||
const void* actual_ptr);
|
||||
|
||||
void TypeName(const TypeInfo& info, size_t N, char* string100);
|
||||
|
||||
void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N, size_t lane_u = 0,
|
||||
size_t max_lanes = 7);
|
||||
|
||||
HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
|
||||
const void* expected_ptr,
|
||||
const void* actual_ptr,
|
||||
const char* target_name,
|
||||
const char* filename, int line,
|
||||
size_t lane = 0, size_t num_lanes = 1);
|
||||
|
||||
void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
|
||||
const void* actual_void, size_t N,
|
||||
const char* target_name, const char* filename, int line);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
|
||||
// unsigned/signed/floating point, followed by the number of bits per lane;
|
||||
// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
|
||||
// understanding which instantiation of a generic test failed.
|
||||
template <typename T>
|
||||
std::string TypeName(T /*unused*/, size_t N) {
|
||||
char string100[100];
|
||||
detail::TypeName(detail::MakeTypeInfo<T>(), N, string100);
|
||||
return string100;
|
||||
}
|
||||
|
||||
// Compare non-vector, non-string T.
|
||||
template <typename T>
|
||||
HWY_INLINE bool IsEqual(const T expected, const T actual) {
|
||||
const auto info = detail::MakeTypeInfo<T>();
|
||||
return detail::IsEqual(info, &expected, &actual);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE void AssertEqual(const T expected, const T actual,
|
||||
const char* target_name, const char* filename,
|
||||
int line, size_t lane = 0) {
|
||||
const auto info = detail::MakeTypeInfo<T>();
|
||||
if (!detail::IsEqual(info, &expected, &actual)) {
|
||||
detail::PrintMismatchAndAbort(info, &expected, &actual, target_name,
|
||||
filename, line, lane);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_TESTS_TEST_UTIL_H_
|
|
@ -30,19 +30,19 @@ struct TestName {
|
|||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
char num[10];
|
||||
std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
|
||||
snprintf(num, sizeof(num), "%zu", sizeof(T) * 8);
|
||||
snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8));
|
||||
expected += num;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N != 1) {
|
||||
expected += 'x';
|
||||
snprintf(num, sizeof(num), "%zu", N);
|
||||
snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N));
|
||||
expected += num;
|
||||
}
|
||||
const std::string actual = TypeName(t, N);
|
||||
if (expected != actual) {
|
||||
NotifyFailure(__FILE__, __LINE__, expected.c_str(), 0, expected.c_str(),
|
||||
actual.c_str());
|
||||
HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n",
|
||||
hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -94,9 +94,17 @@ HWY_NOINLINE void TestAllEqual() {
|
|||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(TestUtilTest);
|
||||
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
|
||||
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char **argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
|
||||
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: libhwy-test
|
||||
Description: Efficient and performance-portable SIMD wrapper, test helpers.
|
||||
Requires: gtest
|
||||
Version: @HWY_LIBRARY_VERSION@
|
||||
Libs: -L${libdir} -lhwy_test
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -5,7 +5,7 @@ cd %~dp0
|
|||
if not exist build_win mkdir build_win
|
||||
|
||||
cd build_win
|
||||
cmake .. -G Ninja || goto error
|
||||
cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -G Ninja || goto error
|
||||
ninja || goto error
|
||||
ctest -j || goto error
|
||||
|
||||
|
|
|
@ -7,9 +7,74 @@ cd "${MYDIR}"
|
|||
# Exit if anything fails
|
||||
set -e
|
||||
|
||||
mkdir -p build
|
||||
#######################################
|
||||
echo RELEASE
|
||||
rm -rf build
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j
|
||||
ctest -j
|
||||
cd ..
|
||||
rm -rf build
|
||||
|
||||
#######################################
|
||||
echo DEBUG Clang 7
|
||||
rm -rf build_dbg
|
||||
mkdir build_dbg
|
||||
cd build_dbg
|
||||
CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
|
||||
make -j
|
||||
ctest -j
|
||||
cd ..
|
||||
rm -rf build_dbg
|
||||
|
||||
#######################################
|
||||
echo 32-bit GCC
|
||||
rm -rf build_32
|
||||
mkdir build_32
|
||||
cd build_32
|
||||
CFLAGS=-m32 CXXFLAGS=-m32 LDFLAGS=-m32 CXX=g++ CC=gcc cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j
|
||||
ctest -j
|
||||
cd ..
|
||||
rm -rf build_32
|
||||
|
||||
#######################################
|
||||
for VER in 8 9 10; do
|
||||
echo GCC $VER
|
||||
rm -rf build_g$VER
|
||||
mkdir build_g$VER
|
||||
cd build_g$VER
|
||||
CC=gcc-$VER CXX=g++-$VER cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j
|
||||
make test
|
||||
cd ..
|
||||
rm -rf build_g$VER
|
||||
done
|
||||
|
||||
#######################################
|
||||
echo ARMv7 GCC
|
||||
export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
|
||||
rm -rf build_arm7
|
||||
mkdir build_arm7
|
||||
cd build_arm7
|
||||
CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j8
|
||||
ctest
|
||||
cd ..
|
||||
rm -rf build_arm7
|
||||
|
||||
#######################################
|
||||
echo ARMv8 GCC
|
||||
export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
|
||||
rm -rf build_arm8
|
||||
mkdir build_arm8
|
||||
cd build_arm8
|
||||
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j8
|
||||
ctest
|
||||
cd ..
|
||||
rm -rf build_arm8
|
||||
|
||||
echo Success
|
||||
|
|
|
@ -1,131 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Helper for running tests for all platforms.
|
||||
|
||||
One-time setup:
|
||||
sudo apt-get install npm
|
||||
sudo npm install -g npm jsvu
|
||||
|
||||
Usage:
|
||||
third_party/highway/test.py [--test=memory_test]
|
||||
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tarfile
|
||||
import tempfile
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
|
||||
def run_subprocess(args, work_dir):
|
||||
"""Runs subprocess and checks for success."""
|
||||
process = subprocess.Popen(args, cwd=work_dir)
|
||||
process.communicate()
|
||||
assert process.returncode == 0
|
||||
|
||||
|
||||
def print_status(name):
|
||||
print("=" * 60, name)
|
||||
|
||||
|
||||
def run_blaze_tests(work_dir, target, desired_config, config_name, blazerc,
|
||||
config):
|
||||
"""Builds and runs via blaze or returns 0 if skipped."""
|
||||
if desired_config is not None and desired_config != config_name:
|
||||
return 0
|
||||
print_status(config_name)
|
||||
default_config = ["-c", "opt", "--copt=-DHWY_COMPILE_ALL_ATTAINABLE"]
|
||||
args = ["blaze"] + blazerc + ["test", ":" + target] + config + default_config
|
||||
run_subprocess(args, work_dir)
|
||||
return 1
|
||||
|
||||
|
||||
def run_wasm_tests(work_dir, target, desired_config, config_name, options):
|
||||
"""Runs wasm via blaze/v8, or returns 0 if skipped."""
|
||||
if desired_config is not None and desired_config != config_name:
|
||||
return 0
|
||||
args = [options.v8, "--no-liftoff", "--experimental-wasm-simd"]
|
||||
# Otherwise v8 returns 0 even after compile failures!
|
||||
args.append("--no-wasm-async-compilation")
|
||||
if options.profile:
|
||||
args.append("--perf-basic-prof-only-functions")
|
||||
|
||||
num_tests_run = 0
|
||||
skipped = []
|
||||
|
||||
# Build (no blazerc to avoid failures caused by local .blazerc)
|
||||
run_subprocess([
|
||||
"blaze", "--blazerc=/dev/null", "build", "--config=wasm",
|
||||
"--features=wasm_simd", ":" + target
|
||||
], work_dir)
|
||||
|
||||
path = "blaze-bin/third_party/highway/"
|
||||
TEST_SUFFIX = "_test"
|
||||
for test in os.listdir(path):
|
||||
# Only test binaries (avoids non-tar files)
|
||||
if not test.endswith(TEST_SUFFIX):
|
||||
continue
|
||||
|
||||
# Skip directories
|
||||
tar_pathname = os.path.join(path, test)
|
||||
if os.path.isdir(tar_pathname):
|
||||
continue
|
||||
|
||||
# Only the desired test (if given)
|
||||
if options.test is not None and options.test != test:
|
||||
skipped.append(test[0:-len(TEST_SUFFIX)])
|
||||
continue
|
||||
|
||||
with tempfile.TemporaryDirectory() as extract_dir:
|
||||
with tarfile.open(tar_pathname, mode="r:") as tar:
|
||||
tar.extractall(extract_dir)
|
||||
|
||||
test_args = args + [os.path.join(extract_dir, test) + ".js"]
|
||||
run_subprocess(test_args, extract_dir)
|
||||
num_tests_run += 1
|
||||
|
||||
print("Finished", num_tests_run, "; skipped", ",".join(skipped))
|
||||
assert (num_tests_run != 0)
|
||||
return 1
|
||||
|
||||
|
||||
def main(args):
|
||||
parser = argparse.ArgumentParser(description="Run test(s)")
|
||||
parser.add_argument("--v8", help="Pathname to v8 (default ~/jsvu/v8)")
|
||||
parser.add_argument("--test", help="Which test to run (defaults to all)")
|
||||
parser.add_argument("--config", help="Which config to run (defaults to all)")
|
||||
parser.add_argument("--profile", action="store_true", help="Enable profiling")
|
||||
options = parser.parse_args(args)
|
||||
if options.v8 is None:
|
||||
options.v8 = os.path.join(os.getenv("HOME"), ".jsvu", "v8")
|
||||
|
||||
work_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
target = "hwy_ops_tests" if options.test is None else options.test
|
||||
|
||||
num_config = 0
|
||||
num_config += run_blaze_tests(work_dir, target, options.config, "x86", [], [])
|
||||
num_config += run_blaze_tests(
|
||||
work_dir, target, options.config, "rvv",
|
||||
["--blazerc=../../third_party/unsupported_toolchains/mpu/blazerc"],
|
||||
["--config=mpu64_gcc"])
|
||||
num_config += run_blaze_tests(work_dir, target, options.config, "arm8", [],
|
||||
["--config=android_arm64"])
|
||||
num_config += run_blaze_tests(work_dir, target, options.config, "arm7", [], [
|
||||
"--config=android_arm", "--copt=-mfpu=neon-vfpv4",
|
||||
"--copt=-mfloat-abi=softfp"
|
||||
])
|
||||
num_config += run_blaze_tests(work_dir, target, options.config, "msvc", [],
|
||||
["--config=msvc"])
|
||||
num_config += run_wasm_tests(work_dir, target, options.config, "wasm",
|
||||
options)
|
||||
|
||||
if num_config == 0:
|
||||
print_status("ERROR: unknown --config=%s, omit to see valid names" %
|
||||
(options.config,))
|
||||
else:
|
||||
print_status("done")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv[1:])
|
|
@ -16,19 +16,21 @@ Cloudinary Ltd. <*@cloudinary.com>
|
|||
Google LLC <*@google.com>
|
||||
|
||||
# Individuals:
|
||||
Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
|
||||
Alexander Sago <cagelight@gmail.com>
|
||||
Andrius Lukas Narbutas <andrius4669@gmail.com>
|
||||
Artem Selishchev
|
||||
Dirk Lemstra <dirk@lemstra.org>
|
||||
Don Olmstead <don.j.olmstead@gmail.com>
|
||||
Jon Sneyers <jon@cloudinary.com>
|
||||
Lovell Fuller
|
||||
Kleis Auke Wolthuizen <github@kleisauke.nl>
|
||||
Leo Izen <leo.izen@gmail.com>
|
||||
Lovell Fuller
|
||||
Marcin Konicki <ahwayakchih@gmail.com>
|
||||
Misaki Kasumi <misakikasumi@outlook.com>
|
||||
Petr Diblík
|
||||
Pieter Wuille
|
||||
Samuel Leong <wvvwvvvvwvvw@gmail.com>
|
||||
Vincent Torri <vincent.torri@gmail.com>
|
||||
xiota
|
||||
Ziemowit Zabawa <ziemek.zabawa@outlook.com>
|
||||
Andrius Lukas Narbutas <andrius4669@gmail.com>
|
||||
Misaki Kasumi <misakikasumi@outlook.com>
|
||||
Samuel Leong <wvvwvvvvwvvw@gmail.com>
|
||||
Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
|
||||
Vincent Torri <vincent.torri@gmail.com>
|
||||
Artem Selishchev
|
||||
|
|
|
@ -12,11 +12,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
`JxlDecoderGetBoxType`, `JxlDecoderGetBoxSizeRaw` and
|
||||
`JxlDecoderSetDecompressBoxes`
|
||||
- decoder API: ability to mark the input is finished: `JxlDecoderCloseInput`
|
||||
- encoder API: ability to add metadata boxes, added new functions
|
||||
`JxlEncoderAddBox`, `JxlEncoderUseBoxes`, `JxlEncoderCloseBoxes` and
|
||||
`JxlEncoderCloseFrames`.
|
||||
- decoder API: new function `JxlDecoderSetCoalesced` to allow decoding
|
||||
non-coalesced (unblended) frames, e.g. layers of a composite still image
|
||||
or the cropped frames of a recompressed GIF/APNG.
|
||||
- decoder API: field added to `JxlFrameHeader`: a `JxlLayerInfo` struct
|
||||
that contains crop dimensions and offsets and blending information for
|
||||
the non-coalesced case.
|
||||
- decoder API: new function `JxlDecoderGetExtraChannelBlendInfo` to get
|
||||
the blending information for extra channels in the non-coalesced case.
|
||||
|
||||
### Changed
|
||||
- decoder API: `JxlDecoderCloseInput` is required when using JXL_DEC_BOX, and
|
||||
also encouraged, but not required for backwards compatiblity, for any other
|
||||
usage of the decoder.
|
||||
- decoder API: using `JxlDecoderCloseInput` at the end of all input is required
|
||||
when using JXL_DEC_BOX, and is now also encouraged in other cases, but not
|
||||
required in those other cases for backwards compatiblity.
|
||||
- encoder API: `JxlEncoderCloseInput` now closes both frames and boxes input.
|
||||
|
||||
## [0.6.1] - 2021-10-29
|
||||
### Changed
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
|
||||
# Ubuntu bionic ships with cmake 3.10.
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
|
||||
|
||||
# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
|
||||
if(POLICY CMP0063)
|
||||
|
@ -122,10 +123,12 @@ set(JPEGXL_FORCE_NEON false CACHE BOOL
|
|||
|
||||
|
||||
# Force system dependencies.
|
||||
set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
|
||||
"Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
|
||||
"Force using system installed brotli instead of third_party/brotli source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
|
||||
"Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_LCMS2 false CACHE BOOL
|
||||
"Force using system installed lcms2 instead of third_party/lcms source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
|
||||
"Force using system installed highway (libhwy-dev) instead of third_party/highway source.")
|
||||
|
||||
|
@ -158,12 +161,12 @@ message(STATUS
|
|||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
if(JPEGXL_STATIC)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
|
||||
set(BUILD_SHARED_LIBS 0)
|
||||
# Clang developers say that in case to use "static" we have to build stdlib
|
||||
# ourselves; for real use case we don't care about stdlib, as it is "granted",
|
||||
# so just linking all other libraries is fine.
|
||||
if (NOT APPLE)
|
||||
if (NOT MSVC AND NOT APPLE)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
|
||||
set(CMAKE_EXE_LINKER_FLAGS
|
||||
"${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
|
||||
endif()
|
||||
|
|
|
@ -1,37 +1,73 @@
|
|||
# Security and Vulnerability Policy for JPEG XL
|
||||
# Security and Vulnerability Policy for libjxl
|
||||
|
||||
The current focus of the reference implementation is to provide a vehicle for
|
||||
evaluating the JPEG XL codec compression density, quality, features and its
|
||||
actual performance on different platforms. With this focus in mind we provide
|
||||
source code releases with improvements on performance and quality so developers
|
||||
can evaluate the codec.
|
||||
## TL;DR:
|
||||
|
||||
At this time, **we don't provide security and vulnerability support** for any
|
||||
of these releases. This means that the source code may contain bugs, including
|
||||
security bugs, that may be added or fixed between releases and will **not** be
|
||||
individually documented. All of these
|
||||
[releases](https://gitlab.com/wg1/jpeg-xl/-/releases) include the following
|
||||
note to that effect:
|
||||
CPE prefix: `cpe:2.3:a:libjxl_project:libjxl`
|
||||
|
||||
* Note: This release is for evaluation purposes and may contain bugs, including
|
||||
security bugs, that will *not* be individually documented when fixed. Always
|
||||
prefer to use the latest release. Please provide feedback and report bugs
|
||||
[here](https://gitlab.com/wg1/jpeg-xl/-/issues).
|
||||
To report a security issue, please email libjxl-security@google.com.
|
||||
|
||||
To be clear, this means that because a release doesn't mention any CVE it
|
||||
doesn't mean that no security issues in previous versions were fixed. You should
|
||||
assume that any previous release contains security issues if that's a concern
|
||||
for your use case.
|
||||
Include in your email a description of the issue, the steps you took to create
|
||||
the issue, affected versions, and if known, mitigations for the issue. Our
|
||||
vulnerability management team will acknowledge receiving your email within 3
|
||||
working days.
|
||||
|
||||
This however doesn't impede you from evaluating the codec with your own trusted
|
||||
inputs, such as `.jxl` you encoded yourself, or when taking appropriate measures
|
||||
for your application like sandboxing if processing untrusted inputs.
|
||||
This project follows a 90 day disclosure timeline.
|
||||
|
||||
## Future plans
|
||||
For all other bugs, where there are no security implications about disclosing
|
||||
the unpatched bug, open a [new issue](https://github.com/libjxl/libjxl/issues)
|
||||
checking first for existing similar issues. If in doubt about the security
|
||||
impact of a bug you discovered, email first.
|
||||
|
||||
To help our users and developers integrating this implementation into their
|
||||
software we plan to provide support for security and vulnerability tracking of
|
||||
this implementation in the future.
|
||||
## Policy overview
|
||||
|
||||
When we can provide such support we will update this Policy with the details and
|
||||
expectations and clearly mention that fact in the release notes.
|
||||
libjxl's Security Policy is based on the [Google Open Source program
|
||||
guidelines](https://github.com/google/oss-vulnerability-guide) for coordinated
|
||||
vulnerability disclosure.
|
||||
|
||||
Early versions of `libjxl` had a different security policy that didn't provide
|
||||
security and vulnerability disclosure support. Versions up to and including
|
||||
0.3.7 are not covered and won't receive any security advisory.
|
||||
|
||||
Only released versions, starting from version 0.5, are covered by this policy.
|
||||
Development branches, arbitrary commits from `main` branch or even releases with
|
||||
backported features externally patched on top are not covered. Only those
|
||||
versions with a release tag in `libjxl`'s repository are covered, starting from
|
||||
version 0.5.
|
||||
|
||||
## What's a "Security bug"
|
||||
|
||||
A security bug is a bug that can potentially be exploited to let an attacker
|
||||
gain unauthorized access or privileges such as disclosing information or
|
||||
arbitrary code execution. Not all fuzzer-found bugs and not all assert()
|
||||
failures are considered security bugs in libjxl. For a detailed explanation and
|
||||
examples see our [Security Vulnerabilities Playbook](doc/vuln_playbook.md).
|
||||
|
||||
## What to expect
|
||||
|
||||
To report a security issue, please email libjxl-security@google.com with all the
|
||||
details about the bug you encountered.
|
||||
|
||||
* Include a description of the issue, steps to reproduce, etc. Compiler
|
||||
versions, flags, exact version used and even CPU are often relevant given our
|
||||
usage of SIMD and run-time dispatch of SIMD instructions.
|
||||
|
||||
* A member of our security team will reply to you within 3 business days. Note
|
||||
that business days are different in different countries.
|
||||
|
||||
* We will evaluate the issue and we may require more input from your side to
|
||||
reproduce it.
|
||||
|
||||
* If the issue fits in the description of a security bug, we will issue a
|
||||
CVE, publish a fix and make a new minor or patch release with it. There is
|
||||
a maximum of 90 day disclosure timeline, we ask you to not publish the
|
||||
details before the 90 day deadline or the release date (whichever comes
|
||||
first).
|
||||
|
||||
* In the case that we publish a CVE we will credit the external researcher who
|
||||
reported the issue. When reporting security issues please let us know if you
|
||||
need to include specific information while doing so, like for example a
|
||||
company affiliation.
|
||||
|
||||
Our security team follows the [Security Vulnerabilities
|
||||
Playbook](doc/vuln_playbook.md). For more details about the process and policies
|
||||
please take a look at it.
|
||||
|
|
|
@ -558,6 +558,7 @@ cmd_coverage_report() {
|
|||
--filter '.*jxl/.*'
|
||||
--exclude '.*_test.cc'
|
||||
--exclude '.*_testonly..*'
|
||||
--exclude '.*_debug.*'
|
||||
--exclude '.*test_utils..*'
|
||||
--object-directory "${real_build_dir}"
|
||||
)
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
# Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
find_package(PkgConfig QUIET)
|
||||
if (PkgConfig_FOUND)
|
||||
pkg_check_modules(PC_HWY QUIET libhwy)
|
||||
set(HWY_VERSION ${PC_HWY_VERSION})
|
||||
endif ()
|
||||
|
||||
find_path(HWY_INCLUDE_DIR
|
||||
NAMES hwy/highway.h
|
||||
HINTS ${PC_HWY_INCLUDEDIR} ${PC_HWY_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
find_library(HWY_LIBRARY
|
||||
NAMES ${HWY_NAMES} hwy
|
||||
HINTS ${PC_HWY_LIBDIR} ${PC_HWY_LIBRARY_DIRS}
|
||||
)
|
||||
|
||||
if (HWY_INCLUDE_DIR AND NOT HWY_VERSION)
|
||||
if (EXISTS "${HWY_INCLUDE_DIR}/hwy/highway.h")
|
||||
file(READ "${HWY_INCLUDE_DIR}/hwy/highway.h" HWY_VERSION_CONTENT)
|
||||
|
||||
string(REGEX MATCH "#define HWY_MAJOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
|
||||
set(HWY_VERSION_MAJOR "${CMAKE_MATCH_1}")
|
||||
|
||||
string(REGEX MATCH "#define +HWY_MINOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
|
||||
set(HWY_VERSION_MINOR "${CMAKE_MATCH_1}")
|
||||
|
||||
string(REGEX MATCH "#define +HWY_PATCH +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
|
||||
set(HWY_VERSION_PATCH "${CMAKE_MATCH_1}")
|
||||
|
||||
set(HWY_VERSION "${HWY_VERSION_MAJOR}.${HWY_VERSION_MINOR}.${HWY_VERSION_PATCH}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(HWY
|
||||
FOUND_VAR HWY_FOUND
|
||||
REQUIRED_VARS HWY_LIBRARY HWY_INCLUDE_DIR
|
||||
VERSION_VAR HWY_VERSION
|
||||
)
|
||||
|
||||
if (HWY_LIBRARY AND NOT TARGET hwy)
|
||||
add_library(hwy INTERFACE IMPORTED GLOBAL)
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.13.5")
|
||||
set_property(TARGET hwy PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${HWY_INCLUDE_DIR})
|
||||
target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
|
||||
set_property(TARGET hwy PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_HWY_CFLAGS_OTHER})
|
||||
else()
|
||||
target_include_directories(hwy INTERFACE ${HWY_INCLUDE_DIR})
|
||||
target_link_libraries(hwy INTERFACE ${HWY_LIBRARY})
|
||||
target_link_options(hwy INTERFACE ${PC_HWY_LDFLAGS_OTHER})
|
||||
target_compile_options(hwy INTERFACE ${PC_HWY_CFLAGS_OTHER})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
mark_as_advanced(HWY_INCLUDE_DIR HWY_LIBRARY)
|
||||
|
||||
if (HWY_FOUND)
|
||||
set(HWY_LIBRARIES ${HWY_LIBRARY})
|
||||
set(HWY_INCLUDE_DIRS ${HWY_INCLUDE_DIR})
|
||||
endif ()
|
|
@ -0,0 +1,59 @@
|
|||
# Copyright (c) the JPEG XL Project Authors. All rights reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
find_package(PkgConfig QUIET)
|
||||
if (PkgConfig_FOUND)
|
||||
pkg_check_modules(PC_LCMS2 QUIET libLCMS2)
|
||||
set(LCMS2_VERSION ${PC_LCMS2_VERSION})
|
||||
endif ()
|
||||
|
||||
find_path(LCMS2_INCLUDE_DIR
|
||||
NAMES lcms2.h
|
||||
HINTS ${PC_LCMS2_INCLUDEDIR} ${PC_LCMS2_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
find_library(LCMS2_LIBRARY
|
||||
NAMES ${LCMS2_NAMES} lcms2 liblcms2 lcms-2 liblcms-2
|
||||
HINTS ${PC_LCMS2_LIBDIR} ${PC_LCMS2_LIBRARY_DIRS}
|
||||
)
|
||||
|
||||
if (LCMS2_INCLUDE_DIR AND NOT LCMS_VERSION)
|
||||
file(READ ${LCMS2_INCLUDE_DIR}/lcms2.h LCMS2_VERSION_CONTENT)
|
||||
string(REGEX MATCH "#define[ \t]+LCMS_VERSION[ \t]+([0-9]+)[ \t]*\n" LCMS2_VERSION_MATCH ${LCMS2_VERSION_CONTENT})
|
||||
if (LCMS2_VERSION_MATCH)
|
||||
string(SUBSTRING ${CMAKE_MATCH_1} 0 1 LCMS2_VERSION_MAJOR)
|
||||
string(SUBSTRING ${CMAKE_MATCH_1} 1 2 LCMS2_VERSION_MINOR)
|
||||
set(LCMS2_VERSION "${LCMS2_VERSION_MAJOR}.${LCMS2_VERSION_MINOR}")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(LCMS2
|
||||
FOUND_VAR LCMS2_FOUND
|
||||
REQUIRED_VARS LCMS2_LIBRARY LCMS2_INCLUDE_DIR
|
||||
VERSION_VAR LCMS2_VERSION
|
||||
)
|
||||
|
||||
if (LCMS2_LIBRARY AND NOT TARGET lcms2)
|
||||
add_library(lcms2 INTERFACE IMPORTED GLOBAL)
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.13.5")
|
||||
set_property(TARGET lcms2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${LCMS2_INCLUDE_DIR})
|
||||
target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
|
||||
set_property(TARGET lcms2 PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_LCMS2_CFLAGS_OTHER})
|
||||
else()
|
||||
target_include_directories(lcms2 INTERFACE ${LCMS2_INCLUDE_DIR})
|
||||
target_link_libraries(lcms2 INTERFACE ${LCMS2_LIBRARY})
|
||||
target_link_options(lcms2 INTERFACE ${PC_LCMS2_LDFLAGS_OTHER})
|
||||
target_compile_options(lcms2 INTERFACE ${PC_LCMS2_CFLAGS_OTHER})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
mark_as_advanced(LCMS2_INCLUDE_DIR LCMS2_LIBRARY)
|
||||
|
||||
if (LCMS2_FOUND)
|
||||
set(LCMS2_LIBRARIES ${LCMS2_LIBRARY})
|
||||
set(LCMS2_INCLUDE_DIRS ${LCMS2_INCLUDE_DIR})
|
||||
endif ()
|
|
@ -14,7 +14,7 @@ Build-Depends:
|
|||
libgmock-dev,
|
||||
libgoogle-perftools-dev,
|
||||
libgtest-dev,
|
||||
libhwy-dev,
|
||||
libhwy-dev (>= 0.15.0),
|
||||
libjpeg-dev,
|
||||
libopenexr-dev,
|
||||
libpng-dev,
|
||||
|
|
|
@ -13,7 +13,7 @@ MYDIR=$(dirname $(realpath "$0"))
|
|||
|
||||
# Git revisions we use for the given submodules. Update these whenever you
|
||||
# update a git submodule.
|
||||
THIRD_PARTY_HIGHWAY="e2397743fe092df68b760d358253773699a16c93"
|
||||
THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5"
|
||||
THIRD_PARTY_LODEPNG="8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a"
|
||||
THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
|
||||
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
|
||||
|
@ -23,7 +23,7 @@ download_github() {
|
|||
local path="$1"
|
||||
local project="$2"
|
||||
|
||||
local varname="${path^^}"
|
||||
local varname=`echo "$path" | tr '[:lower:]' '[:upper:]'`
|
||||
varname="${varname/\//_}"
|
||||
local sha
|
||||
eval "sha=\${${varname}}"
|
||||
|
|
|
@ -67,6 +67,7 @@ int PrintBasicInfo(FILE* file) {
|
|||
}
|
||||
data_size = remaining + read_size;
|
||||
JxlDecoderSetInput(dec, data, data_size);
|
||||
if (feof(file)) JxlDecoderCloseInput(dec);
|
||||
} else if (status == JXL_DEC_SUCCESS) {
|
||||
// Finished all processing.
|
||||
break;
|
||||
|
@ -319,9 +320,11 @@ int main(int argc, char* argv[]) {
|
|||
}
|
||||
|
||||
if (!PrintBasicInfo(file)) {
|
||||
fclose(file);
|
||||
fprintf(stderr, "Couldn't print basic info\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -15,7 +15,7 @@ set(JPEGXL_LIBRARY_VERSION
|
|||
# It is important to update this value when making incompatible API/ABI changes
|
||||
# so that programs that depend on libjxl can update their dependencies. Semantic
|
||||
# versioning allows 0.y.z to have incompatible changes in minor versions.
|
||||
set(JPEGXL_SO_MINOR_VERSION 6)
|
||||
set(JPEGXL_SO_MINOR_VERSION 7)
|
||||
if (JPEGXL_MAJOR_VERSION EQUAL 0)
|
||||
set(JPEGXL_LIBRARY_SOVERSION
|
||||
"${JPEGXL_MAJOR_VERSION}.${JPEGXL_SO_MINOR_VERSION}")
|
||||
|
|
|
@ -297,6 +297,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
|
|||
// TODO(veluca): this could in principle be implemented.
|
||||
if (last_base_was_none && !all_dispose_bg &&
|
||||
(x0 != 0 || y0 != 0 || w0 != w || h0 != h || bop != 0)) {
|
||||
delete[] frameRaw.rows;
|
||||
delete[] frameRaw.p;
|
||||
return JXL_FAILURE(
|
||||
"APNG with dispose-to-0 is not supported for non-full or "
|
||||
"blended frames");
|
||||
|
|
|
@ -301,22 +301,23 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
|
|||
if (cinfo.arith_code) {
|
||||
return failure("arithmetic code JPEGs are not supported");
|
||||
}
|
||||
int nbcomp = cinfo.num_components;
|
||||
if (nbcomp != 1 && nbcomp != 3) {
|
||||
return failure("unsupported number of components in JPEG");
|
||||
}
|
||||
if (!ReadICCProfile(&cinfo, &ppf->icc)) {
|
||||
ppf->icc.clear();
|
||||
// Default to SRGB
|
||||
ppf->color_encoding.color_space = cinfo.output_components == 1
|
||||
? JXL_COLOR_SPACE_GRAY
|
||||
: JXL_COLOR_SPACE_RGB;
|
||||
// Actually, (cinfo.output_components == nbcomp) will be checked after
|
||||
// `jpeg_start_decompress`.
|
||||
ppf->color_encoding.color_space =
|
||||
(nbcomp == 1) ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
|
||||
ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
|
||||
ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
|
||||
ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
|
||||
ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
|
||||
}
|
||||
ReadExif(&cinfo, &ppf->metadata.exif);
|
||||
int nbcomp = cinfo.num_components;
|
||||
if (nbcomp != 1 && nbcomp != 3) {
|
||||
return failure("unsupported number of components in JPEG");
|
||||
}
|
||||
if (!ApplyColorHints(color_hints, /*color_already_set=*/true,
|
||||
/*is_gray=*/false, ppf)) {
|
||||
return failure("ApplyColorHints failed");
|
||||
|
|
|
@ -289,7 +289,69 @@ typedef struct {
|
|||
uint64_t extensions;
|
||||
} JxlHeaderExtensions;
|
||||
|
||||
/** The header of one displayed frame. */
|
||||
/** Frame blend modes.
|
||||
* If coalescing is enabled (default), this can be ignored.
|
||||
*/
|
||||
typedef enum {
|
||||
JXL_BLEND_REPLACE = 0,
|
||||
JXL_BLEND_ADD = 1,
|
||||
JXL_BLEND_BLEND = 2,
|
||||
JXL_BLEND_MULADD = 3,
|
||||
JXL_BLEND_MUL = 4,
|
||||
} JxlBlendMode;
|
||||
|
||||
/** The information about blending the color channels or a single extra channel.
|
||||
* If coalescing is enabled (default), this can be ignored.
|
||||
*/
|
||||
typedef struct {
|
||||
/** Blend mode.
|
||||
* Always equal to JXL_BLEND_REPLACE if coalescing is enabled.
|
||||
*/
|
||||
JxlBlendMode blendmode;
|
||||
/** Reference frame ID to use as the 'bottom' layer (0-3).
|
||||
*/
|
||||
uint32_t source;
|
||||
/** Which extra channel to use as the 'alpha' channel for blend modes
|
||||
* JXL_BLEND_BLEND and JXL_BLEND_MULADD.
|
||||
*/
|
||||
uint32_t alpha;
|
||||
/** Clamp values to [0,1] for the purpose of blending.
|
||||
*/
|
||||
JXL_BOOL clamp;
|
||||
} JxlBlendInfo;
|
||||
|
||||
/** The information about layers.
|
||||
* If coalescing is enabled (default), this can be ignored.
|
||||
*/
|
||||
typedef struct {
|
||||
/** Horizontal offset of the frame (can be negative, always zero if coalescing
|
||||
* is enabled)
|
||||
*/
|
||||
int32_t crop_x0;
|
||||
/** Vertical offset of the frame (can be negative, always zero if coalescing
|
||||
* is enabled)
|
||||
*/
|
||||
int32_t crop_y0;
|
||||
/** Width of the frame (number of columns, always equal to image width if
|
||||
* coalescing is enabled)
|
||||
*/
|
||||
uint32_t xsize;
|
||||
/** Height of the frame (number of rows, always equal to image height if
|
||||
* coalescing is enabled)
|
||||
*/
|
||||
uint32_t ysize;
|
||||
/** The blending info for the color channels. Blending info for extra channels
|
||||
* has to be retrieved separately using JxlDecoderGetExtraChannelBlendInfo.
|
||||
*/
|
||||
JxlBlendInfo blend_info;
|
||||
/** After blending, save the frame as reference frame with this ID (0-3).
|
||||
* Special case: if the frame duration is nonzero, ID 0 means "will not be
|
||||
* referenced in the future".
|
||||
*/
|
||||
uint32_t save_as_reference;
|
||||
} JxlLayerInfo;
|
||||
|
||||
/** The header of one displayed frame or non-coalesced layer. */
|
||||
typedef struct {
|
||||
/** How long to wait after rendering in ticks. The duration in seconds of a
|
||||
* tick is given by tps_numerator and tps_denominator in JxlAnimationHeader.
|
||||
|
@ -314,6 +376,10 @@ typedef struct {
|
|||
/** Indicates this is the last animation frame.
|
||||
*/
|
||||
JXL_BOOL is_last;
|
||||
|
||||
/** Information about the layer in case of no coalescing.
|
||||
*/
|
||||
JxlLayerInfo layer_info;
|
||||
} JxlFrameHeader;
|
||||
|
||||
#if defined(__cplusplus) || defined(c_plusplus)
|
||||
|
|
|
@ -206,12 +206,17 @@ typedef enum {
|
|||
* JxlDecoderGetFrameHeader can be used at this point. A note on frames:
|
||||
* a JPEG XL image can have internal frames that are not intended to be
|
||||
* displayed (e.g. used for compositing a final frame), but this only returns
|
||||
* displayed frames. A displayed frame either has an animation duration or is
|
||||
* the only or last frame in the image. This event occurs max once per
|
||||
* displayed frame, always later than JXL_DEC_COLOR_ENCODING, and always
|
||||
* earlier than any pixel data. While JPEG XL supports encoding a single frame
|
||||
* as the composition of multiple internal sub-frames also called frames, this
|
||||
* event is not indicated for the internal frames.
|
||||
* displayed frames, unless JxlDecoderSetCoalescing was set to JXL_FALSE: in
|
||||
* that case, the individual layers are returned, without blending. Note that
|
||||
* even when coalescing is disabled, only frames of type kRegularFrame are
|
||||
* returned; frames of type kReferenceOnly and kLfFrame are always for
|
||||
* internal purposes only and cannot be accessed. A displayed frame either has
|
||||
* an animation duration or is the only or last frame in the image. This event
|
||||
* occurs max once per displayed frame, always later than
|
||||
* JXL_DEC_COLOR_ENCODING, and always earlier than any pixel data. While JPEG
|
||||
* XL supports encoding a single frame as the composition of multiple internal
|
||||
* sub-frames also called frames, this event is not indicated for the internal
|
||||
* frames.
|
||||
*/
|
||||
JXL_DEC_FRAME = 0x400,
|
||||
|
||||
|
@ -228,11 +233,12 @@ typedef enum {
|
|||
*/
|
||||
JXL_DEC_DC_IMAGE = 0x800,
|
||||
|
||||
/** Informative event by JxlDecoderProcessInput: full frame decoded.
|
||||
* JxlDecoderSetImageOutBuffer must be used after getting the basic image
|
||||
* information to be able to get the image pixels, if not this return status
|
||||
* only indicates we're past this point in the codestream. This event occurs
|
||||
* max once per frame and always later than JXL_DEC_DC_IMAGE.
|
||||
/** Informative event by JxlDecoderProcessInput: full frame (or layer, in case
|
||||
* coalescing is disabled) is decoded. JxlDecoderSetImageOutBuffer must be
|
||||
* used after getting the basic image information to be able to get the image
|
||||
* pixels, if not this return status only indicates we're past this point in
|
||||
* the codestream. This event occurs max once per frame and always later than
|
||||
* JXL_DEC_DC_IMAGE.
|
||||
*/
|
||||
JXL_DEC_FULL_IMAGE = 0x1000,
|
||||
|
||||
|
@ -418,6 +424,22 @@ JxlDecoderSetKeepOrientation(JxlDecoder* dec, JXL_BOOL keep_orientation);
|
|||
JXL_EXPORT JxlDecoderStatus
|
||||
JxlDecoderSetRenderSpotcolors(JxlDecoder* dec, JXL_BOOL render_spotcolors);
|
||||
|
||||
/** Enables or disables coalescing of zero-duration frames. By default, frames
|
||||
* are returned with coalescing enabled, i.e. all frames have the image
|
||||
* dimensions, and are blended if needed. When coalescing is disabled, frames
|
||||
* can have arbitrary dimensions, a non-zero crop offset, and blending is not
|
||||
* performed. For display, coalescing is recommended. For loading a multi-layer
|
||||
* still image as separate layers (as opposed to the merged image), coalescing
|
||||
* has to be disabled.
|
||||
*
|
||||
* @param dec decoder object
|
||||
* @param coalescing JXL_TRUE to enable coalescing (default), JXL_FALSE to
|
||||
* disable it.
|
||||
* @return JXL_DEC_SUCCESS if no error, JXL_DEC_ERROR otherwise.
|
||||
*/
|
||||
JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
|
||||
JXL_BOOL coalescing);
|
||||
|
||||
/**
|
||||
* Decodes JPEG XL file using the available bytes. Requires input has been
|
||||
* set with JxlDecoderSetInput. After JxlDecoderProcessInput, input can
|
||||
|
@ -770,6 +792,21 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
|
|||
JXL_EXPORT JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec,
|
||||
char* name, size_t size);
|
||||
|
||||
/**
|
||||
* Outputs the blend information for the current frame for a specific extra
|
||||
* channel. This function can be called when JXL_DEC_FRAME occurred for the
|
||||
* current frame, even when have_animation in the JxlBasicInfo is JXL_FALSE.
|
||||
* This information is only useful if coalescing is disabled; otherwise the
|
||||
* decoder will have performed blending already.
|
||||
*
|
||||
* @param dec decoder object
|
||||
* @param index the index of the extra channel
|
||||
* @param blend_info struct to copy the information into
|
||||
* @return JXL_DEC_SUCCESS on success, JXL_DEC_ERROR on error
|
||||
*/
|
||||
JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
|
||||
const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
|
||||
|
||||
/**
|
||||
* Returns the minimum size in bytes of the DC image output buffer
|
||||
* for the given format. This is the buffer for JxlDecoderSetDCOutBuffer.
|
||||
|
@ -811,7 +848,11 @@ JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderSetDCOutBuffer(
|
|||
/**
|
||||
* Returns the minimum size in bytes of the image output pixel buffer for the
|
||||
* given format. This is the buffer for JxlDecoderSetImageOutBuffer. Requires
|
||||
* the basic image information is available in the decoder.
|
||||
* that the basic image information is available in the decoder in the case of
|
||||
* coalescing enabled (default). In case coalescing is disabled, this can only
|
||||
* be called after the JXL_DEC_FRAME event occurs. In that case, it will return
|
||||
* the size required to store the possibly cropped frame (which can be larger or
|
||||
* smaller than the image dimensions).
|
||||
*
|
||||
* @param dec decoder object
|
||||
* @param format format of the pixels.
|
||||
|
|
|
@ -110,127 +110,165 @@ typedef enum {
|
|||
*/
|
||||
JXL_ENC_OPTION_EXTRA_CHANNEL_RESAMPLING = 3,
|
||||
|
||||
/** Indicates the frame added with @ref JxlEncoderAddImageFrame is already
|
||||
* downsampled by the downsampling factor set with @ref
|
||||
* JXL_ENC_OPTION_RESAMPLING. The input frame must then be given in the
|
||||
* downsampled resolution, not the full image resolution. The downsampled
|
||||
* resolution is given by ceil(xsize / resampling), ceil(ysize / resampling)
|
||||
* with xsize and ysize the dimensions given in the basic info, and resampling
|
||||
* the factor set with @ref JXL_ENC_OPTION_RESAMPLING.
|
||||
* Use 0 to disable, 1 to enable. Default value is 0.
|
||||
*/
|
||||
JXL_ENC_OPTION_ALREADY_DOWNSAMPLED = 4,
|
||||
|
||||
/** Adds noise to the image emulating photographic film noise, the higher the
|
||||
* given number, the grainier the image will be. As an example, a value of 100
|
||||
* gives low noise whereas a value of 3200 gives a lot of noise. The default
|
||||
* value is 0.
|
||||
*/
|
||||
JXL_ENC_OPTION_PHOTON_NOISE = 4,
|
||||
JXL_ENC_OPTION_PHOTON_NOISE = 5,
|
||||
|
||||
/** Enables adaptive noise generation. This setting is not recommended for
|
||||
* use, please use JXL_ENC_OPTION_PHOTON_NOISE instead. Use -1 for the default
|
||||
* (encoder chooses), 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_NOISE = 5,
|
||||
JXL_ENC_OPTION_NOISE = 6,
|
||||
|
||||
/** Enables or disables dots generation. Use -1 for the default (encoder
|
||||
* chooses), 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_DOTS = 6,
|
||||
JXL_ENC_OPTION_DOTS = 7,
|
||||
|
||||
/** Enables or disables patches generation. Use -1 for the default (encoder
|
||||
* chooses), 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_PATCHES = 7,
|
||||
JXL_ENC_OPTION_PATCHES = 8,
|
||||
|
||||
/** Edge preserving filter level, -1 to 3. Use -1 for the default (encoder
|
||||
* chooses), 0 to 3 to set a strength.
|
||||
*/
|
||||
JXL_ENC_OPTION_EPF = 8,
|
||||
JXL_ENC_OPTION_EPF = 9,
|
||||
|
||||
/** Enables or disables the gaborish filter. Use -1 for the default (encoder
|
||||
* chooses), 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_GABORISH = 9,
|
||||
JXL_ENC_OPTION_GABORISH = 10,
|
||||
|
||||
/** Enables modular encoding. Use -1 for default (encoder
|
||||
* chooses), 0 to enforce VarDCT mode (e.g. for photographic images), 1 to
|
||||
* enforce modular mode (e.g. for lossless images).
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR = 10,
|
||||
JXL_ENC_OPTION_MODULAR = 11,
|
||||
|
||||
/** Enables or disables preserving color of invisible pixels. Use -1 for the
|
||||
* default (1 if lossless, 0 if lossy), 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_KEEP_INVISIBLE = 11,
|
||||
JXL_ENC_OPTION_KEEP_INVISIBLE = 12,
|
||||
|
||||
/** Determines the order in which 256x256 regions are stored in the codestream
|
||||
* for progressive rendering. Use -1 for the encoder
|
||||
* default, 0 for scanline order, 1 for center-first order.
|
||||
*/
|
||||
JXL_ENC_OPTION_GROUP_ORDER = 12,
|
||||
JXL_ENC_OPTION_GROUP_ORDER = 13,
|
||||
|
||||
/** Determines the horizontal position of center for the center-first group
|
||||
* order. Use -1 to automatically use the middle of the image, 0..xsize to
|
||||
* specifically set it.
|
||||
*/
|
||||
JXL_ENC_OPTION_GROUP_ORDER_CENTER_X = 13,
|
||||
JXL_ENC_OPTION_GROUP_ORDER_CENTER_X = 14,
|
||||
|
||||
/** Determines the center for the center-first group order. Use -1 to
|
||||
* automatically use the middle of the image, 0..ysize to specifically set it.
|
||||
*/
|
||||
JXL_ENC_OPTION_GROUP_ORDER_CENTER_Y = 14,
|
||||
JXL_ENC_OPTION_GROUP_ORDER_CENTER_Y = 15,
|
||||
|
||||
/** Enables or disables progressive encoding for modular mode. Use -1 for the
|
||||
* encoder default, 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_RESPONSIVE = 15,
|
||||
JXL_ENC_OPTION_RESPONSIVE = 16,
|
||||
|
||||
/** Set the progressive mode for the AC coefficients of VarDCT, using spectral
|
||||
* progression from the DCT coefficients. Use -1 for the encoder default, 0 to
|
||||
* disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_PROGRESSIVE_AC = 16,
|
||||
JXL_ENC_OPTION_PROGRESSIVE_AC = 17,
|
||||
|
||||
/** Set the progressive mode for the AC coefficients of VarDCT, using
|
||||
* quantization of the least significant bits. Use -1 for the encoder default,
|
||||
* 0 to disable, 1 to enable.
|
||||
*/
|
||||
JXL_ENC_OPTION_QPROGRESSIVE_AC = 17,
|
||||
JXL_ENC_OPTION_QPROGRESSIVE_AC = 18,
|
||||
|
||||
/** Set the progressive mode using lower-resolution DC images for VarDCT. Use
|
||||
* -1 for the encoder default, 0 to disable, 1 to have an extra 64x64 lower
|
||||
* resolution pass, 2 to have a 512x512 and 64x64 lower resolution pass.
|
||||
*/
|
||||
JXL_ENC_OPTION_PROGRESSIVE_DC = 18,
|
||||
JXL_ENC_OPTION_PROGRESSIVE_DC = 19,
|
||||
|
||||
/** Use Global channel palette if the amount of colors is smaller than this
|
||||
* percentage of range. Use 0-100 to set an explicit percentage, -1 to use the
|
||||
* encoder default. Used for modular encoding.
|
||||
*/
|
||||
JXL_ENC_OPTION_CHANNEL_COLORS_PRE_TRANSFORM_PERCENT = 19,
|
||||
JXL_ENC_OPTION_CHANNEL_COLORS_GLOBAL_PERCENT = 20,
|
||||
|
||||
/** Use Local channel palette if the amount of colors is smaller than this
|
||||
* percentage of range. Use 0-100 to set an explicit percentage, -1 to use the
|
||||
* encoder default. Used for modular encoding.
|
||||
/** Use Local (per-group) channel palette if the amount of colors is smaller
|
||||
* than this percentage of range. Use 0-100 to set an explicit percentage, -1
|
||||
* to use the encoder default. Used for modular encoding.
|
||||
*/
|
||||
JXL_ENC_OPTION_CHANNEL_COLORS_PERCENT = 20,
|
||||
JXL_ENC_OPTION_CHANNEL_COLORS_GROUP_PERCENT = 21,
|
||||
|
||||
/** Use color palette if amount of colors is smaller than or equal to this
|
||||
* amount, or -1 to use the encoder default. Used for modular encoding.
|
||||
*/
|
||||
JXL_ENC_OPTION_PALETTE_COLORS = 21,
|
||||
JXL_ENC_OPTION_PALETTE_COLORS = 22,
|
||||
|
||||
/** Enables or disables delta palette. Use -1 for the default (encoder
|
||||
* chooses), 0 to disable, 1 to enable. Used in modular mode.
|
||||
*/
|
||||
JXL_ENC_OPTION_LOSSY_PALETTE = 22,
|
||||
JXL_ENC_OPTION_LOSSY_PALETTE = 23,
|
||||
|
||||
/** Color space for modular encoding: 0=RGB, 1=YCoCg, 2-37=RCT, -1=default:
|
||||
* try several, depending on speed.
|
||||
/** Color transform for internal encoding: -1 = default, 0=XYB, 1=none (RGB),
|
||||
* 2=YCbCr. The XYB setting performs the forward XYB transform. None and
|
||||
* YCbCr both perform no transform, but YCbCr is used to indicate that the
|
||||
* encoded data losslessly represents YCbCr values.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_COLOR_SPACE = 23,
|
||||
JXL_ENC_OPTION_COLOR_TRANSFORM = 24,
|
||||
|
||||
/** Color space for modular encoding: -1=default, 0-35=reverse color transform
|
||||
* index, e.g. index 0 = none, index 6 = YCoCg.
|
||||
* The default behavior is to try several, depending on the speed setting.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_COLOR_SPACE = 25,
|
||||
|
||||
/** Group size for modular encoding: -1=default, 0=128, 1=256, 2=512, 3=1024.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_GROUP_SIZE = 24,
|
||||
JXL_ENC_OPTION_MODULAR_GROUP_SIZE = 26,
|
||||
|
||||
/** Predictor for modular encoding. -1 = default, 0=zero, 1=left, 2=top,
|
||||
* 3=avg0, 4=select, 5=gradient, 6=weighted, 7=topright, 8=topleft,
|
||||
* 9=leftleft, 10=avg1, 11=avg2, 12=avg3, 13=toptop predictive average 14=mix
|
||||
* 5 and 6, 15=mix everything.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_PREDICTOR = 25,
|
||||
JXL_ENC_OPTION_MODULAR_PREDICTOR = 27,
|
||||
|
||||
/** Fraction of pixels used to learn MA trees as a percentage. -1 = default,
|
||||
* 0 = no MA and fast decode, 50 = default value, 100 = all, values above
|
||||
* 100 are also permitted. Higher values use more encoder memory.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_MA_TREE_LEARNING_PERCENT = 28,
|
||||
|
||||
/** Number of extra (previous-channel) MA tree properties to use. -1 =
|
||||
* default, 0-11 = valid values. Recommended values are in the range 0 to 3,
|
||||
* or 0 to amount of channels minus 1 (including all extra channels, and
|
||||
* excluding color channels when using VarDCT mode). Higher value gives slower
|
||||
* encoding and slower decoding.
|
||||
*/
|
||||
JXL_ENC_OPTION_MODULAR_NB_PREV_CHANNELS = 29,
|
||||
|
||||
/** Enable or disable CFL (chroma-from-luma) for lossless JPEG recompression.
|
||||
* -1 = default, 0 = disable CFL, 1 = enable CFL.
|
||||
*/
|
||||
JXL_ENC_OPTION_JPEG_RECON_CFL = 30,
|
||||
|
||||
/** Enum value not to be used as an option. This value is added to force the
|
||||
* C compiler to have the enum to take a known size.
|
||||
|
@ -337,11 +375,22 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderAddJPEGFrame(
|
|||
* JxlEncoderSetBasicInfo before JxlEncoderAddImageFrame.
|
||||
*
|
||||
* Currently only some data types for pixel formats are supported:
|
||||
* - JXL_TYPE_UINT8
|
||||
* - JXL_TYPE_UINT16
|
||||
* - JXL_TYPE_UINT8, with range 0..255
|
||||
* - JXL_TYPE_UINT16, with range 0..65535
|
||||
* - JXL_TYPE_FLOAT16, with nominal range 0..1
|
||||
* - JXL_TYPE_FLOAT, with nominal range 0..1
|
||||
*
|
||||
* Note: the sample data type in pixel_format is allowed to be different from
|
||||
* what is described in the JxlBasicInfo. The type in pixel_format describes the
|
||||
* format of the uncompressed pixel buffer. The bits_per_sample and
|
||||
* exponent_bits_per_sample in the JxlBasicInfo describes what will actually be
|
||||
* encoded in the JPEG XL codestream. For example, to encode a 12-bit image, you
|
||||
* would set bits_per_sample to 12, and you could use e.g. JXL_TYPE_UINT16
|
||||
* (where the values are rescaled to 16-bit, i.e. multiplied by 65535/4095) or
|
||||
* JXL_TYPE_FLOAT (where the values are rescaled to 0..1, i.e. multiplied
|
||||
* by 1.f/4095.f). While it is allowed, it is obviously not recommended to use a
|
||||
* pixel_format with lower precision than what is specified in the JxlBasicInfo.
|
||||
*
|
||||
* We support interleaved channels as described by the JxlPixelFormat:
|
||||
* - single-channel data, e.g. grayscale
|
||||
* - single-channel + alpha
|
||||
|
@ -357,6 +406,10 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderAddJPEGFrame(
|
|||
* JxlEncoderSetICCProfile. If false, the pixels are assumed to be nonlinear
|
||||
* sRGB for integer data types (JXL_TYPE_UINT8, JXL_TYPE_UINT16), and linear
|
||||
* sRGB for floating point data types (JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT).
|
||||
* Sample values in floating-point pixel formats are allowed to be outside the
|
||||
* nominal range, e.g. to represent out-of-sRGB-gamut colors in the
|
||||
* uses_original_profile=false case. They are however not allowed to be NaN or
|
||||
* +-infinity.
|
||||
*
|
||||
* @param options set of encoder options to use when encoding the frame.
|
||||
* @param pixel_format format for pixels. Object owned by the caller and its
|
||||
|
@ -396,7 +449,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
|
|||
const void* buffer, size_t size, uint32_t index);
|
||||
|
||||
/** Adds a metadata box to the file format. JxlEncoderProcessOutput must be used
|
||||
* to effectively write the box to the output. @ref JxlEncoderUseContainer must
|
||||
* to effectively write the box to the output. @ref JxlEncoderUseBoxes must
|
||||
* be enabled before using this function.
|
||||
*
|
||||
* Background information about the container format and boxes follows here:
|
||||
|
@ -488,20 +541,14 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
|
|||
* multiple boxes of type jxlp are used. This is handled by the encoder when
|
||||
* needed.
|
||||
*
|
||||
* For now metadata boxes can only be added before or after the codestream with
|
||||
* all frames, so using JxlEncoderAddBox is only possible before the first
|
||||
* JxlEncoderAddImageFrame call, and/or after the last JxlEncoderAddImageFrame
|
||||
* call and JxlEncoderCloseInput. Support for adding boxes in-between the
|
||||
* codestream, and/or in-between image frames may be added later, and would
|
||||
* cause the encoder to use jxlp boxes for the codestream.
|
||||
*
|
||||
* @param enc encoder object.
|
||||
* @param type the box type, e.g. "Exif" for EXIF metadata, "XML " for XMP or
|
||||
* IPTC metadata, "jumb" for JUMBF metadata.
|
||||
* @param contents the full contents of the box, for example EXIF
|
||||
* data. For an "Exif" box, the EXIF data must be prepended by a 4-byte tiff
|
||||
* header offset, which may be 4 zero-bytes. The ISO BMFF box header must not
|
||||
* be included, only the contents.
|
||||
* be included, only the contents. Owned by the caller and its contents are
|
||||
* copied internally.
|
||||
* @param size size of the box contents.
|
||||
* @param compress_box Whether to compress this box as a "brob" box. Requires
|
||||
* Brotli support.
|
||||
|
@ -509,18 +556,62 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
|
|||
* using this function without JxlEncoderUseContainer, or adding a box type
|
||||
* that would result in an invalid file format.
|
||||
*/
|
||||
JXL_EXPORT JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, JxlBoxType type,
|
||||
JXL_EXPORT JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc,
|
||||
const JxlBoxType type,
|
||||
const uint8_t* contents,
|
||||
size_t size,
|
||||
JXL_BOOL compress_box);
|
||||
|
||||
/**
|
||||
* Declares that this encoder will not encode any further frames. Further
|
||||
* metadata boxes may still be added.
|
||||
* Indicates the intention to add metadata boxes. This allows @ref
|
||||
* JxlEncoderAddBox to be used. When using this function, then it is required
|
||||
* to use @ref JxlEncoderCloseBoxes at the end.
|
||||
*
|
||||
* Must be called between JxlEncoderAddImageFrame/JPEGFrame of the last frame
|
||||
* By default the encoder assumes no metadata boxes will be added.
|
||||
*
|
||||
* This setting can only be set at the beginning, before encoding starts.
|
||||
*
|
||||
* @param enc encoder object.
|
||||
*/
|
||||
JXL_EXPORT JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc);
|
||||
|
||||
/**
|
||||
* Declares that no further boxes will be added with @ref JxlEncoderAddBox.
|
||||
* This function must be called after the last box is added so the encoder knows
|
||||
* the stream will be finished. It is not necessary to use this function if
|
||||
* @ref JxlEncoderUseBoxes is not used. Further frames may still be added.
|
||||
*
|
||||
* Must be called between JxlEncoderAddBox of the last box
|
||||
* and the next call to JxlEncoderProcessOutput, or JxlEncoderProcessOutput
|
||||
* won't output the last frame correctly.
|
||||
* won't output the last box correctly.
|
||||
*
|
||||
* NOTE: if you don't need to close frames and boxes at separate times, you can
|
||||
* use @ref JxlEncoderCloseInput instead to close both at once.
|
||||
*
|
||||
* @param enc encoder object.
|
||||
*/
|
||||
JXL_EXPORT void JxlEncoderCloseBoxes(JxlEncoder* enc);
|
||||
|
||||
/**
|
||||
* Declares that no frames will be added and @ref JxlEncoderAddImageFrame and
|
||||
* @ref JxlEncoderAddJPEGFrame won't be called anymore. Further metadata boxes
|
||||
* may still be added.
|
||||
*
|
||||
* NOTE: if you don't need to close frames and boxes at separate times, you can
|
||||
* use @ref JxlEncoderCloseInput instead to close both at once.
|
||||
*
|
||||
* @param enc encoder object.
|
||||
*/
|
||||
JXL_EXPORT void JxlEncoderCloseFrames(JxlEncoder* enc);
|
||||
|
||||
/**
|
||||
* Closes any input to the encoder, equivalent to calling JxlEncoderCloseFrames
|
||||
* as well as calling JxlEncoderCloseBoxes if needed. No further input of any
|
||||
* kind may be given to the encoder, but further @ref JxlEncoderProcessOutput
|
||||
* calls should be done to create the final output.
|
||||
*
|
||||
* The requirements of both @ref JxlEncoderCloseFrames and @ref
|
||||
* JxlEncoderCloseBoxes apply to this function.
|
||||
*
|
||||
* @param enc encoder object.
|
||||
*/
|
||||
|
@ -618,7 +709,8 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
|
|||
* @param enc encoder object
|
||||
* @param index index of the extra channel to set.
|
||||
* @param name buffer with the name of the extra channel.
|
||||
* @param size size of the name buffer in bytes.
|
||||
* @param size size of the name buffer in bytes, not counting the terminating
|
||||
* character.
|
||||
* @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
|
||||
*/
|
||||
JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
|
||||
|
@ -642,16 +734,13 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
|
|||
JXL_EXPORT JxlEncoderStatus JxlEncoderOptionsSetInteger(
|
||||
JxlEncoderOptions* options, JxlEncoderOptionId option, int32_t value);
|
||||
|
||||
/** Indicates the encoder should use the box-based JPEG XL container format
|
||||
* (BMFF) instead of outputting the codestream bytes directly. Both with and
|
||||
* without container are valid JPEG XL files, but the container is necessary
|
||||
* when metadata, level 10 features or JPEG reconstruction is used.
|
||||
/** Forces the encoder to use the box-based container format (BMFF) even
|
||||
* when not necessary.
|
||||
*
|
||||
* If enabled, the encoder always uses the container format, even if not
|
||||
* necessary. If disabled, the encoder will still use the container format if
|
||||
* required (such as for JPEG metadata @ref JxlEncoderStoreJPEGMetadata).
|
||||
*
|
||||
* This setting must be explicitely enabled before using @ref JxlEncoderAddBox.
|
||||
* When using @ref JxlEncoderUseBoxes, @ref JxlEncoderStoreJPEGMetadata or @ref
|
||||
* JxlEncoderSetCodestreamLevel with level 10, the encoder will automatically
|
||||
* also use the container format, it is not necessary to use
|
||||
* JxlEncoderUseContainer for those use cases.
|
||||
*
|
||||
* By default this setting is disabled.
|
||||
*
|
||||
|
@ -659,7 +748,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderOptionsSetInteger(
|
|||
*
|
||||
* @param enc encoder object.
|
||||
* @param use_container true if the encoder should always output the JPEG XL
|
||||
* container format.
|
||||
* container format, false to only output it when necessary.
|
||||
* @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
|
||||
* otherwise.
|
||||
*/
|
||||
|
@ -715,8 +804,8 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc,
|
|||
* Enables lossless encoding.
|
||||
*
|
||||
* This is not an option like the others on itself, but rather while enabled it
|
||||
* overrides a set of existing options (such as distance and modular mode) that
|
||||
* enables bit-for-bit lossless encoding.
|
||||
* overrides a set of existing options (such as distance, modular mode and
|
||||
* color transform) that enables bit-for-bit lossless encoding.
|
||||
*
|
||||
* When disabled, those options are not overridden, but since those options
|
||||
* could still have been manually set to a combination that operates losslessly,
|
||||
|
|
|
@ -110,6 +110,15 @@ set(JPEGXL_INTERNAL_SOURCES_DEC
|
|||
jxl/entropy_coder.h
|
||||
jxl/epf.cc
|
||||
jxl/epf.h
|
||||
jxl/fast_dct-inl.h
|
||||
jxl/fast_dct.cc
|
||||
jxl/fast_dct.h
|
||||
jxl/fast_dct128-inl.h
|
||||
jxl/fast_dct16-inl.h
|
||||
jxl/fast_dct256-inl.h
|
||||
jxl/fast_dct32-inl.h
|
||||
jxl/fast_dct64-inl.h
|
||||
jxl/fast_dct8-inl.h
|
||||
jxl/fast_math-inl.h
|
||||
jxl/field_encodings.h
|
||||
jxl/fields.cc
|
||||
|
|
|
@ -59,23 +59,6 @@ void AuxOut::Print(size_t num_inputs) const {
|
|||
}
|
||||
}
|
||||
|
||||
void AuxOut::DumpCoeffImage(const char* label,
|
||||
const Image3S& coeff_image) const {
|
||||
JXL_ASSERT(coeff_image.xsize() % 64 == 0);
|
||||
Image3S reshuffled(coeff_image.xsize() / 8, coeff_image.ysize() * 8);
|
||||
for (size_t c = 0; c < 3; c++) {
|
||||
for (size_t y = 0; y < coeff_image.ysize(); y++) {
|
||||
for (size_t x = 0; x < coeff_image.xsize(); x += 64) {
|
||||
for (size_t i = 0; i < 64; i++) {
|
||||
reshuffled.PlaneRow(c, 8 * y + i / 8)[x / 8 + i % 8] =
|
||||
coeff_image.PlaneRow(c, y)[x + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
DumpImage(label, reshuffled);
|
||||
}
|
||||
|
||||
void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
|
||||
BitWriter::Allotment* JXL_RESTRICT allotment,
|
||||
size_t layer, AuxOut* JXL_RESTRICT aux_out) {
|
||||
|
|
|
@ -251,12 +251,6 @@ struct AuxOut {
|
|||
DumpImage(label, normalized);
|
||||
}
|
||||
|
||||
// This dumps coefficients as a 16-bit PNG with coefficients of a block placed
|
||||
// in the area that would contain that block in a normal image. To view the
|
||||
// resulting image manually, rescale intensities by using:
|
||||
// $ convert -auto-level IMAGE.PNG - | display -
|
||||
void DumpCoeffImage(const char* label, const Image3S& coeff_image) const;
|
||||
|
||||
void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) {
|
||||
inspector_image3f_ = inspector;
|
||||
}
|
||||
|
|
|
@ -233,5 +233,27 @@ TEST_F(ColorManagementTest, D2700ToSRGB) {
|
|||
FloatNear(0.601, 1e-3)));
|
||||
}
|
||||
|
||||
TEST_F(ColorManagementTest, P3HLGTo2020HLG) {
|
||||
ColorEncoding p3_hlg;
|
||||
p3_hlg.SetColorSpace(ColorSpace::kRGB);
|
||||
p3_hlg.white_point = WhitePoint::kD65;
|
||||
p3_hlg.primaries = Primaries::kP3;
|
||||
p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
|
||||
ASSERT_TRUE(p3_hlg.CreateICC());
|
||||
|
||||
ColorEncoding rec2020_hlg = p3_hlg;
|
||||
rec2020_hlg.primaries = Primaries::k2100;
|
||||
ASSERT_TRUE(rec2020_hlg.CreateICC());
|
||||
|
||||
ColorSpaceTransform transform;
|
||||
ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1));
|
||||
const float p3_hlg_values[3] = {0., 0.75, 0.};
|
||||
float rec2020_hlg_values[3];
|
||||
DoColorSpaceTransform(&transform, 0, p3_hlg_values, rec2020_hlg_values);
|
||||
EXPECT_THAT(rec2020_hlg_values,
|
||||
ElementsAre(FloatNear(0.3973, 1e-4), FloatNear(0.7382, 1e-4),
|
||||
FloatNear(0.1183, 1e-4)));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace jxl
|
||||
|
|
|
@ -153,8 +153,8 @@ Status DecodeFrame(const DecompressParams& dparams,
|
|||
true));
|
||||
|
||||
// Handling of progressive decoding.
|
||||
const FrameHeader& frame_header = frame_decoder.GetFrameHeader();
|
||||
{
|
||||
const FrameHeader& frame_header = frame_decoder.GetFrameHeader();
|
||||
size_t max_passes = dparams.max_passes;
|
||||
size_t max_downsampling = std::max(
|
||||
dparams.max_downsampling >> (frame_header.dc_level * 3), size_t(1));
|
||||
|
@ -177,6 +177,7 @@ Status DecodeFrame(const DecompressParams& dparams,
|
|||
frame_decoder.SetMaxPasses(max_passes);
|
||||
}
|
||||
frame_decoder.SetRenderSpotcolors(dparams.render_spotcolors);
|
||||
frame_decoder.SetCoalescing(dparams.coalescing);
|
||||
|
||||
size_t processed_bytes = reader->TotalBitsConsumed() / kBitsPerByte;
|
||||
|
||||
|
@ -192,9 +193,18 @@ Status DecodeFrame(const DecompressParams& dparams,
|
|||
size_t e = b + frame_decoder.SectionSizes()[i];
|
||||
bytes_to_skip += e - b;
|
||||
size_t pos = reader->TotalBitsConsumed() / kBitsPerByte;
|
||||
if (pos + e <= reader->TotalBytes()) {
|
||||
auto br = make_unique<BitReader>(
|
||||
Span<const uint8_t>(reader->FirstByte() + b + pos, e - b));
|
||||
if (pos + (dparams.allow_more_progressive_steps &&
|
||||
(i == 0 ||
|
||||
frame_header.encoding == FrameEncoding::kModular)
|
||||
? b
|
||||
: e) <=
|
||||
reader->TotalBytes() ||
|
||||
(i == 0 && dparams.allow_more_progressive_steps)) {
|
||||
auto br = make_unique<BitReader>(Span<const uint8_t>(
|
||||
reader->FirstByte() + b + pos,
|
||||
(pos + b > reader->TotalBytes()
|
||||
? 0
|
||||
: std::min(reader->TotalBytes() - pos - b, e - b))));
|
||||
section_info.emplace_back(FrameDecoder::SectionInfo{br.get(), i});
|
||||
section_closers.emplace_back(
|
||||
make_unique<BitReaderScopedCloser>(br.get(), &close_ok));
|
||||
|
@ -249,7 +259,23 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
|
|||
dec_state_->shared_storage.matrices = DequantMatrices();
|
||||
|
||||
frame_header_.nonserialized_is_preview = is_preview;
|
||||
JXL_RETURN_IF_ERROR(DecodeFrameHeader(br, &frame_header_));
|
||||
size_t pos = br->TotalBitsConsumed() / kBitsPerByte;
|
||||
Status have_frameheader =
|
||||
br->TotalBytes() > pos && DecodeFrameHeader(br, &frame_header_);
|
||||
JXL_RETURN_IF_ERROR(have_frameheader || allow_partial_frames);
|
||||
if (!have_frameheader) {
|
||||
if (dec_state_->shared_storage.dc_frames[0].xsize() > 0) {
|
||||
// If we have a (partial) DC frame available, but we don't have the next
|
||||
// frame header (so allow_partial_frames is true), then we'll assume the
|
||||
// next frame uses that DC frame (which may not be true, e.g. there might
|
||||
// first be a ReferenceOnly patch frame, but it's reasonable to assume
|
||||
// that the DC frame is a good progressive preview)
|
||||
frame_header_.flags |= FrameHeader::kUseDcFrame;
|
||||
frame_header_.encoding = FrameEncoding::kVarDCT;
|
||||
frame_header_.dc_level = 0;
|
||||
} else
|
||||
return JXL_FAILURE("Couldn't read frame header");
|
||||
}
|
||||
frame_dim_ = frame_header_.ToFrameDimensions();
|
||||
|
||||
const size_t num_passes = frame_header_.passes.num_passes;
|
||||
|
@ -271,7 +297,8 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
|
|||
const size_t toc_entries = NumTocEntries(num_groups, frame_dim_.num_dc_groups,
|
||||
num_passes, has_ac_global);
|
||||
JXL_RETURN_IF_ERROR(ReadGroupOffsets(toc_entries, br, §ion_offsets_,
|
||||
§ion_sizes_, &groups_total_size));
|
||||
§ion_sizes_, &groups_total_size) ||
|
||||
allow_partial_frames);
|
||||
|
||||
JXL_DASSERT((br->TotalBitsConsumed() % kBitsPerByte) == 0);
|
||||
const size_t group_codes_begin = br->TotalBitsConsumed() / kBitsPerByte;
|
||||
|
@ -371,11 +398,14 @@ Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
|
|||
if (shared.frame_header.flags & FrameHeader::kNoise) {
|
||||
JXL_RETURN_IF_ERROR(DecodeNoise(br, &shared.image_features.noise_params));
|
||||
}
|
||||
if (!allow_partial_dc_global_ ||
|
||||
br->TotalBitsConsumed() < br->TotalBytes() * kBitsPerByte) {
|
||||
JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));
|
||||
|
||||
JXL_RETURN_IF_ERROR(dec_state_->shared_storage.matrices.DecodeDC(br));
|
||||
if (frame_header_.encoding == FrameEncoding::kVarDCT) {
|
||||
JXL_RETURN_IF_ERROR(
|
||||
jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
|
||||
if (frame_header_.encoding == FrameEncoding::kVarDCT) {
|
||||
JXL_RETURN_IF_ERROR(
|
||||
jxl::DecodeGlobalDCInfo(br, decoded_->IsJPEG(), dec_state_, pool_));
|
||||
}
|
||||
}
|
||||
// Splines' draw cache uses the color correlation map.
|
||||
if (shared.frame_header.flags & FrameHeader::kSplines) {
|
||||
|
@ -406,7 +436,7 @@ Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) {
|
|||
frame_dim_.dc_group_dim, frame_dim_.dc_group_dim);
|
||||
JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
|
||||
mrect, br, 3, 1000, ModularStreamId::ModularDC(dc_group_id),
|
||||
/*zerofill=*/false, nullptr, nullptr));
|
||||
/*zerofill=*/false, nullptr, nullptr, allow_partial_frames_));
|
||||
if (frame_header_.encoding == FrameEncoding::kVarDCT) {
|
||||
JXL_RETURN_IF_ERROR(
|
||||
modular_frame_decoder_.DecodeAcMetadata(dc_group_id, br, dec_state_));
|
||||
|
@ -608,13 +638,13 @@ Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
|
|||
JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
|
||||
mrect, br[i - decoded_passes_per_ac_group_[ac_group_id]], minShift,
|
||||
maxShift, ModularStreamId::ModularAC(ac_group_id, i),
|
||||
/*zerofill=*/false, dec_state_, decoded_));
|
||||
/*zerofill=*/false, dec_state_, decoded_, allow_partial_frames_));
|
||||
} else if (i >= decoded_passes_per_ac_group_[ac_group_id] + num_passes &&
|
||||
force_draw) {
|
||||
JXL_RETURN_IF_ERROR(modular_frame_decoder_.DecodeGroup(
|
||||
mrect, nullptr, minShift, maxShift,
|
||||
ModularStreamId::ModularAC(ac_group_id, i), /*zerofill=*/true,
|
||||
dec_state_, decoded_));
|
||||
dec_state_, decoded_, allow_partial_frames_));
|
||||
}
|
||||
}
|
||||
decoded_passes_per_ac_group_[ac_group_id] += num_passes;
|
||||
|
@ -846,7 +876,7 @@ Status FrameDecoder::Flush() {
|
|||
dec_state_, pool_, decoded_, is_finalized_));
|
||||
JXL_RETURN_IF_ERROR(FinalizeFrameDecoding(decoded_, dec_state_, pool_,
|
||||
/*force_fir=*/false,
|
||||
/*skip_blending=*/false,
|
||||
/*skip_blending=*/!coalescing_,
|
||||
/*move_ec=*/is_finalized_));
|
||||
|
||||
num_renders_++;
|
||||
|
@ -941,7 +971,8 @@ Status FrameDecoder::FinalizeFrame() {
|
|||
|
||||
JXL_RETURN_IF_ERROR(Flush());
|
||||
|
||||
if (dec_state_->shared->frame_header.CanBeReferenced()) {
|
||||
if (dec_state_->shared->frame_header.CanBeReferenced() &&
|
||||
(frame_header_.frame_type != kRegularFrame || coalescing_)) {
|
||||
size_t id = dec_state_->shared->frame_header.save_as_reference;
|
||||
auto& reference_frame = dec_state_->shared_storage.reference_frames[id];
|
||||
if (dec_state_->pre_color_transform_frame.xsize() == 0) {
|
||||
|
@ -949,7 +980,7 @@ Status FrameDecoder::FinalizeFrame() {
|
|||
} else {
|
||||
reference_frame.storage = ImageBundle(decoded_->metadata());
|
||||
reference_frame.storage.SetFromImage(
|
||||
std::move(dec_state_->pre_color_transform_frame),
|
||||
CopyImage(dec_state_->pre_color_transform_frame),
|
||||
decoded_->c_current());
|
||||
if (decoded_->HasExtraChannels()) {
|
||||
const std::vector<ImageF>* ecs = &dec_state_->pre_color_transform_ec;
|
||||
|
@ -989,8 +1020,8 @@ Status FrameDecoder::FinalizeFrame() {
|
|||
// coalesced frame of size equal to image dimensions. Other frames are not
|
||||
// blended, thus their final size is the size that was defined in the
|
||||
// frame_header.
|
||||
if (frame_header_.frame_type == kRegularFrame ||
|
||||
frame_header_.frame_type == kSkipProgressive) {
|
||||
if (coalescing_ && (frame_header_.frame_type == kRegularFrame ||
|
||||
frame_header_.frame_type == kSkipProgressive)) {
|
||||
decoded_->ShrinkTo(
|
||||
dec_state_->shared->frame_header.nonserialized_metadata->xsize(),
|
||||
dec_state_->shared->frame_header.nonserialized_metadata->ysize());
|
||||
|
|
|
@ -63,6 +63,7 @@ class FrameDecoder {
|
|||
constraints_ = constraints;
|
||||
}
|
||||
void SetRenderSpotcolors(bool rsc) { render_spotcolors_ = rsc; }
|
||||
void SetCoalescing(bool c) { coalescing_ = c; }
|
||||
|
||||
// Read FrameHeader and table of contents from the given BitReader.
|
||||
// Also checks frame dimensions for their limits, and sets the output
|
||||
|
@ -254,6 +255,7 @@ class FrameDecoder {
|
|||
bool allow_partial_frames_;
|
||||
bool allow_partial_dc_global_;
|
||||
bool render_spotcolors_ = true;
|
||||
bool coalescing_ = true;
|
||||
|
||||
std::vector<uint8_t> processed_section_;
|
||||
std::vector<uint8_t> decoded_passes_per_ac_group_;
|
||||
|
|
|
@ -160,17 +160,21 @@ Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader,
|
|||
nb_chans = 1;
|
||||
}
|
||||
do_color = decode_color;
|
||||
if (!do_color) nb_chans = 0;
|
||||
size_t nb_extra = metadata.extra_channel_info.size();
|
||||
bool has_tree = reader->ReadBits(1);
|
||||
if (has_tree) {
|
||||
size_t tree_size_limit = std::min(
|
||||
static_cast<size_t>(1 << 22),
|
||||
1024 + frame_dim.xsize * frame_dim.ysize * (nb_chans + nb_extra) / 16);
|
||||
JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
|
||||
JXL_RETURN_IF_ERROR(
|
||||
DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
|
||||
if (!allow_truncated_group ||
|
||||
reader->TotalBitsConsumed() < reader->TotalBytes() * kBitsPerByte) {
|
||||
if (has_tree) {
|
||||
size_t tree_size_limit =
|
||||
std::min(static_cast<size_t>(1 << 22),
|
||||
1024 + frame_dim.xsize * frame_dim.ysize *
|
||||
(nb_chans + nb_extra) / 16);
|
||||
JXL_RETURN_IF_ERROR(DecodeTree(reader, &tree, tree_size_limit));
|
||||
JXL_RETURN_IF_ERROR(
|
||||
DecodeHistograms(reader, (tree.size() + 1) / 2, &code, &context_map));
|
||||
}
|
||||
}
|
||||
if (!do_color) nb_chans = 0;
|
||||
|
||||
bool fp = metadata.bit_depth.floating_point_sample;
|
||||
|
||||
|
@ -257,12 +261,10 @@ void ModularFrameDecoder::MaybeDropFullImage() {
|
|||
}
|
||||
}
|
||||
|
||||
Status ModularFrameDecoder::DecodeGroup(const Rect& rect, BitReader* reader,
|
||||
int minShift, int maxShift,
|
||||
const ModularStreamId& stream,
|
||||
bool zerofill,
|
||||
PassesDecoderState* dec_state,
|
||||
ImageBundle* output) {
|
||||
Status ModularFrameDecoder::DecodeGroup(
|
||||
const Rect& rect, BitReader* reader, int minShift, int maxShift,
|
||||
const ModularStreamId& stream, bool zerofill, PassesDecoderState* dec_state,
|
||||
ImageBundle* output, bool allow_truncated) {
|
||||
JXL_DASSERT(stream.kind == ModularStreamId::kModularDC ||
|
||||
stream.kind == ModularStreamId::kModularAC);
|
||||
const size_t xsize = rect.xsize();
|
||||
|
@ -302,9 +304,11 @@ Status ModularFrameDecoder::DecodeGroup(const Rect& rect, BitReader* reader,
|
|||
if (gi.channel.empty()) return true;
|
||||
ModularOptions options;
|
||||
if (!zerofill) {
|
||||
if (!ModularGenericDecompress(
|
||||
reader, gi, /*header=*/nullptr, stream.ID(frame_dim), &options,
|
||||
/*undo_transforms=*/true, &tree, &code, &context_map)) {
|
||||
if (!ModularGenericDecompress(reader, gi, /*header=*/nullptr,
|
||||
stream.ID(frame_dim), &options,
|
||||
/*undo_transforms=*/true, &tree, &code,
|
||||
&context_map, allow_truncated) &&
|
||||
!allow_truncated) {
|
||||
return JXL_FAILURE("Failed to decode modular group");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,7 +91,8 @@ class ModularFrameDecoder {
|
|||
bool allow_truncated_group);
|
||||
Status DecodeGroup(const Rect& rect, BitReader* reader, int minShift,
|
||||
int maxShift, const ModularStreamId& stream, bool zerofill,
|
||||
PassesDecoderState* dec_state, ImageBundle* output);
|
||||
PassesDecoderState* dec_state, ImageBundle* output,
|
||||
bool allow_truncated);
|
||||
// Decodes a VarDCT DC group (`group_id`) from the given `reader`.
|
||||
Status DecodeVarDCTDC(size_t group_id, BitReader* reader,
|
||||
PassesDecoderState* dec_state);
|
||||
|
|
|
@ -26,6 +26,8 @@ struct DecompressParams {
|
|||
bool keep_dct = false;
|
||||
// If true, render spot colors (otherwise only returned as extra channels)
|
||||
bool render_spotcolors = true;
|
||||
// If true, coalesce frames (otherwise return unblended frames)
|
||||
bool coalescing = true;
|
||||
|
||||
// These cannot be kOn because they need encoder support.
|
||||
Override preview = Override::kDefault;
|
||||
|
|
|
@ -114,8 +114,8 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
|
|||
" > %" PRIuS,
|
||||
pos.y, ref_pos.ysize, ysize);
|
||||
}
|
||||
for (size_t i = 0; i < shared_->metadata->m.extra_channel_info.size() + 1;
|
||||
i++) {
|
||||
for (size_t j = 0; j < shared_->metadata->m.extra_channel_info.size() + 1;
|
||||
j++) {
|
||||
uint32_t blend_mode = read_num(kPatchBlendModeContext);
|
||||
if (blend_mode >= uint32_t(PatchBlendMode::kNumBlendModes)) {
|
||||
return JXL_FAILURE("Invalid patch blend mode: %u", blend_mode);
|
||||
|
@ -125,7 +125,7 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
|
|||
if (UsesAlpha(info.mode)) {
|
||||
*uses_extra_channels = true;
|
||||
}
|
||||
if (info.mode != PatchBlendMode::kNone && i > 0) {
|
||||
if (info.mode != PatchBlendMode::kNone && j > 0) {
|
||||
*uses_extra_channels = true;
|
||||
}
|
||||
if (UsesAlpha(info.mode) &&
|
||||
|
|
|
@ -357,8 +357,8 @@ void DoYCbCrUpsampling(size_t hs, size_t vs, ImageF* plane_in, const Rect& rect,
|
|||
Store(left, d, out + x);
|
||||
Store(right, d, out + x + 1);
|
||||
#else
|
||||
Store(InterleaveLower(left, right), d, out + x);
|
||||
Store(InterleaveUpper(left, right), d, out + x + Lanes(d));
|
||||
Store(InterleaveLower(d, left, right), d, out + x);
|
||||
Store(InterleaveUpper(d, left, right), d, out + x + Lanes(d));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
|
@ -176,8 +176,8 @@ void Upsample(const ImageF& src, const Rect& src_rect, ImageF* dst,
|
|||
min = Min(LoadU(df, raw_min_row + sx + fx), min);
|
||||
max = Max(LoadU(df, raw_max_row + sx + fx), max);
|
||||
}
|
||||
min = MinOfLanes(min);
|
||||
max = MaxOfLanes(max);
|
||||
min = MinOfLanes(df, min);
|
||||
max = MaxOfLanes(df, max);
|
||||
for (size_t lx = 0; lx < N; lx += V) {
|
||||
StoreU(min, df, min_row + N * sx + lx);
|
||||
StoreU(max, df, max_row + N * sx + lx);
|
||||
|
|
|
@ -175,10 +175,15 @@ size_t BitsPerChannel(JxlDataType data_type) {
|
|||
}
|
||||
|
||||
enum class DecoderStage : uint32_t {
|
||||
kInited, // Decoder created, no JxlDecoderProcessInput called yet
|
||||
kStarted, // Running JxlDecoderProcessInput calls
|
||||
kFinished, // Codestream done, but other boxes could still occur
|
||||
kError, // Error occurred, decoder object no longer usable
|
||||
kInited, // Decoder created, no JxlDecoderProcessInput called yet
|
||||
kStarted, // Running JxlDecoderProcessInput calls
|
||||
kCodestreamFinished, // Codestream done, but other boxes could still occur.
|
||||
// This stage can also occur before having seen the
|
||||
// entire codestream if the user didn't subscribe to any
|
||||
// codestream events at all, e.g. only to box events,
|
||||
// or, the user only subscribed to basic info, and only
|
||||
// the header of the codestream was parsed.
|
||||
kError, // Error occurred, decoder object no longer usable
|
||||
};
|
||||
|
||||
enum class FrameStage : uint32_t {
|
||||
|
@ -403,10 +408,11 @@ struct JxlDecoderStruct {
|
|||
// Status of progression, internal.
|
||||
bool got_signature;
|
||||
bool first_codestream_seen;
|
||||
// Indicates we know that we've seen the last codestream, however this is not
|
||||
// guaranteed to be true for the last box because a jxl file may have multiple
|
||||
// "jxlp" boxes and it is possible (and permitted) that the last one is not a
|
||||
// final box that uses size 0 to indicate the end.
|
||||
// Indicates we know that we've seen the last codestream box: either this
|
||||
// was a jxlc box, or a jxlp box that has its index indicated as last by
|
||||
// having its most significant bit set, or no boxes are used at all. This
|
||||
// does not indicate the full codestream has already been seen, only the
|
||||
// last box of it has been initiated.
|
||||
bool last_codestream_seen;
|
||||
bool got_basic_info;
|
||||
size_t header_except_icc_bits = 0; // To skip everything before ICC.
|
||||
|
@ -453,6 +459,7 @@ struct JxlDecoderStruct {
|
|||
// Settings
|
||||
bool keep_orientation;
|
||||
bool render_spotcolors;
|
||||
bool coalescing;
|
||||
|
||||
// Bitfield, for which informative events (JXL_DEC_BASIC_INFO, etc...) the
|
||||
// decoder returns a status. By default, do not return for any of the events,
|
||||
|
@ -594,6 +601,17 @@ struct JxlDecoderStruct {
|
|||
avail_in -= size;
|
||||
file_pos += size;
|
||||
}
|
||||
|
||||
// Whether the decoder can use more codestream input for a purpose it needs.
|
||||
// This returns false if the user didn't subscribe to any events that
|
||||
// require the codestream (e.g. only subscribed to metadata boxes), or all
|
||||
// parts of the codestream that are subscribed to (e.g. only basic info) have
|
||||
// already occured.
|
||||
bool CanUseMoreCodestreamInput() const {
|
||||
// The decoder can set this to finished early if all relevant events were
|
||||
// processed, so this check works.
|
||||
return stage != DecoderStage::kCodestreamFinished;
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(zond): Make this depend on the data loaded into the decoder.
|
||||
|
@ -687,6 +705,7 @@ void JxlDecoderReset(JxlDecoder* dec) {
|
|||
dec->thread_pool.reset();
|
||||
dec->keep_orientation = false;
|
||||
dec->render_spotcolors = true;
|
||||
dec->coalescing = true;
|
||||
dec->orig_events_wanted = 0;
|
||||
dec->frame_references.clear();
|
||||
dec->frame_saved_as.clear();
|
||||
|
@ -797,6 +816,14 @@ JxlDecoderStatus JxlDecoderSetRenderSpotcolors(JxlDecoder* dec,
|
|||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
||||
JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) {
|
||||
if (dec->stage != DecoderStage::kInited) {
|
||||
return JXL_API_ERROR("Must set coalescing option before starting");
|
||||
}
|
||||
dec->coalescing = !!coalescing;
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
||||
namespace jxl {
|
||||
namespace {
|
||||
|
||||
|
@ -1171,6 +1198,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
|
|||
jxl::DecompressParams dparams;
|
||||
dparams.preview = want_preview ? jxl::Override::kOn : jxl::Override::kOff;
|
||||
dparams.render_spotcolors = dec->render_spotcolors;
|
||||
dparams.coalescing = true;
|
||||
jxl::ImageBundle ib(&dec->metadata.m);
|
||||
PassesDecoderState preview_dec_state;
|
||||
JXL_API_RETURN_IF_ERROR(preview_dec_state.output_encoding_info.Set(
|
||||
|
@ -1236,6 +1264,10 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
|
|||
// is last of current still
|
||||
dec->is_last_of_still =
|
||||
dec->is_last_total || dec->frame_header->animation_frame.duration > 0;
|
||||
// is kRegularFrame and coalescing is disabled
|
||||
dec->is_last_of_still |=
|
||||
(!dec->coalescing &&
|
||||
dec->frame_header->frame_type == FrameType::kRegularFrame);
|
||||
|
||||
const size_t internal_frame_index = dec->internal_frames;
|
||||
const size_t external_frame_index = dec->external_frames;
|
||||
|
@ -1319,6 +1351,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
|
|||
dec->frame_dec.reset(new FrameDecoder(
|
||||
dec->passes_state.get(), dec->metadata, dec->thread_pool.get()));
|
||||
dec->frame_dec->SetRenderSpotcolors(dec->render_spotcolors);
|
||||
dec->frame_dec->SetCoalescing(dec->coalescing);
|
||||
|
||||
// If JPEG reconstruction is wanted and possible, set the jpeg_data of
|
||||
// the ImageBundle.
|
||||
|
@ -1516,7 +1549,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec, const uint8_t* in,
|
|||
}
|
||||
}
|
||||
|
||||
dec->stage = DecoderStage::kFinished;
|
||||
dec->stage = DecoderStage::kCodestreamFinished;
|
||||
// Return success, this means there is nothing more to do.
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
@ -1710,13 +1743,14 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
|
|||
|
||||
if (dec->box_stage == BoxStage::kHeader) {
|
||||
if (!dec->have_container) {
|
||||
if (dec->stage == DecoderStage::kFinished) return JXL_DEC_SUCCESS;
|
||||
if (dec->stage == DecoderStage::kCodestreamFinished)
|
||||
return JXL_DEC_SUCCESS;
|
||||
dec->box_stage = BoxStage::kCodestream;
|
||||
dec->box_contents_unbounded = true;
|
||||
continue;
|
||||
}
|
||||
if (dec->avail_in == 0) {
|
||||
if (dec->stage != DecoderStage::kFinished) {
|
||||
if (dec->stage != DecoderStage::kCodestreamFinished) {
|
||||
// Not yet seen (all) codestream boxes.
|
||||
return JXL_DEC_NEED_MORE_INPUT;
|
||||
}
|
||||
|
@ -1813,6 +1847,10 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
|
|||
if (memcmp(dec->box_type, "ftyp", 4) == 0) {
|
||||
dec->box_stage = BoxStage::kFtyp;
|
||||
} else if (memcmp(dec->box_type, "jxlc", 4) == 0) {
|
||||
if (dec->last_codestream_seen) {
|
||||
return JXL_API_ERROR("there can only be one jxlc box");
|
||||
}
|
||||
dec->last_codestream_seen = true;
|
||||
dec->box_stage = BoxStage::kCodestream;
|
||||
} else if (memcmp(dec->box_type, "jxlp", 4) == 0) {
|
||||
dec->box_stage = BoxStage::kPartialCodestream;
|
||||
|
@ -1844,7 +1882,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
|
|||
dec->box_stage = BoxStage::kSkip;
|
||||
} else if (dec->box_stage == BoxStage::kPartialCodestream) {
|
||||
if (dec->last_codestream_seen) {
|
||||
return JXL_API_ERROR("cannot have codestream after last codestream");
|
||||
return JXL_API_ERROR("cannot have jxlp box after last jxlp box");
|
||||
}
|
||||
// TODO(lode): error if box is unbounded but last bit not set
|
||||
if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
|
||||
|
@ -1913,6 +1951,15 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
|
|||
// Codestream done, but there may be more other boxes.
|
||||
dec->box_stage = BoxStage::kSkip;
|
||||
continue;
|
||||
} else if (!dec->last_codestream_seen &&
|
||||
dec->CanUseMoreCodestreamInput()) {
|
||||
// Even though the codestream was successfully decoded, the last seen
|
||||
// jxlp box was not marked as last, so more jxlp boxes are expected.
|
||||
// Since the codestream already successfully finished, the only valid
|
||||
// case where this could happen is if there are empty jxlp boxes after
|
||||
// this.
|
||||
dec->box_stage = BoxStage::kSkip;
|
||||
continue;
|
||||
} else {
|
||||
// Codestream decoded, and no box output requested, skip all further
|
||||
// input and return success.
|
||||
|
@ -2031,6 +2078,8 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
|
|||
|
||||
if (sig == JXL_SIG_CONTAINER) {
|
||||
dec->have_container = 1;
|
||||
} else {
|
||||
dec->last_codestream_seen = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2043,10 +2092,13 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
|
|||
// Even if the box handling returns success, certain types of
|
||||
// data may be missing.
|
||||
if (status == JXL_DEC_SUCCESS) {
|
||||
if (dec->stage != DecoderStage::kFinished) {
|
||||
// TODO(lode): consider not returning this error if only subscribed to
|
||||
// the JXL_DEC_BOX event and so finishing the image frames is not
|
||||
// required.
|
||||
if (dec->CanUseMoreCodestreamInput()) {
|
||||
if (!dec->last_codestream_seen) {
|
||||
// In case of jxlp boxes, this means no jxlp box marked as final was
|
||||
// seen yet. Perhaps there is an empty jxlp box after this, this is not
|
||||
// an error but will require more input.
|
||||
return JXL_DEC_NEED_MORE_INPUT;
|
||||
}
|
||||
return JXL_API_ERROR("codestream never finished");
|
||||
}
|
||||
|
||||
|
@ -2274,6 +2326,10 @@ JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,
|
|||
// Don't know image dimensions yet, cannot check for valid size.
|
||||
return JXL_DEC_NEED_MORE_INPUT;
|
||||
}
|
||||
if (!dec->coalescing &&
|
||||
(!dec->frame_header || dec->frame_stage == FrameStage::kHeader)) {
|
||||
return JXL_API_ERROR("Don't know frame dimensions yet");
|
||||
}
|
||||
if (format->num_channels > 4) {
|
||||
return JXL_API_ERROR("More than 4 channels not supported");
|
||||
}
|
||||
|
@ -2292,6 +2348,22 @@ JxlDecoderStatus PrepareSizeCheck(const JxlDecoder* dec,
|
|||
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
||||
// helper function to get the dimensions of the current image buffer
|
||||
void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize,
|
||||
bool oriented) {
|
||||
xsize = dec->metadata.oriented_xsize(dec->keep_orientation || !oriented);
|
||||
ysize = dec->metadata.oriented_ysize(dec->keep_orientation || !oriented);
|
||||
if (!dec->coalescing) {
|
||||
xsize = dec->frame_header->ToFrameDimensions().xsize;
|
||||
ysize = dec->frame_header->ToFrameDimensions().ysize;
|
||||
if (!dec->keep_orientation && oriented &&
|
||||
static_cast<int>(dec->metadata.m.GetOrientation()) > 4) {
|
||||
std::swap(xsize, ysize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
|
||||
|
@ -2320,7 +2392,9 @@ JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
|
|||
// ConvertImageInternal.
|
||||
size_t xsize = dec->ib->xsize();
|
||||
size_t ysize = dec->ib->ysize();
|
||||
dec->ib->ShrinkTo(dec->metadata.size.xsize(), dec->metadata.size.ysize());
|
||||
size_t xsize_nopadding, ysize_nopadding;
|
||||
GetCurrentDimensions(dec, xsize_nopadding, ysize_nopadding, false);
|
||||
dec->ib->ShrinkTo(xsize_nopadding, ysize_nopadding);
|
||||
JxlDecoderStatus status = jxl::ConvertImageInternal(
|
||||
dec, *dec->ib, dec->image_out_format,
|
||||
/*want_extra_channel=*/false,
|
||||
|
@ -2413,15 +2487,14 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
|
|||
if (format->num_channels < 3 && !dec->metadata.m.color_encoding.IsGray()) {
|
||||
return JXL_API_ERROR("Grayscale output not possible for color image");
|
||||
}
|
||||
|
||||
size_t xsize, ysize;
|
||||
GetCurrentDimensions(dec, xsize, ysize, true);
|
||||
size_t row_size =
|
||||
jxl::DivCeil(dec->metadata.oriented_xsize(dec->keep_orientation) *
|
||||
format->num_channels * bits,
|
||||
jxl::kBitsPerByte);
|
||||
jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
|
||||
if (format->align > 1) {
|
||||
row_size = jxl::DivCeil(row_size, format->align) * format->align;
|
||||
}
|
||||
*size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation);
|
||||
*size = row_size * ysize;
|
||||
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
@ -2474,13 +2547,14 @@ JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec,
|
|||
JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
|
||||
if (status != JXL_DEC_SUCCESS) return status;
|
||||
|
||||
size_t row_size = jxl::DivCeil(
|
||||
dec->metadata.oriented_xsize(dec->keep_orientation) * num_channels * bits,
|
||||
jxl::kBitsPerByte);
|
||||
size_t xsize, ysize;
|
||||
GetCurrentDimensions(dec, xsize, ysize, true);
|
||||
size_t row_size =
|
||||
jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte);
|
||||
if (format->align > 1) {
|
||||
row_size = jxl::DivCeil(row_size, format->align) * format->align;
|
||||
}
|
||||
*size = row_size * dec->metadata.oriented_ysize(dec->keep_orientation);
|
||||
*size = row_size * ysize;
|
||||
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
@ -2549,7 +2623,70 @@ JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
|
|||
}
|
||||
header->name_length = dec->frame_header->name.size();
|
||||
header->is_last = dec->frame_header->is_last;
|
||||
size_t xsize, ysize;
|
||||
GetCurrentDimensions(dec, xsize, ysize, true);
|
||||
header->layer_info.xsize = xsize;
|
||||
header->layer_info.ysize = ysize;
|
||||
if (!dec->coalescing && dec->frame_header->custom_size_or_origin) {
|
||||
header->layer_info.crop_x0 = dec->frame_header->frame_origin.x0;
|
||||
header->layer_info.crop_y0 = dec->frame_header->frame_origin.y0;
|
||||
} else {
|
||||
header->layer_info.crop_x0 = 0;
|
||||
header->layer_info.crop_y0 = 0;
|
||||
}
|
||||
if (!dec->keep_orientation && !dec->coalescing) {
|
||||
// orient the crop offset
|
||||
size_t W = dec->metadata.oriented_xsize(false);
|
||||
size_t H = dec->metadata.oriented_ysize(false);
|
||||
if (metadata.orientation > 4) {
|
||||
std::swap(header->layer_info.crop_x0, header->layer_info.crop_y0);
|
||||
}
|
||||
size_t o = (metadata.orientation - 1) & 3;
|
||||
if (o > 0 && o < 3) {
|
||||
header->layer_info.crop_x0 = W - xsize - header->layer_info.crop_x0;
|
||||
}
|
||||
if (o > 1) {
|
||||
header->layer_info.crop_y0 = H - ysize - header->layer_info.crop_y0;
|
||||
}
|
||||
}
|
||||
if (dec->coalescing) {
|
||||
header->layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
|
||||
header->layer_info.blend_info.source = 0;
|
||||
header->layer_info.blend_info.alpha = 0;
|
||||
header->layer_info.blend_info.clamp = JXL_FALSE;
|
||||
header->layer_info.save_as_reference = 0;
|
||||
} else {
|
||||
header->layer_info.blend_info.blendmode =
|
||||
static_cast<JxlBlendMode>(dec->frame_header->blending_info.mode);
|
||||
header->layer_info.blend_info.source =
|
||||
dec->frame_header->blending_info.source;
|
||||
header->layer_info.blend_info.alpha =
|
||||
dec->frame_header->blending_info.alpha_channel;
|
||||
header->layer_info.blend_info.clamp =
|
||||
dec->frame_header->blending_info.clamp;
|
||||
header->layer_info.save_as_reference = dec->frame_header->save_as_reference;
|
||||
}
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
||||
JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(const JxlDecoder* dec,
|
||||
size_t index,
|
||||
JxlBlendInfo* blend_info) {
|
||||
if (!dec->frame_header || dec->frame_stage == FrameStage::kHeader) {
|
||||
return JXL_API_ERROR("no frame header available");
|
||||
}
|
||||
const auto& metadata = dec->metadata.m;
|
||||
if (index >= metadata.num_extra_channels) {
|
||||
return JXL_API_ERROR("Invalid extra channel index");
|
||||
}
|
||||
blend_info->blendmode = static_cast<JxlBlendMode>(
|
||||
dec->frame_header->extra_channel_blending_info[index].mode);
|
||||
blend_info->source =
|
||||
dec->frame_header->extra_channel_blending_info[index].source;
|
||||
blend_info->alpha =
|
||||
dec->frame_header->extra_channel_blending_info[index].alpha_channel;
|
||||
blend_info->clamp =
|
||||
dec->frame_header->extra_channel_blending_info[index].clamp;
|
||||
return JXL_DEC_SUCCESS;
|
||||
}
|
||||
|
||||
|
|
|
@ -1496,6 +1496,7 @@ TEST_P(DecodeTestParam, PixelTest) {
|
|||
0};
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
cparams.resampling = config.upsampling;
|
||||
cparams.ec_resampling = config.upsampling;
|
||||
jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
|
||||
|
@ -1747,6 +1748,7 @@ TEST(DecodeTest, PixelTestWithICCProfileLossless) {
|
|||
JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
// For variation: some have container and no preview, others have preview
|
||||
// and no container.
|
||||
jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
|
||||
|
@ -2264,6 +2266,7 @@ TEST(DecodeTest, AlignTest) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
|
||||
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
|
||||
cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false);
|
||||
|
@ -2321,6 +2324,7 @@ TEST(DecodeTest, AnimationTest) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
|
@ -2424,6 +2428,7 @@ TEST(DecodeTest, AnimationTestStreaming) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
|
@ -2529,6 +2534,7 @@ TEST(DecodeTest, ExtraChannelTest) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
|
||||
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
|
||||
cparams, kCSBF_None, JXL_ORIENT_IDENTITY, false);
|
||||
|
@ -2645,6 +2651,7 @@ TEST(DecodeTest, SkipFrameTest) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
|
@ -2807,6 +2814,7 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
|
|||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
|
@ -2962,6 +2970,431 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
|
|||
JxlDecoderDestroy(dec);
|
||||
}
|
||||
|
||||
TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
|
||||
size_t xsize = 90, ysize = 120;
|
||||
constexpr size_t num_frames = 16;
|
||||
std::vector<uint8_t> frames[num_frames + 5];
|
||||
JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
|
||||
|
||||
jxl::CodecInOut io;
|
||||
io.SetSize(xsize, ysize);
|
||||
io.metadata.m.SetUintSamples(16);
|
||||
io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
|
||||
io.metadata.m.have_animation = true;
|
||||
io.frames.clear();
|
||||
io.frames.reserve(num_frames + 5);
|
||||
io.SetSize(xsize, ysize);
|
||||
|
||||
std::vector<uint32_t> frame_durations_c;
|
||||
std::vector<uint32_t> frame_durations_nc;
|
||||
std::vector<uint32_t> frame_xsize, frame_ysize, frame_x0, frame_y0;
|
||||
|
||||
for (size_t i = 0; i < num_frames; ++i) {
|
||||
size_t cropxsize = 1 + xsize * 2 / (i + 1);
|
||||
size_t cropysize = 1 + ysize * 3 / (i + 2);
|
||||
int cropx0 = i * 3 - 8;
|
||||
int cropy0 = i * 4 - 7;
|
||||
if (i < 5) {
|
||||
std::vector<uint8_t> frame_internal =
|
||||
jxl::test::GetSomeTestImage(xsize / 2, ysize / 2, 4, i * 2 + 1);
|
||||
// An internal frame with 0 duration, and use_for_next_frame, this is a
|
||||
// frame that is not rendered and not output by default by the API, but on
|
||||
// which the rendered frames depend
|
||||
jxl::ImageBundle bundle_internal(&io.metadata.m);
|
||||
EXPECT_TRUE(ConvertFromExternal(
|
||||
jxl::Span<const uint8_t>(frame_internal.data(),
|
||||
frame_internal.size()),
|
||||
xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
|
||||
/*has_alpha=*/true,
|
||||
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
|
||||
JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr,
|
||||
&bundle_internal, /*float_in=*/false));
|
||||
bundle_internal.duration = 0;
|
||||
bundle_internal.use_for_next_frame = true;
|
||||
bundle_internal.origin = {13, 17};
|
||||
io.frames.push_back(std::move(bundle_internal));
|
||||
frame_durations_nc.push_back(0);
|
||||
frame_xsize.push_back(xsize / 2);
|
||||
frame_ysize.push_back(ysize / 2);
|
||||
frame_x0.push_back(13);
|
||||
frame_y0.push_back(17);
|
||||
}
|
||||
|
||||
std::vector<uint8_t> frame =
|
||||
jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
|
||||
// Actual rendered frame
|
||||
jxl::ImageBundle bundle(&io.metadata.m);
|
||||
EXPECT_TRUE(ConvertFromExternal(
|
||||
jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
|
||||
cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
|
||||
/*has_alpha=*/true,
|
||||
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
|
||||
JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle,
|
||||
/*float_in=*/false));
|
||||
bundle.duration = 5 + i;
|
||||
frame_durations_nc.push_back(5 + i);
|
||||
frame_durations_c.push_back(5 + i);
|
||||
frame_xsize.push_back(cropxsize);
|
||||
frame_ysize.push_back(cropysize);
|
||||
frame_x0.push_back(cropx0);
|
||||
frame_y0.push_back(cropy0);
|
||||
bundle.origin = {cropx0, cropy0};
|
||||
// Create some variation in which frames depend on which.
|
||||
if (i != 3 && i != 9 && i != 10) {
|
||||
bundle.use_for_next_frame = true;
|
||||
}
|
||||
if (i != 12) {
|
||||
bundle.blend = true;
|
||||
bundle.blendmode = jxl::BlendMode::kBlend;
|
||||
}
|
||||
io.frames.push_back(std::move(bundle));
|
||||
}
|
||||
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed, &aux_out,
|
||||
nullptr));
|
||||
// try both with and without coalescing
|
||||
for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
|
||||
// Independently decode all frames without any skipping, to create the
|
||||
// expected blended frames, for the actual tests below to compare with.
|
||||
{
|
||||
JxlDecoder* dec = JxlDecoderCreate(NULL);
|
||||
const uint8_t* next_in = compressed.data();
|
||||
size_t avail_in = compressed.size();
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
|
||||
void* runner = JxlThreadParallelRunnerCreate(
|
||||
NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
|
||||
dec, JxlThreadParallelRunner, runner));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
|
||||
for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
|
||||
EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
|
||||
size_t buffer_size;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
|
||||
if (coalescing)
|
||||
EXPECT_EQ(xsize * ysize * 8, buffer_size);
|
||||
else
|
||||
EXPECT_EQ(frame_xsize[i] * frame_ysize[i] * 8, buffer_size);
|
||||
frames[i].resize(buffer_size);
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetImageOutBuffer(dec, &format, frames[i].data(),
|
||||
frames[i].size()));
|
||||
EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
|
||||
}
|
||||
|
||||
// After all frames were decoded, JxlDecoderProcessInput should return
|
||||
// success to indicate all is done.
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
|
||||
JxlThreadParallelRunnerDestroy(runner);
|
||||
JxlDecoderDestroy(dec);
|
||||
}
|
||||
|
||||
JxlDecoder* dec = JxlDecoderCreate(NULL);
|
||||
const uint8_t* next_in = compressed.data();
|
||||
size_t avail_in = compressed.size();
|
||||
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
|
||||
void* runner = JxlThreadParallelRunnerCreate(
|
||||
NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
|
||||
dec, JxlThreadParallelRunner, runner));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
|
||||
dec, JXL_DEC_BASIC_INFO | JXL_DEC_FRAME |
|
||||
JXL_DEC_FULL_IMAGE));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
|
||||
EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
|
||||
JxlBasicInfo info;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
|
||||
|
||||
for (size_t i = 0; i < num_frames; ++i) {
|
||||
EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
|
||||
|
||||
size_t buffer_size;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
|
||||
std::vector<uint8_t> pixels(buffer_size);
|
||||
|
||||
JxlFrameHeader frame_header;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
|
||||
EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
|
||||
frame_header.duration);
|
||||
|
||||
EXPECT_EQ(i + 1 == num_frames, frame_header.is_last);
|
||||
|
||||
EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
|
||||
pixels.size()));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
|
||||
if (coalescing)
|
||||
EXPECT_EQ(frame_header.layer_info.xsize, xsize);
|
||||
else
|
||||
EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
|
||||
if (coalescing)
|
||||
EXPECT_EQ(frame_header.layer_info.ysize, ysize);
|
||||
else
|
||||
EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
|
||||
EXPECT_EQ(0u,
|
||||
ComparePixels(frames[i].data(), pixels.data(),
|
||||
frame_header.layer_info.xsize,
|
||||
frame_header.layer_info.ysize, format, format));
|
||||
|
||||
// Test rewinding mid-way, not decoding all frames.
|
||||
if (i == 8) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
JxlDecoderRewind(dec);
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
|
||||
dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
|
||||
|
||||
for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
|
||||
if (i == 3) {
|
||||
JxlDecoderSkipFrames(dec, 5);
|
||||
i += 5;
|
||||
}
|
||||
|
||||
EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
|
||||
size_t buffer_size;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
|
||||
std::vector<uint8_t> pixels(buffer_size);
|
||||
|
||||
JxlFrameHeader frame_header;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
|
||||
EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
|
||||
frame_header.duration);
|
||||
|
||||
EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
|
||||
frame_header.is_last);
|
||||
|
||||
EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
|
||||
pixels.size()));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
|
||||
if (coalescing) {
|
||||
EXPECT_EQ(frame_header.layer_info.xsize, xsize);
|
||||
EXPECT_EQ(frame_header.layer_info.ysize, ysize);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
|
||||
} else {
|
||||
EXPECT_EQ(frame_header.layer_info.xsize, frame_xsize[i]);
|
||||
EXPECT_EQ(frame_header.layer_info.ysize, frame_ysize[i]);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_x0, frame_x0[i]);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_y0, frame_y0[i]);
|
||||
EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
|
||||
i != 12 + 5 && frame_header.duration != 0
|
||||
? 2
|
||||
: 0); // kBlend or the default kReplace
|
||||
}
|
||||
EXPECT_EQ(0u,
|
||||
ComparePixels(frames[i].data(), pixels.data(),
|
||||
frame_header.layer_info.xsize,
|
||||
frame_header.layer_info.ysize, format, format));
|
||||
}
|
||||
|
||||
// After all frames were decoded, JxlDecoderProcessInput should return
|
||||
// success to indicate all is done.
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
|
||||
|
||||
// Test rewinding the decoder and skipping different frames
|
||||
|
||||
JxlDecoderRewind(dec);
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
|
||||
dec, JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
|
||||
|
||||
for (size_t i = 0; i < num_frames + (coalescing ? 0 : 5); ++i) {
|
||||
int test_skipping = (i == 9) ? 3 : 0;
|
||||
|
||||
EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
|
||||
size_t buffer_size;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
|
||||
std::vector<uint8_t> pixels(buffer_size);
|
||||
|
||||
// Since this is after JXL_DEC_FRAME but before JXL_DEC_FULL_IMAGE, this
|
||||
// should only skip the next frame, not the currently processed one.
|
||||
if (test_skipping) JxlDecoderSkipFrames(dec, test_skipping);
|
||||
|
||||
JxlFrameHeader frame_header;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetFrameHeader(dec, &frame_header));
|
||||
EXPECT_EQ((coalescing ? frame_durations_c[i] : frame_durations_nc[i]),
|
||||
frame_header.duration);
|
||||
|
||||
EXPECT_EQ(i + 1 == num_frames + (coalescing ? 0 : 5),
|
||||
frame_header.is_last);
|
||||
|
||||
EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetImageOutBuffer(dec, &format, pixels.data(),
|
||||
pixels.size()));
|
||||
|
||||
EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
|
||||
EXPECT_EQ(0u,
|
||||
ComparePixels(frames[i].data(), pixels.data(),
|
||||
frame_header.layer_info.xsize,
|
||||
frame_header.layer_info.ysize, format, format));
|
||||
|
||||
if (test_skipping) i += test_skipping;
|
||||
}
|
||||
|
||||
JxlThreadParallelRunnerDestroy(runner);
|
||||
JxlDecoderDestroy(dec);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DecodeTest, OrientedCroppedFrameTest) {
|
||||
size_t xsize = 90, ysize = 120;
|
||||
JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
|
||||
|
||||
for (bool keep_orientation : {true, false}) {
|
||||
for (uint32_t orientation = 1; orientation <= 8; orientation++) {
|
||||
size_t oxsize = (!keep_orientation && orientation > 4 ? ysize : xsize);
|
||||
size_t oysize = (!keep_orientation && orientation > 4 ? xsize : ysize);
|
||||
jxl::CodecInOut io;
|
||||
io.SetSize(xsize, ysize);
|
||||
io.metadata.m.SetUintSamples(16);
|
||||
io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(false);
|
||||
io.metadata.m.orientation = orientation;
|
||||
io.frames.clear();
|
||||
io.SetSize(xsize, ysize);
|
||||
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
size_t cropxsize = 1 + xsize * 2 / (i + 1);
|
||||
size_t cropysize = 1 + ysize * 3 / (i + 2);
|
||||
int cropx0 = i * 3 - 8;
|
||||
int cropy0 = i * 4 - 7;
|
||||
|
||||
std::vector<uint8_t> frame =
|
||||
jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
|
||||
jxl::ImageBundle bundle(&io.metadata.m);
|
||||
EXPECT_TRUE(ConvertFromExternal(
|
||||
jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
|
||||
cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
|
||||
/*has_alpha=*/true,
|
||||
/*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
|
||||
JXL_BIG_ENDIAN, /*flipped_y=*/false, /*pool=*/nullptr, &bundle,
|
||||
/*float_in=*/false));
|
||||
bundle.origin = {cropx0, cropy0};
|
||||
bundle.use_for_next_frame = true;
|
||||
io.frames.push_back(std::move(bundle));
|
||||
}
|
||||
|
||||
jxl::CompressParams cparams;
|
||||
cparams
|
||||
.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::AuxOut aux_out;
|
||||
jxl::PaddedBytes compressed;
|
||||
jxl::PassesEncoderState enc_state;
|
||||
EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
|
||||
&aux_out, nullptr));
|
||||
|
||||
// 0 is merged frame as decoded with coalescing enabled (default)
|
||||
// 1-3 are non-coalesced frames as decoded with coalescing disabled
|
||||
// 4 is the manually merged frame
|
||||
std::vector<uint8_t> frames[5];
|
||||
frames[4].resize(xsize * ysize * 8, 0);
|
||||
|
||||
// try both with and without coalescing
|
||||
for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
|
||||
// Independently decode all frames without any skipping, to create the
|
||||
// expected blended frames, for the actual tests below to compare with.
|
||||
{
|
||||
JxlDecoder* dec = JxlDecoderCreate(NULL);
|
||||
const uint8_t* next_in = compressed.data();
|
||||
size_t avail_in = compressed.size();
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetCoalescing(dec, coalescing));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetKeepOrientation(dec, keep_orientation));
|
||||
void* runner = JxlThreadParallelRunnerCreate(
|
||||
NULL, JxlThreadParallelRunnerDefaultNumWorkerThreads());
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetParallelRunner(
|
||||
dec, JxlThreadParallelRunner, runner));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSubscribeEvents(dec, JXL_DEC_FULL_IMAGE));
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetInput(dec, next_in, avail_in));
|
||||
for (size_t i = (coalescing ? 0 : 1); i < (coalescing ? 1 : 4); ++i) {
|
||||
EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER,
|
||||
JxlDecoderProcessInput(dec));
|
||||
JxlFrameHeader frame_header;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderGetFrameHeader(dec, &frame_header));
|
||||
size_t buffer_size;
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderImageOutBufferSize(dec, &format, &buffer_size));
|
||||
if (coalescing)
|
||||
EXPECT_EQ(xsize * ysize * 8, buffer_size);
|
||||
else
|
||||
EXPECT_EQ(frame_header.layer_info.xsize *
|
||||
frame_header.layer_info.ysize * 8,
|
||||
buffer_size);
|
||||
frames[i].resize(buffer_size);
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS,
|
||||
JxlDecoderSetImageOutBuffer(
|
||||
dec, &format, frames[i].data(), frames[i].size()));
|
||||
EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
|
||||
EXPECT_EQ(frame_header.layer_info.blend_info.blendmode,
|
||||
JXL_BLEND_REPLACE);
|
||||
if (coalescing) {
|
||||
EXPECT_EQ(frame_header.layer_info.xsize, oxsize);
|
||||
EXPECT_EQ(frame_header.layer_info.ysize, oysize);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_x0, 0);
|
||||
EXPECT_EQ(frame_header.layer_info.crop_y0, 0);
|
||||
} else {
|
||||
// manually merge this layer
|
||||
int x0 = frame_header.layer_info.crop_x0;
|
||||
int y0 = frame_header.layer_info.crop_y0;
|
||||
int w = frame_header.layer_info.xsize;
|
||||
int h = frame_header.layer_info.ysize;
|
||||
for (int y = 0; y < static_cast<int>(oysize); y++) {
|
||||
if (y < y0 || y >= y0 + h) continue;
|
||||
// pointers do whole 16-bit RGBA pixels at a time
|
||||
uint64_t* row_merged = static_cast<uint64_t*>(
|
||||
(void*)(frames[4].data() + y * oxsize * 8));
|
||||
uint64_t* row_layer = static_cast<uint64_t*>(
|
||||
(void*)(frames[i].data() + (y - y0) * w * 8));
|
||||
for (int x = 0; x < static_cast<int>(oxsize); x++) {
|
||||
if (x < x0 || x >= x0 + w) continue;
|
||||
row_merged[x] = row_layer[x - x0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After all frames were decoded, JxlDecoderProcessInput should return
|
||||
// success to indicate all is done.
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
|
||||
JxlThreadParallelRunnerDestroy(runner);
|
||||
JxlDecoderDestroy(dec);
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_EQ(0u, ComparePixels(frames[0].data(), frames[4].data(), oxsize,
|
||||
oysize, format, format));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(DecodeTest, FlushTest) {
|
||||
// Size large enough for multiple groups, required to have progressive
|
||||
// stages
|
||||
|
@ -3187,6 +3620,7 @@ TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
|
|||
jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless();
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
cparams.responsive = 1;
|
||||
jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
|
||||
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
|
||||
|
@ -3235,7 +3669,7 @@ TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
|
|||
|
||||
EXPECT_LE(ComparePixels(pixels2.data(), pixels.data(), xsize, ysize, format,
|
||||
format, 2560.0),
|
||||
1700u);
|
||||
2700u);
|
||||
|
||||
EXPECT_EQ(JXL_DEC_NEED_MORE_INPUT, JxlDecoderProcessInput(dec));
|
||||
|
||||
|
@ -3371,6 +3805,10 @@ TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {
|
|||
|
||||
EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
|
||||
EXPECT_EQ(JXL_DEC_FRAME, JxlDecoderProcessInput(dec));
|
||||
// The decoder returns success despite not having seen the final unknown box
|
||||
// yet. This is because calling JxlDecoderCloseInput is not mandatory for
|
||||
// backwards compatibility, so it doesn't know more bytes follow, the current
|
||||
// bytes ended at a perfectly valid place.
|
||||
EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
|
||||
|
||||
size_t remaining = JxlDecoderReleaseInput(dec);
|
||||
|
@ -3648,6 +4086,7 @@ TEST(DecodeTest, PartialCodestreamBoxTest) {
|
|||
JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
|
||||
jxl::CompressParams cparams;
|
||||
cparams.SetLossless(); // Lossless to verify pixels exactly after roundtrip.
|
||||
cparams.speed_tier = jxl::SpeedTier::kThunder;
|
||||
jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
|
||||
jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
|
||||
cparams, kCSBF_Multi, JXL_ORIENT_IDENTITY, false, true);
|
||||
|
|
|
@ -429,8 +429,8 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
|
|||
}
|
||||
entropy_v += nzeros_v * cost1;
|
||||
|
||||
entropy += GetLane(SumOfLanes(entropy_v));
|
||||
size_t num_nzeros = GetLane(SumOfLanes(nzeros_v));
|
||||
entropy += GetLane(SumOfLanes(df, entropy_v));
|
||||
size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
|
||||
// Add #bit of num_nonzeros, as an estimate of the cost for encoding the
|
||||
// number of non-zeros of the block.
|
||||
size_t nbits = CeilLog2Nonzero(num_nzeros + 1) + 1;
|
||||
|
@ -441,9 +441,9 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
|
|||
float ret =
|
||||
entropy +
|
||||
masking *
|
||||
((config.info_loss_multiplier * GetLane(SumOfLanes(info_loss))) +
|
||||
((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) +
|
||||
(config.info_loss_multiplier2 *
|
||||
sqrt(num_blocks * GetLane(SumOfLanes(info_loss2)))));
|
||||
sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2)))));
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -189,7 +189,7 @@ V GammaModulation(const D d, const size_t x, const size_t y,
|
|||
overall_ratio += avg_ratio;
|
||||
}
|
||||
}
|
||||
overall_ratio = SumOfLanes(overall_ratio);
|
||||
overall_ratio = SumOfLanes(d, overall_ratio);
|
||||
overall_ratio *= Set(d, 1.0f / 64);
|
||||
// ideally -1.0, but likely optimal correction adds some entropy, so slightly
|
||||
// less than that.
|
||||
|
@ -246,12 +246,12 @@ V ColorModulation(const D d, const size_t x, const size_t y,
|
|||
// blue we consider as if it was fully red or blue.
|
||||
static const float ratio = 30.610615782142737f; // out of 64 pixels.
|
||||
|
||||
auto overall_red_coverage = SumOfLanes(red_coverage);
|
||||
auto overall_red_coverage = SumOfLanes(d, red_coverage);
|
||||
overall_red_coverage =
|
||||
Min(overall_red_coverage, Set(d, ratio * kRedRampLength));
|
||||
overall_red_coverage *= Set(d, red_strength / ratio);
|
||||
|
||||
auto overall_blue_coverage = SumOfLanes(blue_coverage);
|
||||
auto overall_blue_coverage = SumOfLanes(d, blue_coverage);
|
||||
overall_blue_coverage =
|
||||
Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength));
|
||||
overall_blue_coverage *= Set(d, blue_strength / ratio);
|
||||
|
@ -295,7 +295,7 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
|
|||
}
|
||||
}
|
||||
|
||||
sum = SumOfLanes(sum);
|
||||
sum = SumOfLanes(d, sum);
|
||||
return MulAdd(sum, Set(d, -2.0052193233688884f / 112), out_val);
|
||||
}
|
||||
|
||||
|
|
|
@ -157,7 +157,7 @@ void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
|
|||
sum += LoadU(df4, rows_in[iy] + x * 4 + ix + 2);
|
||||
}
|
||||
}
|
||||
row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f);
|
||||
row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
|
||||
}
|
||||
}
|
||||
// Indexing iy and ix is a bit tricky as we include a 2 pixel border
|
||||
|
@ -193,7 +193,7 @@ void ProcessTile(const Image3F& opsin, PassesEncoderState* enc_state,
|
|||
sum += Load(df4, rows_in[iy] + sx + ix);
|
||||
}
|
||||
}
|
||||
row_out[x] = GetLane(Sqrt(SumOfLanes(sum))) * (1.0f / 4.0f);
|
||||
row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
|
||||
} else {
|
||||
float sum = 0;
|
||||
for (size_t iy = sy; iy < ey; iy++) {
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче