зеркало из https://github.com/mozilla/gecko-dev.git
Backed out changeset f87312b156f6 (bug 1757483) for causing build bustages on highway_export.h CLOSED TREE
This commit is contained in:
Родитель
05225c58b9
Коммит
b4d61b9a3f
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z).
|
||||
release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: f13e3b956eb226561ac79427893ec0afd66f91a8
|
||||
revision: e69083a12a05caf037cabecdf1b248b7579705a5
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -10,9 +10,9 @@ origin:
|
|||
|
||||
url: https://github.com/libjxl/libjxl
|
||||
|
||||
release: commit 89875cba4d18485ec9692c80b747b59b73ce712e (2022-02-28T16:03:42Z).
|
||||
release: commit 4322679b1c418addc2284c5ea84fc2c3935b4a75 (2022-02-07T20:56:39Z).
|
||||
|
||||
revision: 89875cba4d18485ec9692c80b747b59b73ce712e
|
||||
revision: 4322679b1c418addc2284c5ea84fc2c3935b4a75
|
||||
|
||||
license: Apache-2.0
|
||||
|
||||
|
|
|
@ -1,57 +0,0 @@
|
|||
# Copyright 2021 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
name: Build / test
|
||||
on: [push, pull_request]
|
||||
jobs:
|
||||
cmake:
|
||||
name: Build and test ${{ matrix.name }}
|
||||
runs-on: ubuntu-18.04
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- name: Clang-5.0
|
||||
extra_deps: clang-5.0
|
||||
c_compiler: clang-5.0
|
||||
cxx_compiler: clang++-5.0
|
||||
|
||||
- name: Clang-6.0
|
||||
extra_deps: clang-6.0
|
||||
c_compiler: clang-6.0
|
||||
cxx_compiler: clang++-6.0
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Install deps
|
||||
run: sudo apt-get install ${{ matrix.extra_deps }}
|
||||
|
||||
- name: Build and test
|
||||
run: |
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=2
|
||||
export CTEST_PARALLEL_LEVEL=2
|
||||
CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out .
|
||||
cmake --build out
|
||||
ctest --test-dir out
|
||||
|
||||
bazel:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: bazelbuild/setup-bazelisk@v1
|
||||
- uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.cache/bazel
|
||||
key: bazel-${{ runner.os }}
|
||||
- run: bazel build //...
|
|
@ -1,6 +1,6 @@
|
|||
load("@bazel_skylib//lib:selects.bzl", "selects")
|
||||
|
||||
load("@rules_cc//cc:defs.bzl", "cc_test")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
@ -18,11 +18,6 @@ config_setting(
|
|||
flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "compiler_emscripten",
|
||||
values = {"cpu": "wasm32"},
|
||||
)
|
||||
|
||||
# See https://github.com/bazelbuild/bazel/issues/12707
|
||||
config_setting(
|
||||
name = "compiler_gcc_bug",
|
||||
|
@ -46,6 +41,13 @@ selects.config_setting_group(
|
|||
],
|
||||
)
|
||||
|
||||
config_setting(
|
||||
name = "emulate_sve",
|
||||
values = {
|
||||
"copt": "-DHWY_EMULATE_SVE",
|
||||
},
|
||||
)
|
||||
|
||||
# Additional warnings for Clang OR GCC (skip for MSVC)
|
||||
CLANG_GCC_COPTS = [
|
||||
"-Wunused-parameter",
|
||||
|
@ -80,7 +82,7 @@ COPTS = select({
|
|||
"//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": [
|
||||
"-march=rv64gcv1p0",
|
||||
"-march=rv64gcv0p10",
|
||||
"-menable-experimental-extensions",
|
||||
],
|
||||
"//conditions:default": [
|
||||
|
@ -110,32 +112,28 @@ cc_library(
|
|||
"hwy/base.h",
|
||||
"hwy/cache_control.h",
|
||||
"hwy/detect_compiler_arch.h", # private
|
||||
"hwy/highway_export.h",
|
||||
"hwy/detect_targets.h", # private
|
||||
"hwy/targets.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
copts = COPTS,
|
||||
textual_hdrs = [
|
||||
# These are textual because config macros influence them:
|
||||
"hwy/detect_targets.h", # private
|
||||
"hwy/targets.h",
|
||||
# End of list
|
||||
"hwy/highway.h", # public
|
||||
"hwy/foreach_target.h", # public
|
||||
"hwy/ops/arm_neon-inl.h",
|
||||
"hwy/ops/arm_sve-inl.h",
|
||||
"hwy/ops/generic_ops-inl.h",
|
||||
"hwy/ops/rvv-inl.h",
|
||||
"hwy/ops/scalar-inl.h",
|
||||
"hwy/ops/set_macros-inl.h",
|
||||
"hwy/ops/shared-inl.h",
|
||||
"hwy/ops/wasm_128-inl.h",
|
||||
"hwy/ops/x86_128-inl.h",
|
||||
"hwy/ops/x86_256-inl.h",
|
||||
"hwy/ops/x86_512-inl.h",
|
||||
# Select avoids recompiling native arch if only non-native changed
|
||||
] + select({
|
||||
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
|
||||
"//conditions:default": [],
|
||||
}) + select({
|
||||
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
|
||||
],
|
||||
deps = select({
|
||||
":emulate_sve": ["//third_party/farm_sve"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
)
|
||||
|
@ -146,9 +144,7 @@ cc_library(
|
|||
textual_hdrs = [
|
||||
"hwy/contrib/dot/dot-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -160,9 +156,7 @@ cc_library(
|
|||
"hwy/contrib/image/image.h",
|
||||
],
|
||||
compatible_with = [],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -171,9 +165,16 @@ cc_library(
|
|||
textual_hdrs = [
|
||||
"hwy/contrib/math/math-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":hwy",
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "sort",
|
||||
compatible_with = [],
|
||||
textual_hdrs = [
|
||||
"hwy/contrib/sort/sort-inl.h",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
# Everything required for tests that use Highway.
|
||||
|
@ -187,9 +188,7 @@ cc_library(
|
|||
],
|
||||
# Must not depend on a gtest variant, which can conflict with the
|
||||
# GUNIT_INTERNAL_BUILD_MODE defined by the test.
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -213,9 +212,7 @@ cc_library(
|
|||
srcs = ["hwy/examples/skeleton.cc"],
|
||||
hdrs = ["hwy/examples/skeleton.h"],
|
||||
textual_hdrs = ["hwy/examples/skeleton-inl.h"],
|
||||
deps = [
|
||||
":hwy",
|
||||
],
|
||||
deps = [":hwy"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
|
@ -229,7 +226,7 @@ HWY_TESTS = [
|
|||
("hwy/contrib/dot/", "dot_test"),
|
||||
("hwy/contrib/image/", "image_test"),
|
||||
("hwy/contrib/math/", "math_test"),
|
||||
# contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
|
||||
("hwy/contrib/sort/", "sort_test"),
|
||||
("hwy/examples/", "skeleton_test"),
|
||||
("hwy/", "nanobenchmark_test"),
|
||||
("hwy/", "aligned_allocator_test"),
|
||||
|
@ -242,27 +239,13 @@ HWY_TESTS = [
|
|||
("hwy/tests/", "compare_test"),
|
||||
("hwy/tests/", "convert_test"),
|
||||
("hwy/tests/", "crypto_test"),
|
||||
("hwy/tests/", "demote_test"),
|
||||
("hwy/tests/", "logical_test"),
|
||||
("hwy/tests/", "mask_test"),
|
||||
("hwy/tests/", "memory_test"),
|
||||
("hwy/tests/", "shift_test"),
|
||||
("hwy/tests/", "swizzle_test"),
|
||||
("hwy/tests/", "test_util_test"),
|
||||
]
|
||||
|
||||
HWY_TEST_DEPS = [
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
":image",
|
||||
":math",
|
||||
":nanobenchmark",
|
||||
":skeleton",
|
||||
"//hwy/contrib/sort:vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
]
|
||||
|
||||
[
|
||||
[
|
||||
cc_test(
|
||||
|
@ -282,18 +265,6 @@ HWY_TEST_DEPS = [
|
|||
"@platforms//cpu:riscv64": ["fully_static_link"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
linkopts = select({
|
||||
":compiler_emscripten": [
|
||||
"-s ASSERTIONS=2",
|
||||
"-s ENVIRONMENT=node,shell,web",
|
||||
"-s ERROR_ON_UNDEFINED_SYMBOLS=1",
|
||||
"-s DEMANGLE_SUPPORT=1",
|
||||
"-s EXIT_RUNTIME=1",
|
||||
"-s ALLOW_MEMORY_GROWTH=1",
|
||||
"--pre-js $(location :preamble.js.lds)",
|
||||
],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
linkstatic = select({
|
||||
"@platforms//cpu:riscv64": True,
|
||||
"//conditions:default": False,
|
||||
|
@ -301,10 +272,17 @@ HWY_TEST_DEPS = [
|
|||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
deps = HWY_TEST_DEPS + select({
|
||||
":compiler_emscripten": [":preamble.js.lds"],
|
||||
"//conditions:default": [],
|
||||
}),
|
||||
deps = [
|
||||
":dot",
|
||||
":hwy",
|
||||
":hwy_test_util",
|
||||
":image",
|
||||
":math",
|
||||
":nanobenchmark",
|
||||
":skeleton",
|
||||
":sort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
],
|
||||
),
|
||||
]
|
||||
for subdir, test in HWY_TESTS
|
||||
|
@ -315,5 +293,3 @@ test_suite(
|
|||
name = "hwy_ops_tests",
|
||||
tags = ["hwy_ops_test"],
|
||||
)
|
||||
|
||||
# Placeholder for integration test, do not remove
|
||||
|
|
|
@ -19,13 +19,11 @@ if(POLICY CMP0083)
|
|||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 0.16.0) # Keep in sync with highway.h version
|
||||
|
||||
# Directly define the ABI version from the cmake project() version values:
|
||||
set(LIBRARY_VERSION "${hwy_VERSION}")
|
||||
set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
|
||||
project(hwy VERSION 0.15.0) # Keep in sync with highway.h version
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED YES)
|
||||
|
||||
# Enabled PIE binaries by default if supported.
|
||||
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
|
||||
|
@ -42,14 +40,13 @@ if (NOT CMAKE_BUILD_TYPE)
|
|||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
endif()
|
||||
|
||||
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
|
||||
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
|
||||
|
||||
# Unconditionally adding -Werror risks breaking the build when new warnings
|
||||
# arise due to compiler/platform changes. Enable this in CI/tests.
|
||||
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
|
||||
|
||||
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
|
||||
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
|
||||
set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?")
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
|
@ -67,32 +64,7 @@ set(HWY_CONTRIB_SOURCES
|
|||
hwy/contrib/image/image.cc
|
||||
hwy/contrib/image/image.h
|
||||
hwy/contrib/math/math-inl.h
|
||||
hwy/contrib/sort/disabled_targets.h
|
||||
hwy/contrib/sort/shared-inl.h
|
||||
hwy/contrib/sort/sorting_networks-inl.h
|
||||
hwy/contrib/sort/traits-inl.h
|
||||
hwy/contrib/sort/traits128-inl.h
|
||||
hwy/contrib/sort/vqsort-inl.h
|
||||
hwy/contrib/sort/vqsort.cc
|
||||
hwy/contrib/sort/vqsort.h
|
||||
hwy/contrib/sort/vqsort_128a.cc
|
||||
hwy/contrib/sort/vqsort_128d.cc
|
||||
hwy/contrib/sort/vqsort_f32a.cc
|
||||
hwy/contrib/sort/vqsort_f32d.cc
|
||||
hwy/contrib/sort/vqsort_f64a.cc
|
||||
hwy/contrib/sort/vqsort_f64d.cc
|
||||
hwy/contrib/sort/vqsort_i16a.cc
|
||||
hwy/contrib/sort/vqsort_i16d.cc
|
||||
hwy/contrib/sort/vqsort_i32a.cc
|
||||
hwy/contrib/sort/vqsort_i32d.cc
|
||||
hwy/contrib/sort/vqsort_i64a.cc
|
||||
hwy/contrib/sort/vqsort_i64d.cc
|
||||
hwy/contrib/sort/vqsort_u16a.cc
|
||||
hwy/contrib/sort/vqsort_u16d.cc
|
||||
hwy/contrib/sort/vqsort_u32a.cc
|
||||
hwy/contrib/sort/vqsort_u32d.cc
|
||||
hwy/contrib/sort/vqsort_u64a.cc
|
||||
hwy/contrib/sort/vqsort_u64d.cc
|
||||
hwy/contrib/sort/sort-inl.h
|
||||
)
|
||||
|
||||
set(HWY_SOURCES
|
||||
|
@ -104,7 +76,6 @@ set(HWY_SOURCES
|
|||
hwy/detect_targets.h # private
|
||||
hwy/foreach_target.h
|
||||
hwy/highway.h
|
||||
hwy/highway_export.h
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h
|
||||
hwy/ops/arm_neon-inl.h
|
||||
|
@ -221,59 +192,20 @@ else()
|
|||
|
||||
endif() # !MSVC
|
||||
|
||||
# By default prefer STATIC build (legacy behavior)
|
||||
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
|
||||
option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
|
||||
# only expose shared/static options to advanced users:
|
||||
mark_as_advanced(BUILD_SHARED_LIBS)
|
||||
mark_as_advanced(HWY_FORCE_STATIC_LIBS)
|
||||
# Define visibility settings globally:
|
||||
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
|
||||
set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
|
||||
|
||||
# Copy-cat "add_library" logic + add override.
|
||||
set(HWY_LIBRARY_TYPE "SHARED")
|
||||
if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
|
||||
set(HWY_LIBRARY_TYPE "STATIC")
|
||||
endif()
|
||||
|
||||
# This preprocessor define will drive the build, also used in the *.pc files:
|
||||
if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
|
||||
set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
|
||||
else()
|
||||
set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
|
||||
endif()
|
||||
|
||||
add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
|
||||
target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
|
||||
add_library(hwy STATIC ${HWY_SOURCES})
|
||||
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_compile_features(hwy PUBLIC cxx_std_11)
|
||||
set_target_properties(hwy PROPERTIES
|
||||
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
|
||||
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
|
||||
if(UNIX AND NOT APPLE)
|
||||
set_property(TARGET hwy APPEND_STRING PROPERTY
|
||||
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
|
||||
endif()
|
||||
|
||||
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
|
||||
target_link_libraries(hwy_contrib hwy)
|
||||
add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
|
||||
target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_compile_features(hwy_contrib PUBLIC cxx_std_11)
|
||||
|
||||
add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
|
||||
target_link_libraries(hwy_test hwy)
|
||||
add_library(hwy_test STATIC ${HWY_TEST_SOURCES})
|
||||
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
|
||||
target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
target_compile_features(hwy_test PUBLIC cxx_std_11)
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
|
@ -287,22 +219,17 @@ target_include_directories(hwy_list_targets PRIVATE
|
|||
# Naked target also not always could be run (due to the lack of '.\' prefix)
|
||||
# Thus effective command to run should contain the full path
|
||||
# and emulator prefix (if any).
|
||||
if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
|
||||
add_custom_command(TARGET hwy_list_targets POST_BUILD
|
||||
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
|
||||
endif()
|
||||
|
||||
# --------------------------------------------------------
|
||||
# Allow skipping the following sections for projects that do not need them:
|
||||
# tests, examples, benchmarks and installation.
|
||||
if (HWY_EXAMPLES_TESTS_INSTALL)
|
||||
|
||||
# -------------------------------------------------------- install library
|
||||
if (HWY_ENABLE_INSTALL)
|
||||
|
||||
install(TARGETS hwy
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_SOURCES})
|
||||
|
@ -314,9 +241,7 @@ foreach (source ${HWY_SOURCES})
|
|||
endforeach()
|
||||
|
||||
install(TARGETS hwy_contrib
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_CONTRIB_SOURCES})
|
||||
|
@ -328,9 +253,7 @@ foreach (source ${HWY_CONTRIB_SOURCES})
|
|||
endforeach()
|
||||
|
||||
install(TARGETS hwy_test
|
||||
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_TEST_SOURCES})
|
||||
|
@ -349,9 +272,7 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
|
|||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endforeach()
|
||||
|
||||
endif() # HWY_ENABLE_INSTALL
|
||||
# -------------------------------------------------------- Examples
|
||||
if (HWY_ENABLE_EXAMPLES)
|
||||
|
||||
# Avoids mismatch between GTest's static CRT and our dynamic.
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
@ -359,6 +280,7 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
|||
# Programming exercise with integrated benchmark
|
||||
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
|
||||
target_sources(hwy_benchmark PRIVATE
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h)
|
||||
# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
|
||||
# observe the difference in targets printed.
|
||||
|
@ -367,7 +289,6 @@ target_link_libraries(hwy_benchmark hwy)
|
|||
set_target_properties(hwy_benchmark
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
||||
|
||||
endif() # HWY_ENABLE_EXAMPLES
|
||||
# -------------------------------------------------------- Tests
|
||||
|
||||
include(CTest)
|
||||
|
@ -431,11 +352,9 @@ set(HWY_TEST_FILES
|
|||
hwy/tests/compare_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/crypto_test.cc
|
||||
hwy/tests/demote_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/mask_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/shift_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
)
|
||||
|
@ -458,7 +377,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
|||
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
|
||||
endif()
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
|
||||
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
|
||||
|
||||
if (HWY_EMSCRIPTEN)
|
||||
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
|
||||
|
@ -475,3 +394,5 @@ endforeach ()
|
|||
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
|
||||
|
||||
endif() # BUILD_TESTING
|
||||
|
||||
endif() # HWY_EXAMPLES_TESTS_INSTALL
|
||||
|
|
|
@ -5,11 +5,11 @@ project(googletest-download NONE)
|
|||
include(ExternalProject)
|
||||
ExternalProject_Add(googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c
|
||||
GIT_TAG master
|
||||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
||||
)
|
|
@ -1,95 +1,30 @@
|
|||
# Efficient and performance-portable vector software
|
||||
# Efficient and performance-portable SIMD
|
||||
|
||||
[//]: # (placeholder, do not remove)
|
||||
Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
|
||||
applying the same operation to multiple 'lanes' using a single CPU instruction.
|
||||
|
||||
Highway is a C++ library that provides portable SIMD/vector intrinsics.
|
||||
## Why Highway?
|
||||
|
||||
## Why
|
||||
|
||||
We are passionate about high-performance software. We see major untapped
|
||||
potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
|
||||
to reliably and economically push the boundaries of what is possible in
|
||||
software.
|
||||
|
||||
## How
|
||||
|
||||
CPUs provide SIMD/vector instructions that apply the same operation to multiple
|
||||
data items. This can reduce energy usage e.g. *fivefold* because fewer
|
||||
instructions are executed. We also often see *5-10x* speedups.
|
||||
|
||||
Highway makes SIMD/vector programming practical and workable according to these
|
||||
guiding principles:
|
||||
|
||||
**Does what you expect**: Highway is a C++ library with carefully-chosen
|
||||
functions that map well to CPU instructions without extensive compiler
|
||||
transformations. The resulting code is more predictable and robust to code
|
||||
changes/compiler updates than autovectorization.
|
||||
|
||||
**Works on widely-used platforms**: Highway supports four architectures; the
|
||||
same application code can target eight instruction sets, including those with
|
||||
'scalable' vectors (size unknown at compile time). Highway only requires C++11
|
||||
and supports four families of compilers. If you would like to use Highway on
|
||||
other platforms, please raise an issue.
|
||||
|
||||
**Flexible to deploy**: Applications using Highway can run on heterogeneous
|
||||
clouds or client devices, choosing the best available instruction set at
|
||||
runtime. Alternatively, developers may choose to target a single instruction set
|
||||
without any runtime overhead. In both cases, the application code is the same
|
||||
except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
|
||||
line of code.
|
||||
|
||||
**Suitable for a variety of domains**: Highway provides an extensive set of
|
||||
operations, used for image processing (floating-point), compression, video
|
||||
analysis, linear algebra, cryptography, sorting and random generation. We
|
||||
recognise that new use-cases may require additional ops and are happy to add
|
||||
them where it makes sense (e.g. no performance cliffs on some architectures). If
|
||||
you would like to discuss, please file an issue.
|
||||
|
||||
**Rewards data-parallel design**: Highway provides tools such as Gather,
|
||||
MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
|
||||
the biggest gains are unlocked by designing algorithms and data structures for
|
||||
scalable vectors. Helpful techniques include batching, structure-of-array
|
||||
layouts, and aligned/padded allocations.
|
||||
|
||||
## Examples
|
||||
|
||||
Online demos using Compiler Explorer:
|
||||
|
||||
- [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
|
||||
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
|
||||
|
||||
Projects using Highway: (to add yours, feel free to raise an issue or contact us
|
||||
via the below email)
|
||||
|
||||
* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
|
||||
* [JPEG XL image codec](https://github.com/libjxl/libjxl)
|
||||
* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
|
||||
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort)
|
||||
- more portable (same source code) than platform-specific intrinsics,
|
||||
- works on a wider range of compilers than compiler-specific vector extensions,
|
||||
- more dependable than autovectorization,
|
||||
- easier to write/maintain than assembly language,
|
||||
- supports **runtime dispatch**,
|
||||
- supports **variable-length vector** architectures.
|
||||
|
||||
## Current status
|
||||
|
||||
### Targets
|
||||
|
||||
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
|
||||
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE,
|
||||
WASM SIMD.
|
||||
|
||||
SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV
|
||||
is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops
|
||||
which were not yet supported by GCC.
|
||||
SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not
|
||||
yet validated. A subset of RVV is implemented and tested with GCC and QEMU.
|
||||
Work is underway to compile using LLVM, which has different intrinsics with AVL.
|
||||
|
||||
### Versioning
|
||||
|
||||
Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
|
||||
incrementing MINOR after backward-compatible additions and PATCH after
|
||||
backward-compatible fixes. We recommend using releases (rather than the Git tip)
|
||||
because they are tested more extensively, see below.
|
||||
|
||||
Version 0.11 is considered stable enough to use in other projects.
|
||||
Version 1.0 will signal an increased focus on backwards compatibility and will
|
||||
be reached after the RVV target is finished (planned for 2022H1).
|
||||
|
||||
### Testing
|
||||
Version 0.11 is considered stable enough to use in other projects, and is
|
||||
expected to remain backwards compatible unless serious issues are discovered
|
||||
while finishing the RVV target. After that, Highway will reach version 1.0.
|
||||
|
||||
Continuous integration tests build with a recent version of Clang (running on
|
||||
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
|
||||
|
@ -98,15 +33,13 @@ Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
|
|||
GCC cross-compile and QEMU. See the
|
||||
[testing process](g3doc/release_testing_process.md) for details.
|
||||
|
||||
### Related modules
|
||||
|
||||
The `contrib` directory contains SIMD-related utilities: an image class with
|
||||
aligned rows, a math library (16 functions already implemented, mostly
|
||||
trigonometry), and functions for computing dot products and sorting.
|
||||
aligned rows, and a math library (16 functions already implemented, mostly
|
||||
trigonometry).
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses CMake to generate and build. In a Debian-based system you can
|
||||
This project uses cmake to generate and build. In a Debian-based system you can
|
||||
install it via:
|
||||
|
||||
```bash
|
||||
|
@ -122,8 +55,7 @@ installing gtest separately:
|
|||
sudo apt install libgtest-dev
|
||||
```
|
||||
|
||||
To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
|
||||
the standard CMake workflow can be used:
|
||||
To build and test the library the standard cmake workflow can be used:
|
||||
|
||||
```bash
|
||||
mkdir -p build && cd build
|
||||
|
@ -144,40 +76,31 @@ and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
|
|||
indicates the number of instructions per operation.
|
||||
|
||||
We recommend using full SIMD vectors whenever possible for maximum performance
|
||||
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
|
||||
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
|
||||
alternatives for use-cases requiring an upper bound on the lanes:
|
||||
portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
|
||||
`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
|
||||
two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET ==
|
||||
HWY_SCALAR`, the vector always has one lane. For all other targets, up to
|
||||
128-bit vectors are guaranteed to be available.
|
||||
|
||||
- For up to a power of two `N`, specify `CappedTag<T, N>` (or
|
||||
equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as
|
||||
a narrow matrix. A loop is still required because vectors may actually have
|
||||
fewer than `N` lanes.
|
||||
|
||||
- For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
|
||||
supported `N` depends on the target, but is guaranteed to be at least
|
||||
`16/sizeof(T)`.
|
||||
|
||||
Functions using Highway must either be inside `namespace HWY_NAMESPACE {`
|
||||
(possibly nested in one or more other namespaces defined by the project), OR
|
||||
each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE;
|
||||
hn::LoadDup128()`. Additionally, each function using Highway must either be
|
||||
prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
|
||||
`HWY_AFTER_NAMESPACE()`.
|
||||
Functions using Highway must be inside `namespace HWY_NAMESPACE {`
|
||||
(possibly nested in one or more other namespaces defined by the project), and
|
||||
additionally either prefixed with `HWY_ATTR`, or residing between
|
||||
`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
|
||||
|
||||
* For static dispatch, `HWY_TARGET` will be the best available target among
|
||||
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
|
||||
[quick-reference](g3doc/quick_reference.md)). Functions inside
|
||||
`HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
|
||||
the same module they are defined in. You can call the function from other
|
||||
modules by wrapping it in a regular function and declaring the regular
|
||||
function in a header.
|
||||
[quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
|
||||
can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
|
||||
they are defined in. You can call the function from other modules by
|
||||
wrapping it in a regular function and declaring the regular function in a
|
||||
header.
|
||||
|
||||
* For dynamic dispatch, a table of function pointers is generated via the
|
||||
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
|
||||
call the best function pointer for the current CPU's supported targets. A
|
||||
module is automatically compiled for each target in `HWY_TARGETS` (see
|
||||
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
|
||||
defined and `foreach_target.h` is included.
|
||||
defined and foreach_target.h is included.
|
||||
|
||||
## Compiler flags
|
||||
|
||||
|
@ -200,17 +123,17 @@ ensure proper VEX code generation for AVX2 targets.
|
|||
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
|
||||
loop with number of iterations matching the preferred vector width.
|
||||
|
||||
In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
|
||||
the number of elements to process, and `N = Lanes(d)` the number of lanes in a
|
||||
full vector. Assume the loop body is given as a function `template<bool partial,
|
||||
class D> void LoopBody(D d, size_t index, size_t max_n)`.
|
||||
In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
|
||||
number of elements to process, and `N = Lanes(d)` the number of lanes in a full
|
||||
vector. Assume the loop body is given as a function `template<bool partial,
|
||||
class D> void LoopBody(D d, size_t max_n)`.
|
||||
|
||||
Highway offers several ways to express loops where `N` need not divide `count`:
|
||||
|
||||
* Ensure all inputs/outputs are padded. Then the loop is simply
|
||||
|
||||
```
|
||||
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
|
||||
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
|
||||
```
|
||||
Here, the template parameter and second function argument are not needed.
|
||||
|
||||
|
@ -226,8 +149,8 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0);
|
||||
for (; i + N <= count; i += N) LoopBody<false>(d, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
|
||||
```
|
||||
The template parameter and second function arguments are again not needed.
|
||||
|
||||
|
@ -240,20 +163,18 @@ Highway offers several ways to express loops where `N` need not divide `count`:
|
|||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
LoopBody<false>(d, i, 0);
|
||||
LoopBody<false>(d, 0);
|
||||
}
|
||||
if (i < count) {
|
||||
LoopBody<true>(d, i, count - i);
|
||||
LoopBody<true>(d, count - i);
|
||||
}
|
||||
```
|
||||
Now the template parameter and third function argument can be used inside
|
||||
Now the template parameter and second function argument can be used inside
|
||||
`LoopBody` to 'blend' the new partial vector with previous memory contents:
|
||||
`Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.
|
||||
|
||||
This is a good default when it is infeasible to ensure vectors are padded.
|
||||
In contrast to the scalar loop, only a single final iteration is needed.
|
||||
The increased code size from two loop bodies is expected to be worthwhile
|
||||
because it avoids the cost of masking in all but the final iteration.
|
||||
|
||||
## Additional resources
|
||||
|
||||
|
|
|
@ -1,15 +1,3 @@
|
|||
highway (0.16.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add contrib/sort (vectorized quicksort)
|
||||
* Add IfNegativeThenElse, IfVecThenElse
|
||||
* Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
|
||||
* Add OrAnd, Min128, Max128, Lt128, SumsOf8
|
||||
* Support capped/partial vectors on RVV/SVE, int64 in WASM
|
||||
* Support SVE2, shared library build
|
||||
* Remove deprecated overloads without the required d arg (UpperHalf etc.)
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 03 Feb 2022 11:00:00 +0100
|
||||
|
||||
highway (0.15.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
|
||||
|
|
|
@ -18,11 +18,8 @@
|
|||
// Memory allocator with support for alignment and offsets.
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
|
||||
|
@ -39,15 +36,15 @@ using FreePtr = void (*)(void* opaque, void* memory);
|
|||
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
|
||||
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
|
||||
// memory or malloc() if it is null.
|
||||
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
|
||||
AllocPtr alloc_ptr, void* opaque_ptr);
|
||||
void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
|
||||
void* opaque_ptr);
|
||||
|
||||
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
|
||||
// must have been returned from a previous call to `AllocateAlignedBytes`.
|
||||
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
|
||||
// `free_ptr` function is null, uses the default free().
|
||||
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
|
||||
FreePtr free_ptr, void* opaque_ptr);
|
||||
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr);
|
||||
|
||||
// Class that deletes the aligned pointer passed to operator() calling the
|
||||
// destructor before freeing the pointer. This is equivalent to the
|
||||
|
@ -79,10 +76,8 @@ class AlignedDeleter {
|
|||
// array. TypeArrayDeleter<T> would match this prototype.
|
||||
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
|
||||
|
||||
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
|
||||
FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter);
|
||||
static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr, ArrayDeleter deleter);
|
||||
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
|
@ -112,8 +107,8 @@ template <typename T, typename... Args>
|
|||
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(
|
||||
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter());
|
||||
return AlignedUniquePtr<T>(
|
||||
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
|
||||
}
|
||||
|
||||
// Helpers for array allocators (avoids overflow)
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
#include <cfloat>
|
||||
|
||||
#include "hwy/detect_compiler_arch.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler-specific definitions
|
||||
|
@ -185,6 +184,10 @@
|
|||
} while (0)
|
||||
#endif
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
class FarmFloat16;
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -202,9 +205,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|||
//------------------------------------------------------------------------------
|
||||
// Alignment
|
||||
|
||||
// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
|
||||
// should be allocated dynamically via aligned_allocator.h because Lanes() may
|
||||
// exceed the stack size.
|
||||
// For stack-allocated partial arrays or LoadDup128.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ALIGN_MAX alignas(64)
|
||||
#elif HWY_ARCH_RVV && defined(__riscv_vector)
|
||||
|
@ -227,7 +228,9 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
|||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
#if HWY_NATIVE_FLOAT16
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
using float16_t = FarmFloat16;
|
||||
#elif HWY_NATIVE_FLOAT16
|
||||
using float16_t = __fp16;
|
||||
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
|
||||
// arguments, so use a wrapper.
|
||||
|
@ -250,15 +253,15 @@ using float64_t = double;
|
|||
//------------------------------------------------------------------------------
|
||||
// Controlling overload resolution (SFINAE)
|
||||
|
||||
template <bool Condition>
|
||||
template <bool Condition, class T>
|
||||
struct EnableIfT {};
|
||||
template <>
|
||||
struct EnableIfT<true> {
|
||||
using type = void;
|
||||
template <class T>
|
||||
struct EnableIfT<true, T> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <bool Condition>
|
||||
using EnableIf = typename EnableIfT<Condition>::type;
|
||||
template <bool Condition, class T = void>
|
||||
using EnableIf = typename EnableIfT<Condition, T>::type;
|
||||
|
||||
template <typename T, typename U>
|
||||
struct IsSameT {
|
||||
|
@ -280,7 +283,7 @@ HWY_API constexpr bool IsSame() {
|
|||
//
|
||||
// Note that enabling for exactly 128 bits is unnecessary because a function can
|
||||
// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
|
||||
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
|
||||
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
|
||||
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
|
||||
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
|
||||
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
|
||||
|
@ -316,6 +319,102 @@ struct RemoveConstT<const T> {
|
|||
template <class T>
|
||||
using RemoveConst = typename RemoveConstT<T>::type;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsFloat() {
|
||||
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
||||
// from a float, not compared.
|
||||
return IsSame<T, float>() || IsSame<T, double>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsSigned() {
|
||||
return T(0) > T(-1);
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<float16_t>() {
|
||||
return true;
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<bfloat16_t>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Largest/smallest representable integer values.
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMax() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
|
||||
: static_cast<T>(~0ull);
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMin() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
|
||||
}
|
||||
|
||||
// Largest/smallest representable value (integer or float). This naming avoids
|
||||
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
||||
template <typename T>
|
||||
HWY_API constexpr T LowestValue() {
|
||||
return LimitsMin<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float LowestValue<float>() {
|
||||
return -FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double LowestValue<double>() {
|
||||
return -DBL_MAX;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr T HighestValue() {
|
||||
return LimitsMax<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float HighestValue<float>() {
|
||||
return FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double HighestValue<double>() {
|
||||
return DBL_MAX;
|
||||
}
|
||||
|
||||
// Returns bitmask of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T ExponentMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t ExponentMask<uint32_t>() {
|
||||
return 0x7F800000;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t ExponentMask<uint64_t>() {
|
||||
return 0x7FF0000000000000ULL;
|
||||
}
|
||||
|
||||
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
||||
// absolute value are less than this can be represented exactly.
|
||||
template <typename T>
|
||||
constexpr T MantissaEnd() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr float MantissaEnd<float>() {
|
||||
return 8388608.0f; // 1 << 23
|
||||
}
|
||||
template <>
|
||||
constexpr double MantissaEnd<double>() {
|
||||
// floating point literal with p52 requires C++17.
|
||||
return 4503599627370496.0; // 1 << 52
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type relations
|
||||
|
||||
|
@ -457,118 +556,6 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
|
|||
template <size_t N>
|
||||
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsFloat() {
|
||||
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
|
||||
// from a float, not compared.
|
||||
return IsSame<T, float>() || IsSame<T, double>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr bool IsSigned() {
|
||||
return T(0) > T(-1);
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<float16_t>() {
|
||||
return true;
|
||||
}
|
||||
template <>
|
||||
constexpr bool IsSigned<bfloat16_t>() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Largest/smallest representable integer values.
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMax() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
using TU = MakeUnsigned<T>;
|
||||
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
|
||||
: static_cast<TU>(~0ull));
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API constexpr T LimitsMin() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
|
||||
}
|
||||
|
||||
// Largest/smallest representable value (integer or float). This naming avoids
|
||||
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
||||
template <typename T>
|
||||
HWY_API constexpr T LowestValue() {
|
||||
return LimitsMin<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float LowestValue<float>() {
|
||||
return -FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double LowestValue<double>() {
|
||||
return -DBL_MAX;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API constexpr T HighestValue() {
|
||||
return LimitsMax<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float HighestValue<float>() {
|
||||
return FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double HighestValue<double>() {
|
||||
return DBL_MAX;
|
||||
}
|
||||
|
||||
// Returns bitmask of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T ExponentMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t ExponentMask<uint32_t>() {
|
||||
return 0x7F800000;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t ExponentMask<uint64_t>() {
|
||||
return 0x7FF0000000000000ULL;
|
||||
}
|
||||
|
||||
// Returns bitmask of the mantissa field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T MantissaMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t MantissaMask<uint32_t>() {
|
||||
return 0x007FFFFF;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t MantissaMask<uint64_t>() {
|
||||
return 0x000FFFFFFFFFFFFFULL;
|
||||
}
|
||||
|
||||
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
||||
// absolute value are less than this can be represented exactly.
|
||||
template <typename T>
|
||||
constexpr T MantissaEnd() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr float MantissaEnd<float>() {
|
||||
return 8388608.0f; // 1 << 23
|
||||
}
|
||||
template <>
|
||||
constexpr double MantissaEnd<double>() {
|
||||
// floating point literal with p52 requires C++17.
|
||||
return 4503599627370496.0; // 1 << 52
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
|
||||
|
@ -674,21 +661,14 @@ HWY_API size_t PopCount(uint64_t x) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Skip HWY_API due to GCC "function not considered for inlining". Previously
|
||||
// such errors were caused by underlying type mismatches, but it's not clear
|
||||
// what is still mismatched despite all the casts.
|
||||
template <typename TI>
|
||||
/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
|
||||
return x == TI{1}
|
||||
? 0
|
||||
: static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
|
||||
HWY_API constexpr size_t FloorLog2(TI x) {
|
||||
return x == 1 ? 0 : FloorLog2(x >> 1) + 1;
|
||||
}
|
||||
|
||||
template <typename TI>
|
||||
/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
|
||||
return x == TI{1}
|
||||
? 0
|
||||
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
|
||||
HWY_API constexpr size_t CeilLog2(TI x) {
|
||||
return x == 1 ? 0 : FloorLog2(x - 1) + 1;
|
||||
}
|
||||
|
||||
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
|
@ -747,7 +727,7 @@ HWY_API bfloat16_t BF16FromF32(float f) {
|
|||
return bf;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...);
|
||||
|
||||
} // namespace hwy
|
||||
|
|
|
@ -36,7 +36,9 @@
|
|||
// undefine them in this header; these functions are anyway deprecated.
|
||||
// TODO(janwas): remove when these functions are removed.
|
||||
#pragma push_macro("LoadFence")
|
||||
#pragma push_macro("StoreFence")
|
||||
#undef LoadFence
|
||||
#undef StoreFence
|
||||
|
||||
namespace hwy {
|
||||
|
||||
|
@ -70,6 +72,9 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
|
|||
#endif
|
||||
}
|
||||
|
||||
// DEPRECATED, replace with `FlushStream`.
|
||||
HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); }
|
||||
|
||||
// Optionally begins loading the cache line containing "p" to reduce latency of
|
||||
// subsequent actual loads.
|
||||
template <typename T>
|
||||
|
@ -104,6 +109,7 @@ HWY_INLINE HWY_ATTR_CACHE void Pause() {
|
|||
} // namespace hwy
|
||||
|
||||
// TODO(janwas): remove when these functions are removed. (See above.)
|
||||
#pragma pop_macro("StoreFence")
|
||||
#pragma pop_macro("LoadFence")
|
||||
|
||||
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
|
|
|
@ -14,14 +14,15 @@
|
|||
|
||||
#include "hwy/contrib/image/image.h"
|
||||
|
||||
#include <algorithm> // swap
|
||||
#include <cstddef>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
|
||||
|
||||
#include <algorithm> // swap
|
||||
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
|
|
@ -27,13 +27,12 @@
|
|||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Type-independent parts of Image<> - reduces code duplication and facilitates
|
||||
// moving member function implementations to cc file.
|
||||
struct HWY_CONTRIB_DLLEXPORT ImageBase {
|
||||
struct ImageBase {
|
||||
// Returns required alignment in bytes for externally allocated memory.
|
||||
static size_t VectorSize();
|
||||
|
||||
|
@ -101,7 +100,8 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
|
|||
protected:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidRow(const size_t y) const {
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (y >= ysize_) {
|
||||
HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
|
||||
}
|
||||
|
@ -291,7 +291,8 @@ class Image3 {
|
|||
private:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (c >= kNumPlanes || y >= ysize()) {
|
||||
HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
|
||||
static_cast<uint64_t>(c), static_cast<uint64_t>(y),
|
||||
|
|
|
@ -51,7 +51,7 @@ struct TestAlignedT {
|
|||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; x += Lanes(d)) {
|
||||
const auto values = Iota(d, static_cast<T>(dist(rng)));
|
||||
const auto values = Iota(d, dist(rng));
|
||||
Store(values, d, row + x);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -486,7 +486,7 @@ struct AsinImpl<float> {
|
|||
}
|
||||
};
|
||||
|
||||
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
|
||||
template <>
|
||||
struct AsinImpl<double> {
|
||||
|
@ -531,7 +531,7 @@ struct AtanImpl<float> {
|
|||
}
|
||||
};
|
||||
|
||||
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
|
||||
template <>
|
||||
struct AtanImpl<double> {
|
||||
|
@ -635,7 +635,7 @@ struct CosSinImpl<float> {
|
|||
}
|
||||
};
|
||||
|
||||
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
|
||||
template <>
|
||||
struct CosSinImpl<double> {
|
||||
|
@ -787,7 +787,7 @@ struct LogImpl<float> {
|
|||
}
|
||||
};
|
||||
|
||||
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
template <>
|
||||
struct ExpImpl<double> {
|
||||
// Rounds double toward zero and returns as int32_t.
|
||||
|
|
|
@ -61,7 +61,7 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
|
|||
|
||||
uint64_t max_ulp = 0;
|
||||
// Emulation is slower, so cannot afford as many.
|
||||
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
|
||||
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000));
|
||||
for (int range_index = 0; range_index < range_count; ++range_index) {
|
||||
const UintT start = ranges[range_index][0];
|
||||
const UintT stop = ranges[range_index][1];
|
||||
|
@ -96,11 +96,24 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
|
|||
HWY_ASSERT(max_ulp <= max_error_ulp);
|
||||
}
|
||||
|
||||
// TODO(janwas): remove once RVV supports fractional LMUL
|
||||
#undef DEFINE_MATH_TEST_FUNC
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
|
||||
#define DEFINE_MATH_TEST_FUNC(NAME) \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define DEFINE_MATH_TEST_FUNC(NAME) \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#undef DEFINE_MATH_TEST
|
||||
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
|
||||
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
|
||||
|
|
|
@ -1,133 +0,0 @@
|
|||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
|
||||
# usages with an empty list.
|
||||
COMPAT = [
|
||||
"//buildenv/target:non_prod", # includes mobile/vendor.
|
||||
]
|
||||
|
||||
cc_library(
|
||||
name = "vqsort",
|
||||
srcs = [
|
||||
# Split into separate files to reduce MSVC build time.
|
||||
"vqsort.cc",
|
||||
"vqsort_i16a.cc",
|
||||
"vqsort_i16d.cc",
|
||||
"vqsort_u16a.cc",
|
||||
"vqsort_u16d.cc",
|
||||
"vqsort_f32a.cc",
|
||||
"vqsort_f32d.cc",
|
||||
"vqsort_i32a.cc",
|
||||
"vqsort_i32d.cc",
|
||||
"vqsort_u32a.cc",
|
||||
"vqsort_u32d.cc",
|
||||
"vqsort_f64a.cc",
|
||||
"vqsort_f64d.cc",
|
||||
"vqsort_i64a.cc",
|
||||
"vqsort_i64d.cc",
|
||||
"vqsort_u64a.cc",
|
||||
"vqsort_u64d.cc",
|
||||
"vqsort_128a.cc",
|
||||
"vqsort_128d.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"disabled_targets.h",
|
||||
"vqsort.h", # public interface
|
||||
],
|
||||
compatible_with = [],
|
||||
textual_hdrs = [
|
||||
"shared-inl.h",
|
||||
"sorting_networks-inl.h",
|
||||
"traits-inl.h",
|
||||
"traits128-inl.h",
|
||||
"vqsort-inl.h",
|
||||
],
|
||||
deps = [
|
||||
# Only if VQSORT_SECURE_RNG is set.
|
||||
# "//third_party/absl/random",
|
||||
"//:hwy",
|
||||
],
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Internal-only targets
|
||||
|
||||
cc_library(
|
||||
name = "helpers",
|
||||
testonly = 1,
|
||||
textual_hdrs = [
|
||||
"algo-inl.h",
|
||||
"result-inl.h",
|
||||
],
|
||||
deps = [
|
||||
":vqsort",
|
||||
"//:nanobenchmark",
|
||||
# Required for HAVE_PDQSORT, but that is unused and this is
|
||||
# unavailable to Bazel builds, hence commented out.
|
||||
# "//third_party/boost/allowed",
|
||||
# Avoid ips4o and thus TBB to work around hwloc build failure.
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "print_network",
|
||||
testonly = 1,
|
||||
srcs = ["print_network.cc"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"//:hwy",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "sort_test",
|
||||
size = "medium",
|
||||
srcs = ["sort_test.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
# for test_suite.
|
||||
tags = ["hwy_ops_test"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "bench_sort",
|
||||
testonly = 1,
|
||||
srcs = ["bench_sort.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "bench_parallel",
|
||||
testonly = 1,
|
||||
srcs = ["bench_parallel.cc"],
|
||||
features = ["fully_static_link"],
|
||||
linkstatic = True,
|
||||
local_defines = ["HWY_IS_TEST"],
|
||||
deps = [
|
||||
":helpers",
|
||||
":vqsort",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"//:hwy",
|
||||
"//:hwy_test_util",
|
||||
],
|
||||
)
|
|
@ -1,395 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Normal include guard for target-independent parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath> // std::abs
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
// Third-party algorithms
|
||||
#define HAVE_AVX2SORT 0
|
||||
#define HAVE_IPS4O 0
|
||||
#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
|
||||
#define HAVE_PDQSORT 0
|
||||
#define HAVE_SORT512 0
|
||||
|
||||
#if HAVE_AVX2SORT
|
||||
HWY_PUSH_ATTRIBUTES("avx2,avx")
|
||||
#include "avx2sort.h"
|
||||
HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
#include "third_party/ips4o/include/ips4o.hpp"
|
||||
#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
#include "third_party/boost/allowed/sort/sort.hpp"
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
#include "sort512.h"
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
enum class Dist { kUniform8, kUniform16, kUniform32 };
|
||||
|
||||
std::vector<Dist> AllDist() {
|
||||
return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32};
|
||||
}
|
||||
|
||||
const char* DistName(Dist dist) {
|
||||
switch (dist) {
|
||||
case Dist::kUniform8:
|
||||
return "uniform8";
|
||||
case Dist::kUniform16:
|
||||
return "uniform16";
|
||||
case Dist::kUniform32:
|
||||
return "uniform32";
|
||||
}
|
||||
return "unreachable";
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
class InputStats {
|
||||
public:
|
||||
void Notify(T value) {
|
||||
min_ = std::min(min_, value);
|
||||
max_ = std::max(max_, value);
|
||||
sumf_ += static_cast<double>(value);
|
||||
count_ += 1;
|
||||
}
|
||||
|
||||
bool operator==(const InputStats& other) const {
|
||||
if (count_ != other.count_) {
|
||||
HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
|
||||
static_cast<int>(other.count_));
|
||||
}
|
||||
|
||||
if (min_ != other.min_ || max_ != other.max_) {
|
||||
HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
|
||||
double(other.min_), double(other.max_));
|
||||
}
|
||||
|
||||
// Sum helps detect duplicated/lost values
|
||||
if (sumf_ != other.sumf_) {
|
||||
// Allow some tolerance because kUniform32 * num can exceed double
|
||||
// precision.
|
||||
const double mul = 1E-9; // prevent destructive cancellation
|
||||
const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
|
||||
if (err > 1E-3) {
|
||||
HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
|
||||
other.sumf_, err, double(min_), double(max_));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
T min_ = hwy::HighestValue<T>();
|
||||
T max_ = hwy::LowestValue<T>();
|
||||
double sumf_ = 0.0;
|
||||
size_t count_ = 0;
|
||||
};
|
||||
|
||||
enum class Algo {
|
||||
#if HAVE_AVX2SORT
|
||||
kSEA,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
kParallelIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
kSort512,
|
||||
#endif
|
||||
kStd,
|
||||
kVQSort,
|
||||
kHeap,
|
||||
};
|
||||
|
||||
const char* AlgoName(Algo algo) {
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
case Algo::kSEA:
|
||||
return "sea";
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
case Algo::kIPS4O:
|
||||
return "ips4o";
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
case Algo::kParallelIPS4O:
|
||||
return "par_ips4o";
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
case Algo::kPDQ:
|
||||
return "pdq";
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
case Algo::kSort512:
|
||||
return "sort512";
|
||||
#endif
|
||||
case Algo::kStd:
|
||||
return "std";
|
||||
case Algo::kVQSort:
|
||||
return "vq";
|
||||
case Algo::kHeap:
|
||||
return "heap";
|
||||
}
|
||||
return "unreachable";
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
class Xorshift128Plus {
|
||||
static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
|
||||
return z ^ (z >> 31);
|
||||
}
|
||||
|
||||
public:
|
||||
// Generates two vectors of 64-bit seeds via SplitMix64 and stores into
|
||||
// `seeds`. Generating these afresh in each ChoosePivot is too expensive.
|
||||
template <class DU64>
|
||||
static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
|
||||
seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
|
||||
for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
|
||||
seeds[i] = SplitMix64(seeds[i - 1]);
|
||||
}
|
||||
}
|
||||
|
||||
// Need to pass in the state because vector cannot be class members.
|
||||
template <class DU64>
|
||||
static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
|
||||
Vec<DU64>& state1) {
|
||||
Vec<DU64> s1 = state0;
|
||||
Vec<DU64> s0 = state1;
|
||||
const Vec<DU64> bits = Add(s1, s0);
|
||||
state0 = s0;
|
||||
s1 = Xor(s1, ShiftLeft<23>(s1));
|
||||
state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
|
||||
return bits;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
|
||||
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
|
||||
const Vec<DU64> mask) {
|
||||
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
|
||||
return And(bits, mask);
|
||||
}
|
||||
|
||||
// Important to avoid denormals, which are flushed to zero by SIMD but not
|
||||
// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
|
||||
template <typename T, class DU64, HWY_IF_FLOAT(T)>
|
||||
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
|
||||
const Vec<DU64> mask) {
|
||||
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
|
||||
const Vec<DU64> values = And(bits, mask);
|
||||
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to i32
|
||||
const RebindToSigned<DU64> di;
|
||||
#else
|
||||
const Repartition<MakeSigned<T>, DU64> di;
|
||||
#endif
|
||||
const RebindToFloat<decltype(di)> df;
|
||||
// Avoid NaN/denormal by converting from (range-limited) integer.
|
||||
const Vec<DU64> no_nan =
|
||||
And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
|
||||
return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
|
||||
}
|
||||
|
||||
template <class DU64>
|
||||
Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
|
||||
switch (sizeof_t) {
|
||||
case 2:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
|
||||
: 0xFFFFFFFFFFFFFFFFull);
|
||||
case 4:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
|
||||
: (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
|
||||
: 0xFFFFFFFFFFFFFFFFull);
|
||||
case 8:
|
||||
return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
|
||||
: (dist == Dist::kUniform16) ? 0x000000000000FFFFull
|
||||
: 0x00000000FFFFFFFFull);
|
||||
default:
|
||||
HWY_ABORT("Logic error");
|
||||
return Zero(du64);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
|
||||
SortTag<uint64_t> du64;
|
||||
using VU64 = Vec<decltype(du64)>;
|
||||
const size_t N64 = Lanes(du64);
|
||||
auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
|
||||
Xorshift128Plus::GenerateSeeds(du64, buf.get());
|
||||
auto s0 = Load(du64, buf.get());
|
||||
auto s1 = Load(du64, buf.get() + N64);
|
||||
|
||||
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
|
||||
|
||||
const Repartition<T, decltype(du64)> d;
|
||||
const size_t N = Lanes(d);
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
|
||||
#if HWY_ARCH_RVV
|
||||
// v may not be 64-bit aligned
|
||||
StoreU(bits, du64, buf.get());
|
||||
memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
|
||||
#else
|
||||
StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
|
||||
#endif
|
||||
}
|
||||
if (i < num) {
|
||||
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
|
||||
StoreU(bits, du64, buf.get());
|
||||
memcpy(v + i, buf.get(), (num - i) * sizeof(T));
|
||||
}
|
||||
|
||||
InputStats<T> input_stats;
|
||||
for (size_t i = 0; i < num; ++i) {
|
||||
input_stats.Notify(v[i]);
|
||||
}
|
||||
return input_stats;
|
||||
}
|
||||
|
||||
struct ThreadLocal {
|
||||
Sorter sorter;
|
||||
};
|
||||
|
||||
struct SharedState {
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
ips4o::StdThreadPool pool{
|
||||
HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))};
|
||||
#endif
|
||||
std::vector<ThreadLocal> tls{1};
|
||||
};
|
||||
|
||||
template <class Order, typename T>
|
||||
void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
|
||||
size_t thread) {
|
||||
using detail::HeapSort;
|
||||
using detail::LaneTraits;
|
||||
using detail::SharedTraits;
|
||||
|
||||
switch (algo) {
|
||||
#if HAVE_AVX2SORT
|
||||
case Algo::kSEA:
|
||||
return avx2::quicksort(inout, static_cast<int>(num));
|
||||
#endif
|
||||
|
||||
#if HAVE_IPS4O
|
||||
case Algo::kIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::sort(inout, inout + num, std::less<T>());
|
||||
} else {
|
||||
return ips4o::sort(inout, inout + num, std::greater<T>());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
case Algo::kParallelIPS4O:
|
||||
if (Order().IsAscending()) {
|
||||
return ips4o::parallel::sort(inout, inout + num, std::less<T>());
|
||||
} else {
|
||||
return ips4o::parallel::sort(inout, inout + num, std::greater<T>());
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_SORT512
|
||||
case Algo::kSort512:
|
||||
HWY_ABORT("not supported");
|
||||
// return Sort512::Sort(inout, num);
|
||||
#endif
|
||||
|
||||
#if HAVE_PDQSORT
|
||||
case Algo::kPDQ:
|
||||
if (Order().IsAscending()) {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num,
|
||||
std::less<T>());
|
||||
} else {
|
||||
return boost::sort::pdqsort_branchless(inout, inout + num,
|
||||
std::greater<T>());
|
||||
}
|
||||
#endif
|
||||
|
||||
case Algo::kStd:
|
||||
if (Order().IsAscending()) {
|
||||
return std::sort(inout, inout + num, std::less<T>());
|
||||
} else {
|
||||
return std::sort(inout, inout + num, std::greater<T>());
|
||||
}
|
||||
|
||||
case Algo::kVQSort:
|
||||
return shared.tls[thread].sorter(inout, num, Order());
|
||||
|
||||
case Algo::kHeap:
|
||||
HWY_ASSERT(sizeof(T) < 16);
|
||||
if (Order().IsAscending()) {
|
||||
const SharedTraits<LaneTraits<detail::OrderAscending>> st;
|
||||
return HeapSort(st, inout, num);
|
||||
} else {
|
||||
const SharedTraits<LaneTraits<detail::OrderDescending>> st;
|
||||
return HeapSort(st, inout, num);
|
||||
}
|
||||
|
||||
default:
|
||||
HWY_ABORT("Not implemented");
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
|
|
@ -1,243 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Concurrent, independent sorts for generating more memory traffic and testing
|
||||
// scalability.
|
||||
|
||||
// clang-format off
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/aligned_allocator.h"
|
||||
// Last
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <condition_variable> //NOLINT
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex> //NOLINT
|
||||
#include <thread> //NOLINT
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
class ThreadPool {
|
||||
public:
|
||||
// Starts the given number of worker threads and blocks until they are ready.
|
||||
explicit ThreadPool(
|
||||
const size_t num_threads = std::thread::hardware_concurrency() / 2)
|
||||
: num_threads_(num_threads) {
|
||||
HWY_ASSERT(num_threads_ > 0);
|
||||
threads_.reserve(num_threads_);
|
||||
for (size_t i = 0; i < num_threads_; ++i) {
|
||||
threads_.emplace_back(ThreadFunc, this, i);
|
||||
}
|
||||
|
||||
WorkersReadyBarrier();
|
||||
}
|
||||
|
||||
ThreadPool(const ThreadPool&) = delete;
|
||||
ThreadPool& operator&(const ThreadPool&) = delete;
|
||||
|
||||
// Waits for all threads to exit.
|
||||
~ThreadPool() {
|
||||
StartWorkers(kWorkerExit);
|
||||
|
||||
for (std::thread& thread : threads_) {
|
||||
thread.join();
|
||||
}
|
||||
}
|
||||
|
||||
size_t NumThreads() const { return threads_.size(); }
|
||||
|
||||
template <class Func>
|
||||
void RunOnThreads(size_t max_threads, const Func& func) {
|
||||
task_ = &CallClosure<Func>;
|
||||
data_ = &func;
|
||||
StartWorkers(max_threads);
|
||||
WorkersReadyBarrier();
|
||||
}
|
||||
|
||||
private:
|
||||
// After construction and between calls to Run, workers are "ready", i.e.
|
||||
// waiting on worker_start_cv_. They are "started" by sending a "command"
|
||||
// and notifying all worker_start_cv_ waiters. (That is why all workers
|
||||
// must be ready/waiting - otherwise, the notification will not reach all of
|
||||
// them and the main thread waits in vain for them to report readiness.)
|
||||
using WorkerCommand = uint64_t;
|
||||
|
||||
static constexpr WorkerCommand kWorkerWait = ~1ULL;
|
||||
static constexpr WorkerCommand kWorkerExit = ~2ULL;
|
||||
|
||||
// Calls a closure (lambda with captures).
|
||||
template <class Closure>
|
||||
static void CallClosure(const void* f, size_t thread) {
|
||||
(*reinterpret_cast<const Closure*>(f))(thread);
|
||||
}
|
||||
|
||||
void WorkersReadyBarrier() {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
// Typically only a single iteration.
|
||||
while (workers_ready_ != threads_.size()) {
|
||||
workers_ready_cv_.wait(lock);
|
||||
}
|
||||
workers_ready_ = 0;
|
||||
|
||||
// Safely handle spurious worker wakeups.
|
||||
worker_start_command_ = kWorkerWait;
|
||||
}
|
||||
|
||||
// Precondition: all workers are ready.
|
||||
void StartWorkers(const WorkerCommand worker_command) {
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
worker_start_command_ = worker_command;
|
||||
// Workers will need this lock, so release it before they wake up.
|
||||
lock.unlock();
|
||||
worker_start_cv_.notify_all();
|
||||
}
|
||||
|
||||
static void ThreadFunc(ThreadPool* self, size_t thread) {
|
||||
// Until kWorkerExit command received:
|
||||
for (;;) {
|
||||
std::unique_lock<std::mutex> lock(self->mutex_);
|
||||
// Notify main thread that this thread is ready.
|
||||
if (++self->workers_ready_ == self->num_threads_) {
|
||||
self->workers_ready_cv_.notify_one();
|
||||
}
|
||||
RESUME_WAIT:
|
||||
// Wait for a command.
|
||||
self->worker_start_cv_.wait(lock);
|
||||
const WorkerCommand command = self->worker_start_command_;
|
||||
switch (command) {
|
||||
case kWorkerWait: // spurious wakeup:
|
||||
goto RESUME_WAIT; // lock still held, avoid incrementing ready.
|
||||
case kWorkerExit:
|
||||
return; // exits thread
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lock.unlock();
|
||||
// Command is the maximum number of threads that should run the task.
|
||||
HWY_ASSERT(command < self->NumThreads());
|
||||
if (thread < command) {
|
||||
self->task_(self->data_, thread);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const size_t num_threads_;
|
||||
|
||||
// Unmodified after ctor, but cannot be const because we call thread::join().
|
||||
std::vector<std::thread> threads_;
|
||||
|
||||
std::mutex mutex_; // guards both cv and their variables.
|
||||
std::condition_variable workers_ready_cv_;
|
||||
size_t workers_ready_ = 0;
|
||||
std::condition_variable worker_start_cv_;
|
||||
WorkerCommand worker_start_command_;
|
||||
|
||||
// Written by main thread, read by workers (after mutex lock/unlock).
|
||||
std::function<void(const void*, size_t)> task_; // points to CallClosure
|
||||
const void* data_; // points to caller's Func
|
||||
};
|
||||
|
||||
template <class Order, typename T>
|
||||
void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
|
||||
SharedState& shared, size_t thread) {
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
|
||||
(void)GenerateInput(dist, aligned.get(), num);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<Order>(algo, aligned.get(), num, shared, thread);
|
||||
HWY_ASSERT(aligned[0] < aligned[num - 1]);
|
||||
}
|
||||
|
||||
void BenchParallel() {
|
||||
// Not interested in benchmark results for other targets
|
||||
if (HWY_TARGET != HWY_AVX3) return;
|
||||
|
||||
ThreadPool pool;
|
||||
const size_t NT = pool.NumThreads();
|
||||
|
||||
using T = int64_t;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
|
||||
size_t num = 100 * 1000 * 1000;
|
||||
|
||||
#if HAVE_IPS4O
|
||||
const Algo algo = Algo::kIPS4O;
|
||||
#else
|
||||
const Algo algo = Algo::kVQSort;
|
||||
#endif
|
||||
const Dist dist = Dist::kUniform16;
|
||||
|
||||
SharedState shared;
|
||||
shared.tls.resize(NT);
|
||||
|
||||
std::vector<Result> results;
|
||||
for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
|
||||
Timestamp t0;
|
||||
// Default capture because MSVC wants algo/dist but clang does not.
|
||||
pool.RunOnThreads(nt, [=, &shared](size_t thread) {
|
||||
RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
|
||||
});
|
||||
const double sec = SecondsSince(t0);
|
||||
results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
|
||||
results.back().Print();
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchParallel() {}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(BenchParallel);
|
||||
HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
|
@ -1,259 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <vector>
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
using detail::LaneTraits;
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderDescending;
|
||||
using detail::SharedTraits;
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderDescending128;
|
||||
using detail::Traits128;
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_NOINLINE void BenchPartition() {
|
||||
const SortTag<T> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform8;
|
||||
double sum = 0.0;
|
||||
|
||||
const size_t max_log2 = AdjustedLog2Reps(20);
|
||||
for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
|
||||
const size_t num = 1ull << log2;
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
auto buf =
|
||||
hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
|
||||
|
||||
std::vector<double> seconds;
|
||||
const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
|
||||
for (size_t rep = 0; rep < num_reps; ++rep) {
|
||||
(void)GenerateInput(dist, aligned.get(), num);
|
||||
|
||||
const Timestamp t0;
|
||||
|
||||
detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
|
||||
buf.get());
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// 'Use' the result to prevent optimizing out the partition.
|
||||
sum += static_cast<double>(aligned.get()[num / 2]);
|
||||
}
|
||||
|
||||
MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
|
||||
SummarizeMeasurements(seconds))
|
||||
.Print();
|
||||
}
|
||||
HWY_ASSERT(sum != 999999); // Prevent optimizing out
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllPartition() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
|
||||
HWY_TARGET == HWY_AVX2) {
|
||||
return;
|
||||
}
|
||||
|
||||
BenchPartition<LaneTraits<OrderDescending>, float>();
|
||||
BenchPartition<LaneTraits<OrderAscending>, int64_t>();
|
||||
BenchPartition<Traits128<OrderDescending128>, uint64_t>();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
return;
|
||||
}
|
||||
|
||||
const SortTag<T> d;
|
||||
detail::SharedTraits<Traits> st;
|
||||
const Dist dist = Dist::kUniform32;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t num = SortConstants::BaseCaseNum(N);
|
||||
auto keys = hwy::AllocateAligned<T>(num);
|
||||
auto buf = hwy::AllocateAligned<T>(num + N);
|
||||
|
||||
std::vector<double> seconds;
|
||||
double sum = 0; // prevents elision
|
||||
constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
|
||||
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
|
||||
|
||||
const Timestamp t0;
|
||||
for (size_t i = 0; i < kMul; ++i) {
|
||||
detail::BaseCase(d, st, keys.get(), num, buf.get());
|
||||
sum += static_cast<double>(keys[0]);
|
||||
}
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
|
||||
}
|
||||
HWY_ASSERT(sum < 1E99);
|
||||
results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
|
||||
SummarizeMeasurements(seconds)));
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllBase() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<Result> results;
|
||||
BenchBase<LaneTraits<OrderAscending>, float>(results);
|
||||
BenchBase<LaneTraits<OrderDescending>, int64_t>(results);
|
||||
BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
|
||||
for (const Result& r : results) {
|
||||
r.Print();
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Algo> AlgoForBench() {
|
||||
return {
|
||||
#if HAVE_AVX2SORT
|
||||
Algo::kSEA,
|
||||
#endif
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
Algo::kParallelIPS4O,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
Algo::kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
Algo::kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
Algo::kSort512,
|
||||
#endif
|
||||
// Algo::kStd, // too slow to always benchmark
|
||||
// Algo::kHeap, // too slow to always benchmark
|
||||
Algo::kVQSort,
|
||||
};
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
HWY_NOINLINE void BenchSort(size_t num) {
|
||||
SharedState shared;
|
||||
detail::SharedTraits<Traits> st;
|
||||
auto aligned = hwy::AllocateAligned<T>(num);
|
||||
for (Algo algo : AlgoForBench()) {
|
||||
for (Dist dist : AllDist()) {
|
||||
std::vector<double> seconds;
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
|
||||
|
||||
const Timestamp t0;
|
||||
Run<typename Traits::Order>(algo, aligned.get(), num, shared,
|
||||
/*thread=*/0);
|
||||
seconds.push_back(SecondsSince(t0));
|
||||
// printf("%f\n", seconds.back());
|
||||
|
||||
HWY_ASSERT(
|
||||
VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
|
||||
}
|
||||
MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
|
||||
.Print();
|
||||
} // dist
|
||||
} // algo
|
||||
}
|
||||
|
||||
HWY_NOINLINE void BenchAllSort() {
|
||||
// Not interested in benchmark results for these targets
|
||||
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr size_t K = 1000;
|
||||
constexpr size_t M = K * K;
|
||||
(void)K;
|
||||
(void)M;
|
||||
for (size_t num : {
|
||||
#if HAVE_PARALLEL_IPS4O
|
||||
100 * M,
|
||||
#else
|
||||
AdjustedReps(1 * M),
|
||||
#endif
|
||||
}) {
|
||||
// BenchSort<LaneTraits<OrderAscending>, float>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, double>(num);
|
||||
// BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
|
||||
BenchSort<LaneTraits<OrderDescending>, int32_t>(num);
|
||||
BenchSort<LaneTraits<OrderAscending>, int64_t>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
|
||||
// BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
|
||||
// BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
|
||||
|
||||
BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
// BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void BenchAllPartition() {}
|
||||
void BenchAllBase() {}
|
||||
void BenchAllSort() {}
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(BenchSort);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
|
||||
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
|
@ -1,30 +0,0 @@
|
|||
// Copyright 2022 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Speed up MSVC builds by building fewer targets. This header must be included
|
||||
// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
|
||||
// However, users of vqsort.h are unaffected.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#undef HWY_DISABLED_TARGETS
|
||||
// HWY_SCALAR remains, so there will still be a valid target to call.
|
||||
#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
|
|
@ -1,190 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Based on A.7 in "Entwurf und Implementierung vektorisierter
|
||||
// Sortieralgorithmen" and code by Mark Blacher.
|
||||
void PrintMergeNetwork16x2() {
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x4() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x8() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
void PrintMergeNetwork16x16() {
|
||||
printf("\n");
|
||||
|
||||
for (int i = 8; i < 16; ++i) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
|
||||
}
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 4) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; i += 2) {
|
||||
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
PrintMergeNetwork16x2();
|
||||
PrintMergeNetwork16x4();
|
||||
PrintMergeNetwork16x8();
|
||||
PrintMergeNetwork16x16();
|
||||
return 0;
|
||||
}
|
|
@ -1,149 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
|
||||
// Normal include guard for non-SIMD parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include <algorithm> // std::sort
|
||||
#include <string>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
struct Timestamp {
|
||||
Timestamp() { t = platform::Now(); }
|
||||
double t;
|
||||
};
|
||||
|
||||
double SecondsSince(const Timestamp& t0) {
|
||||
const Timestamp t1;
|
||||
return t1.t - t0.t;
|
||||
}
|
||||
|
||||
constexpr size_t kReps = 30;
|
||||
|
||||
// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
|
||||
// enough for the mode to be reliable).
|
||||
double SummarizeMeasurements(std::vector<double>& seconds) {
|
||||
std::sort(seconds.begin(), seconds.end());
|
||||
double sum = 0;
|
||||
int count = 0;
|
||||
for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
|
||||
sum += seconds[i];
|
||||
count += 1;
|
||||
}
|
||||
return sum / count;
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct Result {
|
||||
Result() {}
|
||||
Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
|
||||
size_t num, size_t num_threads, double sec, size_t sizeof_t,
|
||||
const char* type_name)
|
||||
: target(target),
|
||||
algo(algo),
|
||||
dist(dist),
|
||||
is128(is128),
|
||||
num(num),
|
||||
num_threads(num_threads),
|
||||
sec(sec),
|
||||
sizeof_t(sizeof_t),
|
||||
type_name(type_name) {}
|
||||
|
||||
void Print() const {
|
||||
const double bytes = static_cast<double>(num) *
|
||||
static_cast<double>(num_threads) *
|
||||
static_cast<double>(sizeof_t);
|
||||
printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
|
||||
hwy::TargetName(target), AlgoName(algo),
|
||||
is128 ? "u128" : type_name.c_str(), DistName(dist),
|
||||
static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
|
||||
}
|
||||
|
||||
uint32_t target;
|
||||
Algo algo;
|
||||
Dist dist;
|
||||
bool is128;
|
||||
size_t num = 0;
|
||||
size_t num_threads = 0;
|
||||
double sec = 0.0;
|
||||
size_t sizeof_t = 0;
|
||||
std::string type_name;
|
||||
};
|
||||
|
||||
template <typename T, class Traits>
|
||||
Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
|
||||
size_t num_threads, double sec) {
|
||||
char string100[100];
|
||||
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
|
||||
return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
|
||||
sizeof(T), string100);
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
|
||||
size_t num, const char* caller) {
|
||||
constexpr size_t N1 = st.Is128() ? 2 : 1;
|
||||
HWY_ASSERT(num >= N1);
|
||||
|
||||
InputStats<T> output_stats;
|
||||
// Ensure it matches the sort order
|
||||
for (size_t i = 0; i < num - N1; i += N1) {
|
||||
output_stats.Notify(out[i]);
|
||||
if (N1 == 2) output_stats.Notify(out[i + 1]);
|
||||
// Reverse order instead of checking !Compare1 so we accept equal keys.
|
||||
if (st.Compare1(out + i + N1, out + i)) {
|
||||
printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
|
||||
static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
|
||||
double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
|
||||
double(out[i + N1]));
|
||||
HWY_ABORT("%d-bit sort is incorrect\n",
|
||||
static_cast<int>(sizeof(T) * 8 * N1));
|
||||
}
|
||||
}
|
||||
output_stats.Notify(out[num - N1]);
|
||||
if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
|
||||
|
||||
return input_stats == output_stats;
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
|
|
@ -1,104 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Definitions shared between vqsort-inl and sorting_networks-inl.
|
||||
|
||||
// Normal include guard for target-independent parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Internal constants - these are to avoid magic numbers/literals and cannot be
|
||||
// changed without also changing the associated code.
|
||||
struct SortConstants {
|
||||
// SortingNetwork reshapes its input into a matrix. This is the maximum number
|
||||
// of *keys* per vector.
|
||||
#if HWY_COMPILER_MSVC
|
||||
static constexpr size_t kMaxCols = 8; // avoids build timeout
|
||||
#else
|
||||
static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
|
||||
#endif
|
||||
|
||||
// 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
|
||||
// fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
|
||||
// code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
|
||||
// extra logN factor for larger networks (for which only loose upper bounds
|
||||
// on size are known).
|
||||
static constexpr size_t kMaxRowsLog2 = 4;
|
||||
static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
|
||||
|
||||
static HWY_INLINE size_t BaseCaseNum(size_t N) {
|
||||
return kMaxRows * HWY_MIN(N, kMaxCols);
|
||||
}
|
||||
|
||||
// Unrolling is important (pipelining and amortizing branch mispredictions);
|
||||
// 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
|
||||
// somewhat slower for sorting than 4x.
|
||||
//
|
||||
// To change, must also update left + 3 * N etc. in the loop.
|
||||
static constexpr size_t kPartitionUnroll = 4;
|
||||
|
||||
static HWY_INLINE size_t PartitionBufNum(size_t N) {
|
||||
// The main loop reads kPartitionUnroll vectors, and first loads from
|
||||
// both left and right beforehand, so it requires min = 2 *
|
||||
// kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
|
||||
// >= BaseCaseNum), we partition the right side into a buffer. We need
|
||||
// another vector at the end so CompressStore does not overwrite anything.
|
||||
return (2 * kPartitionUnroll + 1) * N;
|
||||
}
|
||||
|
||||
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
|
||||
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
|
||||
// are larger, use entire vectors to ensure we do not overrun the array.
|
||||
static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
|
||||
return HWY_MAX(64 / sizeof_t, N);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Default tag / vector width selector.
|
||||
// TODO(janwas): enable once LMUL < 1 is supported.
|
||||
#if HWY_TARGET == HWY_RVV && 0
|
||||
template <typename T>
|
||||
using SortTag = ScalableTag<T, -1>;
|
||||
#else
|
||||
template <typename T>
|
||||
using SortTag = ScalableTag<T>;
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
|
|
@ -12,553 +12,160 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/algo-inl.h"
|
||||
#include "hwy/contrib/sort/result-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
|
||||
#include "hwy/contrib/sort/sort-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include <algorithm> // std::max
|
||||
#include <vector>
|
||||
|
||||
#undef VQSORT_TEST_IMPL
|
||||
#if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
|
||||
// Scalar does not implement these, and MSVC non-debug builds time out.
|
||||
#define VQSORT_TEST_IMPL 0
|
||||
#else
|
||||
#define VQSORT_TEST_IMPL 1
|
||||
#endif
|
||||
|
||||
#undef VQSORT_TEST_SORT
|
||||
// MSVC non-debug builds time out.
|
||||
#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
|
||||
#define VQSORT_TEST_SORT 0
|
||||
#else
|
||||
#define VQSORT_TEST_SORT 1
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace {
|
||||
|
||||
#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
|
||||
using detail::LaneTraits;
|
||||
using detail::OrderAscending;
|
||||
using detail::OrderAscending128;
|
||||
using detail::OrderDescending;
|
||||
using detail::OrderDescending128;
|
||||
using detail::SharedTraits;
|
||||
using detail::Traits128;
|
||||
#endif
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
#if !VQSORT_TEST_IMPL
|
||||
static void TestAllMedian() {}
|
||||
static void TestAllBaseCase() {}
|
||||
static void TestAllPartition() {}
|
||||
static void TestAllGenerator() {}
|
||||
#else
|
||||
|
||||
template <class Traits>
|
||||
static HWY_NOINLINE void TestMedian3() {
|
||||
using T = uint64_t;
|
||||
using D = CappedTag<T, 1>;
|
||||
SharedTraits<Traits> st;
|
||||
const D d;
|
||||
using V = Vec<D>;
|
||||
for (uint32_t bits = 0; bits < 8; ++bits) {
|
||||
const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
|
||||
const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
|
||||
const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
|
||||
const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
|
||||
// If at least half(rounded up) of bits are 1, so is the median.
|
||||
const size_t count = PopCount(bits);
|
||||
HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
|
||||
}
|
||||
template <class D>
|
||||
size_t K(D d) {
|
||||
return SortBatchSize(d);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllMedian() {
|
||||
TestMedian3<LaneTraits<OrderAscending> >();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void TestBaseCaseAscDesc() {
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<T> d;
|
||||
template <SortOrder kOrder, class D>
|
||||
void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
lengths.push_back(3 * N1);
|
||||
lengths.push_back(base_case_num / 2);
|
||||
lengths.push_back(base_case_num / 2 + N1);
|
||||
lengths.push_back(base_case_num - N1);
|
||||
lengths.push_back(base_case_num);
|
||||
|
||||
std::vector<size_t> misalignments;
|
||||
misalignments.push_back(0);
|
||||
misalignments.push_back(1);
|
||||
if (N >= 6) misalignments.push_back(N / 2 - 1);
|
||||
misalignments.push_back(N / 2);
|
||||
misalignments.push_back(N / 2 + 1);
|
||||
misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
|
||||
|
||||
for (bool asc : {false, true}) {
|
||||
for (size_t len : lengths) {
|
||||
for (size_t misalign : misalignments) {
|
||||
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
|
||||
if (kDebug) {
|
||||
printf("============%s asc %d N1 %d len %d misalign %d\n",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
|
||||
static_cast<int>(len), static_cast<int>(misalign));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_keys[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
InputStats<T> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
keys[i] =
|
||||
asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
|
||||
input_stats.Notify(keys[i]);
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, keys, len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // len
|
||||
} // asc
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void TestBaseCase01() {
|
||||
SharedTraits<Traits> st;
|
||||
const SortTag<T> d;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
|
||||
constexpr int kDebug = 0;
|
||||
auto keys = hwy::AllocateAligned<T>(base_case_num + N);
|
||||
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
|
||||
|
||||
std::vector<size_t> lengths;
|
||||
lengths.push_back(HWY_MAX(1, N1));
|
||||
lengths.push_back(3 * N1);
|
||||
lengths.push_back(base_case_num / 2);
|
||||
lengths.push_back(base_case_num / 2 + N1);
|
||||
lengths.push_back(base_case_num - N1);
|
||||
lengths.push_back(base_case_num);
|
||||
|
||||
for (size_t len : lengths) {
|
||||
if (kDebug) {
|
||||
printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
|
||||
static_cast<int>(N1), static_cast<int>(len));
|
||||
}
|
||||
const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
|
||||
for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
|
||||
InputStats<T> input_stats;
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
|
||||
input_stats.Notify(keys[i]);
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
// Ensure it matches the sort order
|
||||
for (size_t i = 0; i < K(d) - 1; ++i) {
|
||||
if (!verify::Compare(out[i], out[i + 1], kOrder)) {
|
||||
printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(i),
|
||||
static_cast<uint64_t>(N), static_cast<float>(out[i + 0]),
|
||||
static_cast<float>(out[i + 1]));
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f\n", static_cast<float>(out[i]));
|
||||
}
|
||||
|
||||
detail::BaseCase(d, st, keys.get(), len, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
printf("\n\nin was:\n");
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f\n", static_cast<float>(in[i]));
|
||||
}
|
||||
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
|
||||
for (size_t i = len; i < base_case_num + N; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // bits
|
||||
} // len
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void TestBaseCase() {
|
||||
TestBaseCaseAscDesc<Traits, T>();
|
||||
TestBaseCase01<Traits, T>();
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllBaseCase() {
|
||||
// Workaround for stack overflow on MSVC debug.
|
||||
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
|
||||
return;
|
||||
#endif
|
||||
|
||||
TestBaseCase<LaneTraits<OrderAscending>, int32_t>();
|
||||
TestBaseCase<LaneTraits<OrderDescending>, int64_t>();
|
||||
TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
|
||||
TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
|
||||
size_t left, size_t border,
|
||||
size_t right, const size_t N1,
|
||||
const T* pivot) {
|
||||
/* for (size_t i = left; i < right; ++i) {
|
||||
if (i == border) printf("--\n");
|
||||
printf("%4zu: %3d\n", i, keys[i]);
|
||||
}*/
|
||||
|
||||
HWY_ASSERT(left % N1 == 0);
|
||||
HWY_ASSERT(border % N1 == 0);
|
||||
HWY_ASSERT(right % N1 == 0);
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
for (size_t i = left; i < border; i += N1) {
|
||||
if (st.Compare1(pivot, keys + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
|
||||
"border %d",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
|
||||
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
|
||||
double(keys[i + 0]), static_cast<int>(border));
|
||||
fflush(stdout);
|
||||
HWY_ABORT("Sort is incorrect");
|
||||
}
|
||||
}
|
||||
for (size_t i = border; i < right; i += N1) {
|
||||
if (!st.Compare1(pivot, keys + i)) {
|
||||
HWY_ABORT(
|
||||
"%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
|
||||
"border %d",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
|
||||
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
|
||||
double(keys[i]), static_cast<int>(border));
|
||||
|
||||
// Also verify sums match (detects duplicated/lost values)
|
||||
double expected_sum = 0.0;
|
||||
double actual_sum = 0.0;
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
expected_sum += in[i];
|
||||
actual_sum += out[i];
|
||||
}
|
||||
if (expected_sum != actual_sum) {
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
printf("%.0f %.0f\n", static_cast<float>(in[i]),
|
||||
static_cast<float>(out[i]));
|
||||
}
|
||||
HWY_ABORT("Mismatch");
|
||||
}
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
static HWY_NOINLINE void TestPartition() {
|
||||
const SortTag<T> d;
|
||||
SharedTraits<Traits> st;
|
||||
const bool asc = typename Traits::Order().IsAscending();
|
||||
const size_t N = Lanes(d);
|
||||
constexpr int kDebug = 0;
|
||||
const size_t base_case_num = SortConstants::BaseCaseNum(N);
|
||||
// left + len + align
|
||||
const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
|
||||
auto aligned_keys = hwy::AllocateAligned<T>(total);
|
||||
auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
|
||||
class TestReverse {
|
||||
template <SortOrder kOrder, class D>
|
||||
void TestOrder(D d, RandomState& /* rng */) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
HWY_ASSERT((N % 4) == 0);
|
||||
auto in = AllocateAligned<T>(K(d));
|
||||
auto inout = AllocateAligned<T>(K(d));
|
||||
|
||||
const size_t N1 = st.LanesPerKey();
|
||||
for (bool in_asc : {false, true}) {
|
||||
for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
|
||||
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
|
||||
for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
|
||||
2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
|
||||
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
|
||||
for (T pivot1 :
|
||||
{T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
|
||||
const T pivot2[2] = {pivot1, 0};
|
||||
const auto pivot = st.SetKey(d, pivot2);
|
||||
for (size_t misalign = 0; misalign < N;
|
||||
misalign += st.LanesPerKey()) {
|
||||
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
|
||||
const size_t right = left + len;
|
||||
if (kDebug) {
|
||||
printf(
|
||||
"=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
|
||||
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
|
||||
static_cast<int>(len), static_cast<int>(right),
|
||||
double(pivot2[1]), double(pivot2[0]));
|
||||
}
|
||||
const size_t expected_size = SortBatchSize(d);
|
||||
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned_keys[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
|
||||
: static_cast<T>(right) - T(i));
|
||||
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
keys[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
|
||||
size_t border =
|
||||
detail::Partition(d, st, keys, left, right, pivot, buf.get());
|
||||
|
||||
if (kDebug >= 2) {
|
||||
printf("out>>>>>>\n");
|
||||
for (size_t i = left; i < right; ++i) {
|
||||
printf("%3zu: %f\n", i, double(keys[i]));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
printf("%3zu: sentinel %f\n", i, double(keys[i]));
|
||||
}
|
||||
}
|
||||
|
||||
VerifyPartition(st, keys, left, border, right, N1, pivot2);
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned_keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = 0; i < left; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = right; i < total - misalign; ++i) {
|
||||
if (keys[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // pivot
|
||||
} // len
|
||||
} // left
|
||||
} // asc
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPartition() {
|
||||
TestPartition<LaneTraits<OrderAscending>, int16_t>();
|
||||
TestPartition<LaneTraits<OrderDescending>, int32_t>();
|
||||
TestPartition<LaneTraits<OrderAscending>, int64_t>();
|
||||
TestPartition<LaneTraits<OrderDescending>, float>();
|
||||
#if HWY_HAVE_FLOAT64
|
||||
TestPartition<LaneTraits<OrderDescending>, double>();
|
||||
#endif
|
||||
TestPartition<Traits128<OrderAscending128>, uint64_t>();
|
||||
TestPartition<Traits128<OrderDescending128>, uint64_t>();
|
||||
}
|
||||
|
||||
// (used for sample selection for choosing a pivot)
|
||||
template <typename TU>
|
||||
static HWY_NOINLINE void TestRandomGenerator() {
|
||||
static_assert(!hwy::IsSigned<TU>(), "");
|
||||
SortTag<TU> du;
|
||||
const size_t N = Lanes(du);
|
||||
|
||||
detail::Generator rng(&N, N);
|
||||
|
||||
const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two
|
||||
|
||||
for (uint32_t num_blocks = 2; num_blocks < 100000;
|
||||
num_blocks = 3 * num_blocks / 2) {
|
||||
// Generate some numbers and ensure all are in range
|
||||
uint64_t sum = 0;
|
||||
constexpr size_t kReps = 10000;
|
||||
for (size_t rep = 0; rep < kReps; ++rep) {
|
||||
const uint32_t bits = rng() & 0xFFFFFFFF;
|
||||
const size_t index = detail::RandomChunkIndex(num_blocks, bits);
|
||||
HWY_ASSERT(((index + 1) * lanes_per_block) <=
|
||||
num_blocks * lanes_per_block);
|
||||
|
||||
sum += index;
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
in[i] = static_cast<T>(K(d) - i);
|
||||
inout[i] = in[i];
|
||||
}
|
||||
|
||||
// Also ensure the mean is near the middle of the range
|
||||
const double expected = (num_blocks - 1) / 2.0;
|
||||
const double actual = double(sum) / kReps;
|
||||
HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
|
||||
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
|
||||
HWY_ASSERT_EQ(expected_size, actual_size);
|
||||
Validate<kOrder>(d, in.get(), inout.get());
|
||||
}
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllGenerator() {
|
||||
TestRandomGenerator<uint32_t>();
|
||||
TestRandomGenerator<uint64_t>();
|
||||
}
|
||||
|
||||
#endif // VQSORT_TEST_IMPL
|
||||
|
||||
#if !VQSORT_TEST_SORT
|
||||
static void TestAllSort() {}
|
||||
#else
|
||||
|
||||
// Remembers input, and compares results to that of a reference algorithm.
|
||||
template <class Traits, typename T>
|
||||
class CompareResults {
|
||||
public:
|
||||
void SetInput(const T* in, size_t num) {
|
||||
copy_.resize(num);
|
||||
memcpy(copy_.data(), in, num * sizeof(T));
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
TestOrder<SortOrder::kAscending>(d, rng);
|
||||
TestOrder<SortOrder::kDescending>(d, rng);
|
||||
}
|
||||
|
||||
bool Verify(const T* output) {
|
||||
#if HAVE_PDQSORT
|
||||
const Algo reference = Algo::kPDQ;
|
||||
#else
|
||||
const Algo reference = Algo::kStd;
|
||||
#endif
|
||||
SharedState shared;
|
||||
using Order = typename Traits::Order;
|
||||
Run<Order>(reference, copy_.data(), copy_.size(), shared,
|
||||
/*thread=*/0);
|
||||
|
||||
for (size_t i = 0; i < copy_.size(); ++i) {
|
||||
if (copy_[i] != output[i]) {
|
||||
fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
|
||||
static_cast<int>(i), double(copy_[i]), double(output[i]));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<T> copy_;
|
||||
};
|
||||
|
||||
std::vector<Algo> AlgoForTest() {
|
||||
return {
|
||||
#if HAVE_AVX2SORT
|
||||
Algo::kSEA,
|
||||
#endif
|
||||
#if HAVE_IPS4O
|
||||
Algo::kIPS4O,
|
||||
#endif
|
||||
#if HAVE_PDQSORT
|
||||
Algo::kPDQ,
|
||||
#endif
|
||||
#if HAVE_SORT512
|
||||
Algo::kSort512,
|
||||
#endif
|
||||
Algo::kHeap, Algo::kVQSort,
|
||||
};
|
||||
void TestAllReverse() {
|
||||
TestReverse test;
|
||||
test(int32_t(), CappedTag<int32_t, 16>());
|
||||
test(uint32_t(), CappedTag<uint32_t, 16>());
|
||||
}
|
||||
|
||||
template <class Traits, typename T>
|
||||
void TestSort(size_t num) {
|
||||
// TODO(janwas): fix
|
||||
if (HWY_TARGET == HWY_SSSE3) return;
|
||||
// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
|
||||
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
|
||||
return;
|
||||
#endif
|
||||
class TestRanges {
|
||||
template <SortOrder kOrder, class D>
|
||||
void TestOrder(D d, RandomState& rng) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
HWY_ASSERT((N % 4) == 0);
|
||||
auto in = AllocateAligned<T>(K(d));
|
||||
auto inout = AllocateAligned<T>(K(d));
|
||||
|
||||
SharedState shared;
|
||||
SharedTraits<Traits> st;
|
||||
const size_t expected_size = SortBatchSize(d);
|
||||
|
||||
constexpr size_t kMaxMisalign = 16;
|
||||
auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
|
||||
for (Algo algo : AlgoForTest()) {
|
||||
#if HAVE_IPS4O
|
||||
if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
|
||||
continue;
|
||||
// For each range, try all 0/1 combinations and set any other lanes to
|
||||
// random inputs.
|
||||
constexpr size_t kRange = 8;
|
||||
for (size_t range = 0; range < K(d); range += kRange) {
|
||||
for (size_t bits = 0; bits < (1ull << kRange); ++bits) {
|
||||
// First set all to random, will later overwrite those for `range`
|
||||
for (size_t i = 0; i < K(d); ++i) {
|
||||
in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
// Now set the current combination of {0,1} for elements in the range.
|
||||
// This is sufficient to establish correctness (arbitrary inputs could
|
||||
// be mapped to 0/1 with a comparison predicate).
|
||||
for (size_t i = 0; i < kRange; ++i) {
|
||||
in[range + i] = inout[range + i] = (bits >> i) & 1;
|
||||
}
|
||||
|
||||
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
|
||||
HWY_ASSERT_EQ(expected_size, actual_size);
|
||||
Validate<kOrder>(d, in.get(), inout.get());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
for (Dist dist : AllDist()) {
|
||||
for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
|
||||
size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
|
||||
T* keys = aligned.get() + misalign;
|
||||
|
||||
// Set up red zones before/after the keys to sort
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
aligned[i] = hwy::LowestValue<T>();
|
||||
}
|
||||
for (size_t i = 0; i < kMaxMisalign; ++i) {
|
||||
keys[num + i] = hwy::HighestValue<T>();
|
||||
}
|
||||
#if HWY_IS_MSAN
|
||||
__msan_poison(aligned.get(), misalign * sizeof(T));
|
||||
__msan_poison(keys + num, kMaxMisalign * sizeof(T));
|
||||
#endif
|
||||
InputStats<T> input_stats = GenerateInput(dist, keys, num);
|
||||
|
||||
CompareResults<Traits, T> compare;
|
||||
compare.SetInput(keys, num);
|
||||
|
||||
Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
|
||||
HWY_ASSERT(compare.Verify(keys));
|
||||
HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
|
||||
|
||||
// Check red zones
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(aligned.get(), misalign * sizeof(T));
|
||||
__msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
|
||||
#endif
|
||||
for (size_t i = 0; i < misalign; ++i) {
|
||||
if (aligned[i] != hwy::LowestValue<T>())
|
||||
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
|
||||
}
|
||||
for (size_t i = num; i < num + kMaxMisalign; ++i) {
|
||||
if (keys[i] != hwy::HighestValue<T>())
|
||||
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
|
||||
}
|
||||
} // misalign
|
||||
} // dist
|
||||
} // algo
|
||||
}
|
||||
|
||||
void TestAllSort() {
|
||||
const size_t num = 15 * 1000;
|
||||
|
||||
TestSort<LaneTraits<OrderAscending>, int16_t>(num);
|
||||
TestSort<LaneTraits<OrderDescending>, uint16_t>(num);
|
||||
|
||||
TestSort<LaneTraits<OrderDescending>, int32_t>(num);
|
||||
TestSort<LaneTraits<OrderDescending>, uint32_t>(num);
|
||||
|
||||
TestSort<LaneTraits<OrderAscending>, int64_t>(num);
|
||||
TestSort<LaneTraits<OrderAscending>, uint64_t>(num);
|
||||
|
||||
// WARNING: for float types, SIMD comparisons will flush denormals to zero,
|
||||
// causing mismatches with scalar sorts. In this test, we avoid generating
|
||||
// denormal inputs.
|
||||
TestSort<LaneTraits<OrderAscending>, float>(num);
|
||||
#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
|
||||
if (Sorter::HaveFloat64()) {
|
||||
TestSort<LaneTraits<OrderDescending>, double>(num);
|
||||
}
|
||||
#endif
|
||||
|
||||
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
TestOrder<SortOrder::kAscending>(d, rng);
|
||||
TestOrder<SortOrder::kDescending>(d, rng);
|
||||
}
|
||||
};
|
||||
|
||||
void TestAllRanges() {
|
||||
TestRanges test;
|
||||
test(int32_t(), CappedTag<int32_t, 16>());
|
||||
test(uint32_t(), CappedTag<uint32_t, 16>());
|
||||
}
|
||||
|
||||
#endif // VQSORT_TEST_SORT
|
||||
#else
|
||||
void TestAllReverse() {}
|
||||
void TestAllRanges() {}
|
||||
#endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
|
||||
|
||||
} // namespace
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
@ -567,14 +174,9 @@ HWY_AFTER_NAMESPACE();
|
|||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_BEFORE_TEST(SortTest);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
|
||||
} // namespace
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse);
|
||||
HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
|
@ -583,4 +185,4 @@ int main(int argc, char** argv) {
|
|||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
#endif
|
||||
|
|
|
@ -1,686 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
using Constants = hwy::SortConstants;
|
||||
|
||||
// ------------------------------ SharedTraits
|
||||
|
||||
// Code shared between all traits. It's unclear whether these can profitably be
|
||||
// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
|
||||
// Compare/DupOdd.
|
||||
template <class Base>
|
||||
struct SharedTraits : public Base {
|
||||
// Conditionally swaps lane 0 with 2, 1 with 3 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->SwapAdjacentPairs(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys8(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return base->OddEvenQuads(d, swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
|
||||
Vec<D> swapped = base->ReverseKeys(d, v);
|
||||
base->Sort2(d, v, swapped);
|
||||
return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
|
||||
}
|
||||
};
|
||||
|
||||
// ------------------------------ Sorting network
|
||||
|
||||
// (Green's irregular) sorting network for independent columns in 16 vectors.
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
st.Sort2(d, v0, v2);
|
||||
st.Sort2(d, v1, v3);
|
||||
st.Sort2(d, v4, v6);
|
||||
st.Sort2(d, v5, v7);
|
||||
st.Sort2(d, v8, va);
|
||||
st.Sort2(d, v9, vb);
|
||||
st.Sort2(d, vc, ve);
|
||||
st.Sort2(d, vd, vf);
|
||||
st.Sort2(d, v0, v4);
|
||||
st.Sort2(d, v1, v5);
|
||||
st.Sort2(d, v2, v6);
|
||||
st.Sort2(d, v3, v7);
|
||||
st.Sort2(d, v8, vc);
|
||||
st.Sort2(d, v9, vd);
|
||||
st.Sort2(d, va, ve);
|
||||
st.Sort2(d, vb, vf);
|
||||
st.Sort2(d, v0, v8);
|
||||
st.Sort2(d, v1, v9);
|
||||
st.Sort2(d, v2, va);
|
||||
st.Sort2(d, v3, vb);
|
||||
st.Sort2(d, v4, vc);
|
||||
st.Sort2(d, v5, vd);
|
||||
st.Sort2(d, v6, ve);
|
||||
st.Sort2(d, v7, vf);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v7, vb);
|
||||
st.Sort2(d, vd, ve);
|
||||
st.Sort2(d, v4, v8);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v1, v4);
|
||||
st.Sort2(d, v7, vd);
|
||||
st.Sort2(d, v2, v8);
|
||||
st.Sort2(d, vb, ve);
|
||||
st.Sort2(d, v2, v4);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vb, vd);
|
||||
st.Sort2(d, v3, v8);
|
||||
st.Sort2(d, v7, vc);
|
||||
st.Sort2(d, v3, v5);
|
||||
st.Sort2(d, v6, v8);
|
||||
st.Sort2(d, v7, v9);
|
||||
st.Sort2(d, va, vc);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v7, v8);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vb, vc);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
}
|
||||
|
||||
// ------------------------------ Merging networks
|
||||
|
||||
// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys2(d, v8);
|
||||
v9 = st.ReverseKeys2(d, v9);
|
||||
va = st.ReverseKeys2(d, va);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
vc = st.ReverseKeys2(d, vc);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys2(d, v4);
|
||||
vc = st.ReverseKeys2(d, vc);
|
||||
v5 = st.ReverseKeys2(d, v5);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
v6 = st.ReverseKeys2(d, v6);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys2(d, v2);
|
||||
v3 = st.ReverseKeys2(d, v3);
|
||||
v6 = st.ReverseKeys2(d, v6);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
va = st.ReverseKeys2(d, va);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
ve = st.ReverseKeys2(d, ve);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys2(d, v1);
|
||||
v3 = st.ReverseKeys2(d, v3);
|
||||
v5 = st.ReverseKeys2(d, v5);
|
||||
v7 = st.ReverseKeys2(d, v7);
|
||||
v9 = st.ReverseKeys2(d, v9);
|
||||
vb = st.ReverseKeys2(d, vb);
|
||||
vd = st.ReverseKeys2(d, vd);
|
||||
vf = st.ReverseKeys2(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys4(d, v8);
|
||||
v9 = st.ReverseKeys4(d, v9);
|
||||
va = st.ReverseKeys4(d, va);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
vc = st.ReverseKeys4(d, vc);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys4(d, v4);
|
||||
vc = st.ReverseKeys4(d, vc);
|
||||
v5 = st.ReverseKeys4(d, v5);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
v6 = st.ReverseKeys4(d, v6);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys4(d, v2);
|
||||
v3 = st.ReverseKeys4(d, v3);
|
||||
v6 = st.ReverseKeys4(d, v6);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
va = st.ReverseKeys4(d, va);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
ve = st.ReverseKeys4(d, ve);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys4(d, v1);
|
||||
v3 = st.ReverseKeys4(d, v3);
|
||||
v5 = st.ReverseKeys4(d, v5);
|
||||
v7 = st.ReverseKeys4(d, v7);
|
||||
v9 = st.ReverseKeys4(d, v9);
|
||||
vb = st.ReverseKeys4(d, vb);
|
||||
vd = st.ReverseKeys4(d, vd);
|
||||
vf = st.ReverseKeys4(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse4(d, v0);
|
||||
v1 = st.SortPairsReverse4(d, v1);
|
||||
v2 = st.SortPairsReverse4(d, v2);
|
||||
v3 = st.SortPairsReverse4(d, v3);
|
||||
v4 = st.SortPairsReverse4(d, v4);
|
||||
v5 = st.SortPairsReverse4(d, v5);
|
||||
v6 = st.SortPairsReverse4(d, v6);
|
||||
v7 = st.SortPairsReverse4(d, v7);
|
||||
v8 = st.SortPairsReverse4(d, v8);
|
||||
v9 = st.SortPairsReverse4(d, v9);
|
||||
va = st.SortPairsReverse4(d, va);
|
||||
vb = st.SortPairsReverse4(d, vb);
|
||||
vc = st.SortPairsReverse4(d, vc);
|
||||
vd = st.SortPairsReverse4(d, vd);
|
||||
ve = st.SortPairsReverse4(d, ve);
|
||||
vf = st.SortPairsReverse4(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
|
||||
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
|
||||
V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys8(d, v8);
|
||||
v9 = st.ReverseKeys8(d, v9);
|
||||
va = st.ReverseKeys8(d, va);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
vc = st.ReverseKeys8(d, vc);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys8(d, v4);
|
||||
vc = st.ReverseKeys8(d, vc);
|
||||
v5 = st.ReverseKeys8(d, v5);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
v6 = st.ReverseKeys8(d, v6);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys8(d, v2);
|
||||
v3 = st.ReverseKeys8(d, v3);
|
||||
v6 = st.ReverseKeys8(d, v6);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
va = st.ReverseKeys8(d, va);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
ve = st.ReverseKeys8(d, ve);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys8(d, v1);
|
||||
v3 = st.ReverseKeys8(d, v3);
|
||||
v5 = st.ReverseKeys8(d, v5);
|
||||
v7 = st.ReverseKeys8(d, v7);
|
||||
v9 = st.ReverseKeys8(d, v9);
|
||||
vb = st.ReverseKeys8(d, vb);
|
||||
vd = st.ReverseKeys8(d, vd);
|
||||
vf = st.ReverseKeys8(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse8(d, v0);
|
||||
v1 = st.SortPairsReverse8(d, v1);
|
||||
v2 = st.SortPairsReverse8(d, v2);
|
||||
v3 = st.SortPairsReverse8(d, v3);
|
||||
v4 = st.SortPairsReverse8(d, v4);
|
||||
v5 = st.SortPairsReverse8(d, v5);
|
||||
v6 = st.SortPairsReverse8(d, v6);
|
||||
v7 = st.SortPairsReverse8(d, v7);
|
||||
v8 = st.SortPairsReverse8(d, v8);
|
||||
v9 = st.SortPairsReverse8(d, v9);
|
||||
va = st.SortPairsReverse8(d, va);
|
||||
vb = st.SortPairsReverse8(d, vb);
|
||||
vc = st.SortPairsReverse8(d, vc);
|
||||
vd = st.SortPairsReverse8(d, vd);
|
||||
ve = st.SortPairsReverse8(d, ve);
|
||||
vf = st.SortPairsReverse8(d, vf);
|
||||
v0 = st.SortPairsDistance2(d, v0);
|
||||
v1 = st.SortPairsDistance2(d, v1);
|
||||
v2 = st.SortPairsDistance2(d, v2);
|
||||
v3 = st.SortPairsDistance2(d, v3);
|
||||
v4 = st.SortPairsDistance2(d, v4);
|
||||
v5 = st.SortPairsDistance2(d, v5);
|
||||
v6 = st.SortPairsDistance2(d, v6);
|
||||
v7 = st.SortPairsDistance2(d, v7);
|
||||
v8 = st.SortPairsDistance2(d, v8);
|
||||
v9 = st.SortPairsDistance2(d, v9);
|
||||
va = st.SortPairsDistance2(d, va);
|
||||
vb = st.SortPairsDistance2(d, vb);
|
||||
vc = st.SortPairsDistance2(d, vc);
|
||||
vd = st.SortPairsDistance2(d, vd);
|
||||
ve = st.SortPairsDistance2(d, ve);
|
||||
vf = st.SortPairsDistance2(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
// Unused on MSVC, see below
|
||||
#if !HWY_COMPILER_MSVC
|
||||
|
||||
template <class D, class Traits, class V = Vec<D>>
|
||||
HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
|
||||
V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
|
||||
V& vd, V& ve, V& vf) {
|
||||
v8 = st.ReverseKeys16(d, v8);
|
||||
v9 = st.ReverseKeys16(d, v9);
|
||||
va = st.ReverseKeys16(d, va);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
vc = st.ReverseKeys16(d, vc);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, vf);
|
||||
st.Sort2(d, v1, ve);
|
||||
st.Sort2(d, v2, vd);
|
||||
st.Sort2(d, v3, vc);
|
||||
st.Sort2(d, v4, vb);
|
||||
st.Sort2(d, v5, va);
|
||||
st.Sort2(d, v6, v9);
|
||||
st.Sort2(d, v7, v8);
|
||||
v4 = st.ReverseKeys16(d, v4);
|
||||
vc = st.ReverseKeys16(d, vc);
|
||||
v5 = st.ReverseKeys16(d, v5);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
v6 = st.ReverseKeys16(d, v6);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v7);
|
||||
st.Sort2(d, v8, vf);
|
||||
st.Sort2(d, v1, v6);
|
||||
st.Sort2(d, v9, ve);
|
||||
st.Sort2(d, v2, v5);
|
||||
st.Sort2(d, va, vd);
|
||||
st.Sort2(d, v3, v4);
|
||||
st.Sort2(d, vb, vc);
|
||||
v2 = st.ReverseKeys16(d, v2);
|
||||
v3 = st.ReverseKeys16(d, v3);
|
||||
v6 = st.ReverseKeys16(d, v6);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
va = st.ReverseKeys16(d, va);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
ve = st.ReverseKeys16(d, ve);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v3);
|
||||
st.Sort2(d, v1, v2);
|
||||
st.Sort2(d, v4, v7);
|
||||
st.Sort2(d, v5, v6);
|
||||
st.Sort2(d, v8, vb);
|
||||
st.Sort2(d, v9, va);
|
||||
st.Sort2(d, vc, vf);
|
||||
st.Sort2(d, vd, ve);
|
||||
v1 = st.ReverseKeys16(d, v1);
|
||||
v3 = st.ReverseKeys16(d, v3);
|
||||
v5 = st.ReverseKeys16(d, v5);
|
||||
v7 = st.ReverseKeys16(d, v7);
|
||||
v9 = st.ReverseKeys16(d, v9);
|
||||
vb = st.ReverseKeys16(d, vb);
|
||||
vd = st.ReverseKeys16(d, vd);
|
||||
vf = st.ReverseKeys16(d, vf);
|
||||
st.Sort2(d, v0, v1);
|
||||
st.Sort2(d, v2, v3);
|
||||
st.Sort2(d, v4, v5);
|
||||
st.Sort2(d, v6, v7);
|
||||
st.Sort2(d, v8, v9);
|
||||
st.Sort2(d, va, vb);
|
||||
st.Sort2(d, vc, vd);
|
||||
st.Sort2(d, ve, vf);
|
||||
v0 = st.SortPairsReverse16(d, v0);
|
||||
v1 = st.SortPairsReverse16(d, v1);
|
||||
v2 = st.SortPairsReverse16(d, v2);
|
||||
v3 = st.SortPairsReverse16(d, v3);
|
||||
v4 = st.SortPairsReverse16(d, v4);
|
||||
v5 = st.SortPairsReverse16(d, v5);
|
||||
v6 = st.SortPairsReverse16(d, v6);
|
||||
v7 = st.SortPairsReverse16(d, v7);
|
||||
v8 = st.SortPairsReverse16(d, v8);
|
||||
v9 = st.SortPairsReverse16(d, v9);
|
||||
va = st.SortPairsReverse16(d, va);
|
||||
vb = st.SortPairsReverse16(d, vb);
|
||||
vc = st.SortPairsReverse16(d, vc);
|
||||
vd = st.SortPairsReverse16(d, vd);
|
||||
ve = st.SortPairsReverse16(d, ve);
|
||||
vf = st.SortPairsReverse16(d, vf);
|
||||
v0 = st.SortPairsDistance4(d, v0);
|
||||
v1 = st.SortPairsDistance4(d, v1);
|
||||
v2 = st.SortPairsDistance4(d, v2);
|
||||
v3 = st.SortPairsDistance4(d, v3);
|
||||
v4 = st.SortPairsDistance4(d, v4);
|
||||
v5 = st.SortPairsDistance4(d, v5);
|
||||
v6 = st.SortPairsDistance4(d, v6);
|
||||
v7 = st.SortPairsDistance4(d, v7);
|
||||
v8 = st.SortPairsDistance4(d, v8);
|
||||
v9 = st.SortPairsDistance4(d, v9);
|
||||
va = st.SortPairsDistance4(d, va);
|
||||
vb = st.SortPairsDistance4(d, vb);
|
||||
vc = st.SortPairsDistance4(d, vc);
|
||||
vd = st.SortPairsDistance4(d, vd);
|
||||
ve = st.SortPairsDistance4(d, ve);
|
||||
vf = st.SortPairsDistance4(d, vf);
|
||||
v0 = st.SortPairsDistance2(d, v0);
|
||||
v1 = st.SortPairsDistance2(d, v1);
|
||||
v2 = st.SortPairsDistance2(d, v2);
|
||||
v3 = st.SortPairsDistance2(d, v3);
|
||||
v4 = st.SortPairsDistance2(d, v4);
|
||||
v5 = st.SortPairsDistance2(d, v5);
|
||||
v6 = st.SortPairsDistance2(d, v6);
|
||||
v7 = st.SortPairsDistance2(d, v7);
|
||||
v8 = st.SortPairsDistance2(d, v8);
|
||||
v9 = st.SortPairsDistance2(d, v9);
|
||||
va = st.SortPairsDistance2(d, va);
|
||||
vb = st.SortPairsDistance2(d, vb);
|
||||
vc = st.SortPairsDistance2(d, vc);
|
||||
vd = st.SortPairsDistance2(d, vd);
|
||||
ve = st.SortPairsDistance2(d, ve);
|
||||
vf = st.SortPairsDistance2(d, vf);
|
||||
v0 = st.SortPairsDistance1(d, v0);
|
||||
v1 = st.SortPairsDistance1(d, v1);
|
||||
v2 = st.SortPairsDistance1(d, v2);
|
||||
v3 = st.SortPairsDistance1(d, v3);
|
||||
v4 = st.SortPairsDistance1(d, v4);
|
||||
v5 = st.SortPairsDistance1(d, v5);
|
||||
v6 = st.SortPairsDistance1(d, v6);
|
||||
v7 = st.SortPairsDistance1(d, v7);
|
||||
v8 = st.SortPairsDistance1(d, v8);
|
||||
v9 = st.SortPairsDistance1(d, v9);
|
||||
va = st.SortPairsDistance1(d, va);
|
||||
vb = st.SortPairsDistance1(d, vb);
|
||||
vc = st.SortPairsDistance1(d, vc);
|
||||
vd = st.SortPairsDistance1(d, vd);
|
||||
ve = st.SortPairsDistance1(d, ve);
|
||||
vf = st.SortPairsDistance1(d, vf);
|
||||
}
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC
|
||||
|
||||
// Reshapes `buf` into a matrix, sorts columns independently, and then merges
|
||||
// into a sorted 1D array without transposing.
|
||||
//
|
||||
// `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
|
||||
// bridges differences in sort order and single-lane vs 128-bit keys.
|
||||
// `buf` ensures full vectors are aligned, and enables loads/stores without
|
||||
// bounds checks.
|
||||
//
|
||||
// References:
|
||||
// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
|
||||
// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
|
||||
// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
|
||||
template <class Traits, typename T>
|
||||
HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
|
||||
const CappedTag<T, Constants::kMaxCols> d;
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
HWY_DASSERT(cols <= Constants::kMaxCols);
|
||||
|
||||
// The network width depends on the number of keys, not lanes.
|
||||
constexpr size_t kLanesPerKey = st.LanesPerKey();
|
||||
const size_t keys = cols / kLanesPerKey;
|
||||
constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
|
||||
|
||||
// These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
|
||||
// offsets to duplicating this code for every value of cols.
|
||||
static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
|
||||
V v0 = LoadU(d, buf + 0x0 * cols);
|
||||
V v1 = LoadU(d, buf + 0x1 * cols);
|
||||
V v2 = LoadU(d, buf + 0x2 * cols);
|
||||
V v3 = LoadU(d, buf + 0x3 * cols);
|
||||
V v4 = LoadU(d, buf + 0x4 * cols);
|
||||
V v5 = LoadU(d, buf + 0x5 * cols);
|
||||
V v6 = LoadU(d, buf + 0x6 * cols);
|
||||
V v7 = LoadU(d, buf + 0x7 * cols);
|
||||
V v8 = LoadU(d, buf + 0x8 * cols);
|
||||
V v9 = LoadU(d, buf + 0x9 * cols);
|
||||
V va = LoadU(d, buf + 0xa * cols);
|
||||
V vb = LoadU(d, buf + 0xb * cols);
|
||||
V vc = LoadU(d, buf + 0xc * cols);
|
||||
V vd = LoadU(d, buf + 0xd * cols);
|
||||
V ve = LoadU(d, buf + 0xe * cols);
|
||||
V vf = LoadU(d, buf + 0xf * cols);
|
||||
|
||||
Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
|
||||
|
||||
// Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
|
||||
// code paths: if MaxLanes < 2, then keys <= cols < 2.
|
||||
if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
|
||||
Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
|
||||
vf);
|
||||
|
||||
if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
|
||||
Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
|
||||
vf);
|
||||
|
||||
if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
|
||||
Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
||||
// Avoids build timeout
|
||||
#if !HWY_COMPILER_MSVC
|
||||
if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
|
||||
Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
|
||||
ve, vf);
|
||||
|
||||
static_assert(Constants::kMaxCols <= 16, "Add more branches");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StoreU(v0, d, buf + 0x0 * cols);
|
||||
StoreU(v1, d, buf + 0x1 * cols);
|
||||
StoreU(v2, d, buf + 0x2 * cols);
|
||||
StoreU(v3, d, buf + 0x3 * cols);
|
||||
StoreU(v4, d, buf + 0x4 * cols);
|
||||
StoreU(v5, d, buf + 0x5 * cols);
|
||||
StoreU(v6, d, buf + 0x6 * cols);
|
||||
StoreU(v7, d, buf + 0x7 * cols);
|
||||
StoreU(v8, d, buf + 0x8 * cols);
|
||||
StoreU(v9, d, buf + 0x9 * cols);
|
||||
StoreU(va, d, buf + 0xa * cols);
|
||||
StoreU(vb, d, buf + 0xb * cols);
|
||||
StoreU(vc, d, buf + 0xc * cols);
|
||||
StoreU(vd, d, buf + 0xd * cols);
|
||||
StoreU(ve, d, buf + 0xe * cols);
|
||||
StoreU(vf, d, buf + 0xf * cols);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
|
|
@ -1,324 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
struct KeyLane {
|
||||
constexpr size_t LanesPerKey() const { return 1; }
|
||||
|
||||
// For HeapSort
|
||||
template <typename T>
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const T temp = *a;
|
||||
*a = *b;
|
||||
*b = temp;
|
||||
}
|
||||
|
||||
// Broadcasts one key into a vector
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return Set(d, *key);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
return Reverse(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
|
||||
return Reverse2(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
|
||||
return Reverse4(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
|
||||
return Reverse8(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
|
||||
static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
|
||||
return ReverseKeys(d, v);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
|
||||
return OddEven(odd, even);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
|
||||
const Repartition<uint32_t, D> du32;
|
||||
return BitCast(d, Shuffle2301(BitCast(du32, v)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
|
||||
return Shuffle1032(v);
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
|
||||
return SwapAdjacentBlocks(v);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D>> dw;
|
||||
#endif
|
||||
return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
|
||||
// Assumes max vector size = 512
|
||||
return ConcatLowerUpper(d, v, v);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
|
||||
const Vec<D> even) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D>> dw;
|
||||
#endif
|
||||
return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
|
||||
return OddEvenBlocks(odd, even);
|
||||
}
|
||||
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
|
||||
#if HWY_HAVE_FLOAT64 // in case D is float32
|
||||
const RepartitionToWide<D> dw;
|
||||
#else
|
||||
const RepartitionToWide<RebindToUnsigned<D>> dw;
|
||||
#endif
|
||||
return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
|
||||
}
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
|
||||
return ConcatUpperLower(d, odd, even);
|
||||
}
|
||||
};
|
||||
|
||||
// Anything order-related depends on the key traits *and* the order (see
|
||||
// FirstOfLanes). We cannot implement just one Compare function because Lt128
|
||||
// only compiles if the lane type is u64. Thus we need either overloaded
|
||||
// functions with a tag type, class specializations, or separate classes.
|
||||
// We avoid overloaded functions because we want all functions to be callable
|
||||
// from a SortTraits without per-function wrappers. Specializing would work, but
|
||||
// we are anyway going to specialize at a higher level.
|
||||
struct OrderAscending : public KeyLane {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return *a < *b;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
// Two halves of Sort2, used in ScanMinMax.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D>>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D>>());
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending : public KeyLane {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return *b < *a;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
return MaxOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT /* buf */) const {
|
||||
return MinOfLanes(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D>>());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D>>());
|
||||
}
|
||||
};
|
||||
|
||||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
struct LaneTraits : public Base {
|
||||
constexpr bool Is128() const { return false; }
|
||||
|
||||
// For each lane i: replaces a[i] with the first and b[i] with the second
|
||||
// according to Base.
|
||||
// Corresponds to a conditional swap, which is one "node" of a sorting
|
||||
// network. Min/Max are cheaper than compare + blend at least for integers.
|
||||
template <class D>
|
||||
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
|
||||
const Vec<D> a_copy = a;
|
||||
// Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
|
||||
// instructions. We can reduce it to a compare + 2 IfThenElse.
|
||||
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
|
||||
if (sizeof(TFromD<D>) == 8) {
|
||||
const Mask<D> cmp = base->Compare(d, a, b);
|
||||
a = IfThenElse(cmp, a, b);
|
||||
b = IfThenElse(cmp, b, a_copy);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
a = base->First(d, a, b);
|
||||
b = base->Last(d, a_copy, b);
|
||||
}
|
||||
|
||||
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
// Further to the above optimization, Sort2+OddEvenKeys compile to four
|
||||
// instructions; we can save one by combining two blends.
|
||||
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
|
||||
const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
|
||||
return IfVecThenElse(DupOdd(cmp), swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// (See above - we use Sort2 for non-64-bit types.)
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys4(d, v);
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
}
|
||||
|
||||
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->SwapAdjacentQuads(d, v);
|
||||
// Only used in Merge16, so this will not be used on AVX2 (which only has 4
|
||||
// u64 lanes), so skip the above optimization for 64-bit AVX2.
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenQuads(d, swapped, v);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
|
|
@ -1,368 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h" // SortDescending
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
|
||||
struct OrderAscending128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
|
||||
}
|
||||
};
|
||||
|
||||
template <class Order>
|
||||
struct Traits128 : public Order {
|
||||
constexpr bool Is128() const { return true; }
|
||||
constexpr size_t LanesPerKey() const { return 2; }
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
|
||||
// along with an abstraction layer for single-lane vs. lane-pair, which is
|
||||
// independent of the order.
|
||||
struct Key128 {
|
||||
constexpr size_t LanesPerKey() const { return 2; }
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE void Swap(T* a, T* b) const {
|
||||
const FixedTag<T, 2> d;
|
||||
const auto temp = LoadU(d, a);
|
||||
StoreU(LoadU(d, b), d, a);
|
||||
StoreU(temp, d, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
|
||||
return LoadDup128(d, key);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
|
||||
return ReverseBlocks(d, v);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
|
||||
return SwapAdjacentBlocks(v);
|
||||
}
|
||||
|
||||
// Only called for 4 keys because we do not support >512-bit vectors.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
|
||||
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
|
||||
return ReverseKeys(d, v);
|
||||
}
|
||||
|
||||
// Only called for 4 keys because we do not support >512-bit vectors.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
|
||||
const Vec<D> even) const {
|
||||
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
|
||||
return ConcatUpperLower(d, odd, even);
|
||||
}
|
||||
|
||||
template <class V>
|
||||
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
|
||||
return OddEvenBlocks(odd, even);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
|
||||
HWY_ASSERT(0); // not supported: would require 1024-bit vectors
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
|
||||
HWY_ASSERT(0); // not supported: would require 2048-bit vectors
|
||||
}
|
||||
|
||||
// This is only called for 8/16 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// This is only called for 16 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
|
||||
// This is only called for 8 col networks (not supported).
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
};
|
||||
|
||||
// Anything order-related depends on the key traits *and* the order (see
|
||||
// FirstOfLanes). We cannot implement just one Compare function because Lt128
|
||||
// only compiles if the lane type is u64. Thus we need either overloaded
|
||||
// functions with a tag type, class specializations, or separate classes.
|
||||
// We avoid overloaded functions because we want all functions to be callable
|
||||
// from a SortTraits without per-function wrappers. Specializing would work, but
|
||||
// we are anyway going to specialize at a higher level.
|
||||
struct OrderAscending128 : public Key128 {
|
||||
using Order = SortAscending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128(d, a, b);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = First(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = Last(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
};
|
||||
|
||||
struct OrderDescending128 : public Key128 {
|
||||
using Order = SortDescending;
|
||||
|
||||
template <typename T>
|
||||
HWY_INLINE bool Compare1(const T* a, const T* b) {
|
||||
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
|
||||
return Lt128(d, b, a);
|
||||
}
|
||||
|
||||
// Used by CompareTop
|
||||
template <class V>
|
||||
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
|
||||
return Lt(b, a);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Max128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
|
||||
return Min128(d, a, b);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = First(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
|
||||
TFromD<D>* HWY_RESTRICT buf) const {
|
||||
const size_t N = Lanes(d);
|
||||
Store(v, d, buf);
|
||||
v = SetKey(d, buf + 0); // result must be broadcasted
|
||||
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
|
||||
v = Last(d, v, SetKey(d, buf + i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
// Same as for regular lanes because 128-bit lanes are u64.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> FirstValue(D d) const {
|
||||
return Set(d, hwy::HighestValue<TFromD<D> >());
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> LastValue(D d) const {
|
||||
return Set(d, hwy::LowestValue<TFromD<D> >());
|
||||
}
|
||||
};
|
||||
|
||||
// Shared code that depends on Order.
|
||||
template <class Base>
|
||||
class Traits128 : public Base {
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
// Returns vector with only the top u64 lane valid. Useful when the next step
|
||||
// is to replicate the mask anyway.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
|
||||
const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
|
||||
const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
|
||||
return OrAnd(ltHL, eqHL, ltLX);
|
||||
}
|
||||
|
||||
// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
|
||||
// the most-significant of those lanes (the result of CompareTop), so
|
||||
// replicate it 4x. Only called for >= 256-bit vectors.
|
||||
template <class V>
|
||||
HWY_INLINE V ReplicateTop4x(V v) const {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#else // AVX2
|
||||
return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
public:
|
||||
constexpr bool Is128() const { return true; }
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
|
||||
const Vec<D> a_copy = a;
|
||||
const auto lt = base->Compare(d, a, b);
|
||||
a = IfThenElse(lt, a, b);
|
||||
b = IfThenElse(lt, b, a_copy);
|
||||
}
|
||||
|
||||
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys2(d, v);
|
||||
|
||||
#if HWY_TARGET <= HWY_AVX2
|
||||
const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
|
||||
return IfVecThenElse(select, swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenKeys(swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
|
||||
const Base* base = static_cast<const Base*>(this);
|
||||
Vec<D> swapped = base->ReverseKeys4(d, v);
|
||||
|
||||
// Only specialize for AVX3 because this requires 512-bit vectors.
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
|
||||
// Similar to ReplicateTop4x, we want to gang together 2 comparison results
|
||||
// (4 lanes). They are not contiguous, so use permute to replicate 4x.
|
||||
alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
|
||||
const Vec512<uint64_t> select =
|
||||
TableLookupLanes(outHx, SetTableIndices(d, kIndices));
|
||||
return IfVecThenElse(select, swapped, v);
|
||||
#else
|
||||
Sort2(d, v, swapped);
|
||||
return base->OddEvenPairs(d, swapped, v);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
|
||||
template <class D>
|
||||
HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
|
||||
// Only used by Merge16, which would require 2048 bit vectors (unsupported).
|
||||
HWY_ASSERT(0);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
|
||||
} // namespace detail
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
|
|
@ -1,722 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Normal include guard for target-independent parts
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
|
||||
|
||||
// Makes it harder for adversaries to predict our sampling locations, at the
|
||||
// cost of 1-2% increased runtime.
|
||||
#ifndef VQSORT_SECURE_RNG
|
||||
#define VQSORT_SECURE_RNG 0
|
||||
#endif
|
||||
|
||||
#if VQSORT_SECURE_RNG
|
||||
#include "third_party/absl/random/random.h"
|
||||
#endif
|
||||
|
||||
#include <string.h> // memcpy
|
||||
|
||||
#include "hwy/cache_control.h" // Prefetch
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes
|
||||
|
||||
#if HWY_IS_MSAN
|
||||
#include <sanitizer/msan_interface.h>
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
|
||||
|
||||
// Per-target
|
||||
#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
|
||||
defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
|
||||
#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
|
||||
#else
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
|
||||
#endif
|
||||
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
#include "hwy/contrib/sort/sorting_networks-inl.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
namespace detail {
|
||||
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
|
||||
template <typename T>
|
||||
void Swap(T* a, T* b) {
|
||||
T t = *a;
|
||||
*a = *b;
|
||||
*b = t;
|
||||
}
|
||||
|
||||
// Scalar version of HeapSort (see below)
|
||||
template <class Traits, typename T>
|
||||
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
|
||||
if (num < 2) return;
|
||||
|
||||
// Build heap.
|
||||
for (size_t i = 1; i < num; i += 1) {
|
||||
size_t j = i;
|
||||
while (j != 0) {
|
||||
const size_t idx_parent = ((j - 1) / 1 / 2);
|
||||
if (!st.Compare1(keys + idx_parent, keys + j)) {
|
||||
break;
|
||||
}
|
||||
Swap(keys + j, keys + idx_parent);
|
||||
j = idx_parent;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = num - 1; i != 0; i -= 1) {
|
||||
// Swap root with last
|
||||
Swap(keys + 0, keys + i);
|
||||
|
||||
// Sift down the new root.
|
||||
size_t j = 0;
|
||||
while (j < i) {
|
||||
const size_t left = 2 * j + 1;
|
||||
const size_t right = 2 * j + 2;
|
||||
if (left >= i) break;
|
||||
size_t idx_larger = j;
|
||||
if (st.Compare1(keys + j, keys + left)) {
|
||||
idx_larger = left;
|
||||
}
|
||||
if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
|
||||
idx_larger = right;
|
||||
}
|
||||
if (idx_larger == j) break;
|
||||
Swap(keys + j, keys + idx_larger);
|
||||
j = idx_larger;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
using Constants = hwy::SortConstants;
|
||||
|
||||
// ------------------------------ HeapSort
|
||||
|
||||
// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
|
||||
// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
|
||||
template <class Traits, typename T>
|
||||
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
const FixedTag<T, N1> d;
|
||||
|
||||
if (num < 2 * N1) return;
|
||||
|
||||
// Build heap.
|
||||
for (size_t i = N1; i < num; i += N1) {
|
||||
size_t j = i;
|
||||
while (j != 0) {
|
||||
const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
|
||||
if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
|
||||
st.SetKey(d, keys + j)))) {
|
||||
break;
|
||||
}
|
||||
st.Swap(keys + j, keys + idx_parent);
|
||||
j = idx_parent;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = num - N1; i != 0; i -= N1) {
|
||||
// Swap root with last
|
||||
st.Swap(keys + 0, keys + i);
|
||||
|
||||
// Sift down the new root.
|
||||
size_t j = 0;
|
||||
while (j < i) {
|
||||
const size_t left = 2 * j + N1;
|
||||
const size_t right = 2 * j + 2 * N1;
|
||||
if (left >= i) break;
|
||||
size_t idx_larger = j;
|
||||
const auto key_j = st.SetKey(d, keys + j);
|
||||
if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
|
||||
idx_larger = left;
|
||||
}
|
||||
if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
|
||||
st.SetKey(d, keys + right)))) {
|
||||
idx_larger = right;
|
||||
}
|
||||
if (idx_larger == j) break;
|
||||
st.Swap(keys + j, keys + idx_larger);
|
||||
j = idx_larger;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------ BaseCase
|
||||
|
||||
// Sorts `keys` within the range [0, num) via sorting network.
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
|
||||
// _Nonzero32 requires num - 1 != 0.
|
||||
if (HWY_UNLIKELY(num <= 1)) return;
|
||||
|
||||
// Reshape into a matrix with kMaxRows rows, and columns limited by the
|
||||
// 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum).
|
||||
const size_t num_pow2 = size_t{1}
|
||||
<< (32 - Num0BitsAboveMS1Bit_Nonzero32(
|
||||
static_cast<uint32_t>(num - 1)));
|
||||
HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N));
|
||||
const size_t cols =
|
||||
HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
|
||||
HWY_DASSERT(cols <= N);
|
||||
|
||||
// Copy `keys` to `buf`.
|
||||
size_t i;
|
||||
for (i = 0; i + N <= num; i += N) {
|
||||
Store(LoadU(d, keys + i), d, buf + i);
|
||||
}
|
||||
for (; i < num; ++i) {
|
||||
buf[i] = keys[i];
|
||||
}
|
||||
|
||||
// Fill with padding - last in sort order, not copied to keys.
|
||||
const V kPadding = st.LastValue(d);
|
||||
// Initialize an extra vector because SortingNetwork loads full vectors,
|
||||
// which may exceed cols*kMaxRows.
|
||||
for (; i < (cols * Constants::kMaxRows + N); i += N) {
|
||||
StoreU(kPadding, d, buf + i);
|
||||
}
|
||||
|
||||
SortingNetwork(st, buf, cols);
|
||||
|
||||
for (i = 0; i + N <= num; i += N) {
|
||||
StoreU(Load(d, buf + i), d, keys + i);
|
||||
}
|
||||
for (; i < num; ++i) {
|
||||
keys[i] = buf[i];
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------ Partition
|
||||
|
||||
// Consumes from `left` until a multiple of kUnroll*N remains.
|
||||
// Temporarily stores the right side into `buf`, then moves behind `right`.
|
||||
template <class D, class Traits, class T>
|
||||
HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
|
||||
T* HWY_RESTRICT keys,
|
||||
size_t& left, size_t& right,
|
||||
const Vec<D> pivot,
|
||||
T* HWY_RESTRICT buf) {
|
||||
constexpr size_t kUnroll = Constants::kPartitionUnroll;
|
||||
const size_t N = Lanes(d);
|
||||
size_t readL = left;
|
||||
size_t bufR = 0;
|
||||
const size_t num = right - left;
|
||||
// Partition requires both a multiple of kUnroll*N and at least
|
||||
// 2*kUnroll*N for the initial loads. If less, consume all here.
|
||||
const size_t num_rem =
|
||||
(num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1));
|
||||
size_t i = 0;
|
||||
for (; i + N <= num_rem; i += N) {
|
||||
const Vec<D> vL = LoadU(d, keys + readL);
|
||||
readL += N;
|
||||
|
||||
const auto comp = st.Compare(d, pivot, vL);
|
||||
left += CompressBlendedStore(vL, Not(comp), d, keys + left);
|
||||
bufR += CompressStore(vL, comp, d, buf + bufR);
|
||||
}
|
||||
// Last iteration: only use valid lanes.
|
||||
if (HWY_LIKELY(i != num_rem)) {
|
||||
const auto mask = FirstN(d, num_rem - i);
|
||||
const Vec<D> vL = LoadU(d, keys + readL);
|
||||
|
||||
const auto comp = st.Compare(d, pivot, vL);
|
||||
left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left);
|
||||
bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
|
||||
}
|
||||
|
||||
// MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
|
||||
#if HWY_IS_MSAN
|
||||
__msan_unpoison(buf, bufR * sizeof(T));
|
||||
#endif
|
||||
|
||||
// Everything we loaded was put into buf, or behind the new `left`, after
|
||||
// which there is space for bufR items. First move items from `right` to
|
||||
// `left` to free up space, then copy `buf` into the vacated `right`.
|
||||
// A loop with masked loads from `buf` is insufficient - we would also need to
|
||||
// mask from `right`. Combining a loop with memcpy for the remainders is
|
||||
// slower than just memcpy, so we use that for simplicity.
|
||||
right -= bufR;
|
||||
memcpy(keys + left, keys + right, bufR * sizeof(T));
|
||||
memcpy(keys + right, buf, bufR * sizeof(T));
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
|
||||
const Vec<D> pivot, T* HWY_RESTRICT keys,
|
||||
size_t& writeL, size_t& writeR) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
const auto comp = st.Compare(d, pivot, v);
|
||||
const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL);
|
||||
writeL += num_left;
|
||||
|
||||
writeR -= (N - num_left);
|
||||
(void)CompressBlendedStore(v, comp, d, keys + writeR);
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
|
||||
const Vec<D> v1, const Vec<D> v2,
|
||||
const Vec<D> v3, const Vec<D> pivot,
|
||||
T* HWY_RESTRICT keys, size_t& writeL,
|
||||
size_t& writeR) {
|
||||
StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
|
||||
}
|
||||
|
||||
// Moves "<= pivot" keys to the front, and others to the back. pivot is
|
||||
// broadcasted. Time-critical!
|
||||
//
|
||||
// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
|
||||
size_t right, const Vec<D> pivot,
|
||||
T* HWY_RESTRICT buf) {
|
||||
using V = decltype(Zero(d));
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
// StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
|
||||
// lanes happen to be in the right-side partition, this will overrun `keys`,
|
||||
// which triggers asan errors. Avoid by special-casing the last vector.
|
||||
HWY_DASSERT(right - left > 2 * N); // ensured by HandleSpecialCases
|
||||
right -= N;
|
||||
const size_t last = right;
|
||||
const V vlast = LoadU(d, keys + last);
|
||||
|
||||
PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
|
||||
constexpr size_t kUnroll = Constants::kPartitionUnroll;
|
||||
|
||||
// Invariant: [left, writeL) and [writeR, right) are already partitioned.
|
||||
size_t writeL = left;
|
||||
size_t writeR = right;
|
||||
|
||||
const size_t num = right - left;
|
||||
// Cannot load if there were fewer than 2 * kUnroll * N.
|
||||
if (HWY_LIKELY(num != 0)) {
|
||||
HWY_DASSERT(num >= 2 * kUnroll * N);
|
||||
HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
|
||||
|
||||
// Make space for writing in-place by reading from left and right.
|
||||
const V vL0 = LoadU(d, keys + left + 0 * N);
|
||||
const V vL1 = LoadU(d, keys + left + 1 * N);
|
||||
const V vL2 = LoadU(d, keys + left + 2 * N);
|
||||
const V vL3 = LoadU(d, keys + left + 3 * N);
|
||||
left += kUnroll * N;
|
||||
right -= kUnroll * N;
|
||||
const V vR0 = LoadU(d, keys + right + 0 * N);
|
||||
const V vR1 = LoadU(d, keys + right + 1 * N);
|
||||
const V vR2 = LoadU(d, keys + right + 2 * N);
|
||||
const V vR3 = LoadU(d, keys + right + 3 * N);
|
||||
|
||||
// The left/right updates may consume all inputs, so check before the loop.
|
||||
while (left != right) {
|
||||
V v0, v1, v2, v3;
|
||||
|
||||
// Free up capacity for writing by loading from the side that has less.
|
||||
// Data-dependent but branching is faster than forcing branch-free.
|
||||
const size_t capacityL = left - writeL;
|
||||
const size_t capacityR = writeR - right;
|
||||
HWY_DASSERT(capacityL <= num && capacityR <= num); // >= 0
|
||||
if (capacityR < capacityL) {
|
||||
right -= kUnroll * N;
|
||||
v0 = LoadU(d, keys + right + 0 * N);
|
||||
v1 = LoadU(d, keys + right + 1 * N);
|
||||
v2 = LoadU(d, keys + right + 2 * N);
|
||||
v3 = LoadU(d, keys + right + 3 * N);
|
||||
hwy::Prefetch(keys + right - 3 * kUnroll * N);
|
||||
} else {
|
||||
v0 = LoadU(d, keys + left + 0 * N);
|
||||
v1 = LoadU(d, keys + left + 1 * N);
|
||||
v2 = LoadU(d, keys + left + 2 * N);
|
||||
v3 = LoadU(d, keys + left + 3 * N);
|
||||
left += kUnroll * N;
|
||||
hwy::Prefetch(keys + left + 3 * kUnroll * N);
|
||||
}
|
||||
|
||||
StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
|
||||
}
|
||||
|
||||
// Now finish writing the initial left/right to the middle.
|
||||
StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
|
||||
StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
|
||||
}
|
||||
|
||||
// We have partitioned [left, right) such that writeL is the boundary.
|
||||
HWY_DASSERT(writeL == writeR);
|
||||
// Make space for inserting vlast: move up to N of the first right-side keys
|
||||
// into the unused space starting at last. If we have fewer, ensure they are
|
||||
// the last items in that vector by subtracting from the *load* address,
|
||||
// which is safe because we have at least two vectors (checked above).
|
||||
const size_t totalR = last - writeL;
|
||||
const size_t startR = totalR < N ? writeL + totalR - N : writeL;
|
||||
StoreU(LoadU(d, keys + startR), d, keys + last);
|
||||
|
||||
// Partition vlast: write L, then R, into the single-vector gap at writeL.
|
||||
const auto comp = st.Compare(d, pivot, vlast);
|
||||
writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
|
||||
(void)CompressBlendedStore(vlast, comp, d, keys + writeL);
|
||||
|
||||
return writeL;
|
||||
}
|
||||
|
||||
// ------------------------------ Pivot
|
||||
|
||||
template <class Traits, class V>
|
||||
HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
|
||||
const DFromV<V> d;
|
||||
// Slightly faster for 128-bit, apparently because not serially dependent.
|
||||
if (st.Is128()) {
|
||||
// Median = XOR-sum 'minus' the first and last. Calling First twice is
|
||||
// slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
|
||||
const auto sum = Xor(Xor(v0, v1), v2);
|
||||
const auto first = st.First(d, st.First(d, v0, v1), v2);
|
||||
const auto last = st.Last(d, st.Last(d, v0, v1), v2);
|
||||
return Xor(Xor(sum, first), last);
|
||||
}
|
||||
st.Sort2(d, v0, v2);
|
||||
v1 = st.Last(d, v0, v1);
|
||||
v1 = st.First(d, v1, v2);
|
||||
return v1;
|
||||
}
|
||||
|
||||
// Replaces triplets with their median and recurses until less than 3 keys
|
||||
// remain. Ignores leftover values (non-whole triplets)!
|
||||
template <class D, class Traits, typename T>
|
||||
Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
constexpr size_t N1 = st.LanesPerKey();
|
||||
|
||||
if (num < 3 * N1) return st.SetKey(d, keys);
|
||||
|
||||
size_t read = 0;
|
||||
size_t written = 0;
|
||||
|
||||
// Triplets of vectors
|
||||
for (; read + 3 * N <= num; read += 3 * N) {
|
||||
const auto v0 = Load(d, keys + read + 0 * N);
|
||||
const auto v1 = Load(d, keys + read + 1 * N);
|
||||
const auto v2 = Load(d, keys + read + 2 * N);
|
||||
Store(MedianOf3(st, v0, v1, v2), d, buf + written);
|
||||
written += N;
|
||||
}
|
||||
|
||||
// Triplets of keys
|
||||
for (; read + 3 * N1 <= num; read += 3 * N1) {
|
||||
const auto v0 = st.SetKey(d, keys + read + 0 * N1);
|
||||
const auto v1 = st.SetKey(d, keys + read + 1 * N1);
|
||||
const auto v2 = st.SetKey(d, keys + read + 2 * N1);
|
||||
StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
|
||||
written += N1;
|
||||
}
|
||||
|
||||
// Tail recursion; swap buffers
|
||||
return RecursiveMedianOf3(d, st, buf, written, keys);
|
||||
}
|
||||
|
||||
#if VQSORT_SECURE_RNG
|
||||
using Generator = absl::BitGen;
|
||||
#else
|
||||
// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
|
||||
#pragma pack(push, 1)
|
||||
class Generator {
|
||||
public:
|
||||
Generator(const void* heap, size_t num) {
|
||||
Sorter::Fill24Bytes(heap, num, &a_);
|
||||
k_ = 1; // stream index: must be odd
|
||||
}
|
||||
|
||||
uint64_t operator()() {
|
||||
const uint64_t b = b_;
|
||||
w_ += k_;
|
||||
const uint64_t next = a_ ^ w_;
|
||||
a_ = (b + (b << 3)) ^ (b >> 11);
|
||||
const uint64_t rot = (b << 24) | (b >> 40);
|
||||
b_ = rot + next;
|
||||
return next;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t a_;
|
||||
uint64_t b_;
|
||||
uint64_t w_;
|
||||
uint64_t k_; // increment
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
#endif // !VQSORT_SECURE_RNG
|
||||
|
||||
// Returns slightly biased random index of a chunk in [0, num_chunks).
|
||||
// See https://www.pcg-random.org/posts/bounded-rands.html.
|
||||
HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
|
||||
const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32;
|
||||
HWY_DASSERT(chunk_index < num_chunks);
|
||||
return static_cast<size_t>(chunk_index);
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
|
||||
const size_t begin, const size_t end,
|
||||
T* HWY_RESTRICT buf, Generator& rng) {
|
||||
using V = decltype(Zero(d));
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
// Power of two
|
||||
const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
|
||||
|
||||
keys += begin;
|
||||
size_t num = end - begin;
|
||||
|
||||
// Align start of keys to chunks. We always have at least 2 chunks because the
|
||||
// base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
|
||||
HWY_DASSERT(num >= 2 * lanes_per_chunk);
|
||||
const size_t misalign =
|
||||
(reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1);
|
||||
if (misalign != 0) {
|
||||
const size_t consume = lanes_per_chunk - misalign;
|
||||
keys += consume;
|
||||
num -= consume;
|
||||
}
|
||||
|
||||
// Generate enough random bits for 9 uint32
|
||||
uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf);
|
||||
for (size_t i = 0; i < 5; ++i) {
|
||||
bits64[i] = rng();
|
||||
}
|
||||
const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf);
|
||||
|
||||
const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk);
|
||||
// Avoid division
|
||||
const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32);
|
||||
const size_t num_chunks64 = num >> log2_lpc;
|
||||
// Clamp to uint32 for RandomChunkIndex
|
||||
const uint32_t num_chunks =
|
||||
static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull));
|
||||
|
||||
const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc;
|
||||
const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc;
|
||||
const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc;
|
||||
const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc;
|
||||
const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc;
|
||||
const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc;
|
||||
const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc;
|
||||
const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc;
|
||||
const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc;
|
||||
for (size_t i = 0; i < lanes_per_chunk; i += N) {
|
||||
const V v0 = Load(d, keys + offset0 + i);
|
||||
const V v1 = Load(d, keys + offset1 + i);
|
||||
const V v2 = Load(d, keys + offset2 + i);
|
||||
const V medians0 = MedianOf3(st, v0, v1, v2);
|
||||
Store(medians0, d, buf + i);
|
||||
|
||||
const V v3 = Load(d, keys + offset3 + i);
|
||||
const V v4 = Load(d, keys + offset4 + i);
|
||||
const V v5 = Load(d, keys + offset5 + i);
|
||||
const V medians1 = MedianOf3(st, v3, v4, v5);
|
||||
Store(medians1, d, buf + i + lanes_per_chunk);
|
||||
|
||||
const V v6 = Load(d, keys + offset6 + i);
|
||||
const V v7 = Load(d, keys + offset7 + i);
|
||||
const V v8 = Load(d, keys + offset8 + i);
|
||||
const V medians2 = MedianOf3(st, v6, v7, v8);
|
||||
Store(medians2, d, buf + i + lanes_per_chunk * 2);
|
||||
}
|
||||
|
||||
return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
|
||||
buf + 3 * lanes_per_chunk);
|
||||
}
|
||||
|
||||
// Compute exact min/max to detect all-equal partitions. Only called after a
|
||||
// degenerate Partition (none in the right partition).
|
||||
template <class D, class Traits, typename T>
|
||||
HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
|
||||
size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
|
||||
Vec<D>& last) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
first = st.LastValue(d);
|
||||
last = st.FirstValue(d);
|
||||
|
||||
size_t i = 0;
|
||||
for (; i + N <= num; i += N) {
|
||||
const Vec<D> v = LoadU(d, keys + i);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
}
|
||||
if (HWY_LIKELY(i != num)) {
|
||||
HWY_DASSERT(num >= N); // See HandleSpecialCases
|
||||
const Vec<D> v = LoadU(d, keys + num - N);
|
||||
first = st.First(d, v, first);
|
||||
last = st.Last(d, v, last);
|
||||
}
|
||||
|
||||
first = st.FirstOfLanes(d, first, buf);
|
||||
last = st.LastOfLanes(d, last, buf);
|
||||
}
|
||||
|
||||
template <class D, class Traits, typename T>
|
||||
void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
|
||||
const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
|
||||
Generator& rng, size_t remaining_levels) {
|
||||
HWY_DASSERT(begin + 1 < end);
|
||||
const size_t num = end - begin; // >= 2
|
||||
|
||||
// Too many degenerate partitions. This is extremely unlikely to happen
|
||||
// because we select pivots from large (though still O(1)) samples.
|
||||
if (HWY_UNLIKELY(remaining_levels == 0)) {
|
||||
HeapSort(st, keys + begin, num); // Slow but N*logN.
|
||||
return;
|
||||
}
|
||||
|
||||
const ptrdiff_t base_case_num =
|
||||
static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
|
||||
const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
|
||||
|
||||
const ptrdiff_t num_left =
|
||||
static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
|
||||
const ptrdiff_t num_right =
|
||||
static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
|
||||
|
||||
// Check for degenerate partitions (i.e. Partition did not move any keys):
|
||||
if (HWY_UNLIKELY(num_right == 0)) {
|
||||
// Because the pivot is one of the keys, it must have been equal to the
|
||||
// first or last key in sort order. Scan for the actual min/max:
|
||||
// passing the current pivot as the new bound is insufficient because one of
|
||||
// the partitions might not actually include that key.
|
||||
Vec<D> first, last;
|
||||
ScanMinMax(d, st, keys + begin, num, buf, first, last);
|
||||
if (AllTrue(d, Eq(first, last))) return;
|
||||
|
||||
// Separate recursion to make sure that we don't pick `last` as the
|
||||
// pivot - that would again lead to a degenerate partition.
|
||||
Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
|
||||
return;
|
||||
}
|
||||
|
||||
if (HWY_UNLIKELY(num_left <= base_case_num)) {
|
||||
BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
|
||||
} else {
|
||||
const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
|
||||
Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
}
|
||||
if (HWY_UNLIKELY(num_right <= base_case_num)) {
|
||||
BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
|
||||
} else {
|
||||
const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
|
||||
Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
|
||||
remaining_levels - 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if sorting is finished.
|
||||
template <class D, class Traits, typename T>
|
||||
bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t base_case_num = Constants::BaseCaseNum(N);
|
||||
|
||||
// 128-bit keys require vectors with at least two u64 lanes, which is always
|
||||
// the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
|
||||
// hardware vector width is less than 128bit / fraction.
|
||||
const bool partial_128 = N < 2 && st.Is128();
|
||||
// Partition assumes its input is at least two vectors. If vectors are huge,
|
||||
// base_case_num may actually be smaller. If so, which is only possible on
|
||||
// RVV, pass a capped or partial d (LMUL < 1).
|
||||
constexpr bool kPotentiallyHuge =
|
||||
HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
|
||||
const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
|
||||
if (partial_128 || huge_vec) {
|
||||
// PERFORMANCE WARNING: falling back to HeapSort.
|
||||
HeapSort(st, keys, num);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Small arrays: use sorting network, no need for other checks.
|
||||
if (HWY_UNLIKELY(num <= base_case_num)) {
|
||||
BaseCase(d, st, keys, num, buf);
|
||||
return true;
|
||||
}
|
||||
|
||||
// We could also check for already sorted/reverse/equal, but that's probably
|
||||
// counterproductive if vqsort is used as a base case.
|
||||
|
||||
return false; // not finished sorting
|
||||
}
|
||||
|
||||
#endif // HWY_TARGET != HWY_SCALAR
|
||||
} // namespace detail
|
||||
|
||||
// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
|
||||
// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
|
||||
// Non-stable (order of equal keys may change), except for the common case where
|
||||
// the upper bits of T are the key, and the lower bits are a sequential or at
|
||||
// least unique ID.
|
||||
// There is no upper limit on `num`, but note that pivots may be chosen by
|
||||
// sampling only from the first 256 GiB.
|
||||
//
|
||||
// `d` is typically SortTag<T> (chooses between full and partial vectors).
|
||||
// `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
|
||||
// bridges differences in sort order and single-lane vs 128-bit keys.
|
||||
template <class D, class Traits, typename T>
|
||||
void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
|
||||
T* HWY_RESTRICT buf) {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
(void)d;
|
||||
(void)buf;
|
||||
// PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
|
||||
return detail::HeapSort(st, keys, num);
|
||||
#else
|
||||
if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
|
||||
|
||||
#if HWY_MAX_BYTES > 64
|
||||
// sorting_networks-inl and traits assume no more than 512 bit vectors.
|
||||
if (Lanes(d) > 64 / sizeof(T)) {
|
||||
return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
|
||||
}
|
||||
#endif // HWY_MAX_BYTES > 64
|
||||
|
||||
// Pulled out of the recursion so we can special-case degenerate partitions.
|
||||
detail::Generator rng(keys, num);
|
||||
const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
|
||||
|
||||
// Introspection: switch to worst-case N*logN heapsort after this many.
|
||||
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
|
||||
|
||||
detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
|
|
@ -1,148 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#include <string.h> // memset
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/shared-inl.h"
|
||||
|
||||
// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
|
||||
// (not all Android support the getrandom wrapper)
|
||||
#ifndef VQSORT_SECURE_SEED
|
||||
|
||||
#if (defined(linux) || defined(__linux__)) && \
|
||||
!(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV)
|
||||
#define VQSORT_SECURE_SEED 1
|
||||
#elif defined(_WIN32) || defined(_WIN64)
|
||||
#define VQSORT_SECURE_SEED 2
|
||||
#else
|
||||
#define VQSORT_SECURE_SEED 0
|
||||
#endif
|
||||
|
||||
#endif // VQSORT_SECURE_SEED
|
||||
|
||||
#if !VQSORT_SECURE_RNG
|
||||
|
||||
#include <time.h>
|
||||
#if VQSORT_SECURE_SEED == 1
|
||||
#include <sys/random.h>
|
||||
#elif VQSORT_SECURE_SEED == 2
|
||||
#include <windows.h>
|
||||
#pragma comment(lib, "Advapi32.lib")
|
||||
// Must come after windows.h.
|
||||
#include <wincrypt.h>
|
||||
#endif // VQSORT_SECURE_SEED
|
||||
|
||||
#endif // !VQSORT_SECURE_RNG
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
|
||||
bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(VectorSize);
|
||||
HWY_EXPORT(HaveFloat64);
|
||||
|
||||
HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
|
||||
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
|
||||
const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N);
|
||||
return (3 + 1) * lpc + 2 * N;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Sorter::Sorter() {
|
||||
// Determine the largest buffer size required for any type by trying them all.
|
||||
// (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
|
||||
// may require a larger buffer.)
|
||||
const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
|
||||
size_t max_bytes = 0;
|
||||
for (size_t sizeof_t :
|
||||
{sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) {
|
||||
const size_t N = vector_size / sizeof_t;
|
||||
// One extra for padding plus another for full-vector loads.
|
||||
const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N;
|
||||
const size_t partition_num = SortConstants::PartitionBufNum(N);
|
||||
const size_t buf_lanes =
|
||||
HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N)));
|
||||
max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t);
|
||||
}
|
||||
|
||||
ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
|
||||
|
||||
// Prevent msan errors by initializing.
|
||||
memset(ptr_, 0, max_bytes);
|
||||
}
|
||||
|
||||
void Sorter::Delete() {
|
||||
FreeAlignedBytes(ptr_, nullptr, nullptr);
|
||||
ptr_ = nullptr;
|
||||
}
|
||||
|
||||
#if !VQSORT_SECURE_RNG
|
||||
|
||||
void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
|
||||
#if VQSORT_SECURE_SEED == 1
|
||||
// May block if urandom is not yet initialized.
|
||||
const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
|
||||
if (ret == 24) return;
|
||||
#elif VQSORT_SECURE_SEED == 2
|
||||
HCRYPTPROV hProvider{};
|
||||
if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
|
||||
CRYPT_VERIFYCONTEXT)) {
|
||||
const BOOL ok =
|
||||
CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
|
||||
CryptReleaseContext(hProvider, 0);
|
||||
if (ok) return;
|
||||
}
|
||||
#endif
|
||||
|
||||
// VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
|
||||
// stack/heap/code addresses and the clock() timer.
|
||||
uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
|
||||
uint64_t** seed_stack = &words;
|
||||
void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
|
||||
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
|
||||
const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
|
||||
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
|
||||
const uint64_t bits_time = static_cast<uint64_t>(clock());
|
||||
words[0] = bits_stack ^ bits_time ^ seed_num;
|
||||
words[1] = bits_heap ^ bits_time ^ seed_num;
|
||||
words[2] = bits_code ^ bits_time ^ seed_num;
|
||||
}
|
||||
|
||||
#endif // !VQSORT_SECURE_RNG
|
||||
|
||||
bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,104 +0,0 @@
|
|||
// Copyright 2022 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Interface to vectorized quicksort with dynamic dispatch.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
|
||||
// https://reviews.llvm.org/D86310
|
||||
#pragma pack(push, 1)
|
||||
struct alignas(16) uint128_t {
|
||||
uint64_t lo; // little-endian layout
|
||||
uint64_t hi;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
// Tag arguments that determine the sort order.
|
||||
struct SortAscending {
|
||||
constexpr bool IsAscending() const { return true; }
|
||||
};
|
||||
struct SortDescending {
|
||||
constexpr bool IsAscending() const { return false; }
|
||||
};
|
||||
|
||||
// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
|
||||
// This allows amortizing the allocation over multiple sorts.
|
||||
class HWY_CONTRIB_DLLEXPORT Sorter {
|
||||
public:
|
||||
Sorter();
|
||||
~Sorter() { Delete(); }
|
||||
|
||||
// Move-only
|
||||
Sorter(const Sorter&) = delete;
|
||||
Sorter& operator=(const Sorter&) = delete;
|
||||
Sorter(Sorter&& other) {
|
||||
Delete();
|
||||
ptr_ = other.ptr_;
|
||||
other.ptr_ = nullptr;
|
||||
}
|
||||
Sorter& operator=(Sorter&& other) {
|
||||
Delete();
|
||||
ptr_ = other.ptr_;
|
||||
other.ptr_ = nullptr;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Sorts keys[0, n). Dispatches to the best available instruction set,
|
||||
// and does not allocate memory.
|
||||
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
|
||||
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
|
||||
|
||||
// For internal use only
|
||||
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
|
||||
static bool HaveFloat64();
|
||||
|
||||
private:
|
||||
void Delete();
|
||||
|
||||
template <typename T>
|
||||
T* Get() const {
|
||||
return static_cast<T*>(ptr_);
|
||||
}
|
||||
|
||||
void* ptr_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
|
|
@ -1,55 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(Sort128Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(Sort128Asc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,55 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits128-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(Sort128Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(Sort128Desc)
|
||||
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,53 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
|
||||
float* HWY_RESTRICT buf) {
|
||||
SortTag<float> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,61 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
|
||||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
(void)num;
|
||||
(void)buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,61 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
|
||||
double* HWY_RESTRICT buf) {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
SortTag<double> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
#else
|
||||
(void)keys;
|
||||
(void)num;
|
||||
(void)buf;
|
||||
HWY_ASSERT(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortF64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,59 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI16Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
|
@ -1,59 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
|
||||
int16_t* HWY_RESTRICT buf) {
|
||||
SortTag<int16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI16Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
|
||||
int32_t* HWY_RESTRICT buf) {
|
||||
SortTag<int32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
|
||||
int64_t* HWY_RESTRICT buf) {
|
||||
SortTag<int64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortI64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,59 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU16Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
|
@ -1,59 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
// Workaround for build timeout
|
||||
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
|
||||
uint16_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint16_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU16Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU32Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
|
||||
uint32_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint32_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU32Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU64Asc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortAscending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -1,54 +0,0 @@
|
|||
// Copyright 2021 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/contrib/sort/disabled_targets.h"
|
||||
#include "hwy/contrib/sort/vqsort.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// After foreach_target
|
||||
#include "hwy/contrib/sort/traits-inl.h"
|
||||
#include "hwy/contrib/sort/vqsort-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
|
||||
uint64_t* HWY_RESTRICT buf) {
|
||||
SortTag<uint64_t> d;
|
||||
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
|
||||
Sort(d, st, keys, num, buf);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(SortU64Desc);
|
||||
} // namespace
|
||||
|
||||
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
|
||||
SortDescending) const {
|
||||
HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -106,6 +106,20 @@
|
|||
//------------------------------------------------------------------------------
|
||||
// Architecture
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#define HWY_ARCH_X86 0
|
||||
#define HWY_ARCH_PPC 0
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#define HWY_ARCH_ARM 1
|
||||
#define HWY_ARCH_WASM 0
|
||||
#define HWY_ARCH_RVV 0
|
||||
|
||||
#else
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
|
@ -168,6 +182,8 @@
|
|||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
#endif // defined(HWY_EMULATE_SVE)
|
||||
|
||||
// It is an error to detect multiple architectures at the same time, but OK to
|
||||
// detect none of the above.
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
||||
|
|
|
@ -161,6 +161,11 @@
|
|||
// user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
#define HWY_BASELINE_TARGETS HWY_SVE // does not support SVE2
|
||||
#define HWY_BASELINE_AVX3_DL 0
|
||||
#else
|
||||
|
||||
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
|
||||
// HWY_TARGET == HWY_SCALAR.
|
||||
|
||||
|
@ -181,7 +186,7 @@
|
|||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
// SVE2 compiles, but is not yet tested.
|
||||
// SVE compiles, but is not yet tested.
|
||||
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
|
||||
#define HWY_BASELINE_SVE2 HWY_SVE2
|
||||
#else
|
||||
|
@ -302,6 +307,8 @@
|
|||
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
||||
HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
|
||||
|
||||
#endif // HWY_EMULATE_SVE
|
||||
|
||||
#else
|
||||
// User already defined HWY_BASELINE_TARGETS, but we still need to define
|
||||
// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
|
||||
|
|
|
@ -25,17 +25,15 @@
|
|||
#include <numeric> // iota
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These templates are not found via ADL.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
|
||||
using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
|
||||
#endif
|
||||
|
||||
class TwoArray {
|
||||
|
@ -89,14 +87,14 @@ void RunBenchmark(const char* caption) {
|
|||
}
|
||||
|
||||
void Intro() {
|
||||
const float in[16] = {1, 2, 3, 4, 5, 6};
|
||||
float out[16];
|
||||
HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
|
||||
HWY_ALIGN float out[16];
|
||||
const ScalableTag<float> d; // largest possible vector
|
||||
for (size_t i = 0; i < 16; i += Lanes(d)) {
|
||||
const auto vec = LoadU(d, in + i); // no alignment requirement
|
||||
auto result = Mul(vec, vec);
|
||||
result = Add(result, result); // can update if not const
|
||||
StoreU(result, d, out + i);
|
||||
const auto vec = Load(d, in + i); // aligned!
|
||||
auto result = vec * vec;
|
||||
result += result; // can update if not const
|
||||
Store(result, d, out + i);
|
||||
}
|
||||
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
|
||||
}
|
||||
|
@ -111,34 +109,32 @@ class BenchmarkDot : public TwoArray {
|
|||
const ScalableTag<float> d;
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
constexpr size_t unroll = 8;
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// We cannot use an array because V might be a sizeless type. For reasonable
|
||||
// code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
|
||||
V sum0 = Zero(d);
|
||||
V sum1 = Zero(d);
|
||||
V sum2 = Zero(d);
|
||||
V sum3 = Zero(d);
|
||||
// Some older compilers might not be able to fit the 8 arrays in registers,
|
||||
// so manual unrolling can be helpfull if you run into this issue.
|
||||
// 2 FMA ports * 4 cycle latency = 8x unrolled.
|
||||
V sum[unroll];
|
||||
for (size_t i = 0; i < unroll; ++i) {
|
||||
sum[i] = Zero(d);
|
||||
}
|
||||
const float* const HWY_RESTRICT pa = &a_[0];
|
||||
const float* const HWY_RESTRICT pb = b_;
|
||||
for (size_t i = 0; i < num_items; i += 4 * N) {
|
||||
const auto a0 = Load(d, pa + i + 0 * N);
|
||||
const auto b0 = Load(d, pb + i + 0 * N);
|
||||
sum0 = MulAdd(a0, b0, sum0);
|
||||
const auto a1 = Load(d, pa + i + 1 * N);
|
||||
const auto b1 = Load(d, pb + i + 1 * N);
|
||||
sum1 = MulAdd(a1, b1, sum1);
|
||||
const auto a2 = Load(d, pa + i + 2 * N);
|
||||
const auto b2 = Load(d, pb + i + 2 * N);
|
||||
sum2 = MulAdd(a2, b2, sum2);
|
||||
const auto a3 = Load(d, pa + i + 3 * N);
|
||||
const auto b3 = Load(d, pb + i + 3 * N);
|
||||
sum3 = MulAdd(a3, b3, sum3);
|
||||
for (size_t i = 0; i < num_items; i += unroll * N) {
|
||||
for (size_t j = 0; j < unroll; ++j) {
|
||||
const auto a = Load(d, pa + i + j * N);
|
||||
const auto b = Load(d, pb + i + j * N);
|
||||
sum[j] = MulAdd(a, b, sum[j]);
|
||||
}
|
||||
}
|
||||
// Reduction tree: sum of all accumulators by pairs into sum0.
|
||||
sum0 = Add(sum0, sum1);
|
||||
sum2 = Add(sum2, sum3);
|
||||
sum0 = Add(sum0, sum2);
|
||||
dot_ = GetLane(SumOfLanes(d, sum0));
|
||||
// Reduction tree: sum of all accumulators by pairs into sum[0], then the
|
||||
// lanes.
|
||||
for (size_t power = 1; power < unroll; power *= 2) {
|
||||
for (size_t i = 0; i < unroll; i += 2 * power) {
|
||||
sum[i] += sum[i + power];
|
||||
}
|
||||
}
|
||||
dot_ = GetLane(SumOfLanes(d, sum[0]));
|
||||
return static_cast<FuncOutput>(dot_);
|
||||
}
|
||||
void Verify(size_t num_items) {
|
||||
|
@ -197,9 +193,9 @@ struct BenchmarkDelta : public TwoArray {
|
|||
auto prev = Load(df, &a_[0]);
|
||||
for (; i < num_items; i += Lanes(df)) {
|
||||
const auto a = Load(df, &a_[i]);
|
||||
const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
|
||||
const auto shifted = CombineShiftRightLanes<3>(a, prev);
|
||||
prev = a;
|
||||
Store(Sub(a, shifted), df, &b_[i]);
|
||||
Store(a - shifted, df, &b_[i]);
|
||||
}
|
||||
#endif
|
||||
return static_cast<FuncOutput>(b_[num_items - 1]);
|
||||
|
|
|
@ -24,14 +24,11 @@
|
|||
// Generates code for each enabled target by re-including this source file.
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
|
||||
// Optional, can instead add HWY_ATTR to all functions.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
// This namespace name is unique per target, which allows code for multiple
|
||||
// targets to co-exist in the same translation unit.
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
|
@ -50,7 +47,7 @@ template <class DF>
|
|||
ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const RebindToSigned<DF> d32;
|
||||
const Rebind<int32_t, DF> d32;
|
||||
const Rebind<uint8_t, DF> d8;
|
||||
|
||||
const auto u8 = Load(d8, values);
|
||||
|
@ -62,7 +59,7 @@ ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
|||
void CodepathDemo() {
|
||||
// Highway defaults to portability, but per-target codepaths may be selected
|
||||
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
const char* gather = "Has int64";
|
||||
#else
|
||||
const char* gather = "No int64";
|
||||
|
@ -74,16 +71,20 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
|||
uint8_t* HWY_RESTRICT log2) {
|
||||
CodepathDemo();
|
||||
|
||||
const ScalableTag<float> df;
|
||||
// Second argument is necessary on RVV until it supports fractional lengths.
|
||||
const ScalableTag<float, 2> df;
|
||||
|
||||
const size_t N = Lanes(df);
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
OneFloorLog2(df, values + i, log2 + i);
|
||||
}
|
||||
// TODO(janwas): implement
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
for (; i < count; ++i) {
|
||||
CappedTag<float, 1> d1;
|
||||
OneFloorLog2(d1, values + i, log2 + i);
|
||||
OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
|
@ -91,9 +92,6 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
|||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
// The table of pointers to the various implementations in HWY_NAMESPACE must
|
||||
// be compiled only once (foreach_target #includes this file multiple times).
|
||||
// HWY_ONCE is true for only one of these 'compilation passes'.
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace skeleton {
|
||||
|
@ -107,8 +105,6 @@ HWY_EXPORT(FloorLog2);
|
|||
// is equivalent to inlining this function.
|
||||
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
||||
uint8_t* HWY_RESTRICT out) {
|
||||
// This must reside outside of HWY_NAMESPACE because it references (calls the
|
||||
// appropriate one from) the per-target implementations there.
|
||||
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,13 +21,10 @@
|
|||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
// Must come after foreach_target.h to avoid redefinition errors.
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Optional: factor out parts of the implementation into *-inl.h
|
||||
// (must also come after foreach_target.h to avoid redefinition errors)
|
||||
#include "hwy/examples/skeleton-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
|
@ -53,7 +50,10 @@ struct TestFloorLog2 {
|
|||
CallFloorLog2(in.get(), count, out.get());
|
||||
int sum = 0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
// TODO(janwas): implement
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
HWY_ASSERT_EQ(expected[i], out[i]);
|
||||
#endif
|
||||
sum += out[i];
|
||||
}
|
||||
hwy::PreventElision(sum);
|
||||
|
|
|
@ -74,28 +74,6 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SVE2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSSE3
|
||||
|
|
|
@ -27,7 +27,7 @@ namespace hwy {
|
|||
|
||||
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
|
||||
#define HWY_MAJOR 0
|
||||
#define HWY_MINOR 16
|
||||
#define HWY_MINOR 15
|
||||
#define HWY_PATCH 0
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
@ -37,9 +37,7 @@ namespace hwy {
|
|||
|
||||
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
|
||||
// registers in the group, and is ignored on targets that do not support groups.
|
||||
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
|
||||
#define HWY_FULL2(T, LMUL) \
|
||||
hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
|
||||
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
|
||||
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
|
||||
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
|
||||
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
|
||||
|
@ -48,9 +46,9 @@ namespace hwy {
|
|||
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
|
||||
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
||||
|
||||
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
|
||||
// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
|
||||
#define HWY_CAPPED(T, MAX_N) \
|
||||
hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))>
|
||||
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Export user functions for static/dynamic dispatch
|
||||
|
@ -111,7 +109,6 @@ struct FunctionCache {
|
|||
template <FunctionType* const table[]>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
// If we are running here it means we need to update the chosen target.
|
||||
ChosenTarget& chosen_target = GetChosenTarget();
|
||||
chosen_target.Update();
|
||||
return (table[chosen_target.GetIndex()])(args...);
|
||||
}
|
||||
|
@ -266,15 +263,10 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
HWY_CHOOSE_SCALAR(FUNC_NAME), \
|
||||
}
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
|
||||
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
|
||||
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
|
||||
|
||||
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
||||
|
||||
// DEPRECATED names; please use HWY_HAVE_* instead.
|
||||
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
|
||||
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
|
||||
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_HIGHWAY_INCLUDED
|
||||
|
@ -291,6 +283,13 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
|||
#define HWY_HIGHWAY_PER_TARGET
|
||||
#endif
|
||||
|
||||
#undef HWY_FULL2
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
|
||||
#else
|
||||
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
|
||||
#endif
|
||||
|
||||
// These define ops inside namespace hwy::HWY_NAMESPACE.
|
||||
#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
|
||||
#include "hwy/ops/x86_128-inl.h"
|
||||
|
|
|
@ -1,106 +0,0 @@
|
|||
// Pseudo-generated file to handle both cmake & bazel build system.
|
||||
|
||||
// Initial generation done using cmake code:
|
||||
// include(GenerateExportHeader)
|
||||
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
|
||||
// hwy/highway_export.h)
|
||||
// code reformatted using clang-format --style=Google
|
||||
|
||||
#ifndef HWY_DLLEXPORT_H
|
||||
#define HWY_DLLEXPORT_H
|
||||
|
||||
// Bazel build are always static:
|
||||
#if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE)
|
||||
#define HWY_STATIC_DEFINE
|
||||
#endif
|
||||
|
||||
#ifdef HWY_STATIC_DEFINE
|
||||
#define HWY_DLLEXPORT
|
||||
#define HWY_NO_EXPORT
|
||||
#define HWY_CONTRIB_DLLEXPORT
|
||||
#define HWY_CONTRIB_NO_EXPORT
|
||||
#define HWY_TEST_DLLEXPORT
|
||||
#define HWY_TEST_NO_EXPORT
|
||||
#else
|
||||
|
||||
#ifndef HWY_DLLEXPORT
|
||||
#if defined(hwy_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_NO_EXPORT
|
||||
#else
|
||||
#define HWY_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_CONTRIB_DLLEXPORT
|
||||
#if defined(hwy_contrib_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_CONTRIB_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_CONTRIB_NO_EXPORT
|
||||
#else
|
||||
#define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_TEST_DLLEXPORT
|
||||
#if defined(hwy_test_EXPORTS)
|
||||
/* We are building this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#else
|
||||
/* We are using this library */
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
|
||||
#else
|
||||
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef HWY_TEST_NO_EXPORT
|
||||
#ifdef _WIN32
|
||||
#define HWY_TEST_NO_EXPORT
|
||||
#else
|
||||
#define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* HWY_DLLEXPORT_H */
|
|
@ -15,8 +15,6 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <bitset>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "highway_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
@ -28,53 +26,6 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// For testing that ForPartialVectors reaches every possible size:
|
||||
using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
|
||||
|
||||
// Monostate pattern because ForPartialVectors takes a template argument, not a
|
||||
// functor by reference.
|
||||
static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
|
||||
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
|
||||
static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
|
||||
return num_lanes + sizeof_t;
|
||||
}
|
||||
static size_t* MaxLanesForSize(size_t sizeof_t) {
|
||||
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
|
||||
static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
|
||||
return num_lanes + sizeof_t;
|
||||
}
|
||||
|
||||
struct TestMaxLanes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const size_t kMax = MaxLanes(d);
|
||||
HWY_ASSERT(N <= kMax);
|
||||
HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
|
||||
|
||||
NumLanesForSize(sizeof(T))->set(N);
|
||||
*MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMaxLanes() {
|
||||
ForAllTypes(ForPartialVectors<TestMaxLanes>());
|
||||
|
||||
// Ensure ForPartialVectors visited all powers of two [1, N].
|
||||
for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
|
||||
sizeof(uint64_t)}) {
|
||||
const size_t N = *MaxLanesForSize(sizeof_t);
|
||||
for (size_t i = 1; i <= N; i += i) {
|
||||
if (!NumLanesForSize(sizeof_t)->test(i)) {
|
||||
fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
|
||||
static_cast<int>(sizeof_t), static_cast<int>(i),
|
||||
static_cast<int>(N));
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TestSet {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -371,7 +322,6 @@ HWY_AFTER_NAMESPACE();
|
|||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HighwayTest);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
|
||||
|
|
|
@ -1,19 +0,0 @@
|
|||
HWY_0 {
|
||||
global:
|
||||
extern "C++" {
|
||||
*hwy::*;
|
||||
};
|
||||
|
||||
local:
|
||||
# Hide all the std namespace symbols. std namespace is explicitly marked
|
||||
# as visibility(default) and header-only functions or methods (such as those
|
||||
# from templates) should be exposed in shared libraries as weak symbols but
|
||||
# this is only needed when we expose those types in the shared library API
|
||||
# in any way. We don't use C++ std types in the API and we also don't
|
||||
# support exceptions in the library.
|
||||
# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
|
||||
# about this.
|
||||
extern "C++" {
|
||||
*std::*;
|
||||
};
|
||||
};
|
|
@ -37,7 +37,7 @@
|
|||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#if defined(__MACH__)
|
||||
#include <mach/mach.h>
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
|
@ -148,7 +148,7 @@ inline Ticks Start() {
|
|||
LARGE_INTEGER counter;
|
||||
(void)QueryPerformanceCounter(&counter);
|
||||
t = counter.QuadPart;
|
||||
#elif defined(__APPLE__)
|
||||
#elif defined(__MACH__)
|
||||
t = mach_absolute_time();
|
||||
#elif defined(__HAIKU__)
|
||||
t = system_time_nsecs(); // since boot
|
||||
|
@ -405,7 +405,7 @@ double NominalClockRate() {
|
|||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT double InvariantTicksPerSecond() {
|
||||
double InvariantTicksPerSecond() {
|
||||
#if HWY_ARCH_PPC && defined(__GLIBC__)
|
||||
return double(__ppc_get_timebase_freq());
|
||||
#elif HWY_ARCH_X86
|
||||
|
@ -415,7 +415,7 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
|
|||
LARGE_INTEGER freq;
|
||||
(void)QueryPerformanceFrequency(&freq);
|
||||
return double(freq.QuadPart);
|
||||
#elif defined(__APPLE__)
|
||||
#elif defined(__MACH__)
|
||||
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
|
||||
mach_timebase_info_data_t timebase;
|
||||
(void)mach_timebase_info(&timebase);
|
||||
|
@ -426,12 +426,12 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
|
|||
#endif
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT double Now() {
|
||||
double Now() {
|
||||
static const double mul = 1.0 / InvariantTicksPerSecond();
|
||||
return static_cast<double>(timer::Start()) * mul;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT uint64_t TimerResolution() {
|
||||
uint64_t TimerResolution() {
|
||||
// Nested loop avoids exceeding stack/L1 capacity.
|
||||
timer::Ticks repetitions[Params::kTimerSamples];
|
||||
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
|
||||
|
@ -656,11 +656,10 @@ timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
|
|||
|
||||
} // namespace
|
||||
|
||||
HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
|
||||
int Unpredictable1() { return timer::Start() != ~0ULL; }
|
||||
|
||||
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
|
||||
const FuncInput* inputs, const size_t num_inputs,
|
||||
Result* results, const Params& p) {
|
||||
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results, const Params& p) {
|
||||
NANOBENCHMARK_CHECK(num_inputs != 0);
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
|
|
|
@ -47,8 +47,6 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
// Enables sanity checks that verify correct operation at the cost of
|
||||
// longer benchmark runs.
|
||||
#ifndef NANOBENCHMARK_ENABLE_CHECKS
|
||||
|
@ -74,23 +72,23 @@ namespace platform {
|
|||
// Returns tick rate, useful for converting measurements to seconds. Invariant
|
||||
// means the tick counter frequency is independent of CPU throttling or sleep.
|
||||
// This call may be expensive, callers should cache the result.
|
||||
HWY_DLLEXPORT double InvariantTicksPerSecond();
|
||||
double InvariantTicksPerSecond();
|
||||
|
||||
// Returns current timestamp [in seconds] relative to an unspecified origin.
|
||||
// Features: monotonic (no negative elapsed time), steady (unaffected by system
|
||||
// time changes), high-resolution (on the order of microseconds).
|
||||
HWY_DLLEXPORT double Now();
|
||||
double Now();
|
||||
|
||||
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
|
||||
// timer resolution (minimum measurable difference) and overhead.
|
||||
// This call is expensive, callers should cache the result.
|
||||
HWY_DLLEXPORT uint64_t TimerResolution();
|
||||
uint64_t TimerResolution();
|
||||
|
||||
} // namespace platform
|
||||
|
||||
// Returns 1, but without the compiler knowing what the value is. This prevents
|
||||
// optimizing out code.
|
||||
HWY_DLLEXPORT int Unpredictable1();
|
||||
int Unpredictable1();
|
||||
|
||||
// Input influencing the function being measured (e.g. number of bytes to copy).
|
||||
using FuncInput = size_t;
|
||||
|
@ -166,9 +164,9 @@ struct Result {
|
|||
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
|
||||
// Returns how many Result were written to "results": one per unique input, or
|
||||
// zero if the measurement failed (an error message goes to stderr).
|
||||
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
|
||||
const FuncInput* inputs, const size_t num_inputs,
|
||||
Result* results, const Params& p = Params());
|
||||
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results,
|
||||
const Params& p = Params());
|
||||
|
||||
// Calls operator() of the given closure (lambda function).
|
||||
template <class Closure>
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -19,14 +19,14 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
|
||||
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
|
||||
template <class V>
|
||||
using LaneType = decltype(GetLane(V()));
|
||||
|
||||
// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
|
||||
// type of functions that do not take a vector argument, or as an argument type
|
||||
// if the function only has a template argument for D, or for explicit type
|
||||
// names instead of auto. This may be a built-in type.
|
||||
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
|
||||
// of functions that do not take a vector argument, or as an argument type if
|
||||
// the function only has a template argument for D, or for explicit type names
|
||||
// instead of auto. This may be a built-in type.
|
||||
template <class D>
|
||||
using Vec = decltype(Zero(D()));
|
||||
|
||||
|
@ -53,6 +53,12 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
|
|||
return CombineShiftRightBytes<kBytes>(d, hi, lo);
|
||||
}
|
||||
|
||||
// DEPRECATED
|
||||
template <size_t kLanes, class V>
|
||||
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
|
||||
return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns lanes with the most significant bit set and all other bits zero.
|
||||
|
@ -202,15 +208,6 @@ HWY_API V AESRound(V state, const V round_key) {
|
|||
return state;
|
||||
}
|
||||
|
||||
template <class V> // u8
|
||||
HWY_API V AESLastRound(V state, const V round_key) {
|
||||
// LIke AESRound, but without MixColumns.
|
||||
state = detail::SubBytes(state);
|
||||
state = detail::ShiftRows(state);
|
||||
state = Xor(state, round_key); // AddRoundKey
|
||||
return state;
|
||||
}
|
||||
|
||||
// Constant-time implementation inspired by
|
||||
// https://www.bearssl.org/constanttime.html, but about half the cost because we
|
||||
// use 64x64 multiplies and 128-bit XORs.
|
||||
|
@ -281,47 +278,23 @@ HWY_API V CLMulUpper(V a, V b) {
|
|||
#define HWY_NATIVE_POPCNT
|
||||
#endif
|
||||
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#define HWY_MIN_POW2_FOR_128 1
|
||||
#else
|
||||
// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
|
||||
// guarantee 128 bits anyway.
|
||||
#define HWY_MIN_POW2_FOR_128 0
|
||||
#endif
|
||||
|
||||
// This algorithm requires vectors to be at least 16 bytes, which is the case
|
||||
// for LMUL >= 2. If not, use the fallback below.
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
|
||||
HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
constexpr DFromV<V> d;
|
||||
HWY_ALIGN constexpr uint8_t kLookup[16] = {
|
||||
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
|
||||
};
|
||||
const auto lo = And(v, Set(d, 0xF));
|
||||
const auto hi = ShiftRight<4>(v);
|
||||
const auto lookup = LoadDup128(d, kLookup);
|
||||
auto lo = And(v, Set(d, 0xF));
|
||||
auto hi = ShiftRight<4>(v);
|
||||
auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup);
|
||||
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
|
||||
}
|
||||
|
||||
// RVV has a specialization that avoids the Set().
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
// Slower fallback for capped vectors.
|
||||
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
|
||||
v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
|
||||
v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
|
||||
return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
|
||||
}
|
||||
#endif // HWY_TARGET != HWY_RVV
|
||||
|
||||
template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
const Repartition<uint8_t, decltype(d)> d8;
|
||||
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
||||
Repartition<uint8_t, decltype(d)> d8;
|
||||
auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
|
||||
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
|
@ -333,7 +306,7 @@ HWY_API V PopulationCount(V v) {
|
|||
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
|
||||
}
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
|
||||
HWY_API V PopulationCount(V v) {
|
||||
const DFromV<V> d;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -27,7 +27,7 @@ namespace HWY_NAMESPACE {
|
|||
|
||||
// Single instruction, single data.
|
||||
template <typename T>
|
||||
using Sisd = Simd<T, 1, 0>;
|
||||
using Sisd = Simd<T, 1>;
|
||||
|
||||
// (Wrapper class required for overloading comparison operators.)
|
||||
template <typename T>
|
||||
|
@ -187,20 +187,6 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
|
|||
return Xor(a, b);
|
||||
}
|
||||
|
||||
// ------------------------------ OrAnd
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
|
||||
return Or(o, And(a1, a2));
|
||||
}
|
||||
|
||||
// ------------------------------ IfVecThenElse
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
|
||||
return IfThenElse(MaskFromVec(mask), yes, no);
|
||||
}
|
||||
|
||||
// ------------------------------ CopySign
|
||||
|
||||
template <typename T>
|
||||
|
@ -289,11 +275,6 @@ HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
|
|||
return mask.bits ? Vec1<T>(0) : no;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
|
||||
return v.raw < 0 ? yes : no;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
|
||||
return v.raw < 0 ? Vec1<T>(0) : v;
|
||||
|
@ -442,13 +423,7 @@ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
|
|||
return Vec1<double>(a.raw - b.raw);
|
||||
}
|
||||
|
||||
// ------------------------------ SumsOf8
|
||||
|
||||
HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
|
||||
return Vec1<uint64_t>(v.raw);
|
||||
}
|
||||
|
||||
// ------------------------------ SaturatedAdd
|
||||
// ------------------------------ Saturating addition
|
||||
|
||||
// Returns a + b clamped to the destination range.
|
||||
|
||||
|
@ -956,30 +931,21 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
|||
return Vec1<ToT>(static_cast<ToT>(from.raw));
|
||||
}
|
||||
|
||||
// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
|
||||
// so we overload for FromT=double and ToT={float,int32_t}.
|
||||
HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
|
||||
template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
|
||||
HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
||||
static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
|
||||
|
||||
// Prevent ubsan errors when converting float to narrower integer/float
|
||||
if (std::isinf(from.raw) ||
|
||||
std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
|
||||
return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
|
||||
: HighestValue<float>());
|
||||
std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
|
||||
return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
|
||||
: HighestValue<ToT>());
|
||||
}
|
||||
return Vec1<float>(static_cast<float>(from.raw));
|
||||
}
|
||||
HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
|
||||
// Prevent ubsan errors when converting int32_t to narrower integer/int32_t
|
||||
if (std::isinf(from.raw) ||
|
||||
std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
|
||||
return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
|
||||
: HighestValue<int32_t>());
|
||||
}
|
||||
return Vec1<int32_t>(static_cast<int32_t>(from.raw));
|
||||
return Vec1<ToT>(static_cast<ToT>(from.raw));
|
||||
}
|
||||
|
||||
template <typename FromT, typename ToT>
|
||||
template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
|
||||
HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
|
||||
static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
|
||||
static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
|
||||
|
||||
// Int to int: choose closest value in ToT to `from` (avoids UB)
|
||||
|
@ -1117,12 +1083,6 @@ HWY_API T GetLane(const Vec1<T> v) {
|
|||
return v.raw;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> DupEven(Vec1<T> v) {
|
||||
return v;
|
||||
}
|
||||
// DupOdd is unsupported.
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
|
||||
return even;
|
||||
|
@ -1165,14 +1125,6 @@ HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
|
|||
return v;
|
||||
}
|
||||
|
||||
// ------------------------------ ReverseBlocks
|
||||
|
||||
// Single block: no change
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse
|
||||
|
||||
template <typename T>
|
||||
|
@ -1180,21 +1132,6 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
|
|||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) {
|
||||
return v;
|
||||
}
|
||||
|
||||
// ================================================== BLOCKWISE
|
||||
// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
|
||||
|
||||
|
@ -1371,6 +1308,41 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
|
|||
return v;
|
||||
}
|
||||
|
||||
// ================================================== DEPRECATED
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t StoreMaskBits(const Mask1<T> mask, uint8_t* bits) {
|
||||
return StoreMaskBits(Sisd<T>(), mask, bits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllTrue(const Mask1<T> mask) {
|
||||
return AllTrue(Sisd<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllFalse(const Mask1<T> mask) {
|
||||
return AllFalse(Sisd<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t CountTrue(const Mask1<T> mask) {
|
||||
return CountTrue(Sisd<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> SumOfLanes(const Vec1<T> v) {
|
||||
return SumOfLanes(Sisd<T>(), v);
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> MinOfLanes(const Vec1<T> v) {
|
||||
return MinOfLanes(Sisd<T>(), v);
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API Vec1<T> MaxOfLanes(const Vec1<T> v) {
|
||||
return MaxOfLanes(Sisd<T>(), v);
|
||||
}
|
||||
|
||||
// ================================================== Operator wrapper
|
||||
|
||||
template <class V>
|
||||
|
|
|
@ -32,10 +32,9 @@
|
|||
#undef HWY_MAX_BYTES
|
||||
#undef HWY_LANES
|
||||
|
||||
#undef HWY_HAVE_SCALABLE
|
||||
#undef HWY_HAVE_INTEGER64
|
||||
#undef HWY_HAVE_FLOAT16
|
||||
#undef HWY_HAVE_FLOAT64
|
||||
#undef HWY_CAP_INTEGER64
|
||||
#undef HWY_CAP_FLOAT16
|
||||
#undef HWY_CAP_FLOAT64
|
||||
#undef HWY_CAP_GE256
|
||||
#undef HWY_CAP_GE512
|
||||
|
||||
|
@ -80,10 +79,9 @@
|
|||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_AES 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
@ -98,10 +96,9 @@
|
|||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -116,10 +113,9 @@
|
|||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -133,10 +129,9 @@
|
|||
#define HWY_MAX_BYTES 64
|
||||
#define HWY_LANES(T) (64 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 1
|
||||
|
||||
|
@ -164,10 +159,9 @@
|
|||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 0
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -183,16 +177,15 @@
|
|||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_ARCH_ARM_A64
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#endif
|
||||
|
||||
#define HWY_NAMESPACE N_NEON
|
||||
|
@ -203,19 +196,23 @@
|
|||
// SVE[2]
|
||||
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
|
||||
|
||||
#if defined(HWY_EMULATE_SVE) && !defined(__F16C__)
|
||||
#error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16"
|
||||
#endif
|
||||
|
||||
// SVE only requires lane alignment, not natural alignment of the entire vector.
|
||||
#define HWY_ALIGN alignas(8)
|
||||
|
||||
#define HWY_MAX_BYTES 256
|
||||
|
||||
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
|
||||
// overallocation.
|
||||
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
|
||||
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
|
||||
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
|
||||
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
|
||||
#define HWY_LANES(T) (32768 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 1
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -235,10 +232,9 @@
|
|||
#define HWY_MAX_BYTES 16
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_CAP_INTEGER64 0
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -254,10 +250,9 @@
|
|||
#define HWY_MAX_BYTES 32
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 0
|
||||
#define HWY_CAP_INTEGER64 0
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -276,20 +271,20 @@
|
|||
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
|
||||
#define HWY_MAX_BYTES 65536
|
||||
|
||||
// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
|
||||
// LMUL. This is the tightest possible upper bound.
|
||||
#define HWY_LANES(T) (8192 / sizeof(T))
|
||||
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
|
||||
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
|
||||
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
|
||||
#define HWY_LANES(T) (8388608 / sizeof(T))
|
||||
|
||||
#define HWY_HAVE_SCALABLE 1
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(__riscv_zfh)
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#else
|
||||
#define HWY_HAVE_FLOAT16 0
|
||||
#define HWY_CAP_FLOAT16 0
|
||||
#endif
|
||||
|
||||
#define HWY_NAMESPACE N_RVV
|
||||
|
@ -305,10 +300,9 @@
|
|||
#define HWY_MAX_BYTES 8
|
||||
#define HWY_LANES(T) 1
|
||||
|
||||
#define HWY_HAVE_SCALABLE 0
|
||||
#define HWY_HAVE_INTEGER64 1
|
||||
#define HWY_HAVE_FLOAT16 1
|
||||
#define HWY_HAVE_FLOAT64 1
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT16 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
|
@ -350,3 +344,7 @@
|
|||
#else
|
||||
#define HWY_ATTR
|
||||
#endif
|
||||
|
||||
// DEPRECATED
|
||||
#undef HWY_GATHER_LANES
|
||||
#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
||||
|
|
|
@ -26,117 +26,65 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway operations are implemented as overloaded functions selected using an
|
||||
// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
|
||||
// shift count applied to scalable vectors. Instead of referring to Simd<>
|
||||
// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
|
||||
// full vector, or fractions/groups if the argument is negative/positive),
|
||||
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
|
||||
// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
|
||||
// cap. For constexpr-size vectors, N is the actual number of lanes. This
|
||||
// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
|
||||
template <typename Lane, size_t N, int kPow2>
|
||||
// SIMD operations are implemented as overloaded functions selected using a tag
|
||||
// type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
|
||||
// use only. Users create D via aliases ScalableTag<T>() (a full vector),
|
||||
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
|
||||
// (always a power of two) is Lanes(D()).
|
||||
template <typename Lane, size_t N>
|
||||
struct Simd {
|
||||
constexpr Simd() = default;
|
||||
using T = Lane;
|
||||
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
|
||||
|
||||
// Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
|
||||
// warns when using enums and non-enums in the same expression. Cannot be
|
||||
// static constexpr function (another MSVC limitation).
|
||||
static constexpr size_t kPrivateN = N;
|
||||
static constexpr int kPrivatePow2 = kPow2;
|
||||
|
||||
template <typename NewT>
|
||||
static constexpr size_t NewN() {
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
|
||||
}
|
||||
|
||||
#if HWY_HAVE_SCALABLE
|
||||
template <typename NewT>
|
||||
static constexpr int Pow2Ratio() {
|
||||
return (sizeof(NewT) > sizeof(T))
|
||||
? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
|
||||
: -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Widening/narrowing ops change the number of lanes and/or their type.
|
||||
// To initialize such vectors, we need the corresponding tag types:
|
||||
|
||||
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
|
||||
#if HWY_HAVE_SCALABLE
|
||||
template <typename NewT>
|
||||
using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
|
||||
#else
|
||||
template <typename NewT>
|
||||
using Rebind = Simd<NewT, N, kPow2>;
|
||||
#endif
|
||||
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
|
||||
template <typename NewLane>
|
||||
using Rebind = Simd<NewLane, N>;
|
||||
|
||||
// Change lane type while keeping the same vector size, e.g. for MulEven.
|
||||
template <typename NewT>
|
||||
using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
|
||||
// MulEven() with another lane type, but same total size.
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
template <typename NewLane>
|
||||
using Repartition =
|
||||
Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
|
||||
|
||||
// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
#if HWY_HAVE_SCALABLE
|
||||
// Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
|
||||
// then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
|
||||
using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
|
||||
#else
|
||||
using Half = Simd<T, (N + 1) / 2, kPow2>;
|
||||
#endif
|
||||
// LowerHalf() with the same lane type, but half the lanes.
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
using Half = Simd<T, (N + 1) / 2>;
|
||||
|
||||
// Twice the lanes while keeping the same lane type, e.g. for Combine.
|
||||
#if HWY_HAVE_SCALABLE
|
||||
using Twice = Simd<T, 2 * N, kPow2 + 1>;
|
||||
#else
|
||||
using Twice = Simd<T, 2 * N, kPow2>;
|
||||
#endif
|
||||
// Combine() with the same lane type, but twice the lanes.
|
||||
using Twice = Simd<T, 2 * N>;
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
#if HWY_HAVE_SCALABLE
|
||||
|
||||
template <typename T, size_t N, int kPow2>
|
||||
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
|
||||
return N == HWY_LANES(T) && kPow2 == 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns the number of lanes (possibly zero) after applying a shift:
|
||||
// - 0: no change;
|
||||
// - [1,3]: a group of 2,4,8 [fractional] vectors;
|
||||
// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
|
||||
// Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
|
||||
// - a full vector (pow2 = 0);
|
||||
// - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
|
||||
// - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
|
||||
constexpr size_t ScaleByPower(size_t N, int pow2) {
|
||||
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For fractions, if N == 1 ensure we still return at least one lane.
|
||||
return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
|
||||
#else
|
||||
// If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
|
||||
return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Struct wrappers enable validation of arguments via static_assert.
|
||||
template <typename T, int kPow2>
|
||||
struct ScalableTagChecker {
|
||||
static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Only RVV supports register groups.
|
||||
using type = Simd<T, HWY_LANES(T), kPow2>;
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For SVE[2], only allow full or fractions.
|
||||
using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
using type = Simd<T, /*N=*/1, 0>;
|
||||
#else
|
||||
// Only allow full or fractions.
|
||||
using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
|
||||
#endif
|
||||
using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
|
||||
};
|
||||
|
||||
template <typename T, size_t kLimit>
|
||||
struct CappedTagChecker {
|
||||
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
|
||||
using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
|
||||
using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
|
||||
};
|
||||
|
||||
template <typename T, size_t kNumLanes>
|
||||
|
@ -147,7 +95,7 @@ struct FixedTagChecker {
|
|||
// HWY_MAX_BYTES would still allow uint8x8, which is not supported.
|
||||
static_assert(kNumLanes == 1, "Scalar only supports one lane");
|
||||
#endif
|
||||
using type = Simd<T, kNumLanes, 0>;
|
||||
using type = Simd<T, kNumLanes>;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
@ -166,14 +114,15 @@ using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
|
|||
// typically used for 1D loops with a relatively low application-defined upper
|
||||
// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
|
||||
// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
|
||||
// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
|
||||
// chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
|
||||
// this would enable vector-length-agnostic loops using ScalableTag).
|
||||
template <typename T, size_t kLimit>
|
||||
using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
|
||||
|
||||
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
|
||||
// even on targets with scalable vectors. HWY_SCALAR only supports one lane.
|
||||
// All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
|
||||
// even on targets with scalable vectors. All targets except HWY_SCALAR support
|
||||
// up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
|
||||
// on that is non-portable and discouraged.
|
||||
//
|
||||
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
|
||||
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
|
||||
|
@ -214,11 +163,11 @@ using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
|
|||
template <class D>
|
||||
using Half = typename D::Half;
|
||||
|
||||
// Tag for the same lane type as D, but twice the lanes.
|
||||
// Descriptor for the same lane type as D, but twice the lanes.
|
||||
template <class D>
|
||||
using Twice = typename D::Twice;
|
||||
|
||||
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
|
||||
// Same as base.h macros but with a Simd<T, N> argument instead of T.
|
||||
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
|
||||
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
|
||||
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
|
||||
|
@ -226,12 +175,6 @@ using Twice = typename D::Twice;
|
|||
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
|
||||
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
|
||||
|
||||
// MSVC workaround: use PrivateN directly instead of MaxLanes.
|
||||
#define HWY_IF_LT128_D(D) \
|
||||
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
|
||||
#define HWY_IF_GE128_D(D) \
|
||||
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
|
||||
|
||||
// Same, but with a vector argument.
|
||||
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
|
||||
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
|
||||
|
@ -240,59 +183,42 @@ using Twice = typename D::Twice;
|
|||
|
||||
// For implementing functions for a specific type.
|
||||
// IsSame<...>() in template arguments is broken on MSVC2015.
|
||||
#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
|
||||
#define HWY_IF_LANES_ARE(T, V) \
|
||||
EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr
|
||||
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
|
||||
return D::kPrivatePow2;
|
||||
}
|
||||
|
||||
// MSVC requires the explicit <D>.
|
||||
#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
|
||||
|
||||
#if HWY_HAVE_SCALABLE
|
||||
|
||||
// Upper bound on the number of lanes. Intended for template arguments and
|
||||
// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
|
||||
// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
|
||||
// actual size for allocating storage. WARNING: MSVC might not be able to deduce
|
||||
// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
|
||||
return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
|
||||
D::kPrivatePow2);
|
||||
}
|
||||
|
||||
#else
|
||||
// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
|
||||
// is not an option, nor does a member function work.
|
||||
template <class D>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
|
||||
return D::kPrivateN;
|
||||
}
|
||||
|
||||
// (Potentially) non-constant actual size of the vector at runtime, subject to
|
||||
// the limit imposed by the Simd. Useful for advancing loop counters.
|
||||
// Targets with scalable vectors define this themselves.
|
||||
template <typename T, size_t N, int kPow2>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
|
||||
// Compile-time-constant, (typically but not guaranteed) an upper bound on the
|
||||
// number of lanes.
|
||||
// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
|
||||
// `#if HWY_CAP_GE*`.
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
|
||||
return N;
|
||||
}
|
||||
|
||||
#endif // !HWY_HAVE_SCALABLE
|
||||
// Targets with non-constexpr Lanes define this themselves.
|
||||
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
|
||||
|
||||
// (Potentially) non-constant actual size of the vector at runtime, subject to
|
||||
// the limit imposed by the Simd. Useful for advancing loop counters.
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
|
||||
return N;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
|
||||
// functions in two situations:
|
||||
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
|
||||
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
|
||||
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
|
||||
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
|
||||
// all) tests to fail.
|
||||
//
|
||||
// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
|
||||
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
|
||||
// and possibly also other functions that are not inlined.
|
||||
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
|
||||
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
|
||||
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64)
|
||||
template <class V>
|
||||
using VecArg = const V&;
|
||||
#else
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -44,6 +44,10 @@
|
|||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
using Full256 = Simd<T, 32 / sizeof(T)>;
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
|
@ -322,38 +326,6 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ OrAnd
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const Full256<T> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
using VU = VFromD<decltype(du)>;
|
||||
const __m256i ret = _mm256_ternarylogic_epi64(
|
||||
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
|
||||
return BitCast(d, VU{ret});
|
||||
#else
|
||||
return Or(o, And(a1, a2));
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ IfVecThenElse
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const Full256<T> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
using VU = VFromD<decltype(du)>;
|
||||
return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
|
||||
BitCast(du, yes).raw,
|
||||
BitCast(du, no).raw, 0xCA)});
|
||||
#else
|
||||
return IfThenElse(MaskFromVec(mask), yes, no);
|
||||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Operator overloads (internal-only if float)
|
||||
|
||||
template <typename T>
|
||||
|
@ -813,7 +785,6 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
|
|||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
|
||||
const auto zero = Zero(Full256<T>());
|
||||
// AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
|
||||
return IfThenElse(MaskFromVec(v), zero, v);
|
||||
}
|
||||
|
||||
|
@ -1424,12 +1395,7 @@ HWY_API Vec256<double> operator-(const Vec256<double> a,
|
|||
return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ SumsOf8
|
||||
HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
|
||||
return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
|
||||
}
|
||||
|
||||
// ------------------------------ SaturatedAdd
|
||||
// ------------------------------ Saturating addition
|
||||
|
||||
// Returns a + b clamped to the destination range.
|
||||
|
||||
|
@ -1453,7 +1419,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
|
|||
return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ SaturatedSub
|
||||
// ------------------------------ Saturating subtraction
|
||||
|
||||
// Returns a - b clamped to the destination range.
|
||||
|
||||
|
@ -1719,35 +1685,6 @@ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
|
||||
HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
|
||||
Vec256<int8_t> no) {
|
||||
// int8: AVX2 IfThenElse only looks at the MSB.
|
||||
return IfThenElse(MaskFromVec(v), yes, no);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
|
||||
static_assert(IsSigned<T>(), "Only works for signed/float");
|
||||
const Full256<T> d;
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
|
||||
// 16-bit: no native blendv, so copy sign to lower byte's MSB.
|
||||
v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
|
||||
return IfThenElse(MaskFromVec(v), yes, no);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
|
||||
static_assert(IsSigned<T>(), "Only works for signed/float");
|
||||
const Full256<T> d;
|
||||
const RebindToFloat<decltype(d)> df;
|
||||
|
||||
// 32/64-bit: use float IfThenElse, which only looks at the MSB.
|
||||
const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
|
||||
return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
|
||||
}
|
||||
|
||||
// ------------------------------ ShiftLeftSame
|
||||
|
||||
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
|
||||
|
@ -2297,7 +2234,7 @@ HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|||
Store(v, d, lanes);
|
||||
|
||||
alignas(32) Offset offset_lanes[N];
|
||||
Store(offset, Full256<Offset>(), offset_lanes);
|
||||
Store(offset, Simd<Offset, N>(), offset_lanes);
|
||||
|
||||
uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
|
@ -2315,7 +2252,7 @@ HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
|
|||
Store(v, d, lanes);
|
||||
|
||||
alignas(32) Index index_lanes[N];
|
||||
Store(index, Full256<Index>(), index_lanes);
|
||||
Store(index, Simd<Index, N>(), index_lanes);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
base[index_lanes[i]] = lanes[i];
|
||||
|
@ -2536,7 +2473,7 @@ HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
|
|||
template <int kLanes, typename T>
|
||||
HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
|
||||
const Repartition<uint8_t, decltype(d)> d8;
|
||||
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
|
||||
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
||||
}
|
||||
|
||||
// ------------------------------ CombineShiftRightBytes
|
||||
|
@ -2796,81 +2733,6 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse2
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
|
||||
const Full256<uint32_t> du32;
|
||||
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
|
||||
return Shuffle2301(v);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
|
||||
return Shuffle01(v);
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse4
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
|
||||
11, 10, 9, 8, 15, 14, 13, 12};
|
||||
const Vec256<int16_t> idx = Load(di, kReverse4);
|
||||
return BitCast(d, Vec256<int16_t>{
|
||||
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
||||
#else
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
|
||||
return Shuffle0123(v);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
|
||||
return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
|
||||
}
|
||||
HWY_API Vec256<double> Reverse4(Full256<double> /* tag */, Vec256<double> v) {
|
||||
return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse8
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
|
||||
#if HWY_TARGET <= HWY_AVX3
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
|
||||
15, 14, 13, 12, 11, 10, 9, 8};
|
||||
const Vec256<int16_t> idx = Load(di, kReverse8);
|
||||
return BitCast(d, Vec256<int16_t>{
|
||||
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
||||
#else
|
||||
const RepartitionToWide<decltype(d)> dw;
|
||||
return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
|
||||
return Reverse(d, v);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
|
||||
HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
|
||||
}
|
||||
|
||||
// ------------------------------ InterleaveLower
|
||||
|
||||
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
|
||||
|
@ -2920,6 +2782,12 @@ HWY_API Vec256<double> InterleaveLower(const Vec256<double> a,
|
|||
return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// Additional overload for the optional Simd<> tag.
|
||||
template <typename T, class V = Vec256<T>>
|
||||
HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
|
||||
return InterleaveLower(a, b);
|
||||
}
|
||||
|
||||
// ------------------------------ InterleaveUpper
|
||||
|
||||
// All functions inside detail lack the required D parameter.
|
||||
|
@ -2981,11 +2849,11 @@ HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
|
|||
// this is necessary because the single-lane scalar cannot return two values.
|
||||
template <typename T, typename TW = MakeWide<T>>
|
||||
HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) {
|
||||
return BitCast(Full256<TW>(), InterleaveLower(a, b));
|
||||
return BitCast(Full256<TW>(), InterleaveLower(Full256<T>(), a, b));
|
||||
}
|
||||
template <typename T, typename TW = MakeWide<T>>
|
||||
HWY_API Vec256<TW> ZipLower(Full256<TW> dw, Vec256<T> a, Vec256<T> b) {
|
||||
return BitCast(dw, InterleaveLower(a, b));
|
||||
return BitCast(dw, InterleaveLower(Full256<T>(), a, b));
|
||||
}
|
||||
|
||||
template <typename T, typename TW = MakeWide<T>>
|
||||
|
@ -3195,38 +3063,6 @@ HWY_API Vec256<double> ConcatEven(Full256<double> d, Vec256<double> hi,
|
|||
#endif
|
||||
}
|
||||
|
||||
// ------------------------------ DupEven (InterleaveLower)
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec256<T> DupEven(Vec256<T> v) {
|
||||
return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
|
||||
}
|
||||
HWY_API Vec256<float> DupEven(Vec256<float> v) {
|
||||
return Vec256<float>{
|
||||
_mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec256<T> DupEven(const Vec256<T> v) {
|
||||
return InterleaveLower(Full256<T>(), v, v);
|
||||
}
|
||||
|
||||
// ------------------------------ DupOdd (InterleaveUpper)
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec256<T> DupOdd(Vec256<T> v) {
|
||||
return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
||||
}
|
||||
HWY_API Vec256<float> DupOdd(Vec256<float> v) {
|
||||
return Vec256<float>{
|
||||
_mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
|
||||
return InterleaveUpper(Full256<T>(), v, v);
|
||||
}
|
||||
|
||||
// ------------------------------ OddEven
|
||||
|
||||
namespace detail {
|
||||
|
@ -3304,13 +3140,6 @@ HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
|
|||
return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
|
||||
}
|
||||
|
||||
// ------------------------------ ReverseBlocks (ConcatLowerUpper)
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
|
||||
return ConcatLowerUpper(d, v, v);
|
||||
}
|
||||
|
||||
// ------------------------------ TableLookupBytes (ZeroExtendVector)
|
||||
|
||||
// Both full
|
||||
|
@ -3607,7 +3436,7 @@ HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
|
|||
_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
|
||||
}
|
||||
|
||||
HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */,
|
||||
HWY_API Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */,
|
||||
const Vec256<int32_t> v) {
|
||||
const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
|
||||
// Concatenate lower 64 bits of each 128-bit block
|
||||
|
@ -3626,7 +3455,7 @@ HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
|
|||
_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
|
||||
}
|
||||
|
||||
HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */,
|
||||
HWY_API Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */,
|
||||
const Vec256<int32_t> v) {
|
||||
const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
|
||||
// Concatenate lower 64 bits of each 128-bit block
|
||||
|
@ -3724,7 +3553,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
|
|||
const auto lo = LowerHalf(quad);
|
||||
const auto hi = UpperHalf(Full128<uint32_t>(), quad);
|
||||
const auto pair = LowerHalf(lo | hi);
|
||||
return BitCast(Full64<uint8_t>(), pair);
|
||||
return BitCast(Simd<uint8_t, 8>(), pair);
|
||||
}
|
||||
|
||||
// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
|
||||
|
@ -3862,19 +3691,6 @@ HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state,
|
|||
#endif
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state,
|
||||
Vec256<uint8_t> round_key) {
|
||||
#if HWY_TARGET == HWY_AVX3_DL
|
||||
return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
|
||||
#else
|
||||
const Full256<uint8_t> d;
|
||||
const Half<decltype(d)> d2;
|
||||
return Combine(d,
|
||||
AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
|
||||
AESLastRound(LowerHalf(state), LowerHalf(round_key)));
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) {
|
||||
#if HWY_TARGET == HWY_AVX3_DL
|
||||
return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
|
||||
|
@ -4203,7 +4019,7 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
|
|||
#if HWY_TARGET <= HWY_AVX3_DL
|
||||
return CompressStore(v, m, d, unaligned); // also native
|
||||
#else
|
||||
const size_t count = CountTrue(d, m);
|
||||
const size_t count = CountTrue(m);
|
||||
const Vec256<T> compressed = Compress(v, m);
|
||||
// There is no 16-bit MaskedStore, so blend.
|
||||
const Vec256<T> prev = LoadU(d, unaligned);
|
||||
|
@ -4428,7 +4244,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
|
|||
namespace detail {
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d,
|
||||
uint64_t mask_bits) {
|
||||
const RebindToUnsigned<decltype(d)> d32;
|
||||
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
|
||||
|
@ -4491,7 +4307,7 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
|||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
|
||||
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d,
|
||||
uint64_t mask_bits) {
|
||||
const Repartition<uint32_t, decltype(d)> d32;
|
||||
|
||||
|
@ -4537,8 +4353,8 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
|
|||
const auto compressed1 = Compress(promoted1, mask_bits1);
|
||||
|
||||
const Half<decltype(du)> dh;
|
||||
const auto demoted0 = ZeroExtendVector(du, DemoteTo(dh, compressed0));
|
||||
const auto demoted1 = ZeroExtendVector(du, DemoteTo(dh, compressed1));
|
||||
const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
|
||||
const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
|
||||
|
||||
const size_t count0 = PopCount(mask_bits0);
|
||||
// Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
|
||||
|
@ -4809,6 +4625,101 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
|
|||
return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
|
||||
}
|
||||
|
||||
// ================================================== DEPRECATED
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t StoreMaskBits(const Mask256<T> mask, uint8_t* bits) {
|
||||
return StoreMaskBits(Full256<T>(), mask, bits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllTrue(const Mask256<T> mask) {
|
||||
return AllTrue(Full256<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllFalse(const Mask256<T> mask) {
|
||||
return AllFalse(Full256<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t CountTrue(const Mask256<T> mask) {
|
||||
return CountTrue(Full256<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> SumOfLanes(const Vec256<T> vHL) {
|
||||
return SumOfLanes(Full256<T>(), vHL);
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> MinOfLanes(const Vec256<T> vHL) {
|
||||
return MinOfLanes(Full256<T>(), vHL);
|
||||
}
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> MaxOfLanes(const Vec256<T> vHL) {
|
||||
return MaxOfLanes(Full256<T>(), vHL);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec128<T> UpperHalf(Vec256<T> v) {
|
||||
return UpperHalf(Full128<T>(), v);
|
||||
}
|
||||
|
||||
template <int kBytes, typename T>
|
||||
HWY_API Vec256<T> ShiftRightBytes(const Vec256<T> v) {
|
||||
return ShiftRightBytes<kBytes>(Full256<T>(), v);
|
||||
}
|
||||
|
||||
template <int kLanes, typename T>
|
||||
HWY_API Vec256<T> ShiftRightLanes(const Vec256<T> v) {
|
||||
return ShiftRightLanes<kLanes>(Full256<T>(), v);
|
||||
}
|
||||
|
||||
template <size_t kBytes, typename T>
|
||||
HWY_API Vec256<T> CombineShiftRightBytes(Vec256<T> hi, Vec256<T> lo) {
|
||||
return CombineShiftRightBytes<kBytes>(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> InterleaveUpper(Vec256<T> a, Vec256<T> b) {
|
||||
return InterleaveUpper(Full256<T>(), a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<MakeWide<T>> ZipUpper(Vec256<T> a, Vec256<T> b) {
|
||||
return InterleaveUpper(Full256<MakeWide<T>>(), a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> Combine(Vec128<T> hi, Vec128<T> lo) {
|
||||
return Combine(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ZeroExtendVector(Vec128<T> lo) {
|
||||
return ZeroExtendVector(Full256<T>(), lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ConcatLowerLower(Vec256<T> hi, Vec256<T> lo) {
|
||||
return ConcatLowerLower(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ConcatLowerUpper(Vec256<T> hi, Vec256<T> lo) {
|
||||
return ConcatLowerUpper(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ConcatUpperLower(Vec256<T> hi, Vec256<T> lo) {
|
||||
return ConcatUpperLower(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> ConcatUpperUpper(Vec256<T> hi, Vec256<T> lo) {
|
||||
return ConcatUpperUpper(Full256<T>(), hi, lo);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
|
@ -57,6 +57,9 @@ HWY_BEFORE_NAMESPACE();
|
|||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T>
|
||||
using Full512 = Simd<T, 64 / sizeof(T)>;
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
|
@ -310,30 +313,6 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
|
|||
return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ OrAnd
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
|
||||
const Full512<T> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
using VU = VFromD<decltype(du)>;
|
||||
const __m512i ret = _mm512_ternarylogic_epi64(
|
||||
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
|
||||
return BitCast(d, VU{ret});
|
||||
}
|
||||
|
||||
// ------------------------------ IfVecThenElse
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
|
||||
const Full512<T> d;
|
||||
const RebindToUnsigned<decltype(d)> du;
|
||||
using VU = VFromD<decltype(du)>;
|
||||
return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
|
||||
BitCast(du, yes).raw,
|
||||
BitCast(du, no).raw, 0xCA)});
|
||||
}
|
||||
|
||||
// ------------------------------ Operator overloads (internal-only if float)
|
||||
|
||||
template <typename T>
|
||||
|
@ -600,13 +579,6 @@ HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask,
|
|||
return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
|
||||
static_assert(IsSigned<T>(), "Only works for signed/float");
|
||||
// AVX3 MaskFromVec only looks at the MSB
|
||||
return IfThenElse(MaskFromVec(v), yes, no);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
|
||||
// AVX3 MaskFromVec only looks at the MSB
|
||||
|
@ -709,12 +681,7 @@ HWY_API Vec512<double> operator-(const Vec512<double> a,
|
|||
return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ SumsOf8
|
||||
HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) {
|
||||
return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
|
||||
}
|
||||
|
||||
// ------------------------------ SaturatedAdd
|
||||
// ------------------------------ Saturating addition
|
||||
|
||||
// Returns a + b clamped to the destination range.
|
||||
|
||||
|
@ -738,7 +705,7 @@ HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a,
|
|||
return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ SaturatedSub
|
||||
// ------------------------------ Saturating subtraction
|
||||
|
||||
// Returns a - b clamped to the destination range.
|
||||
|
||||
|
@ -1853,7 +1820,7 @@ HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
|
|||
// https://gcc.godbolt.org/z/-Jt_-F
|
||||
#if HWY_LOADDUP_ASM
|
||||
__m512i out;
|
||||
asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
|
||||
asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
|
||||
return Vec512<T>{out};
|
||||
#else
|
||||
const auto x4 = LoadU(Full128<T>(), p);
|
||||
|
@ -1864,7 +1831,7 @@ HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
|
|||
const float* const HWY_RESTRICT p) {
|
||||
#if HWY_LOADDUP_ASM
|
||||
__m512 out;
|
||||
asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
|
||||
asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
|
||||
return Vec512<float>{out};
|
||||
#else
|
||||
const __m128 x4 = _mm_loadu_ps(p);
|
||||
|
@ -1876,7 +1843,7 @@ HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
|
|||
const double* const HWY_RESTRICT p) {
|
||||
#if HWY_LOADDUP_ASM
|
||||
__m512d out;
|
||||
asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
|
||||
asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
|
||||
return Vec512<double>{out};
|
||||
#else
|
||||
const __m128d x2 = _mm_loadu_pd(p);
|
||||
|
@ -2040,7 +2007,7 @@ HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<8> /* tag */,
|
|||
template <typename T, typename Offset>
|
||||
HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
|
||||
const Vec512<Offset> offset) {
|
||||
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
||||
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
|
||||
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
|
||||
}
|
||||
template <typename T, typename Index>
|
||||
|
@ -2206,7 +2173,7 @@ HWY_API Vec512<T> ShiftRightBytes(Full512<T> /* tag */, const Vec512<T> v) {
|
|||
template <int kLanes, typename T>
|
||||
HWY_API Vec512<T> ShiftRightLanes(Full512<T> d, const Vec512<T> v) {
|
||||
const Repartition<uint8_t, decltype(d)> d8;
|
||||
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
|
||||
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
|
||||
}
|
||||
|
||||
// ------------------------------ CombineShiftRightBytes
|
||||
|
@ -2429,78 +2396,6 @@ HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
|
|||
return TableLookupLanes(v, SetTableIndices(d, kReverse));
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse2
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec512<T> Reverse2(Full512<T> d, const Vec512<T> v) {
|
||||
const Full512<uint32_t> du32;
|
||||
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
|
||||
return Shuffle2301(v);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
|
||||
return Shuffle01(v);
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse4
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec512<T> Reverse4(Full512<T> d, const Vec512<T> v) {
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
alignas(64) constexpr int16_t kReverse4[32] = {
|
||||
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
|
||||
19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
|
||||
const Vec512<int16_t> idx = Load(di, kReverse4);
|
||||
return BitCast(d, Vec512<int16_t>{
|
||||
_mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
|
||||
return Shuffle0123(v);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
|
||||
return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
|
||||
}
|
||||
HWY_API Vec512<double> Reverse4(Full512<double> /* tag */, Vec512<double> v) {
|
||||
return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
|
||||
}
|
||||
|
||||
// ------------------------------ Reverse8
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
|
||||
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
alignas(64) constexpr int16_t kReverse8[32] = {
|
||||
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
|
||||
23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
|
||||
const Vec512<int16_t> idx = Load(di, kReverse8);
|
||||
return BitCast(d, Vec512<int16_t>{
|
||||
_mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
|
||||
const RebindToSigned<decltype(d)> di;
|
||||
alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
|
||||
15, 14, 13, 12, 11, 10, 9, 8};
|
||||
const Vec512<int32_t> idx = Load(di, kReverse8);
|
||||
return BitCast(d, Vec512<int32_t>{
|
||||
_mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
|
||||
return Reverse(d, v);
|
||||
}
|
||||
|
||||
// ------------------------------ InterleaveLower
|
||||
|
||||
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
|
||||
|
@ -2550,6 +2445,12 @@ HWY_API Vec512<double> InterleaveLower(const Vec512<double> a,
|
|||
return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
|
||||
}
|
||||
|
||||
// Additional overload for the optional Simd<> tag.
|
||||
template <typename T, class V = Vec512<T>>
|
||||
HWY_API V InterleaveLower(Full512<T> /* tag */, V a, V b) {
|
||||
return InterleaveLower(a, b);
|
||||
}
|
||||
|
||||
// ------------------------------ InterleaveUpper
|
||||
|
||||
// All functions inside detail lack the required D parameter.
|
||||
|
@ -2614,8 +2515,8 @@ HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) {
|
|||
return BitCast(Full512<TW>(), InterleaveLower(a, b));
|
||||
}
|
||||
template <typename T, typename TW = MakeWide<T>>
|
||||
HWY_API Vec512<TW> ZipLower(Full512<TW> /* d */, Vec512<T> a, Vec512<T> b) {
|
||||
return BitCast(Full512<TW>(), InterleaveLower(a, b));
|
||||
HWY_API Vec512<TW> ZipLower(Full512<TW> d, Vec512<T> a, Vec512<T> b) {
|
||||
return BitCast(Full512<TW>(), InterleaveLower(d, a, b));
|
||||
}
|
||||
|
||||
template <typename T, typename TW = MakeWide<T>>
|
||||
|
@ -2663,17 +2564,17 @@ HWY_API Vec512<double> ConcatUpperUpper(Full512<double> /* tag */,
|
|||
template <typename T>
|
||||
HWY_API Vec512<T> ConcatLowerUpper(Full512<T> /* tag */, const Vec512<T> hi,
|
||||
const Vec512<T> lo) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, 0x4E)};
|
||||
}
|
||||
HWY_API Vec512<float> ConcatLowerUpper(Full512<float> /* tag */,
|
||||
const Vec512<float> hi,
|
||||
const Vec512<float> lo) {
|
||||
return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
|
||||
return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, 0x4E)};
|
||||
}
|
||||
HWY_API Vec512<double> ConcatLowerUpper(Full512<double> /* tag */,
|
||||
const Vec512<double> hi,
|
||||
const Vec512<double> lo) {
|
||||
return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
|
||||
return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, 0x4E)};
|
||||
}
|
||||
|
||||
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
|
||||
|
@ -2774,36 +2675,6 @@ HWY_API Vec512<double> ConcatEven(Full512<double> d, Vec512<double> hi,
|
|||
__mmask8{0xFF}, hi.raw)};
|
||||
}
|
||||
|
||||
// ------------------------------ DupEven (InterleaveLower)
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec512<T> DupEven(Vec512<T> v) {
|
||||
return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
|
||||
}
|
||||
HWY_API Vec512<float> DupEven(Vec512<float> v) {
|
||||
return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> DupEven(const Vec512<T> v) {
|
||||
return InterleaveLower(Full512<T>(), v, v);
|
||||
}
|
||||
|
||||
// ------------------------------ DupOdd (InterleaveUpper)
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
|
||||
HWY_API Vec512<T> DupOdd(Vec512<T> v) {
|
||||
return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
|
||||
}
|
||||
HWY_API Vec512<float> DupOdd(Vec512<float> v) {
|
||||
return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
|
||||
HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
|
||||
return InterleaveUpper(Full512<T>(), v, v);
|
||||
}
|
||||
|
||||
// ------------------------------ OddEven
|
||||
|
||||
template <typename T>
|
||||
|
@ -2834,29 +2705,17 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
|
|||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
||||
}
|
||||
|
||||
HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
|
||||
return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
|
||||
return Vec512<float>{
|
||||
_mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
||||
}
|
||||
|
||||
HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
|
||||
return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
|
||||
}
|
||||
|
||||
// ------------------------------ ReverseBlocks
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
|
||||
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
|
||||
}
|
||||
HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
|
||||
return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
|
||||
}
|
||||
HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
|
||||
Vec512<double> v) {
|
||||
return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
|
||||
return Vec512<double>{
|
||||
_mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
|
||||
}
|
||||
|
||||
// ------------------------------ TableLookupBytes (ZeroExtendVector)
|
||||
|
@ -3153,23 +3012,17 @@ HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state,
|
|||
#if HWY_TARGET == HWY_AVX3_DL
|
||||
return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
|
||||
#else
|
||||
alignas(64) uint8_t a[64];
|
||||
alignas(64) uint8_t b[64];
|
||||
const Full512<uint8_t> d;
|
||||
const Half<decltype(d)> d2;
|
||||
return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
|
||||
AESRound(LowerHalf(state), LowerHalf(round_key)));
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state,
|
||||
Vec512<uint8_t> round_key) {
|
||||
#if HWY_TARGET == HWY_AVX3_DL
|
||||
return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
|
||||
#else
|
||||
const Full512<uint8_t> d;
|
||||
const Half<decltype(d)> d2;
|
||||
return Combine(d,
|
||||
AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
|
||||
AESLastRound(LowerHalf(state), LowerHalf(round_key)));
|
||||
const Full128<uint8_t> d128;
|
||||
Store(state, d, a);
|
||||
Store(round_key, d, b);
|
||||
for (size_t i = 0; i < 64; i += 16) {
|
||||
const auto enc = AESRound(Load(d128, a + i), Load(d128, b + i));
|
||||
Store(enc, d128, a + i);
|
||||
}
|
||||
return Load(d, a);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -3411,8 +3264,8 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
|
|||
const auto compressed0 = Compress(promoted0, mask0);
|
||||
const auto compressed1 = Compress(promoted1, mask1);
|
||||
|
||||
const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
|
||||
const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
|
||||
const auto demoted0 = ZeroExtendVector(DemoteTo(duh, compressed0));
|
||||
const auto demoted1 = ZeroExtendVector(DemoteTo(duh, compressed1));
|
||||
|
||||
// Concatenate into single vector by shifting upper with writemask.
|
||||
const size_t num0 = CountTrue(dw, mask0);
|
||||
|
@ -3510,7 +3363,7 @@ HWY_API size_t CompressBlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> d,
|
|||
if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
|
||||
return CompressStore(v, m, d, unaligned);
|
||||
} else {
|
||||
const size_t count = CountTrue(d, m);
|
||||
const size_t count = CountTrue(m);
|
||||
const Vec512<T> compressed = Compress(v, m);
|
||||
const Vec512<T> prev = LoadU(d, unaligned);
|
||||
StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned);
|
||||
|
@ -3569,9 +3422,9 @@ HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
|
|||
const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
|
||||
|
||||
// To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
|
||||
const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA);
|
||||
const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB);
|
||||
const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC);
|
||||
const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
|
||||
const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
|
||||
const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
|
||||
|
||||
// Alternating order, most-significant 128 bits from the second arg.
|
||||
const __mmask8 m = 0xCC;
|
||||
|
@ -3603,12 +3456,12 @@ HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
|
|||
const auto k = ZipLower(d32, ba8, dc8).raw; // 4x128bit: d..aB d..a8
|
||||
const auto l = ZipUpper(d32, ba8, dc8).raw; // 4x128bit: d..aF d..aC
|
||||
// 128-bit blocks were independent until now; transpose 4x4.
|
||||
const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA);
|
||||
const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA);
|
||||
const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC);
|
||||
const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC);
|
||||
constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA;
|
||||
constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB;
|
||||
const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
|
||||
constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
|
||||
const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
|
||||
const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
|
||||
const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
|
||||
|
@ -3778,6 +3631,103 @@ HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
|
|||
return BitCast(d, Or(min, ShiftLeft<16>(min)));
|
||||
}
|
||||
|
||||
// ================================================== DEPRECATED
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t StoreMaskBits(const Mask512<T> mask, uint8_t* bits) {
|
||||
return StoreMaskBits(Full512<T>(), mask, bits);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllTrue(const Mask512<T> mask) {
|
||||
return AllTrue(Full512<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API bool AllFalse(const Mask512<T> mask) {
|
||||
return AllFalse(Full512<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API size_t CountTrue(const Mask512<T> mask) {
|
||||
return CountTrue(Full512<T>(), mask);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> SumOfLanes(Vec512<T> v) {
|
||||
return SumOfLanes(Full512<T>(), v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> MinOfLanes(Vec512<T> v) {
|
||||
return MinOfLanes(Full512<T>(), v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> MaxOfLanes(Vec512<T> v) {
|
||||
return MaxOfLanes(Full512<T>(), v);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec256<T> UpperHalf(Vec512<T> v) {
|
||||
return UpperHalf(Full256<T>(), v);
|
||||
}
|
||||
|
||||
template <int kBytes, typename T>
|
||||
HWY_API Vec512<T> ShiftRightBytes(const Vec512<T> v) {
|
||||
return ShiftRightBytes<kBytes>(Full512<T>(), v);
|
||||
}
|
||||
|
||||
template <int kLanes, typename T>
|
||||
HWY_API Vec512<T> ShiftRightLanes(const Vec512<T> v) {
|
||||
return ShiftRightBytes<kLanes>(Full512<T>(), v);
|
||||
}
|
||||
|
||||
template <size_t kBytes, typename T>
|
||||
HWY_API Vec512<T> CombineShiftRightBytes(Vec512<T> hi, Vec512<T> lo) {
|
||||
return CombineShiftRightBytes<kBytes>(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> InterleaveUpper(Vec512<T> a, Vec512<T> b) {
|
||||
return InterleaveUpper(Full512<T>(), a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<MakeWide<T>> ZipUpper(Vec512<T> a, Vec512<T> b) {
|
||||
return InterleaveUpper(Full512<MakeWide<T>>(), a, b);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> Combine(Vec256<T> hi, Vec256<T> lo) {
|
||||
return Combine(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ZeroExtendVector(Vec256<T> lo) {
|
||||
return ZeroExtendVector(Full512<T>(), lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ConcatLowerLower(Vec512<T> hi, Vec512<T> lo) {
|
||||
return ConcatLowerLower(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ConcatLowerUpper(Vec512<T> hi, Vec512<T> lo) {
|
||||
return ConcatLowerUpper(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ConcatUpperLower(Vec512<T> hi, Vec512<T> lo) {
|
||||
return ConcatUpperLower(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
HWY_API Vec512<T> ConcatUpperUpper(Vec512<T> hi, Vec512<T> lo) {
|
||||
return ConcatUpperUpper(Full512<T>(), hi, lo);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
|
@ -15,25 +15,23 @@
|
|||
#include "hwy/targets.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstddef>
|
||||
#include <limits>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
|
||||
#endif
|
||||
|
||||
#include <stdlib.h> // abort / exit
|
||||
#endif // defined(*_SANITIZER)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
#include <xmmintrin.h>
|
||||
#if HWY_COMPILER_MSVC
|
||||
#include <intrin.h>
|
||||
#else // !HWY_COMPILER_MSVC
|
||||
#else // HWY_COMPILER_MSVC
|
||||
#include <cpuid.h>
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
#endif // HWY_ARCH_X86
|
||||
|
@ -95,7 +93,7 @@ std::atomic<uint32_t> supported_{0}; // Not yet initialized
|
|||
uint32_t supported_targets_for_test_ = 0;
|
||||
|
||||
// Mask of targets disabled at runtime with DisableTargets.
|
||||
uint32_t supported_mask_{LimitsMax<uint32_t>()};
|
||||
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Arbritrary bit indices indicating which instruction set extensions are
|
||||
|
@ -192,22 +190,21 @@ HWY_NORETURN void HWY_FORMAT(3, 4)
|
|||
va_end(args);
|
||||
|
||||
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
|
||||
|
||||
// If compiled with any sanitizer, they can also print a stack trace.
|
||||
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
// If compiled with any sanitizer print a stack trace. This call doesn't crash
|
||||
// the program, instead the trap below will crash it also allowing gdb to
|
||||
// break there.
|
||||
__sanitizer_print_stack_trace();
|
||||
#endif // HWY_IS_*
|
||||
#endif // defined(*_SANITIZER)
|
||||
fflush(stderr);
|
||||
|
||||
// Now terminate the program:
|
||||
#if HWY_ARCH_RVV
|
||||
exit(1); // trap/abort just freeze Spike.
|
||||
#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
|
||||
// Facilitates breaking into a debugger, but don't use this in non-debug
|
||||
// builds because it looks like "illegal instruction", which is misleading.
|
||||
__builtin_trap();
|
||||
#else
|
||||
#if HWY_COMPILER_MSVC
|
||||
abort(); // Compile error without this due to HWY_NORETURN.
|
||||
#elif HWY_ARCH_RVV
|
||||
exit(1); // trap/abort just freeze Spike
|
||||
#else
|
||||
__builtin_trap();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -216,7 +213,7 @@ void DisableTargets(uint32_t disabled_targets) {
|
|||
// We can call Update() here to initialize the mask but that will trigger a
|
||||
// call to SupportedTargets() which we use in tests to tell whether any of the
|
||||
// highway dynamic dispatch functions were used.
|
||||
GetChosenTarget().DeInit();
|
||||
chosen_target.DeInit();
|
||||
}
|
||||
|
||||
void SetSupportedTargetsForTest(uint32_t targets) {
|
||||
|
@ -225,7 +222,7 @@ void SetSupportedTargetsForTest(uint32_t targets) {
|
|||
// if not zero.
|
||||
supported_.store(0, std::memory_order_release);
|
||||
supported_targets_for_test_ = targets;
|
||||
GetChosenTarget().DeInit();
|
||||
chosen_target.DeInit();
|
||||
}
|
||||
|
||||
bool SupportedTargetsCalledForTest() {
|
||||
|
@ -347,10 +344,8 @@ uint32_t SupportedTargets() {
|
|||
return bits & supported_mask_;
|
||||
}
|
||||
|
||||
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
|
||||
static ChosenTarget chosen_target;
|
||||
return chosen_target;
|
||||
}
|
||||
// Declared in targets.h
|
||||
ChosenTarget chosen_target;
|
||||
|
||||
void ChosenTarget::Update() {
|
||||
// The supported variable contains the current CPU supported targets shifted
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/detect_targets.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
|
@ -30,7 +29,7 @@ namespace hwy {
|
|||
// Implemented in targets.cc; unconditionally compiled to support the use case
|
||||
// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
|
||||
// eliding calls to this function.
|
||||
HWY_DLLEXPORT uint32_t SupportedTargets();
|
||||
uint32_t SupportedTargets();
|
||||
|
||||
// Evaluates to a function call, or literal if there is a single target.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
|
@ -45,7 +44,7 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
|
|||
// lower target is desired. For this reason, attempts to disable targets which
|
||||
// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
|
||||
// returns at least the baseline target.
|
||||
HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
|
||||
void DisableTargets(uint32_t disabled_targets);
|
||||
|
||||
// Set the mock mask of CPU supported targets instead of the actual CPU
|
||||
// supported targets computed in SupportedTargets(). The return value of
|
||||
|
@ -53,11 +52,11 @@ HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
|
|||
// regardless of this mock, to prevent accidentally adding targets that are
|
||||
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
|
||||
// mock and use the actual CPU supported targets instead.
|
||||
HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
|
||||
void SetSupportedTargetsForTest(uint32_t targets);
|
||||
|
||||
// Returns whether the SupportedTargets() function was called since the last
|
||||
// SetSupportedTargetsForTest() call.
|
||||
HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
|
||||
bool SupportedTargetsCalledForTest();
|
||||
|
||||
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
|
||||
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
|
||||
|
@ -226,7 +225,7 @@ struct ChosenTarget {
|
|||
public:
|
||||
// Update the ChosenTarget mask based on the current CPU supported
|
||||
// targets.
|
||||
HWY_DLLEXPORT void Update();
|
||||
void Update();
|
||||
|
||||
// Reset the ChosenTarget to the uninitialized state.
|
||||
void DeInit() { mask_.store(1); }
|
||||
|
@ -246,12 +245,11 @@ struct ChosenTarget {
|
|||
}
|
||||
|
||||
private:
|
||||
// Initialized to 1 so GetIndex() returns 0.
|
||||
// Initialized to 1 so GetChosenTargetIndex() returns 0.
|
||||
std::atomic<uint32_t> mask_{1};
|
||||
};
|
||||
|
||||
// For internal use (e.g. by FunctionCache and DisableTargets).
|
||||
HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
|
||||
extern ChosenTarget chosen_target;
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
|
|
|
@ -44,11 +44,11 @@ void CheckFakeFunction() {
|
|||
hwy::SetSupportedTargetsForTest(HWY_##TGT); \
|
||||
/* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
|
||||
/* the pointer to the already cached function. */ \
|
||||
hwy::GetChosenTarget().Update(); \
|
||||
hwy::chosen_target.Update(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Calling DeInit() will test that the initializer function */ \
|
||||
/* also calls the right function. */ \
|
||||
hwy::GetChosenTarget().DeInit(); \
|
||||
hwy::chosen_target.DeInit(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Second call uses the cached value from the previous call. */ \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
|
|
|
@ -177,6 +177,387 @@ HWY_NOINLINE void TestAllAbs() {
|
|||
ForFloatTypes(ForPartialVectors<TestFloatAbs>());
|
||||
}
|
||||
|
||||
template <bool kSigned>
|
||||
struct TestLeftShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
if (kSigned) {
|
||||
// Also test positive values
|
||||
TestLeftShifts</*kSigned=*/false>()(t, d);
|
||||
}
|
||||
|
||||
using TI = MakeSigned<T>;
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
|
||||
|
||||
// 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
|
||||
|
||||
// max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
|
||||
}
|
||||
};
|
||||
|
||||
template <bool kSigned>
|
||||
struct TestVariableLeftShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
if (kSigned) {
|
||||
// Also test positive values
|
||||
TestVariableLeftShifts</*kSigned=*/false>()(t, d);
|
||||
}
|
||||
|
||||
using TI = MakeSigned<T>;
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
// Same: 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
|
||||
|
||||
// Same: 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
|
||||
|
||||
// Same: max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
|
||||
|
||||
// Variable: small
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << (i & kMaxShift));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
|
||||
|
||||
// Variable: large
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestUnsignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto values = Iota(d, 0);
|
||||
|
||||
const T kMax = LimitsMax<T>();
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
||||
|
||||
// max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestRotateRight {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr size_t kBits = sizeof(T) * 8;
|
||||
const auto mask_shift = Set(d, T{kBits});
|
||||
// Cover as many bit positions as possible to test shifting out
|
||||
const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
|
||||
|
||||
// Rotate by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
|
||||
|
||||
// Rotate by 1
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
|
||||
|
||||
// Rotate by half
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
|
||||
|
||||
// Rotate by max
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestVariableUnsignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto values = Iota(d, 0);
|
||||
|
||||
const T kMax = LimitsMax<T>();
|
||||
const auto max = Set(d, kMax);
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
// Same: 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
|
||||
|
||||
// Same: 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
|
||||
|
||||
// Same: max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
|
||||
|
||||
// Variable: small
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(i) >> (i & kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
|
||||
|
||||
// Variable: Large
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
template <int kAmount, typename T>
|
||||
T RightShiftNegative(T val) {
|
||||
// C++ shifts are implementation-defined for negative numbers, and we have
|
||||
// seen divisions replaced with shifts, so resort to bit operations.
|
||||
using TU = hwy::MakeUnsigned<T>;
|
||||
TU bits;
|
||||
CopyBytes<sizeof(T)>(&val, &bits);
|
||||
|
||||
const TU shifted = TU(bits >> kAmount);
|
||||
|
||||
const TU all = TU(~TU(0));
|
||||
const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
|
||||
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
|
||||
|
||||
bits = shifted | sign_extended;
|
||||
CopyBytes<sizeof(T)>(&bits, &val);
|
||||
return val;
|
||||
}
|
||||
|
||||
class TestSignedRightShifts {
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
constexpr T kMin = LimitsMin<T>();
|
||||
constexpr T kMax = LimitsMax<T>();
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// First test positive values, negative are checked below.
|
||||
const auto v0 = Zero(d);
|
||||
const auto values = And(Iota(d, 0), Set(d, kMax));
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
||||
|
||||
// max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
|
||||
|
||||
// Even negative value
|
||||
Test<0>(kMin, d, __LINE__);
|
||||
Test<1>(kMin, d, __LINE__);
|
||||
Test<2>(kMin, d, __LINE__);
|
||||
Test<kMaxShift>(kMin, d, __LINE__);
|
||||
|
||||
const T odd = static_cast<T>(kMin + 1);
|
||||
Test<0>(odd, d, __LINE__);
|
||||
Test<1>(odd, d, __LINE__);
|
||||
Test<2>(odd, d, __LINE__);
|
||||
Test<kMaxShift>(odd, d, __LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <int kAmount, typename T, class D>
|
||||
void Test(T val, D d, int line) {
|
||||
const auto expected = Set(d, RightShiftNegative<kAmount>(val));
|
||||
const auto in = Set(d, val);
|
||||
const char* file = __FILE__;
|
||||
AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
|
||||
AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
|
||||
}
|
||||
};
|
||||
|
||||
struct TestVariableSignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr T kMin = LimitsMin<T>();
|
||||
constexpr T kMax = LimitsMax<T>();
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// First test positive values, negative are checked below.
|
||||
const auto v0 = Zero(d);
|
||||
const auto positive = Iota(d, 0) & Set(d, kMax);
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
|
||||
|
||||
// max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
|
||||
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
const auto negative = Iota(d, kMin);
|
||||
|
||||
// Test varying negative to shift
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
|
||||
|
||||
// Shift MSB right by small amounts
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t amount = i & kMaxShift;
|
||||
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
|
||||
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
|
||||
|
||||
// Shift MSB right by large amounts
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t amount = kMaxShift - (i & kMaxShift);
|
||||
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
|
||||
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShifts() {
|
||||
ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
|
||||
ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
|
||||
ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
|
||||
ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllVariableShifts() {
|
||||
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
|
||||
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
|
||||
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
|
||||
const ForPartialVectors<TestSignedRightShifts> shr_s;
|
||||
|
||||
shl_u(uint16_t());
|
||||
shr_u(uint16_t());
|
||||
|
||||
shl_u(uint32_t());
|
||||
shr_u(uint32_t());
|
||||
|
||||
shl_s(int16_t());
|
||||
shr_s(int16_t());
|
||||
|
||||
shl_s(int32_t());
|
||||
shr_s(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
shl_u(uint64_t());
|
||||
shr_u(uint64_t());
|
||||
|
||||
shl_s(int64_t());
|
||||
shr_s(int64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllRotateRight() {
|
||||
const ForPartialVectors<TestRotateRight> test;
|
||||
test(uint32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestUnsignedMinMax {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -263,84 +644,6 @@ HWY_NOINLINE void TestAllMinMax() {
|
|||
ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
|
||||
}
|
||||
|
||||
class TestMinMax128 {
|
||||
template <class D>
|
||||
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
|
||||
alignas(16) uint64_t in[2];
|
||||
in[0] = lo;
|
||||
in[1] = hi;
|
||||
return LoadDup128(d, in);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using V = Vec<D>;
|
||||
const size_t N = Lanes(d);
|
||||
auto a_lanes = AllocateAligned<T>(N);
|
||||
auto b_lanes = AllocateAligned<T>(N);
|
||||
auto min_lanes = AllocateAligned<T>(N);
|
||||
auto max_lanes = AllocateAligned<T>(N);
|
||||
RandomState rng;
|
||||
|
||||
const V v00 = Zero(d);
|
||||
const V v01 = Make128(d, 0, 1);
|
||||
const V v10 = Make128(d, 1, 0);
|
||||
const V v11 = Add(v01, v10);
|
||||
|
||||
// Same arg
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
|
||||
|
||||
// First arg less
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
|
||||
|
||||
// Second arg less
|
||||
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
|
||||
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
|
||||
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
|
||||
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
|
||||
|
||||
// Also check 128-bit blocks are independent
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
a_lanes[i] = Random64(&rng);
|
||||
b_lanes[i] = Random64(&rng);
|
||||
}
|
||||
const V a = Load(d, a_lanes.get());
|
||||
const V b = Load(d, b_lanes.get());
|
||||
for (size_t i = 0; i < N; i += 2) {
|
||||
const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
|
||||
? (a_lanes[i] < b_lanes[i])
|
||||
: (a_lanes[i + 1] < b_lanes[i + 1]);
|
||||
min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
|
||||
min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
|
||||
max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
|
||||
max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
|
||||
HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMinMax128() {
|
||||
ForGEVectors<128, TestMinMax128>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestUnsignedMul {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -531,11 +834,11 @@ struct TestMulEvenOdd64 {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMulEven() {
|
||||
ForGEVectors<64, TestMulEven> test;
|
||||
ForExtendableVectors<TestMulEven> test;
|
||||
test(int32_t());
|
||||
test(uint32_t());
|
||||
|
||||
ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
|
||||
ForGE128Vectors<TestMulEvenOdd64>()(uint64_t());
|
||||
}
|
||||
|
||||
struct TestMulAdd {
|
||||
|
@ -810,6 +1113,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
|
|||
// negative +/- epsilon
|
||||
T(-1) + eps,
|
||||
T(-1) - eps,
|
||||
#if !defined(HWY_EMULATE_SVE) // these are not safe to just cast to int
|
||||
// +/- huge (but still fits in float)
|
||||
T(1E34),
|
||||
T(-1E35),
|
||||
|
@ -818,6 +1122,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
|
|||
-std::numeric_limits<T>::infinity(),
|
||||
// qNaN
|
||||
GetLane(NaN(d))
|
||||
#endif
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
|
@ -1064,41 +1369,6 @@ HWY_NOINLINE void TestAllAbsDiff() {
|
|||
ForPartialVectors<TestAbsDiff>()(float());
|
||||
}
|
||||
|
||||
struct TestSumsOf8 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N < 8) return;
|
||||
const Repartition<uint64_t, D> du64;
|
||||
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in_lanes[i] = Random64(&rng) & 0xFF;
|
||||
}
|
||||
|
||||
for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
|
||||
uint64_t sum = 0;
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
sum += in_lanes[idx_sum * 8 + i];
|
||||
}
|
||||
sum_lanes[idx_sum] = sum;
|
||||
}
|
||||
|
||||
const Vec<D> in = Load(d, in_lanes.get());
|
||||
HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumsOf8() {
|
||||
ForGEVectors<64, TestSumsOf8>()(uint8_t());
|
||||
}
|
||||
|
||||
struct TestNeg {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -1127,8 +1397,10 @@ namespace hwy {
|
|||
HWY_BEFORE_TEST(HwyArithmeticTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
|
||||
|
@ -1148,7 +1420,6 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
|
|||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
|
||||
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
|
||||
} // namespace hwy
|
||||
|
||||
|
|
|
@ -393,7 +393,7 @@ HWY_NOINLINE void TestAllZip() {
|
|||
lower_unsigned(uint8_t());
|
||||
#endif
|
||||
lower_unsigned(uint16_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
|
@ -402,7 +402,7 @@ HWY_NOINLINE void TestAllZip() {
|
|||
lower_signed(int8_t());
|
||||
#endif
|
||||
lower_signed(int16_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
|
@ -411,7 +411,7 @@ HWY_NOINLINE void TestAllZip() {
|
|||
upper_unsigned(uint8_t());
|
||||
#endif
|
||||
upper_unsigned(uint16_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
|
@ -420,20 +420,19 @@ HWY_NOINLINE void TestAllZip() {
|
|||
upper_signed(int8_t());
|
||||
#endif
|
||||
upper_signed(int16_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
// No float - concatenating f32 does not result in a f64
|
||||
}
|
||||
|
||||
template <int kBytes>
|
||||
struct TestCombineShiftRightBytesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
|
||||
template <int kBytes>
|
||||
struct TestCombineShiftRightBytes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T, D d) {
|
||||
const size_t kBlockSize = 16;
|
||||
static_assert(kBytes < kBlockSize, "Shift count is per block");
|
||||
const Repartition<uint8_t, D> d8;
|
||||
|
@ -462,13 +461,21 @@ struct TestCombineShiftRightBytes {
|
|||
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
|
||||
}
|
||||
|
||||
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <int kLanes>
|
||||
struct TestCombineShiftRightLanes {
|
||||
struct TestCombineShiftRightLanesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T, D d) {
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
if (N8 < 16) return;
|
||||
|
@ -498,29 +505,33 @@ struct TestCombineShiftRightLanes {
|
|||
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
|
||||
}
|
||||
|
||||
TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
template <>
|
||||
struct TestCombineShiftRightBytesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightLanesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
struct TestCombineShiftRight {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
|
||||
constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
|
||||
TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
|
||||
TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
|
||||
TestCombineShiftRightBytes<1>()(t, d);
|
||||
|
||||
TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
|
||||
TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
|
||||
TestCombineShiftRightLanes<1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif
|
||||
TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
|
||||
TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -542,13 +553,11 @@ class TestSpecialShuffle32 {
|
|||
}
|
||||
|
||||
private:
|
||||
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
|
||||
// 128 bits (the NEON register) of actual are zero.
|
||||
template <class D, class V>
|
||||
HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
|
||||
const size_t i2, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
|
||||
const size_t i2, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
using T = TFromD<D>;
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
const size_t N = Lanes(d);
|
||||
|
@ -573,12 +582,10 @@ class TestSpecialShuffle64 {
|
|||
}
|
||||
|
||||
private:
|
||||
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
|
||||
// 128 bits (the NEON register) of actual are zero.
|
||||
template <class D, class V>
|
||||
HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
|
||||
const size_t i0, const char* filename,
|
||||
const int line) {
|
||||
using T = TFromD<D>;
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
const size_t N = Lanes(d);
|
||||
|
@ -593,19 +600,19 @@ class TestSpecialShuffle64 {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSpecialShuffles() {
|
||||
const ForGEVectors<128, TestSpecialShuffle32> test32;
|
||||
const ForGE128Vectors<TestSpecialShuffle32> test32;
|
||||
test32(uint32_t());
|
||||
test32(int32_t());
|
||||
test32(float());
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
const ForGEVectors<128, TestSpecialShuffle64> test64;
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test64;
|
||||
test64(uint64_t());
|
||||
test64(int64_t());
|
||||
#endif
|
||||
|
||||
#if HWY_HAVE_FLOAT64
|
||||
const ForGEVectors<128, TestSpecialShuffle64> test_d;
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test_d;
|
||||
test_d(double());
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -22,6 +22,9 @@
|
|||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Not yet implemented
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
@ -82,8 +85,8 @@ struct TestLowerQuarter {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLowerHalf() {
|
||||
ForAllTypes(ForHalfVectors<TestLowerHalf>());
|
||||
ForAllTypes(ForHalfVectors<TestLowerQuarter, 2>());
|
||||
ForAllTypes(ForDemoteVectors<TestLowerHalf>());
|
||||
ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>());
|
||||
}
|
||||
|
||||
struct TestUpperHalf {
|
||||
|
@ -92,14 +95,21 @@ struct TestUpperHalf {
|
|||
// Scalar does not define UpperHalf.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const Half<D> d2;
|
||||
const size_t N2 = Lanes(d2);
|
||||
HWY_ASSERT(N2 * 2 == Lanes(d));
|
||||
auto expected = AllocateAligned<T>(N2);
|
||||
|
||||
const auto v = Iota(d, 1);
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
Store(UpperHalf(d2, v), d2, lanes.get());
|
||||
size_t i = 0;
|
||||
for (; i < N2; ++i) {
|
||||
expected[i] = static_cast<T>(N2 + 1 + i);
|
||||
for (; i < Lanes(d2); ++i) {
|
||||
HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
|
||||
}
|
||||
// Other half remains unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
|
@ -107,7 +117,7 @@ struct TestUpperHalf {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllUpperHalf() {
|
||||
ForAllTypes(ForHalfVectors<TestUpperHalf>());
|
||||
ForAllTypes(ForShrinkableVectors<TestUpperHalf>());
|
||||
}
|
||||
|
||||
struct TestZeroExtendVector {
|
||||
|
@ -116,23 +126,23 @@ struct TestZeroExtendVector {
|
|||
const Twice<D> d2;
|
||||
|
||||
const auto v = Iota(d, 1);
|
||||
const size_t N = Lanes(d);
|
||||
const size_t N2 = Lanes(d2);
|
||||
// If equal, then N was already MaxLanes(d) and it's not clear what
|
||||
// Combine or ZeroExtendVector should return.
|
||||
if (N2 == N) return;
|
||||
HWY_ASSERT(N2 == 2 * N);
|
||||
auto lanes = AllocateAligned<T>(N2);
|
||||
Store(v, d, &lanes[0]);
|
||||
Store(v, d, &lanes[N]);
|
||||
Store(v, d, &lanes[N2 / 2]);
|
||||
|
||||
const auto ext = ZeroExtendVector(d2, v);
|
||||
Store(ext, d2, lanes.get());
|
||||
|
||||
size_t i = 0;
|
||||
// Lower half is unchanged
|
||||
HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
|
||||
for (; i < N2 / 2; ++i) {
|
||||
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
|
||||
}
|
||||
// Upper half is zero
|
||||
HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
|
||||
for (; i < N2; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -148,7 +158,7 @@ struct TestCombine {
|
|||
auto lanes = AllocateAligned<T>(N2);
|
||||
|
||||
const auto lo = Iota(d, 1);
|
||||
const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
|
||||
const auto hi = Iota(d, N2 / 2 + 1);
|
||||
const auto combined = Combine(d2, hi, lo);
|
||||
Store(combined, d2, lanes.get());
|
||||
|
||||
|
@ -222,7 +232,7 @@ struct TestConcatOddEven {
|
|||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
|
||||
const size_t N = Lanes(d);
|
||||
const auto hi = Iota(d, static_cast<T>(N));
|
||||
const auto hi = Iota(d, N);
|
||||
const auto lo = Iota(d, 0);
|
||||
const auto even = Add(Iota(d, 0), Iota(d, 0));
|
||||
const auto odd = Add(even, Set(d, 1));
|
||||
|
@ -262,3 +272,7 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
#endif // HWY_ONCE
|
||||
|
||||
#else
|
||||
int main(int, char**) { return 0; }
|
||||
#endif // HWY_TARGET != HWY_RVV
|
||||
|
|
|
@ -218,63 +218,6 @@ HWY_NOINLINE void TestAllWeakFloat() {
|
|||
ForFloatTypes(ForPartialVectors<TestWeakFloat>());
|
||||
}
|
||||
|
||||
class TestLt128 {
|
||||
template <class D>
|
||||
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
|
||||
alignas(16) uint64_t in[2];
|
||||
in[0] = lo;
|
||||
in[1] = hi;
|
||||
return LoadDup128(d, in);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using V = Vec<D>;
|
||||
const V v00 = Zero(d);
|
||||
const V v01 = Make128(d, 0, 1);
|
||||
const V v10 = Make128(d, 1, 0);
|
||||
const V v11 = Add(v01, v10);
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
|
||||
|
||||
// Reversed order
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
|
||||
|
||||
// Also check 128-bit blocks are independent
|
||||
const V iota = Iota(d, 1);
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
|
||||
|
||||
// Max value
|
||||
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
@ -289,7 +232,6 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
|
|||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
|
|
|
@ -57,17 +57,17 @@ struct TestBitCastFrom {
|
|||
TestBitCast<uint8_t>()(t, d);
|
||||
TestBitCast<uint16_t>()(t, d);
|
||||
TestBitCast<uint32_t>()(t, d);
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
TestBitCast<uint64_t>()(t, d);
|
||||
#endif
|
||||
TestBitCast<int8_t>()(t, d);
|
||||
TestBitCast<int16_t>()(t, d);
|
||||
TestBitCast<int32_t>()(t, d);
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
TestBitCast<int64_t>()(t, d);
|
||||
#endif
|
||||
TestBitCast<float>()(t, d);
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
TestBitCast<double>()(t, d);
|
||||
#endif
|
||||
}
|
||||
|
@ -103,39 +103,39 @@ HWY_NOINLINE void TestAllBitCast() {
|
|||
to_i32(int32_t());
|
||||
to_i32(float());
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
|
||||
to_u64(uint64_t());
|
||||
to_u64(int64_t());
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
to_u64(double());
|
||||
#endif
|
||||
|
||||
const ForPartialVectors<TestBitCast<int64_t>> to_i64;
|
||||
to_i64(uint64_t());
|
||||
to_i64(int64_t());
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
to_i64(double());
|
||||
#endif
|
||||
#endif // HWY_HAVE_INTEGER64
|
||||
#endif // HWY_CAP_INTEGER64
|
||||
|
||||
const ForPartialVectors<TestBitCast<float>> to_float;
|
||||
to_float(uint32_t());
|
||||
to_float(int32_t());
|
||||
to_float(float());
|
||||
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForPartialVectors<TestBitCast<double>> to_double;
|
||||
to_double(double());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
to_double(uint64_t());
|
||||
to_double(int64_t());
|
||||
#endif // HWY_HAVE_INTEGER64
|
||||
#endif // HWY_HAVE_FLOAT64
|
||||
#endif // HWY_CAP_INTEGER64
|
||||
#endif // HWY_CAP_FLOAT64
|
||||
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
// For non-scalar vectors, we can cast all types to all.
|
||||
ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
|
||||
ForAllTypes(ForGE64Vectors<TestBitCastFrom>());
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -165,39 +165,39 @@ struct TestPromoteTo {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllPromoteTo() {
|
||||
const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
|
||||
const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
|
||||
to_u16div2(uint8_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
|
||||
to_u32div4(uint8_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
|
||||
const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
|
||||
to_u32div2(uint16_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
|
||||
to_i16div2(uint8_t());
|
||||
to_i16div2(int8_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
|
||||
to_i32div2(uint16_t());
|
||||
to_i32div2(int16_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
|
||||
const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
|
||||
to_i32div4(uint8_t());
|
||||
to_i32div4(int8_t());
|
||||
|
||||
// Must test f16/bf16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
|
||||
to_u64div2(uint32_t());
|
||||
|
||||
const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
|
||||
const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
|
||||
to_i64div2(int32_t());
|
||||
#endif
|
||||
|
||||
#if HWY_HAVE_FLOAT64
|
||||
const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2;
|
||||
to_f64div2(int32_t());
|
||||
to_f64div2(float());
|
||||
#endif
|
||||
|
@ -213,6 +213,111 @@ bool IsFinite(T /*unused*/) {
|
|||
return true;
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteTo {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
// Narrower range in the wider type, for clamping before we cast
|
||||
const T min = LimitsMin<ToT>();
|
||||
const T max = LimitsMax<ToT>();
|
||||
|
||||
const auto value_ok = [&](T& value) {
|
||||
if (!IsFinite(value)) return false;
|
||||
#if HWY_EMULATE_SVE
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
value = HWY_MIN(HWY_MAX(min, value), max);
|
||||
#endif
|
||||
return true;
|
||||
};
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!value_ok(from[i]));
|
||||
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
DemoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToInt() {
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
|
||||
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
|
||||
to_u16(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
|
||||
to_i16(int32_t());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToMixed() {
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
|
||||
to_i32(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteToFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
// For floats, we clamp differently and cannot call LimitsMin.
|
||||
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!IsFinite(from[i]));
|
||||
const T magn = std::abs(from[i]);
|
||||
const T max_abs = HighestValue<ToT>();
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
|
||||
expected[i] = static_cast<ToT>(clipped);
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
DemoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToFloat() {
|
||||
// Must test f16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
|
||||
to_float(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
|
@ -247,7 +352,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
|
|||
struct TestF16 {
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_HAVE_FLOAT16
|
||||
#if HWY_CAP_FLOAT16
|
||||
size_t padded;
|
||||
auto in = F16TestCases(d32, padded);
|
||||
using TF16 = float16_t;
|
||||
|
@ -301,7 +406,7 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
|
|||
struct TestBF16 {
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if !defined(HWY_EMULATE_SVE)
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
size_t padded;
|
||||
auto in = BF16TestCases(d32, padded);
|
||||
using TBF16 = bfloat16_t;
|
||||
|
@ -312,7 +417,6 @@ struct TestBF16 {
|
|||
#endif
|
||||
const Half<decltype(dbf16)> dbf16_half;
|
||||
const size_t N = Lanes(d32);
|
||||
HWY_ASSERT(Lanes(dbf16_half) <= N);
|
||||
auto temp16 = AllocateAligned<TBF16>(N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += N) {
|
||||
|
@ -330,6 +434,124 @@ struct TestBF16 {
|
|||
|
||||
HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
// Same as BF16TestCases:
|
||||
// +/- 1
|
||||
1.0f,
|
||||
-1.0f,
|
||||
// +/- 0
|
||||
0.0f,
|
||||
-0.0f,
|
||||
// near 0
|
||||
0.25f,
|
||||
-0.25f,
|
||||
// +/- integer
|
||||
4.0f,
|
||||
-32.0f,
|
||||
// positive +/- delta
|
||||
2.015625f,
|
||||
3.984375f,
|
||||
// negative +/- delta
|
||||
-2.015625f,
|
||||
-3.984375f,
|
||||
|
||||
// No huge values - would interfere with sum. But add more to fill 2 * N:
|
||||
-2.0f,
|
||||
-10.0f,
|
||||
0.03125f,
|
||||
1.03125f,
|
||||
1.5f,
|
||||
2.0f,
|
||||
4.0f,
|
||||
5.0f,
|
||||
6.0f,
|
||||
8.0f,
|
||||
10.0f,
|
||||
256.0f,
|
||||
448.0f,
|
||||
2080.0f,
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
|
||||
auto in = AllocateAligned<float>(padded);
|
||||
auto expected = AllocateAligned<float>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
|
||||
return in;
|
||||
}
|
||||
|
||||
class TestReorderDemote2To {
|
||||
// In-place N^2 selection sort to avoid dependencies
|
||||
void Sort(float* p, size_t count) {
|
||||
for (size_t i = 0; i < count - 1; ++i) {
|
||||
// Find min_element
|
||||
size_t idx_min = i;
|
||||
for (size_t j = i + 1; j < count; j++) {
|
||||
if (p[j] < p[idx_min]) {
|
||||
idx_min = j;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap with current
|
||||
const float tmp = p[i];
|
||||
p[i] = p[idx_min];
|
||||
p[idx_min] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
size_t padded;
|
||||
auto in = ReorderBF16TestCases(d32, padded);
|
||||
|
||||
using TBF16 = bfloat16_t;
|
||||
const Repartition<TBF16, DF32> dbf16;
|
||||
const Half<decltype(dbf16)> dbf16_half;
|
||||
const size_t N = Lanes(d32);
|
||||
auto temp16 = AllocateAligned<TBF16>(2 * N);
|
||||
auto expected = AllocateAligned<float>(2 * N);
|
||||
auto actual = AllocateAligned<float>(2 * N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += 2 * N) {
|
||||
const auto f0 = Load(d32, &in[i + 0]);
|
||||
const auto f1 = Load(d32, &in[i + N]);
|
||||
const auto v16 = ReorderDemote2To(dbf16, f0, f1);
|
||||
Store(v16, dbf16, temp16.get());
|
||||
const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
|
||||
const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
|
||||
|
||||
// Smoke test: sum should be same (with tolerance for non-associativity)
|
||||
const auto sum_expected =
|
||||
GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
|
||||
const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
|
||||
HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
|
||||
sum_expected <= sum_actual + 1E-4);
|
||||
|
||||
// Ensure values are the same after sorting to undo the Reorder
|
||||
Store(f0, d32, expected.get() + 0);
|
||||
Store(f1, d32, expected.get() + N);
|
||||
Store(promoted0, d32, actual.get() + 0);
|
||||
Store(promoted1, d32, actual.get() + N);
|
||||
Sort(expected.get(), 2 * N);
|
||||
Sort(actual.get(), 2 * N);
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
|
||||
}
|
||||
#else // HWY_SCALAR
|
||||
(void)d32;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReorderDemote2To() {
|
||||
ForShrinkableVectors<TestReorderDemote2To>()(float());
|
||||
}
|
||||
|
||||
struct TestConvertU8 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
|
||||
|
@ -342,7 +564,7 @@ struct TestConvertU8 {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllConvertU8() {
|
||||
ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
|
||||
ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
|
||||
}
|
||||
|
||||
// Separate function to attempt to work around a compiler bug on ARM: when this
|
||||
|
@ -352,23 +574,19 @@ struct TestIntFromFloatHuge {
|
|||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
// Still does not work, although ARMv7 manual says that float->int
|
||||
// saturates, i.e. chooses the nearest representable value. Also causes
|
||||
// out-of-memory for MSVC.
|
||||
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
|
||||
// out-of-memory for MSVC, and unsafe cast in farm_sve.
|
||||
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE)
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
|
||||
// Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
|
||||
// the expected lvalue also seems to prevent the issue.
|
||||
const size_t N = Lanes(df);
|
||||
auto expected = AllocateAligned<TI>(N);
|
||||
// Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
|
||||
// the expected i32 value is otherwise 0x80..00).
|
||||
const auto expected_max = Set(di, LimitsMax<TI>());
|
||||
HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));
|
||||
|
||||
// Huge positive
|
||||
Store(Set(di, LimitsMax<TI>()), di, expected.get());
|
||||
HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
|
||||
|
||||
// Huge negative
|
||||
Store(Set(di, LimitsMin<TI>()), di, expected.get());
|
||||
HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
|
||||
// Huge negative (also lvalue for safety, but GCC bug was not triggered)
|
||||
const auto expected_min = Set(di, LimitsMin<TI>());
|
||||
HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
|
||||
#else
|
||||
(void)df;
|
||||
#endif
|
||||
|
@ -416,6 +634,10 @@ class TestIntFromFloat {
|
|||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(TF));
|
||||
} while (!std::isfinite(from[i]));
|
||||
#if defined(HWY_EMULATE_SVE)
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2);
|
||||
#endif
|
||||
if (from[i] >= max) {
|
||||
expected[i] = LimitsMax<TI>();
|
||||
} else if (from[i] <= min) {
|
||||
|
@ -503,21 +725,30 @@ struct TestI32F64 {
|
|||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
|
||||
|
||||
// Integer negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
|
||||
|
||||
// Above positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
|
||||
|
||||
// Below positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
|
||||
|
||||
const TF eps = static_cast<TF>(0.0001);
|
||||
// Above negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
|
||||
|
||||
// Below negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
|
||||
|
||||
// Max positive int
|
||||
|
@ -527,11 +758,22 @@ struct TestI32F64 {
|
|||
// Min negative int
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
|
||||
PromoteTo(df, Set(di, LimitsMin<TI>())));
|
||||
|
||||
// farm_sve just casts, which is undefined if the value is out of range.
|
||||
#if !defined(HWY_EMULATE_SVE)
|
||||
// Huge positive float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
|
||||
DemoteTo(di, Set(df, TF(1E12))));
|
||||
|
||||
// Huge negative float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
|
||||
DemoteTo(di, Set(df, TF(-1E12))));
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllI32F64() {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
ForDemoteVectors<TestI32F64>()(double());
|
||||
#endif
|
||||
}
|
||||
|
@ -548,8 +790,12 @@ namespace hwy {
|
|||
HWY_BEFORE_TEST(HwyConvertTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
|
||||
|
|
|
@ -74,7 +74,7 @@ class TestAES {
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < 256; i += N) {
|
||||
const auto in = Iota(d, static_cast<T>(i));
|
||||
const auto in = Iota(d, i);
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
|
||||
}
|
||||
}
|
||||
|
@ -89,17 +89,11 @@ class TestAES {
|
|||
0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
|
||||
const auto test = LoadDup128(d, test_lanes);
|
||||
|
||||
// = ShiftRow result
|
||||
alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
|
||||
0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
|
||||
0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
|
||||
const auto expected_sr = LoadDup128(d, expected_sr_lanes);
|
||||
|
||||
// = MixColumn result
|
||||
alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
|
||||
alignas(16) constexpr uint8_t expected0_lanes[16] = {
|
||||
0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
|
||||
0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
|
||||
const auto expected_mc = LoadDup128(d, expected_mc_lanes);
|
||||
const auto expected0 = LoadDup128(d, expected0_lanes);
|
||||
|
||||
// = KeyAddition result
|
||||
alignas(16) constexpr uint8_t expected_lanes[16] = {
|
||||
|
@ -109,20 +103,17 @@ class TestAES {
|
|||
|
||||
alignas(16) uint8_t key_lanes[16];
|
||||
for (size_t i = 0; i < 16; ++i) {
|
||||
key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
|
||||
key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i];
|
||||
}
|
||||
const auto round_key = LoadDup128(d, key_lanes);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
|
||||
HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d)));
|
||||
HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
|
||||
HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
|
||||
HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
|
||||
AESLastRound(test, round_key));
|
||||
|
||||
TestSBox(t, d);
|
||||
}
|
||||
};
|
||||
HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
|
||||
HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); }
|
||||
|
||||
#else
|
||||
HWY_NOINLINE void TestAllAES() {}
|
||||
|
@ -132,7 +123,7 @@ struct TestCLMul {
|
|||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// needs 64 bit lanes and 128-bit result
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64
|
||||
const size_t N = Lanes(d);
|
||||
if (N == 1) return;
|
||||
|
||||
|
@ -534,7 +525,7 @@ struct TestCLMul {
|
|||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
|
||||
HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
|
|
@ -1,333 +0,0 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Causes build timeout.
|
||||
#if !HWY_IS_MSAN
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
bool IsFinite(T t) {
|
||||
return std::isfinite(t);
|
||||
}
|
||||
// Wrapper avoids calling std::isfinite for integer types (ambiguous).
|
||||
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
||||
bool IsFinite(T /*unused*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteTo {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
// Narrower range in the wider type, for clamping before we cast
|
||||
const T min = LimitsMin<ToT>();
|
||||
const T max = LimitsMax<ToT>();
|
||||
|
||||
const auto value_ok = [&](T& value) {
|
||||
if (!IsFinite(value)) return false;
|
||||
return true;
|
||||
};
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!value_ok(from[i]));
|
||||
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
|
||||
}
|
||||
|
||||
const auto in = Load(from_d, from.get());
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToInt() {
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
|
||||
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
|
||||
to_u16(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
|
||||
to_i16(int32_t());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToMixed() {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
|
||||
to_i32(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteToFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
// For floats, we clamp differently and cannot call LimitsMin.
|
||||
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!IsFinite(from[i]));
|
||||
const T magn = std::abs(from[i]);
|
||||
const T max_abs = HighestValue<ToT>();
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
|
||||
expected[i] = static_cast<ToT>(clipped);
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
DemoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToFloat() {
|
||||
// Must test f16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_HAVE_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
|
||||
to_float(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
// Same as BF16TestCases:
|
||||
// +/- 1
|
||||
1.0f,
|
||||
-1.0f,
|
||||
// +/- 0
|
||||
0.0f,
|
||||
-0.0f,
|
||||
// near 0
|
||||
0.25f,
|
||||
-0.25f,
|
||||
// +/- integer
|
||||
4.0f,
|
||||
-32.0f,
|
||||
// positive +/- delta
|
||||
2.015625f,
|
||||
3.984375f,
|
||||
// negative +/- delta
|
||||
-2.015625f,
|
||||
-3.984375f,
|
||||
|
||||
// No huge values - would interfere with sum. But add more to fill 2 * N:
|
||||
-2.0f,
|
||||
-10.0f,
|
||||
0.03125f,
|
||||
1.03125f,
|
||||
1.5f,
|
||||
2.0f,
|
||||
4.0f,
|
||||
5.0f,
|
||||
6.0f,
|
||||
8.0f,
|
||||
10.0f,
|
||||
256.0f,
|
||||
448.0f,
|
||||
2080.0f,
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
|
||||
auto in = AllocateAligned<float>(padded);
|
||||
auto expected = AllocateAligned<float>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
|
||||
return in;
|
||||
}
|
||||
|
||||
class TestReorderDemote2To {
|
||||
// In-place N^2 selection sort to avoid dependencies
|
||||
void Sort(float* p, size_t count) {
|
||||
for (size_t i = 0; i < count - 1; ++i) {
|
||||
// Find min_element
|
||||
size_t idx_min = i;
|
||||
for (size_t j = i + 1; j < count; j++) {
|
||||
if (p[j] < p[idx_min]) {
|
||||
idx_min = j;
|
||||
}
|
||||
}
|
||||
|
||||
// Swap with current
|
||||
const float tmp = p[i];
|
||||
p[i] = p[idx_min];
|
||||
p[idx_min] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
|
||||
size_t padded;
|
||||
auto in = ReorderBF16TestCases(d32, padded);
|
||||
|
||||
using TBF16 = bfloat16_t;
|
||||
const Repartition<TBF16, DF32> dbf16;
|
||||
const Half<decltype(dbf16)> dbf16_half;
|
||||
const size_t N = Lanes(d32);
|
||||
auto temp16 = AllocateAligned<TBF16>(2 * N);
|
||||
auto expected = AllocateAligned<float>(2 * N);
|
||||
auto actual = AllocateAligned<float>(2 * N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += 2 * N) {
|
||||
const auto f0 = Load(d32, &in[i + 0]);
|
||||
const auto f1 = Load(d32, &in[i + N]);
|
||||
const auto v16 = ReorderDemote2To(dbf16, f0, f1);
|
||||
Store(v16, dbf16, temp16.get());
|
||||
const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
|
||||
const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
|
||||
|
||||
// Smoke test: sum should be same (with tolerance for non-associativity)
|
||||
const auto sum_expected =
|
||||
GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
|
||||
const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
|
||||
HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
|
||||
sum_expected <= sum_actual + 1E-4);
|
||||
|
||||
// Ensure values are the same after sorting to undo the Reorder
|
||||
Store(f0, d32, expected.get() + 0);
|
||||
Store(f1, d32, expected.get() + N);
|
||||
Store(promoted0, d32, actual.get() + 0);
|
||||
Store(promoted1, d32, actual.get() + N);
|
||||
Sort(expected.get(), 2 * N);
|
||||
Sort(actual.get(), 2 * N);
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
|
||||
HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
|
||||
}
|
||||
#else // HWY_SCALAR
|
||||
(void)d32;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReorderDemote2To() {
|
||||
ForShrinkableVectors<TestReorderDemote2To>()(float());
|
||||
}
|
||||
|
||||
struct TestI32F64 {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
using TI = int32_t;
|
||||
const Rebind<TI, DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
|
||||
|
||||
// Integer negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
|
||||
|
||||
// Above positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
|
||||
|
||||
// Below positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
|
||||
|
||||
const TF eps = static_cast<TF>(0.0001);
|
||||
// Above negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
|
||||
|
||||
// Below negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
|
||||
// Huge positive float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
|
||||
DemoteTo(di, Set(df, TF(1E12))));
|
||||
|
||||
// Huge negative float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
|
||||
DemoteTo(di, Set(df, TF(-1E12))));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllI32F64() {
|
||||
#if HWY_HAVE_FLOAT64
|
||||
ForDemoteVectors<TestI32F64>()(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // !HWY_IS_MSAN
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
#if !HWY_IS_MSAN
|
||||
HWY_BEFORE_TEST(HwyDemoteTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
|
||||
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
|
||||
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
|
||||
#endif // !HWY_IS_MSAN
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -17,6 +17,7 @@
|
|||
#include <string.h> // memcmp
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
|
||||
|
@ -58,15 +59,6 @@ struct TestLogicalInteger {
|
|||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
|
||||
|
||||
auto v = vi;
|
||||
v = And(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
|
@ -164,43 +156,6 @@ struct TestCopySign {
|
|||
}
|
||||
};
|
||||
|
||||
struct TestIfVecThenElse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TU = MakeUnsigned<T>; // For all-one mask
|
||||
const Rebind<TU, D> du;
|
||||
const size_t N = Lanes(d);
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto vec_lanes = AllocateAligned<TU>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = static_cast<T>(Random32(&rng));
|
||||
in2[i] = static_cast<T>(Random32(&rng));
|
||||
vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
|
||||
}
|
||||
|
||||
const auto v1 = Load(d, in1.get());
|
||||
const auto v2 = Load(d, in2.get());
|
||||
const auto vec = BitCast(d, Load(du, vec_lanes.get()));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = vec_lanes[i] ? in1[i] : in2[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIfVecThenElse() {
|
||||
ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllCopySign() {
|
||||
ForFloatTypes(ForPartialVectors<TestCopySign>());
|
||||
}
|
||||
|
@ -225,31 +180,6 @@ HWY_NOINLINE void TestAllZeroIfNegative() {
|
|||
ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
|
||||
}
|
||||
|
||||
struct TestIfNegative {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vp = Iota(d, 1);
|
||||
const auto vn = Or(vp, SignBit(d));
|
||||
|
||||
// Zero and positive remain unchanged
|
||||
HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
|
||||
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
|
||||
|
||||
// Negative are replaced with 2nd arg
|
||||
HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
|
||||
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIfNegative() {
|
||||
ForFloatTypes(ForPartialVectors<TestIfNegative>());
|
||||
ForSignedTypes(ForPartialVectors<TestIfNegative>());
|
||||
}
|
||||
|
||||
struct TestBroadcastSignBit {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
|
@ -304,11 +234,16 @@ HWY_NOINLINE void TestAllTestBit() {
|
|||
struct TestPopulationCount {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD
|
||||
constexpr size_t kNumTests = 1 << 14;
|
||||
#else
|
||||
constexpr size_t kNumTests = 1 << 20;
|
||||
#endif
|
||||
RandomState rng;
|
||||
size_t N = Lanes(d);
|
||||
auto data = AllocateAligned<T>(N);
|
||||
auto popcnt = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
|
||||
for (size_t i = 0; i < kNumTests / N; i++) {
|
||||
for (size_t i = 0; i < N; i++) {
|
||||
data[i] = static_cast<T>(rng());
|
||||
popcnt[i] = static_cast<T>(PopCount(data[i]));
|
||||
|
@ -333,10 +268,8 @@ namespace hwy {
|
|||
HWY_BEFORE_TEST(HwyLogicalTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
|
||||
|
|
|
@ -17,6 +17,8 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h> // memcmp
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
@ -53,18 +55,13 @@ struct TestFirstN {
|
|||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
const RebindToSigned<D> di;
|
||||
using TI = TFromD<decltype(di)>;
|
||||
using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>;
|
||||
const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
|
||||
|
||||
// TODO(janwas): 8-bit FirstN (using SlideUp) causes spike to freeze.
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
if (sizeof(T) == 1) return;
|
||||
#endif
|
||||
|
||||
const size_t max_lanes = AdjustedReps(HWY_MIN(2 * N, size_t(64)));
|
||||
for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
|
||||
for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) {
|
||||
const auto expected =
|
||||
RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len))));
|
||||
const auto actual = FirstN(d, len);
|
||||
|
@ -371,7 +368,7 @@ struct TestFindFirstTrue {
|
|||
memset(bool_lanes.get(), 0, N * sizeof(TI));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
|
||||
const size_t max_lanes = HWY_MIN(N, size_t(10));
|
||||
|
||||
HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
|
||||
HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
|
||||
|
@ -410,7 +407,7 @@ struct TestLogicalMask {
|
|||
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
|
||||
const size_t max_lanes = HWY_MIN(N, size_t(6));
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
|
||||
|
|
|
@ -36,7 +36,7 @@ struct TestLoadStore {
|
|||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto hi = Iota(d, static_cast<T>(1 + N));
|
||||
const auto hi = Iota(d, 1 + N);
|
||||
const auto lo = Iota(d, 1);
|
||||
auto lanes = AllocateAligned<T>(2 * N);
|
||||
Store(hi, d, &lanes[N]);
|
||||
|
@ -135,7 +135,7 @@ struct TestStoreInterleaved3 {
|
|||
HWY_NOINLINE void TestAllStoreInterleaved3() {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
||||
const ForExtendableVectors<TestStoreInterleaved3, 2> test;
|
||||
const ForExtendableVectors<TestStoreInterleaved3, 4> test;
|
||||
#else
|
||||
const ForPartialVectors<TestStoreInterleaved3> test;
|
||||
#endif
|
||||
|
@ -198,7 +198,7 @@ struct TestStoreInterleaved4 {
|
|||
HWY_NOINLINE void TestAllStoreInterleaved4() {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
||||
const ForExtendableVectors<TestStoreInterleaved4, 2> test;
|
||||
const ForExtendableVectors<TestStoreInterleaved4, 4> test;
|
||||
#else
|
||||
const ForPartialVectors<TestStoreInterleaved4> test;
|
||||
#endif
|
||||
|
@ -230,7 +230,7 @@ struct TestLoadDup128 {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLoadDup128() {
|
||||
ForAllTypes(ForGEVectors<128, TestLoadDup128>());
|
||||
ForAllTypes(ForGE128Vectors<TestLoadDup128>());
|
||||
}
|
||||
|
||||
struct TestStream {
|
||||
|
@ -245,7 +245,7 @@ struct TestStream {
|
|||
std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
|
||||
|
||||
Stream(v, d, out.get());
|
||||
FlushStream();
|
||||
StoreFence();
|
||||
const auto actual = Load(d, out.get());
|
||||
HWY_ASSERT_VEC_EQ(d, v, actual);
|
||||
// Ensure Stream didn't modify more memory than expected
|
||||
|
@ -386,7 +386,7 @@ HWY_NOINLINE void TestAllGather() {
|
|||
|
||||
HWY_NOINLINE void TestAllCache() {
|
||||
LoadFence();
|
||||
FlushStream();
|
||||
StoreFence();
|
||||
int test = 0;
|
||||
Prefetch(&test);
|
||||
FlushCacheline(&test);
|
||||
|
|
|
@ -1,433 +0,0 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <bool kSigned>
|
||||
struct TestLeftShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
if (kSigned) {
|
||||
// Also test positive values
|
||||
TestLeftShifts</*kSigned=*/false>()(t, d);
|
||||
}
|
||||
|
||||
using TI = MakeSigned<T>;
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
|
||||
|
||||
// 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
|
||||
|
||||
// max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(T(i) - T(N)) : T(i);
|
||||
expected[i] = T(TU(value) << kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
|
||||
}
|
||||
};
|
||||
|
||||
template <bool kSigned>
|
||||
struct TestVariableLeftShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
if (kSigned) {
|
||||
// Also test positive values
|
||||
TestVariableLeftShifts</*kSigned=*/false>()(t, d);
|
||||
}
|
||||
|
||||
using TI = MakeSigned<T>;
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
// Same: 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
|
||||
|
||||
// Same: 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
|
||||
|
||||
// Same: max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
|
||||
|
||||
// Variable: small
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const T value = kSigned ? T(i) - T(N) : T(i);
|
||||
expected[i] = T(TU(value) << (i & kMaxShift));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
|
||||
|
||||
// Variable: large
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestUnsignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto values = Iota(d, 0);
|
||||
|
||||
const T kMax = LimitsMax<T>();
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
||||
|
||||
// max
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestRotateRight {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr size_t kBits = sizeof(T) * 8;
|
||||
const auto mask_shift = Set(d, T{kBits});
|
||||
// Cover as many bit positions as possible to test shifting out
|
||||
const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
|
||||
|
||||
// Rotate by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
|
||||
|
||||
// Rotate by 1
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
|
||||
|
||||
// Rotate by half
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
|
||||
|
||||
// Rotate by max
|
||||
Store(values, d, expected.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestVariableUnsignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
const auto v0 = Zero(d);
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto values = Iota(d, 0);
|
||||
|
||||
const T kMax = LimitsMax<T>();
|
||||
const auto max = Set(d, kMax);
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
// Same: 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
|
||||
|
||||
// Same: 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
|
||||
|
||||
// Same: max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
|
||||
|
||||
// Variable: small
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(i) >> (i & kMaxShift);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
|
||||
|
||||
// Variable: Large
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
template <int kAmount, typename T>
|
||||
T RightShiftNegative(T val) {
|
||||
// C++ shifts are implementation-defined for negative numbers, and we have
|
||||
// seen divisions replaced with shifts, so resort to bit operations.
|
||||
using TU = hwy::MakeUnsigned<T>;
|
||||
TU bits;
|
||||
CopyBytes<sizeof(T)>(&val, &bits);
|
||||
|
||||
const TU shifted = TU(bits >> kAmount);
|
||||
|
||||
const TU all = TU(~TU(0));
|
||||
const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
|
||||
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
|
||||
|
||||
bits = shifted | sign_extended;
|
||||
CopyBytes<sizeof(T)>(&bits, &val);
|
||||
return val;
|
||||
}
|
||||
|
||||
class TestSignedRightShifts {
|
||||
public:
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
constexpr T kMin = LimitsMin<T>();
|
||||
constexpr T kMax = LimitsMax<T>();
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// First test positive values, negative are checked below.
|
||||
const auto v0 = Zero(d);
|
||||
const auto values = And(Iota(d, 0), Set(d, kMax));
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
|
||||
|
||||
// max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
|
||||
|
||||
// Even negative value
|
||||
Test<0>(kMin, d, __LINE__);
|
||||
Test<1>(kMin, d, __LINE__);
|
||||
Test<2>(kMin, d, __LINE__);
|
||||
Test<kMaxShift>(kMin, d, __LINE__);
|
||||
|
||||
const T odd = static_cast<T>(kMin + 1);
|
||||
Test<0>(odd, d, __LINE__);
|
||||
Test<1>(odd, d, __LINE__);
|
||||
Test<2>(odd, d, __LINE__);
|
||||
Test<kMaxShift>(odd, d, __LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <int kAmount, typename T, class D>
|
||||
void Test(T val, D d, int line) {
|
||||
const auto expected = Set(d, RightShiftNegative<kAmount>(val));
|
||||
const auto in = Set(d, val);
|
||||
const char* file = __FILE__;
|
||||
AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
|
||||
AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
|
||||
}
|
||||
};
|
||||
|
||||
struct TestVariableSignedRightShifts {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr T kMin = LimitsMin<T>();
|
||||
constexpr T kMax = LimitsMax<T>();
|
||||
|
||||
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
|
||||
|
||||
// First test positive values, negative are checked below.
|
||||
const auto v0 = Zero(d);
|
||||
const auto positive = Iota(d, 0) & Set(d, kMax);
|
||||
|
||||
// Shift by 0
|
||||
HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
|
||||
|
||||
// Shift by 1
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(T(i & kMax) >> 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
|
||||
|
||||
// max
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
|
||||
|
||||
const auto max_shift = Set(d, kMaxShift);
|
||||
const auto small_shifts = And(Iota(d, 0), max_shift);
|
||||
const auto large_shifts = max_shift - small_shifts;
|
||||
|
||||
const auto negative = Iota(d, kMin);
|
||||
|
||||
// Test varying negative to shift
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
|
||||
|
||||
// Shift MSB right by small amounts
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t amount = i & kMaxShift;
|
||||
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
|
||||
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
|
||||
|
||||
// Shift MSB right by large amounts
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t amount = kMaxShift - (i & kMaxShift);
|
||||
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
|
||||
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShifts() {
|
||||
ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
|
||||
ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
|
||||
ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
|
||||
ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllVariableShifts() {
|
||||
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
|
||||
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
|
||||
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
|
||||
const ForPartialVectors<TestSignedRightShifts> shr_s;
|
||||
|
||||
shl_u(uint16_t());
|
||||
shr_u(uint16_t());
|
||||
|
||||
shl_u(uint32_t());
|
||||
shr_u(uint32_t());
|
||||
|
||||
shl_s(int16_t());
|
||||
shr_s(int16_t());
|
||||
|
||||
shl_s(int32_t());
|
||||
shr_s(int32_t());
|
||||
|
||||
#if HWY_HAVE_INTEGER64
|
||||
shl_u(uint64_t());
|
||||
shr_u(uint64_t());
|
||||
|
||||
shl_s(int64_t());
|
||||
shr_s(int64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllRotateRight() {
|
||||
const ForPartialVectors<TestRotateRight> test;
|
||||
test(uint32_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
test(uint64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyShiftTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
|
||||
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
|
||||
} // namespace hwy
|
||||
|
||||
// Ought not to be necessary, but without this, no tests run on RVV.
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
|
||||
#endif
|
|
@ -19,8 +19,6 @@
|
|||
|
||||
#include <array> // IWYU pragma: keep
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
@ -46,48 +44,12 @@ HWY_NOINLINE void TestAllGetLane() {
|
|||
ForAllTypes(ForPartialVectors<TestGetLane>());
|
||||
}
|
||||
|
||||
struct TestDupEven {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDupEven() {
|
||||
ForUIF3264(ForShrinkableVectors<TestDupEven>());
|
||||
}
|
||||
|
||||
struct TestDupOdd {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDupOdd() {
|
||||
ForUIF3264(ForShrinkableVectors<TestDupOdd>());
|
||||
}
|
||||
|
||||
struct TestOddEven {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto even = Iota(d, 1);
|
||||
const auto odd = Iota(d, static_cast<T>(1 + N));
|
||||
const auto odd = Iota(d, 1 + N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
|
||||
|
@ -105,7 +67,7 @@ struct TestOddEvenBlocks {
|
|||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto even = Iota(d, 1);
|
||||
const auto odd = Iota(d, static_cast<T>(1 + N));
|
||||
const auto odd = Iota(d, 1 + N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t idx_block = i / (16 / sizeof(T));
|
||||
|
@ -116,7 +78,7 @@ struct TestOddEvenBlocks {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllOddEvenBlocks() {
|
||||
ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
|
||||
ForAllTypes(ForShrinkableVectors<TestOddEvenBlocks>());
|
||||
}
|
||||
|
||||
struct TestSwapAdjacentBlocks {
|
||||
|
@ -138,7 +100,7 @@ struct TestSwapAdjacentBlocks {
|
|||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
|
||||
ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
|
||||
ForAllTypes(ForPartialVectors<TestSwapAdjacentBlocks>());
|
||||
}
|
||||
|
||||
struct TestTableLookupLanes {
|
||||
|
@ -235,131 +197,23 @@ struct TestReverse {
|
|||
}
|
||||
};
|
||||
|
||||
struct TestReverse2 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const RebindToUnsigned<D> du; // Iota does not support float16_t.
|
||||
const auto v = BitCast(d, Iota(du, 1));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// Can't set float16_t value directly, need to permute in memory.
|
||||
auto copy = AllocateAligned<T>(N);
|
||||
Store(v, d, copy.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = copy[i ^ 1];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestReverse4 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const RebindToUnsigned<D> du; // Iota does not support float16_t.
|
||||
const auto v = BitCast(d, Iota(du, 1));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// Can't set float16_t value directly, need to permute in memory.
|
||||
auto copy = AllocateAligned<T>(N);
|
||||
Store(v, d, copy.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = copy[i ^ 3];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestReverse8 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const RebindToUnsigned<D> du; // Iota does not support float16_t.
|
||||
const auto v = BitCast(d, Iota(du, 1));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// Can't set float16_t value directly, need to permute in memory.
|
||||
auto copy = AllocateAligned<T>(N);
|
||||
Store(v, d, copy.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = copy[i ^ 7];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReverse() {
|
||||
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
|
||||
// which requires 16 bits.
|
||||
ForUIF163264(ForPartialVectors<TestReverse>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllReverse2() {
|
||||
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
|
||||
// which requires 16 bits.
|
||||
ForUIF64(ForGEVectors<128, TestReverse2>());
|
||||
ForUIF32(ForGEVectors<64, TestReverse2>());
|
||||
ForUIF16(ForGEVectors<32, TestReverse2>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllReverse4() {
|
||||
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
|
||||
// which requires 16 bits.
|
||||
ForUIF64(ForGEVectors<256, TestReverse4>());
|
||||
ForUIF32(ForGEVectors<128, TestReverse4>());
|
||||
ForUIF16(ForGEVectors<64, TestReverse4>());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllReverse8() {
|
||||
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
|
||||
// which requires 16 bits.
|
||||
ForUIF64(ForGEVectors<512, TestReverse8>());
|
||||
ForUIF32(ForGEVectors<256, TestReverse8>());
|
||||
ForUIF16(ForGEVectors<128, TestReverse8>());
|
||||
}
|
||||
|
||||
struct TestReverseBlocks {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const RebindToUnsigned<D> du; // Iota does not support float16_t.
|
||||
const auto v = BitCast(d, Iota(du, 1));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
constexpr size_t kLanesPerBlock = 16 / sizeof(T);
|
||||
const size_t num_blocks = N / kLanesPerBlock;
|
||||
HWY_ASSERT(num_blocks != 0);
|
||||
|
||||
// Can't set float16_t value directly, need to permute in memory.
|
||||
auto copy = AllocateAligned<T>(N);
|
||||
Store(v, d, copy.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const size_t idx_block = i / kLanesPerBlock;
|
||||
const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
|
||||
expected[i] = copy[base + (i % kLanesPerBlock)];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllReverseBlocks() {
|
||||
ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
|
||||
}
|
||||
|
||||
class TestCompress {
|
||||
template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
|
||||
void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
|
||||
const AlignedFreeUniquePtr<T[]>& in,
|
||||
template <typename T, typename TI, size_t N>
|
||||
void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
|
||||
size_t actual_pos, const AlignedFreeUniquePtr<T[]>& in,
|
||||
const AlignedFreeUniquePtr<TI[]>& mask_lanes,
|
||||
const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
|
||||
int line) {
|
||||
if (expected_pos != actual_pos) {
|
||||
hwy::Abort(
|
||||
__FILE__, line,
|
||||
"Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
|
||||
TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
|
||||
static_cast<uint64_t>(actual_pos));
|
||||
hwy::Abort(__FILE__, line,
|
||||
"Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
|
||||
TypeName(T(), N).c_str(), static_cast<uint64_t>(expected_pos), static_cast<uint64_t>(actual_pos));
|
||||
}
|
||||
// Upper lanes are undefined. Modified from AssertVecEqual.
|
||||
for (size_t i = 0; i < expected_pos; ++i) {
|
||||
|
@ -368,7 +222,6 @@ class TestCompress {
|
|||
"Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(expected_pos),
|
||||
line);
|
||||
const size_t N = Lanes(d);
|
||||
Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
|
||||
Print(d, "in", Load(d, in.get()), 0, N);
|
||||
Print(d, "expect", Load(d, expected.get()), 0, N);
|
||||
|
@ -398,10 +251,7 @@ class TestCompress {
|
|||
auto expected = AllocateAligned<T>(N);
|
||||
auto actual_a = AllocateAligned<T>(misalign + N);
|
||||
T* actual_u = actual_a.get() + misalign;
|
||||
|
||||
const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
|
||||
auto bits = AllocateAligned<uint8_t>(bits_size);
|
||||
memset(bits.get(), 0, bits_size); // for MSAN
|
||||
auto bits = AllocateAligned<uint8_t>(HWY_MAX(8, (N + 7) / 8));
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
|
||||
|
@ -615,7 +465,7 @@ HWY_NOINLINE void TestAllCompress() {
|
|||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
#if HWY_HAVE_FLOAT16
|
||||
#if HWY_CAP_FLOAT16
|
||||
test(float16_t());
|
||||
#endif
|
||||
|
||||
|
@ -632,17 +482,11 @@ HWY_AFTER_NAMESPACE();
|
|||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwySwizzleTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse2);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse4);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse8);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
|
||||
} // namespace hwy
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ HWY_NOINLINE void PrintValue(T value) {
|
|||
fprintf(stderr, "0x%02X,", byte);
|
||||
}
|
||||
|
||||
#if HWY_HAVE_FLOAT16
|
||||
#if HWY_CAP_FLOAT16
|
||||
HWY_NOINLINE void PrintValue(float16_t value) {
|
||||
uint16_t bits;
|
||||
CopyBytes<2>(&value, &bits);
|
||||
|
@ -70,11 +70,9 @@ void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
|
|||
}
|
||||
|
||||
// Compare expected vector to vector.
|
||||
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
|
||||
// 128 bits (the NEON register) of actual are zero.
|
||||
template <class D, typename T = TFromD<D>, class V = Vec<D>>
|
||||
HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
|
||||
const char* filename, const int line) {
|
||||
void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
|
||||
const char* filename, const int line) {
|
||||
const size_t N = Lanes(d);
|
||||
auto actual_lanes = AllocateAligned<T>(N);
|
||||
Store(actual, d, actual_lanes.get());
|
||||
|
@ -86,11 +84,9 @@ HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
|
|||
}
|
||||
|
||||
// Compare expected lanes to vector.
|
||||
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
|
||||
// 128 bits (the NEON register) of actual are zero.
|
||||
template <class D, typename T = TFromD<D>, class V = Vec<D>>
|
||||
HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
|
||||
const char* filename, int line) {
|
||||
HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
|
||||
const char* filename, int line) {
|
||||
auto expected_lanes = AllocateAligned<T>(Lanes(d));
|
||||
Store(expected, d, expected_lanes.get());
|
||||
AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
|
||||
|
@ -100,10 +96,7 @@ HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
|
|||
template <class D>
|
||||
HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
|
||||
const char* filename, int line) {
|
||||
// lvalues prevented MSAN failure in farm_sve.
|
||||
const Vec<D> va = VecFromMask(d, a);
|
||||
const Vec<D> vb = VecFromMask(d, b);
|
||||
AssertVecEqual(d, va, vb, filename, line);
|
||||
AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
|
||||
|
||||
const char* target_name = hwy::TargetName(HWY_TARGET);
|
||||
AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
|
||||
|
@ -185,269 +178,169 @@ HWY_INLINE Mask<D> MaskFalse(const D d) {
|
|||
|
||||
// Helpers for instantiating tests with combinations of lane types / counts.
|
||||
|
||||
// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
|
||||
// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
|
||||
// is required to ensure capped vectors remain extendable. Implemented by
|
||||
// recursively halving kMul until it is zero.
|
||||
template <typename T, size_t kMul, size_t kMinArg, class Test>
|
||||
struct ForeachCappedR {
|
||||
static void Do(size_t min_lanes, size_t max_lanes) {
|
||||
const CappedTag<T, kMul * kMinArg> d;
|
||||
// For ensuring we do not call tests with D such that widening D results in 0
|
||||
// lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
|
||||
// of a u64 vector in this case.
|
||||
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE size_t PromotedLanes(const D d) {
|
||||
return Lanes(RepartitionToWide<decltype(d)>());
|
||||
}
|
||||
// Already the widest possible T, cannot widen.
|
||||
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
|
||||
HWY_INLINE size_t PromotedLanes(const D d) {
|
||||
return Lanes(d);
|
||||
}
|
||||
|
||||
// If we already don't have enough lanes, stop.
|
||||
const size_t lanes = Lanes(d);
|
||||
if (lanes < min_lanes) return;
|
||||
// For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
|
||||
// stops at kMul == 0). Note that N may be capped or a fraction.
|
||||
template <typename T, size_t kMul, size_t kMinLanes, class Test,
|
||||
bool kPromote = false>
|
||||
struct ForeachSizeR {
|
||||
static void Do() {
|
||||
const Simd<T, kMul * kMinLanes> d;
|
||||
|
||||
if (lanes <= max_lanes) {
|
||||
Test()(T(), d);
|
||||
}
|
||||
ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
|
||||
// Skip invalid fractions (e.g. 1/8th of u32x4).
|
||||
const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d);
|
||||
if (lanes < kMinLanes) return;
|
||||
|
||||
Test()(T(), d);
|
||||
|
||||
static_assert(kMul != 0, "Recursion should have ended already");
|
||||
ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do();
|
||||
}
|
||||
};
|
||||
|
||||
// Base case to stop the recursion.
|
||||
template <typename T, size_t kMinArg, class Test>
|
||||
struct ForeachCappedR<T, 0, kMinArg, Test> {
|
||||
static void Do(size_t, size_t) {}
|
||||
template <typename T, size_t kMinLanes, class Test, bool kPromote>
|
||||
struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> {
|
||||
static void Do() {}
|
||||
};
|
||||
|
||||
#if HWY_HAVE_SCALABLE
|
||||
|
||||
constexpr int MinVectorSize() {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Actually 16 for the application processor profile, but the intrinsics are
|
||||
// defined as if VLEN might be only 64: there is no vuint64mf2_t.
|
||||
return 8;
|
||||
#else
|
||||
return 16;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr int MinPow2() {
|
||||
// Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
|
||||
// as kPow2 == -3). The fraction also must not result in zero lanes for the
|
||||
// smallest possible vector size.
|
||||
return HWY_MAX(-3, -static_cast<int>(CeilLog2(MinVectorSize() / sizeof(T))));
|
||||
}
|
||||
|
||||
// Iterates kPow2 upward through +3.
|
||||
template <typename T, int kPow2, int kAddPow2, class Test>
|
||||
struct ForeachShiftR {
|
||||
static void Do(size_t min_lanes) {
|
||||
const ScalableTag<T, kPow2 + kAddPow2> d;
|
||||
|
||||
// Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
|
||||
// vector size, so we always have enough lanes, except ForGEVectors.
|
||||
if (Lanes(d) >= min_lanes) {
|
||||
Test()(T(), d);
|
||||
} else {
|
||||
fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
|
||||
static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
|
||||
static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
|
||||
HWY_ASSERT(min_lanes != 1);
|
||||
}
|
||||
|
||||
ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
|
||||
}
|
||||
};
|
||||
|
||||
// Base case to stop the recursion.
|
||||
template <typename T, int kAddPow2, class Test>
|
||||
struct ForeachShiftR<T, 4, kAddPow2, Test> {
|
||||
static void Do(size_t) {}
|
||||
};
|
||||
#else
|
||||
// ForeachCappedR already handled all possible sizes.
|
||||
#endif // HWY_HAVE_SCALABLE
|
||||
|
||||
// These adapters may be called directly, or via For*Types:
|
||||
|
||||
// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
|
||||
// Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
|
||||
// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
|
||||
template <class Test, int kPow2 = 1>
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForExtendableVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
// Skip CappedTag that are already full vectors.
|
||||
const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
|
||||
(void)kMaxCapped;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#else
|
||||
ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
|
||||
constexpr bool kPromote = true;
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(1);
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do();
|
||||
#endif
|
||||
#endif // HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
|
||||
// Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
|
||||
// that narrow their input, e.g. UpperHalf.
|
||||
template <class Test, int kPow2 = 1>
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForShrinkableVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kMinLanes = size_t{1} << kPow2;
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
// For shrinking, an upper limit is unnecessary.
|
||||
constexpr size_t max_lanes = kMaxCapped;
|
||||
|
||||
(void)kMinLanes;
|
||||
(void)max_lanes;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#else
|
||||
ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
|
||||
max_lanes);
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do();
|
||||
#endif
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all supported power of two vectors of at least kMinBits.
|
||||
// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
|
||||
template <size_t kMinBits, class Test>
|
||||
struct ForGEVectors {
|
||||
// Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
|
||||
// ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
|
||||
template <class Test>
|
||||
struct ForGE128Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
|
||||
// An upper limit is unnecessary.
|
||||
constexpr size_t max_lanes = kMaxCapped;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
(void)kMinLanes; // not supported
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
|
||||
#else
|
||||
ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(kMinLanes,
|
||||
max_lanes);
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Can be 0 (handled below) if kMinBits > 64.
|
||||
constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
|
||||
constexpr int kMinPow2 =
|
||||
kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
|
||||
// For each [kMinPow2, 3]; counter is [kMinPow2, 3].
|
||||
ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// Can be 0 (handled below) if kMinBits > 128.
|
||||
constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
|
||||
constexpr int kMinPow2 =
|
||||
kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
|
||||
// For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
|
||||
ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
|
||||
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
Test>::Do();
|
||||
#endif
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
|
||||
// ops that require at least 64 bits, e.g. casts.
|
||||
template <class Test>
|
||||
using ForGE128Vectors = ForGEVectors<128, Test>;
|
||||
struct ForGE64Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
// not supported
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
// TODO(janwas): also capped
|
||||
// ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
|
||||
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
|
||||
// Capped
|
||||
ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do();
|
||||
// Fractions
|
||||
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)),
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all N that can be promoted (not the same as Extendable because
|
||||
// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
|
||||
template <class Test, int kPow2 = 1>
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForPromoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kFactor = size_t{1} << kPow2;
|
||||
static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
constexpr size_t kMinLanes = kFactor;
|
||||
// Skip CappedTag that are already full vectors.
|
||||
const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
|
||||
(void)kMaxCapped;
|
||||
(void)kMinLanes;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
|
||||
ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do();
|
||||
#else
|
||||
// TODO(janwas): call Extendable if kMinLanes check not required?
|
||||
ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes, max_lanes);
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(kMinLanes);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(kMinLanes);
|
||||
return ForExtendableVectors<Test, kFactor>()(T());
|
||||
#endif
|
||||
#endif // HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all N than can be demoted (not the same as Shrinkable because
|
||||
// HWY_SCALAR has one lane).
|
||||
template <class Test, int kPow2 = 1>
|
||||
// HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForDemoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kMinLanes = size_t{1} << kPow2;
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
// For shrinking, an upper limit is unnecessary.
|
||||
constexpr size_t max_lanes = kMaxCapped;
|
||||
|
||||
(void)kMinLanes;
|
||||
(void)max_lanes;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
|
||||
ForeachSizeR<T, 1, 1, Test>::Do();
|
||||
#else
|
||||
ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
|
||||
max_lanes);
|
||||
|
||||
// TODO(janwas): call Extendable if kMinLanes check not required?
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
|
||||
return ForShrinkableVectors<Test, kFactor>()(T());
|
||||
#endif
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
// For LowerHalf/Quarter.
|
||||
template <class Test, int kPow2 = 1>
|
||||
struct ForHalfVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
constexpr size_t kMinLanes = size_t{1} << kPow2;
|
||||
constexpr size_t kMaxCapped = HWY_LANES(T);
|
||||
// For shrinking, an upper limit is unnecessary.
|
||||
constexpr size_t max_lanes = kMaxCapped;
|
||||
|
||||
(void)kMinLanes;
|
||||
(void)max_lanes;
|
||||
(void)max_lanes;
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
|
||||
#else
|
||||
// ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
|
||||
// max_lanes);
|
||||
|
||||
// TODO(janwas): call Extendable if kMinLanes check not required?
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
|
||||
#elif HWY_HAVE_SCALABLE
|
||||
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
|
||||
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
|
||||
#endif
|
||||
#endif // HWY_TARGET == HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -457,7 +350,7 @@ template <class Test>
|
|||
struct ForPartialVectors {
|
||||
template <typename T>
|
||||
void operator()(T t) const {
|
||||
ForExtendableVectors<Test, 0>()(t);
|
||||
ForExtendableVectors<Test, 1>()(t);
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -468,7 +361,7 @@ void ForSignedTypes(const Func& func) {
|
|||
func(int8_t());
|
||||
func(int16_t());
|
||||
func(int32_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(int64_t());
|
||||
#endif
|
||||
}
|
||||
|
@ -478,7 +371,7 @@ void ForUnsignedTypes(const Func& func) {
|
|||
func(uint8_t());
|
||||
func(uint16_t());
|
||||
func(uint32_t());
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(uint64_t());
|
||||
#endif
|
||||
}
|
||||
|
@ -492,7 +385,7 @@ void ForIntegerTypes(const Func& func) {
|
|||
template <class Func>
|
||||
void ForFloatTypes(const Func& func) {
|
||||
func(float());
|
||||
#if HWY_HAVE_FLOAT64
|
||||
#if HWY_CAP_FLOAT64
|
||||
func(double());
|
||||
#endif
|
||||
}
|
||||
|
@ -504,49 +397,32 @@ void ForAllTypes(const Func& func) {
|
|||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF16(const Func& func) {
|
||||
func(uint16_t());
|
||||
func(int16_t());
|
||||
#if HWY_HAVE_FLOAT16
|
||||
func(float16_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF32(const Func& func) {
|
||||
void ForUIF3264(const Func& func) {
|
||||
func(uint32_t());
|
||||
func(int32_t());
|
||||
func(float());
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF64(const Func& func) {
|
||||
#if HWY_HAVE_INTEGER64
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(uint64_t());
|
||||
func(int64_t());
|
||||
#endif
|
||||
#if HWY_HAVE_FLOAT64
|
||||
func(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF3264(const Func& func) {
|
||||
ForUIF32(func);
|
||||
ForUIF64(func);
|
||||
ForFloatTypes(func);
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUIF163264(const Func& func) {
|
||||
ForUIF16(func);
|
||||
ForUIF3264(func);
|
||||
func(uint16_t());
|
||||
func(int16_t());
|
||||
#if HWY_CAP_FLOAT16
|
||||
func(float16_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
// For tests that involve loops, adjust the trip count so that emulated tests
|
||||
// finish quickly (but always at least 2 iterations to ensure some diversity).
|
||||
constexpr size_t AdjustedReps(size_t max_reps) {
|
||||
#if HWY_ARCH_RVV
|
||||
return HWY_MAX(max_reps / 32, 2);
|
||||
return HWY_MAX(max_reps / 16, 2);
|
||||
#elif HWY_ARCH_ARM
|
||||
return HWY_MAX(max_reps / 4, 2);
|
||||
#elif HWY_IS_DEBUG_BUILD
|
||||
|
@ -556,20 +432,6 @@ constexpr size_t AdjustedReps(size_t max_reps) {
|
|||
#endif
|
||||
}
|
||||
|
||||
// Same as above, but the loop trip count will be 1 << max_pow2.
|
||||
constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
|
||||
// If "negative" (unsigned wraparound), use original.
|
||||
#if HWY_ARCH_RVV
|
||||
return HWY_MIN(max_pow2 - 4, max_pow2);
|
||||
#elif HWY_ARCH_ARM
|
||||
return HWY_MIN(max_pow2 - 1, max_pow2);
|
||||
#elif HWY_IS_DEBUG_BUILD
|
||||
return HWY_MIN(max_pow2 - 1, max_pow2);
|
||||
#else
|
||||
return max_pow2;
|
||||
#endif
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
|
|
|
@ -30,6 +30,9 @@ bool BytesEqual(const void* p1, const void* p2, const size_t size,
|
|||
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (bytes1[i] != bytes2[i]) {
|
||||
fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n",
|
||||
static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i],
|
||||
bytes2[i]);
|
||||
if (pos != nullptr) {
|
||||
*pos = i;
|
||||
}
|
||||
|
|
|
@ -26,7 +26,6 @@
|
|||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/highway_export.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
|
@ -68,7 +67,9 @@ static HWY_INLINE uint32_t Random32(RandomState* rng) {
|
|||
return static_cast<uint32_t>((*rng)());
|
||||
}
|
||||
|
||||
static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
|
||||
static HWY_INLINE uint64_t Random64(RandomState* rng) {
|
||||
return (*rng)();
|
||||
}
|
||||
|
||||
// Prevents the compiler from eliding the computations that led to "output".
|
||||
// Works by indicating to the compiler that "output" is being read and modified.
|
||||
|
@ -83,8 +84,8 @@ inline void PreventElision(T&& output) {
|
|||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
|
||||
const size_t size, size_t* pos = nullptr);
|
||||
bool BytesEqual(const void* p1, const void* p2, const size_t size,
|
||||
size_t* pos = nullptr);
|
||||
|
||||
void AssertStringEqual(const char* expected, const char* actual,
|
||||
const char* target_name, const char* filename, int line);
|
||||
|
@ -128,25 +129,25 @@ HWY_INLINE TypeInfo MakeTypeInfo() {
|
|||
return info;
|
||||
}
|
||||
|
||||
HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
|
||||
const void* actual_ptr);
|
||||
bool IsEqual(const TypeInfo& info, const void* expected_ptr,
|
||||
const void* actual_ptr);
|
||||
|
||||
HWY_TEST_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
|
||||
void TypeName(const TypeInfo& info, size_t N, char* string100);
|
||||
|
||||
HWY_TEST_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N,
|
||||
size_t lane_u = 0, size_t max_lanes = 7);
|
||||
void PrintArray(const TypeInfo& info, const char* caption,
|
||||
const void* array_void, size_t N, size_t lane_u = 0,
|
||||
size_t max_lanes = 7);
|
||||
|
||||
HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
|
||||
const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
|
||||
const char* target_name, const char* filename, int line, size_t lane = 0,
|
||||
size_t num_lanes = 1);
|
||||
HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
|
||||
const void* expected_ptr,
|
||||
const void* actual_ptr,
|
||||
const char* target_name,
|
||||
const char* filename, int line,
|
||||
size_t lane = 0, size_t num_lanes = 1);
|
||||
|
||||
HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
|
||||
const void* expected_void,
|
||||
const void* actual_void, size_t N,
|
||||
const char* target_name,
|
||||
const char* filename, int line);
|
||||
void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
|
||||
const void* actual_void, size_t N,
|
||||
const char* target_name, const char* filename, int line);
|
||||
|
||||
} // namespace detail
|
||||
|
||||
|
|
|
@ -52,10 +52,10 @@ HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
|
|||
struct TestEqualInteger {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*t*/) const {
|
||||
HWY_ASSERT_EQ(T(0), T(0));
|
||||
HWY_ASSERT_EQ(T(1), T(1));
|
||||
HWY_ASSERT_EQ(T(-1), T(-1));
|
||||
HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
|
||||
HWY_ASSERT(IsEqual(T(0), T(0)));
|
||||
HWY_ASSERT(IsEqual(T(1), T(1)));
|
||||
HWY_ASSERT(IsEqual(T(-1), T(-1)));
|
||||
HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));
|
||||
|
||||
HWY_ASSERT(!IsEqual(T(0), T(1)));
|
||||
HWY_ASSERT(!IsEqual(T(1), T(0)));
|
||||
|
|
|
@ -4,7 +4,7 @@ libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
|
|||
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: libhwy-contrib
|
||||
Description: Additions to Highway: dot product, image, math, sort
|
||||
Description: Additions to Highway: image and math library
|
||||
Version: @HWY_LIBRARY_VERSION@
|
||||
Libs: -L${libdir} -lhwy_contrib
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -7,4 +7,4 @@ Name: libhwy
|
|||
Description: Efficient and performance-portable SIMD wrapper
|
||||
Version: @HWY_LIBRARY_VERSION@
|
||||
Libs: -L${libdir} -lhwy
|
||||
Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -1,9 +0,0 @@
|
|||
/*
|
||||
* Copyright 2019 Google LLC
|
||||
*
|
||||
* This source code is licensed under the BSD-style license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */
|
||||
var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } };
|
|
@ -59,7 +59,7 @@ export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
|
|||
rm -rf build_arm7
|
||||
mkdir build_arm7
|
||||
cd build_arm7
|
||||
CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j8
|
||||
ctest
|
||||
cd ..
|
||||
|
@ -71,7 +71,7 @@ export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
|
|||
rm -rf build_arm8
|
||||
mkdir build_arm8
|
||||
cd build_arm8
|
||||
CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
|
||||
make -j8
|
||||
ctest
|
||||
cd ..
|
||||
|
|
|
@ -34,7 +34,7 @@ jobs:
|
|||
env_stack_size: 1
|
||||
max_stack: 3000
|
||||
# Conformance tooling test requires numpy.
|
||||
apt_pkgs: graphviz python3-numpy
|
||||
apt_pkgs: python3-numpy
|
||||
- name: lowprecision
|
||||
mode: release
|
||||
test_in_pr: true
|
||||
|
@ -461,8 +461,8 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
env:
|
||||
CCACHE_DIR: ${{ github.workspace }}/.ccache
|
||||
EM_VERSION: 3.1.4
|
||||
V8_VERSION: 9.8.177
|
||||
EM_VERSION: 2.0.23
|
||||
V8_VERSION: 9.3.22
|
||||
V8: ${{ github.workspace }}/.jsvu/v8
|
||||
BUILD_TARGET: wasm32
|
||||
|
||||
|
@ -506,7 +506,7 @@ jobs:
|
|||
${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }}
|
||||
|
||||
- name: Install emsdk
|
||||
uses: mymindstorm/setup-emsdk@v11
|
||||
uses: mymindstorm/setup-emsdk@v10
|
||||
# TODO(deymo): We could cache this action but it doesn't work when running
|
||||
# in a matrix.
|
||||
with:
|
||||
|
|
|
@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
|
|||
# Git revisions we use for the given submodules. Update these whenever you
|
||||
# update a git submodule.
|
||||
THIRD_PARTY_GFLAGS="827c769e5fc98e0f2a34c47cef953cc6328abced"
|
||||
THIRD_PARTY_HIGHWAY="f13e3b956eb226561ac79427893ec0afd66f91a8"
|
||||
THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5"
|
||||
THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
|
||||
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
|
||||
THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
|
||||
|
|
|
@ -5,12 +5,6 @@
|
|||
|
||||
#include "lib/extras/codec.h"
|
||||
|
||||
#include "jxl/decode.h"
|
||||
#include "jxl/types.h"
|
||||
#include "lib/extras/packed_image.h"
|
||||
#include "lib/jxl/base/padded_bytes.h"
|
||||
#include "lib/jxl/base/status.h"
|
||||
|
||||
#if JPEGXL_ENABLE_APNG
|
||||
#include "lib/extras/enc/apng.h"
|
||||
#endif
|
||||
|
@ -74,14 +68,6 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
|
|||
JXL_WARNING("Writing JPEG data as pixels");
|
||||
}
|
||||
|
||||
extras::PackedPixelFile ppf;
|
||||
size_t num_channels = io.metadata.m.color_encoding.Channels();
|
||||
JxlPixelFormat format = {
|
||||
static_cast<uint32_t>(num_channels),
|
||||
bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
|
||||
JXL_NATIVE_ENDIAN, 0};
|
||||
std::vector<uint8_t> bytes_vector;
|
||||
const bool floating_point = bits_per_sample > 16;
|
||||
switch (codec) {
|
||||
case extras::Codec::kPNG:
|
||||
#if JPEGXL_ENABLE_APNG
|
||||
|
@ -101,24 +87,8 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
|
|||
return JXL_FAILURE("JPEG XL was built without JPEG support");
|
||||
#endif
|
||||
case extras::Codec::kPNM:
|
||||
|
||||
// Choose native for PFM; PGM/PPM require big-endian (N/A for PBM)
|
||||
format.endianness = floating_point ? JXL_NATIVE_ENDIAN : JXL_BIG_ENDIAN;
|
||||
if (floating_point) {
|
||||
format.data_type = JXL_TYPE_FLOAT;
|
||||
}
|
||||
if (!c_desired.IsSRGB()) {
|
||||
JXL_WARNING(
|
||||
"PNM encoder cannot store custom ICC profile; decoder\n"
|
||||
"will need hint key=color_space to get the same values");
|
||||
}
|
||||
JXL_RETURN_IF_ERROR(extras::ConvertCodecInOutToPackedPixelFile(
|
||||
io, format, c_desired, pool, &ppf));
|
||||
JXL_RETURN_IF_ERROR(
|
||||
extras::EncodeImagePNM(ppf, bits_per_sample, pool, &bytes_vector));
|
||||
bytes->assign(bytes_vector.data(),
|
||||
bytes_vector.data() + bytes_vector.size());
|
||||
return true;
|
||||
return extras::EncodeImagePNM(&io, c_desired, bits_per_sample, pool,
|
||||
bytes);
|
||||
case extras::Codec::kPGX:
|
||||
return extras::EncodeImagePGX(&io, c_desired, bits_per_sample, pool,
|
||||
bytes);
|
||||
|
|
|
@ -61,6 +61,7 @@ const EnumName<JxlRenderingIntent> kJxlRenderingIntentNames[] = {
|
|||
template <typename T>
|
||||
Status ParseEnum(const std::string& token, const EnumName<T>* enum_values,
|
||||
size_t enum_len, T* value) {
|
||||
std::string str;
|
||||
for (size_t i = 0; i < enum_len; i++) {
|
||||
if (enum_values[i].name == token) {
|
||||
*value = enum_values[i].value;
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче