Backed out changeset f87312b156f6 (bug 1757483) for causing build bustages on highway_export.h CLOSED TREE

This commit is contained in:
Norisz Fay 2022-03-01 13:31:39 +02:00
Родитель 05225c58b9
Коммит b4d61b9a3f
141 изменённых файлов: 6830 добавлений и 13828 удалений

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z).
release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: f13e3b956eb226561ac79427893ec0afd66f91a8
revision: e69083a12a05caf037cabecdf1b248b7579705a5
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -10,9 +10,9 @@ origin:
url: https://github.com/libjxl/libjxl
release: commit 89875cba4d18485ec9692c80b747b59b73ce712e (2022-02-28T16:03:42Z).
release: commit 4322679b1c418addc2284c5ea84fc2c3935b4a75 (2022-02-07T20:56:39Z).
revision: 89875cba4d18485ec9692c80b747b59b73ce712e
revision: 4322679b1c418addc2284c5ea84fc2c3935b4a75
license: Apache-2.0

Просмотреть файл

@ -1,57 +0,0 @@
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Build / test
on: [push, pull_request]
jobs:
cmake:
name: Build and test ${{ matrix.name }}
runs-on: ubuntu-18.04
strategy:
matrix:
include:
- name: Clang-5.0
extra_deps: clang-5.0
c_compiler: clang-5.0
cxx_compiler: clang++-5.0
- name: Clang-6.0
extra_deps: clang-6.0
c_compiler: clang-6.0
cxx_compiler: clang++-6.0
steps:
- uses: actions/checkout@v2
- name: Install deps
run: sudo apt-get install ${{ matrix.extra_deps }}
- name: Build and test
run: |
export CMAKE_BUILD_PARALLEL_LEVEL=2
export CTEST_PARALLEL_LEVEL=2
CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out .
cmake --build out
ctest --test-dir out
bazel:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: bazelbuild/setup-bazelisk@v1
- uses: actions/cache@v2
with:
path: ~/.cache/bazel
key: bazel-${{ runner.os }}
- run: bazel build //...

106
third_party/highway/BUILD поставляемый
Просмотреть файл

@ -1,6 +1,6 @@
load("@bazel_skylib//lib:selects.bzl", "selects")
load("@rules_cc//cc:defs.bzl", "cc_test")
package(default_visibility = ["//visibility:public"])
licenses(["notice"])
@ -18,11 +18,6 @@ config_setting(
flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
)
config_setting(
name = "compiler_emscripten",
values = {"cpu": "wasm32"},
)
# See https://github.com/bazelbuild/bazel/issues/12707
config_setting(
name = "compiler_gcc_bug",
@ -46,6 +41,13 @@ selects.config_setting_group(
],
)
config_setting(
name = "emulate_sve",
values = {
"copt": "-DHWY_EMULATE_SVE",
},
)
# Additional warnings for Clang OR GCC (skip for MSVC)
CLANG_GCC_COPTS = [
"-Wunused-parameter",
@ -80,7 +82,7 @@ COPTS = select({
"//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS,
}) + select({
"@platforms//cpu:riscv64": [
"-march=rv64gcv1p0",
"-march=rv64gcv0p10",
"-menable-experimental-extensions",
],
"//conditions:default": [
@ -110,32 +112,28 @@ cc_library(
"hwy/base.h",
"hwy/cache_control.h",
"hwy/detect_compiler_arch.h", # private
"hwy/highway_export.h",
"hwy/detect_targets.h", # private
"hwy/targets.h",
],
compatible_with = [],
copts = COPTS,
textual_hdrs = [
# These are textual because config macros influence them:
"hwy/detect_targets.h", # private
"hwy/targets.h",
# End of list
"hwy/highway.h", # public
"hwy/foreach_target.h", # public
"hwy/ops/arm_neon-inl.h",
"hwy/ops/arm_sve-inl.h",
"hwy/ops/generic_ops-inl.h",
"hwy/ops/rvv-inl.h",
"hwy/ops/scalar-inl.h",
"hwy/ops/set_macros-inl.h",
"hwy/ops/shared-inl.h",
"hwy/ops/wasm_128-inl.h",
"hwy/ops/x86_128-inl.h",
"hwy/ops/x86_256-inl.h",
"hwy/ops/x86_512-inl.h",
# Select avoids recompiling native arch if only non-native changed
] + select({
":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
"//conditions:default": [],
}) + select({
"@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
],
deps = select({
":emulate_sve": ["//third_party/farm_sve"],
"//conditions:default": [],
}),
)
@ -146,9 +144,7 @@ cc_library(
textual_hdrs = [
"hwy/contrib/dot/dot-inl.h",
],
deps = [
":hwy",
],
deps = [":hwy"],
)
cc_library(
@ -160,9 +156,7 @@ cc_library(
"hwy/contrib/image/image.h",
],
compatible_with = [],
deps = [
":hwy",
],
deps = [":hwy"],
)
cc_library(
@ -171,9 +165,16 @@ cc_library(
textual_hdrs = [
"hwy/contrib/math/math-inl.h",
],
deps = [
":hwy",
deps = [":hwy"],
)
cc_library(
name = "sort",
compatible_with = [],
textual_hdrs = [
"hwy/contrib/sort/sort-inl.h",
],
deps = [":hwy"],
)
# Everything required for tests that use Highway.
@ -187,9 +188,7 @@ cc_library(
],
# Must not depend on a gtest variant, which can conflict with the
# GUNIT_INTERNAL_BUILD_MODE defined by the test.
deps = [
":hwy",
],
deps = [":hwy"],
)
cc_library(
@ -213,9 +212,7 @@ cc_library(
srcs = ["hwy/examples/skeleton.cc"],
hdrs = ["hwy/examples/skeleton.h"],
textual_hdrs = ["hwy/examples/skeleton-inl.h"],
deps = [
":hwy",
],
deps = [":hwy"],
)
cc_binary(
@ -229,7 +226,7 @@ HWY_TESTS = [
("hwy/contrib/dot/", "dot_test"),
("hwy/contrib/image/", "image_test"),
("hwy/contrib/math/", "math_test"),
# contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
("hwy/contrib/sort/", "sort_test"),
("hwy/examples/", "skeleton_test"),
("hwy/", "nanobenchmark_test"),
("hwy/", "aligned_allocator_test"),
@ -242,27 +239,13 @@ HWY_TESTS = [
("hwy/tests/", "compare_test"),
("hwy/tests/", "convert_test"),
("hwy/tests/", "crypto_test"),
("hwy/tests/", "demote_test"),
("hwy/tests/", "logical_test"),
("hwy/tests/", "mask_test"),
("hwy/tests/", "memory_test"),
("hwy/tests/", "shift_test"),
("hwy/tests/", "swizzle_test"),
("hwy/tests/", "test_util_test"),
]
HWY_TEST_DEPS = [
":dot",
":hwy",
":hwy_test_util",
":image",
":math",
":nanobenchmark",
":skeleton",
"//hwy/contrib/sort:vqsort",
"@com_google_googletest//:gtest_main",
]
[
[
cc_test(
@ -282,18 +265,6 @@ HWY_TEST_DEPS = [
"@platforms//cpu:riscv64": ["fully_static_link"],
"//conditions:default": [],
}),
linkopts = select({
":compiler_emscripten": [
"-s ASSERTIONS=2",
"-s ENVIRONMENT=node,shell,web",
"-s ERROR_ON_UNDEFINED_SYMBOLS=1",
"-s DEMANGLE_SUPPORT=1",
"-s EXIT_RUNTIME=1",
"-s ALLOW_MEMORY_GROWTH=1",
"--pre-js $(location :preamble.js.lds)",
],
"//conditions:default": [],
}),
linkstatic = select({
"@platforms//cpu:riscv64": True,
"//conditions:default": False,
@ -301,10 +272,17 @@ HWY_TEST_DEPS = [
local_defines = ["HWY_IS_TEST"],
# for test_suite.
tags = ["hwy_ops_test"],
deps = HWY_TEST_DEPS + select({
":compiler_emscripten": [":preamble.js.lds"],
"//conditions:default": [],
}),
deps = [
":dot",
":hwy",
":hwy_test_util",
":image",
":math",
":nanobenchmark",
":skeleton",
":sort",
"@com_google_googletest//:gtest_main",
],
),
]
for subdir, test in HWY_TESTS
@ -315,5 +293,3 @@ test_suite(
name = "hwy_ops_tests",
tags = ["hwy_ops_test"],
)
# Placeholder for integration test, do not remove

113
third_party/highway/CMakeLists.txt поставляемый
Просмотреть файл

@ -19,13 +19,11 @@ if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
project(hwy VERSION 0.16.0) # Keep in sync with highway.h version
# Directly define the ABI version from the cmake project() version values:
set(LIBRARY_VERSION "${hwy_VERSION}")
set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
project(hwy VERSION 0.15.0) # Keep in sync with highway.h version
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
# Enabled PIE binaries by default if supported.
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
@ -42,14 +40,13 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE RelWithDebInfo)
endif()
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?")
# Unconditionally adding -Werror risks breaking the build when new warnings
# arise due to compiler/platform changes. Enable this in CI/tests.
set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?")
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
@ -67,32 +64,7 @@ set(HWY_CONTRIB_SOURCES
hwy/contrib/image/image.cc
hwy/contrib/image/image.h
hwy/contrib/math/math-inl.h
hwy/contrib/sort/disabled_targets.h
hwy/contrib/sort/shared-inl.h
hwy/contrib/sort/sorting_networks-inl.h
hwy/contrib/sort/traits-inl.h
hwy/contrib/sort/traits128-inl.h
hwy/contrib/sort/vqsort-inl.h
hwy/contrib/sort/vqsort.cc
hwy/contrib/sort/vqsort.h
hwy/contrib/sort/vqsort_128a.cc
hwy/contrib/sort/vqsort_128d.cc
hwy/contrib/sort/vqsort_f32a.cc
hwy/contrib/sort/vqsort_f32d.cc
hwy/contrib/sort/vqsort_f64a.cc
hwy/contrib/sort/vqsort_f64d.cc
hwy/contrib/sort/vqsort_i16a.cc
hwy/contrib/sort/vqsort_i16d.cc
hwy/contrib/sort/vqsort_i32a.cc
hwy/contrib/sort/vqsort_i32d.cc
hwy/contrib/sort/vqsort_i64a.cc
hwy/contrib/sort/vqsort_i64d.cc
hwy/contrib/sort/vqsort_u16a.cc
hwy/contrib/sort/vqsort_u16d.cc
hwy/contrib/sort/vqsort_u32a.cc
hwy/contrib/sort/vqsort_u32d.cc
hwy/contrib/sort/vqsort_u64a.cc
hwy/contrib/sort/vqsort_u64d.cc
hwy/contrib/sort/sort-inl.h
)
set(HWY_SOURCES
@ -104,7 +76,6 @@ set(HWY_SOURCES
hwy/detect_targets.h # private
hwy/foreach_target.h
hwy/highway.h
hwy/highway_export.h
hwy/nanobenchmark.cc
hwy/nanobenchmark.h
hwy/ops/arm_neon-inl.h
@ -221,59 +192,20 @@ else()
endif() # !MSVC
# By default prefer STATIC build (legacy behavior)
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
# only expose shared/static options to advanced users:
mark_as_advanced(BUILD_SHARED_LIBS)
mark_as_advanced(HWY_FORCE_STATIC_LIBS)
# Define visibility settings globally:
set(CMAKE_CXX_VISIBILITY_PRESET hidden)
set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
# Copy-cat "add_library" logic + add override.
set(HWY_LIBRARY_TYPE "SHARED")
if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
set(HWY_LIBRARY_TYPE "STATIC")
endif()
# This preprocessor define will drive the build, also used in the *.pc files:
if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
else()
set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
endif()
add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
add_library(hwy STATIC ${HWY_SOURCES})
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
target_compile_features(hwy PUBLIC cxx_std_11)
set_target_properties(hwy PROPERTIES
LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
if(UNIX AND NOT APPLE)
set_property(TARGET hwy APPEND_STRING PROPERTY
LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
endif()
add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
target_link_libraries(hwy_contrib hwy)
add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES})
target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR})
target_compile_features(hwy_contrib PUBLIC cxx_std_11)
add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
target_link_libraries(hwy_test hwy)
add_library(hwy_test STATIC ${HWY_TEST_SOURCES})
target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR})
target_compile_features(hwy_test PUBLIC cxx_std_11)
# -------------------------------------------------------- hwy_list_targets
# Generate a tool to print the compiled-in targets as defined by the current
@ -287,22 +219,17 @@ target_include_directories(hwy_list_targets PRIVATE
# Naked target also not always could be run (due to the lack of '.\' prefix)
# Thus effective command to run should contain the full path
# and emulator prefix (if any).
if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
add_custom_command(TARGET hwy_list_targets POST_BUILD
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
endif()
# --------------------------------------------------------
# Allow skipping the following sections for projects that do not need them:
# tests, examples, benchmarks and installation.
if (HWY_EXAMPLES_TESTS_INSTALL)
# -------------------------------------------------------- install library
if (HWY_ENABLE_INSTALL)
install(TARGETS hwy
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_SOURCES})
@ -314,9 +241,7 @@ foreach (source ${HWY_SOURCES})
endforeach()
install(TARGETS hwy_contrib
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_CONTRIB_SOURCES})
@ -328,9 +253,7 @@ foreach (source ${HWY_CONTRIB_SOURCES})
endforeach()
install(TARGETS hwy_test
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_TEST_SOURCES})
@ -349,9 +272,7 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc)
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
endforeach()
endif() # HWY_ENABLE_INSTALL
# -------------------------------------------------------- Examples
if (HWY_ENABLE_EXAMPLES)
# Avoids mismatch between GTest's static CRT and our dynamic.
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@ -359,6 +280,7 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Programming exercise with integrated benchmark
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
target_sources(hwy_benchmark PRIVATE
hwy/nanobenchmark.cc
hwy/nanobenchmark.h)
# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
# observe the difference in targets printed.
@ -367,7 +289,6 @@ target_link_libraries(hwy_benchmark hwy)
set_target_properties(hwy_benchmark
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
endif() # HWY_ENABLE_EXAMPLES
# -------------------------------------------------------- Tests
include(CTest)
@ -431,11 +352,9 @@ set(HWY_TEST_FILES
hwy/tests/compare_test.cc
hwy/tests/convert_test.cc
hwy/tests/crypto_test.cc
hwy/tests/demote_test.cc
hwy/tests/logical_test.cc
hwy/tests/mask_test.cc
hwy/tests/memory_test.cc
hwy/tests/shift_test.cc
hwy/tests/swizzle_test.cc
hwy/tests/test_util_test.cc
)
@ -458,7 +377,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES)
target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main)
endif()
# Output test targets in the test directory.
set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
if (HWY_EMSCRIPTEN)
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
@ -475,3 +394,5 @@ endforeach ()
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
endif() # BUILD_TESTING
endif() # HWY_EXAMPLES_TESTS_INSTALL

4
third_party/highway/CMakeLists.txt.in поставляемый
Просмотреть файл

@ -5,11 +5,11 @@ project(googletest-download NONE)
include(ExternalProject)
ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG 43efa0a4efd40c78b9210d15373112081899a97c
GIT_TAG master
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
)

171
third_party/highway/README.md поставляемый
Просмотреть файл

@ -1,95 +1,30 @@
# Efficient and performance-portable vector software
# Efficient and performance-portable SIMD
[//]: # (placeholder, do not remove)
Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
applying the same operation to multiple 'lanes' using a single CPU instruction.
Highway is a C++ library that provides portable SIMD/vector intrinsics.
## Why Highway?
## Why
We are passionate about high-performance software. We see major untapped
potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
to reliably and economically push the boundaries of what is possible in
software.
## How
CPUs provide SIMD/vector instructions that apply the same operation to multiple
data items. This can reduce energy usage e.g. *fivefold* because fewer
instructions are executed. We also often see *5-10x* speedups.
Highway makes SIMD/vector programming practical and workable according to these
guiding principles:
**Does what you expect**: Highway is a C++ library with carefully-chosen
functions that map well to CPU instructions without extensive compiler
transformations. The resulting code is more predictable and robust to code
changes/compiler updates than autovectorization.
**Works on widely-used platforms**: Highway supports four architectures; the
same application code can target eight instruction sets, including those with
'scalable' vectors (size unknown at compile time). Highway only requires C++11
and supports four families of compilers. If you would like to use Highway on
other platforms, please raise an issue.
**Flexible to deploy**: Applications using Highway can run on heterogeneous
clouds or client devices, choosing the best available instruction set at
runtime. Alternatively, developers may choose to target a single instruction set
without any runtime overhead. In both cases, the application code is the same
except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
line of code.
**Suitable for a variety of domains**: Highway provides an extensive set of
operations, used for image processing (floating-point), compression, video
analysis, linear algebra, cryptography, sorting and random generation. We
recognise that new use-cases may require additional ops and are happy to add
them where it makes sense (e.g. no performance cliffs on some architectures). If
you would like to discuss, please file an issue.
**Rewards data-parallel design**: Highway provides tools such as Gather,
MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
the biggest gains are unlocked by designing algorithms and data structures for
scalable vectors. Helpful techniques include batching, structure-of-array
layouts, and aligned/padded allocations.
## Examples
Online demos using Compiler Explorer:
- [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
- [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
Projects using Highway: (to add yours, feel free to raise an issue or contact us
via the below email)
* [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
* [JPEG XL image codec](https://github.com/libjxl/libjxl)
* [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
* [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort)
- more portable (same source code) than platform-specific intrinsics,
- works on a wider range of compilers than compiler-specific vector extensions,
- more dependable than autovectorization,
- easier to write/maintain than assembly language,
- supports **runtime dispatch**,
- supports **variable-length vector** architectures.
## Current status
### Targets
Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE,
WASM SIMD.
SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV
is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops
which were not yet supported by GCC.
SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not
yet validated. A subset of RVV is implemented and tested with GCC and QEMU.
Work is underway to compile using LLVM, which has different intrinsics with AVL.
### Versioning
Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
incrementing MINOR after backward-compatible additions and PATCH after
backward-compatible fixes. We recommend using releases (rather than the Git tip)
because they are tested more extensively, see below.
Version 0.11 is considered stable enough to use in other projects.
Version 1.0 will signal an increased focus on backwards compatibility and will
be reached after the RVV target is finished (planned for 2022H1).
### Testing
Version 0.11 is considered stable enough to use in other projects, and is
expected to remain backwards compatible unless serious issues are discovered
while finishing the RVV target. After that, Highway will reach version 1.0.
Continuous integration tests build with a recent version of Clang (running on
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86).
@ -98,15 +33,13 @@ Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via
GCC cross-compile and QEMU. See the
[testing process](g3doc/release_testing_process.md) for details.
### Related modules
The `contrib` directory contains SIMD-related utilities: an image class with
aligned rows, a math library (16 functions already implemented, mostly
trigonometry), and functions for computing dot products and sorting.
aligned rows, and a math library (16 functions already implemented, mostly
trigonometry).
## Installation
This project uses CMake to generate and build. In a Debian-based system you can
This project uses cmake to generate and build. In a Debian-based system you can
install it via:
```bash
@ -122,8 +55,7 @@ installing gtest separately:
sudo apt install libgtest-dev
```
To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
the standard CMake workflow can be used:
To build and test the library the standard cmake workflow can be used:
```bash
mkdir -p build && cd build
@ -144,40 +76,31 @@ and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
indicates the number of instructions per operation.
We recommend using full SIMD vectors whenever possible for maximum performance
portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
alternatives for use-cases requiring an upper bound on the lanes:
portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET ==
HWY_SCALAR`, the vector always has one lane. For all other targets, up to
128-bit vectors are guaranteed to be available.
- For up to a power of two `N`, specify `CappedTag<T, N>` (or
equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as
a narrow matrix. A loop is still required because vectors may actually have
fewer than `N` lanes.
- For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
supported `N` depends on the target, but is guaranteed to be at least
`16/sizeof(T)`.
Functions using Highway must either be inside `namespace HWY_NAMESPACE {`
(possibly nested in one or more other namespaces defined by the project), OR
each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE;
hn::LoadDup128()`. Additionally, each function using Highway must either be
prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
`HWY_AFTER_NAMESPACE()`.
Functions using Highway must be inside `namespace HWY_NAMESPACE {`
(possibly nested in one or more other namespaces defined by the project), and
additionally either prefixed with `HWY_ATTR`, or residing between
`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
* For static dispatch, `HWY_TARGET` will be the best available target among
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
[quick-reference](g3doc/quick_reference.md)). Functions inside
`HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
the same module they are defined in. You can call the function from other
modules by wrapping it in a regular function and declaring the regular
function in a header.
[quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
they are defined in. You can call the function from other modules by
wrapping it in a regular function and declaring the regular function in a
header.
* For dynamic dispatch, a table of function pointers is generated via the
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
call the best function pointer for the current CPU's supported targets. A
module is automatically compiled for each target in `HWY_TARGETS` (see
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
defined and `foreach_target.h` is included.
defined and foreach_target.h is included.
## Compiler flags
@ -200,17 +123,17 @@ ensure proper VEX code generation for AVX2 targets.
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
loop with number of iterations matching the preferred vector width.
In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
the number of elements to process, and `N = Lanes(d)` the number of lanes in a
full vector. Assume the loop body is given as a function `template<bool partial,
class D> void LoopBody(D d, size_t index, size_t max_n)`.
In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
number of elements to process, and `N = Lanes(d)` the number of lanes in a full
vector. Assume the loop body is given as a function `template<bool partial,
class D> void LoopBody(D d, size_t max_n)`.
Highway offers several ways to express loops where `N` need not divide `count`:
* Ensure all inputs/outputs are padded. Then the loop is simply
```
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
```
Here, the template parameter and second function argument are not needed.
@ -226,8 +149,8 @@ Highway offers several ways to express loops where `N` need not divide `count`:
```
size_t i = 0;
for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0);
for (; i + N <= count; i += N) LoopBody<false>(d, 0);
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
```
The template parameter and second function arguments are again not needed.
@ -240,20 +163,18 @@ Highway offers several ways to express loops where `N` need not divide `count`:
```
size_t i = 0;
for (; i + N <= count; i += N) {
LoopBody<false>(d, i, 0);
LoopBody<false>(d, 0);
}
if (i < count) {
LoopBody<true>(d, i, count - i);
LoopBody<true>(d, count - i);
}
```
Now the template parameter and third function argument can be used inside
Now the template parameter and second function argument can be used inside
`LoopBody` to 'blend' the new partial vector with previous memory contents:
`Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`.
This is a good default when it is infeasible to ensure vectors are padded.
In contrast to the scalar loop, only a single final iteration is needed.
The increased code size from two loop bodies is expected to be worthwhile
because it avoids the cost of masking in all but the final iteration.
## Additional resources

12
third_party/highway/debian/changelog поставляемый
Просмотреть файл

@ -1,15 +1,3 @@
highway (0.16.0-1) UNRELEASED; urgency=medium
* Add contrib/sort (vectorized quicksort)
* Add IfNegativeThenElse, IfVecThenElse
* Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
* Add OrAnd, Min128, Max128, Lt128, SumsOf8
* Support capped/partial vectors on RVV/SVE, int64 in WASM
* Support SVE2, shared library build
* Remove deprecated overloads without the required d arg (UpperHalf etc.)
-- Jan Wassenberg <janwas@google.com> Thu, 03 Feb 2022 11:00:00 +0100
highway (0.15.0-1) UNRELEASED; urgency=medium
* New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec

21
third_party/highway/hwy/aligned_allocator.h поставляемый
Просмотреть файл

@ -18,11 +18,8 @@
// Memory allocator with support for alignment and offsets.
#include <stddef.h>
#include <memory>
#include "hwy/highway_export.h"
namespace hwy {
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
@ -39,15 +36,15 @@ using FreePtr = void (*)(void* opaque, void* memory);
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
// memory or malloc() if it is null.
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
AllocPtr alloc_ptr, void* opaque_ptr);
void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
void* opaque_ptr);
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
// must have been returned from a previous call to `AllocateAlignedBytes`.
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
// `free_ptr` function is null, uses the default free().
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
FreePtr free_ptr, void* opaque_ptr);
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr);
// Class that deletes the aligned pointer passed to operator() calling the
// destructor before freeing the pointer. This is equivalent to the
@ -79,10 +76,8 @@ class AlignedDeleter {
// array. TypeArrayDeleter<T> would match this prototype.
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
FreePtr free_ptr,
void* opaque_ptr,
ArrayDeleter deleter);
static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr, ArrayDeleter deleter);
FreePtr free_;
void* opaque_ptr_;
@ -112,8 +107,8 @@ template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter());
return AlignedUniquePtr<T>(
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
}
// Helpers for array allocators (avoids overflow)

252
third_party/highway/hwy/base.h поставляемый
Просмотреть файл

@ -24,7 +24,6 @@
#include <cfloat>
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
//------------------------------------------------------------------------------
// Compiler-specific definitions
@ -185,6 +184,10 @@
} while (0)
#endif
#if defined(HWY_EMULATE_SVE)
class FarmFloat16;
#endif
namespace hwy {
//------------------------------------------------------------------------------
@ -202,9 +205,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
//------------------------------------------------------------------------------
// Alignment
// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
// should be allocated dynamically via aligned_allocator.h because Lanes() may
// exceed the stack size.
// For stack-allocated partial arrays or LoadDup128.
#if HWY_ARCH_X86
#define HWY_ALIGN_MAX alignas(64)
#elif HWY_ARCH_RVV && defined(__riscv_vector)
@ -227,7 +228,9 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
#pragma pack(push, 1)
#if HWY_NATIVE_FLOAT16
#if defined(HWY_EMULATE_SVE)
using float16_t = FarmFloat16;
#elif HWY_NATIVE_FLOAT16
using float16_t = __fp16;
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
// arguments, so use a wrapper.
@ -250,15 +253,15 @@ using float64_t = double;
//------------------------------------------------------------------------------
// Controlling overload resolution (SFINAE)
template <bool Condition>
template <bool Condition, class T>
struct EnableIfT {};
template <>
struct EnableIfT<true> {
using type = void;
template <class T>
struct EnableIfT<true, T> {
using type = T;
};
template <bool Condition>
using EnableIf = typename EnableIfT<Condition>::type;
template <bool Condition, class T = void>
using EnableIf = typename EnableIfT<Condition, T>::type;
template <typename T, typename U>
struct IsSameT {
@ -280,7 +283,7 @@ HWY_API constexpr bool IsSame() {
//
// Note that enabling for exactly 128 bits is unnecessary because a function can
// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
@ -316,6 +319,102 @@ struct RemoveConstT<const T> {
template <class T>
using RemoveConst = typename RemoveConstT<T>::type;
//------------------------------------------------------------------------------
// Type traits
template <typename T>
HWY_API constexpr bool IsFloat() {
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
// from a float, not compared.
return IsSame<T, float>() || IsSame<T, double>();
}
template <typename T>
HWY_API constexpr bool IsSigned() {
return T(0) > T(-1);
}
template <>
constexpr bool IsSigned<float16_t>() {
return true;
}
template <>
constexpr bool IsSigned<bfloat16_t>() {
return true;
}
// Largest/smallest representable integer values.
template <typename T>
HWY_API constexpr T LimitsMax() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
: static_cast<T>(~0ull);
}
template <typename T>
HWY_API constexpr T LimitsMin() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
}
// Largest/smallest representable value (integer or float). This naming avoids
// confusion with numeric_limits<float>::min() (the smallest positive value).
template <typename T>
HWY_API constexpr T LowestValue() {
return LimitsMin<T>();
}
template <>
constexpr float LowestValue<float>() {
return -FLT_MAX;
}
template <>
constexpr double LowestValue<double>() {
return -DBL_MAX;
}
template <typename T>
HWY_API constexpr T HighestValue() {
return LimitsMax<T>();
}
template <>
constexpr float HighestValue<float>() {
return FLT_MAX;
}
template <>
constexpr double HighestValue<double>() {
return DBL_MAX;
}
// Returns bitmask of the exponent field in IEEE binary32/64.
template <typename T>
constexpr T ExponentMask() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr uint32_t ExponentMask<uint32_t>() {
return 0x7F800000;
}
template <>
constexpr uint64_t ExponentMask<uint64_t>() {
return 0x7FF0000000000000ULL;
}
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
// absolute value are less than this can be represented exactly.
template <typename T>
constexpr T MantissaEnd() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr float MantissaEnd<float>() {
return 8388608.0f; // 1 << 23
}
template <>
constexpr double MantissaEnd<double>() {
// floating point literal with p52 requires C++17.
return 4503599627370496.0; // 1 << 52
}
//------------------------------------------------------------------------------
// Type relations
@ -457,118 +556,6 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
template <size_t N>
using FloatFromSize = typename detail::TypeFromSize<N>::Float;
//------------------------------------------------------------------------------
// Type traits
template <typename T>
HWY_API constexpr bool IsFloat() {
// Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
// from a float, not compared.
return IsSame<T, float>() || IsSame<T, double>();
}
template <typename T>
HWY_API constexpr bool IsSigned() {
return T(0) > T(-1);
}
template <>
constexpr bool IsSigned<float16_t>() {
return true;
}
template <>
constexpr bool IsSigned<bfloat16_t>() {
return true;
}
// Largest/smallest representable integer values.
template <typename T>
HWY_API constexpr T LimitsMax() {
static_assert(!IsFloat<T>(), "Only for integer types");
using TU = MakeUnsigned<T>;
return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
: static_cast<TU>(~0ull));
}
template <typename T>
HWY_API constexpr T LimitsMin() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
}
// Largest/smallest representable value (integer or float). This naming avoids
// confusion with numeric_limits<float>::min() (the smallest positive value).
template <typename T>
HWY_API constexpr T LowestValue() {
return LimitsMin<T>();
}
template <>
constexpr float LowestValue<float>() {
return -FLT_MAX;
}
template <>
constexpr double LowestValue<double>() {
return -DBL_MAX;
}
template <typename T>
HWY_API constexpr T HighestValue() {
return LimitsMax<T>();
}
template <>
constexpr float HighestValue<float>() {
return FLT_MAX;
}
template <>
constexpr double HighestValue<double>() {
return DBL_MAX;
}
// Returns bitmask of the exponent field in IEEE binary32/64.
template <typename T>
constexpr T ExponentMask() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr uint32_t ExponentMask<uint32_t>() {
return 0x7F800000;
}
template <>
constexpr uint64_t ExponentMask<uint64_t>() {
return 0x7FF0000000000000ULL;
}
// Returns bitmask of the mantissa field in IEEE binary32/64.
template <typename T>
constexpr T MantissaMask() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr uint32_t MantissaMask<uint32_t>() {
return 0x007FFFFF;
}
template <>
constexpr uint64_t MantissaMask<uint64_t>() {
return 0x000FFFFFFFFFFFFFULL;
}
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
// absolute value are less than this can be represented exactly.
template <typename T>
constexpr T MantissaEnd() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr float MantissaEnd<float>() {
return 8388608.0f; // 1 << 23
}
template <>
constexpr double MantissaEnd<double>() {
// floating point literal with p52 requires C++17.
return 4503599627370496.0; // 1 << 52
}
//------------------------------------------------------------------------------
// Helper functions
@ -674,21 +661,14 @@ HWY_API size_t PopCount(uint64_t x) {
#endif
}
// Skip HWY_API due to GCC "function not considered for inlining". Previously
// such errors were caused by underlying type mismatches, but it's not clear
// what is still mismatched despite all the casts.
template <typename TI>
/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
HWY_API constexpr size_t FloorLog2(TI x) {
return x == 1 ? 0 : FloorLog2(x >> 1) + 1;
}
template <typename TI>
/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
return x == TI{1}
? 0
: static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
HWY_API constexpr size_t CeilLog2(TI x) {
return x == 1 ? 0 : FloorLog2(x - 1) + 1;
}
#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
@ -747,7 +727,7 @@ HWY_API bfloat16_t BF16FromF32(float f) {
return bf;
}
HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...);
} // namespace hwy

6
third_party/highway/hwy/cache_control.h поставляемый
Просмотреть файл

@ -36,7 +36,9 @@
// undefine them in this header; these functions are anyway deprecated.
// TODO(janwas): remove when these functions are removed.
#pragma push_macro("LoadFence")
#pragma push_macro("StoreFence")
#undef LoadFence
#undef StoreFence
namespace hwy {
@ -70,6 +72,9 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
#endif
}
// DEPRECATED, replace with `FlushStream`.
HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); }
// Optionally begins loading the cache line containing "p" to reduce latency of
// subsequent actual loads.
template <typename T>
@ -104,6 +109,7 @@ HWY_INLINE HWY_ATTR_CACHE void Pause() {
} // namespace hwy
// TODO(janwas): remove when these functions are removed. (See above.)
#pragma pop_macro("StoreFence")
#pragma pop_macro("LoadFence")
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_

Просмотреть файл

@ -14,14 +14,15 @@
#include "hwy/contrib/image/image.h"
#include <algorithm> // swap
#include <cstddef>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
#include <algorithm> // swap
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {

Просмотреть файл

@ -27,13 +27,12 @@
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/highway_export.h"
namespace hwy {
// Type-independent parts of Image<> - reduces code duplication and facilitates
// moving member function implementations to cc file.
struct HWY_CONTRIB_DLLEXPORT ImageBase {
struct ImageBase {
// Returns required alignment in bytes for externally allocated memory.
static size_t VectorSize();
@ -101,7 +100,8 @@ struct HWY_CONTRIB_DLLEXPORT ImageBase {
protected:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidRow(const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
if (y >= ysize_) {
HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_);
}
@ -291,7 +291,8 @@ class Image3 {
private:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
if (c >= kNumPlanes || y >= ysize()) {
HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n",
static_cast<uint64_t>(c), static_cast<uint64_t>(y),

Просмотреть файл

@ -51,7 +51,7 @@ struct TestAlignedT {
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; x += Lanes(d)) {
const auto values = Iota(d, static_cast<T>(dist(rng)));
const auto values = Iota(d, dist(rng));
Store(values, d, row + x);
}
}

Просмотреть файл

@ -486,7 +486,7 @@ struct AsinImpl<float> {
}
};
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
template <>
struct AsinImpl<double> {
@ -531,7 +531,7 @@ struct AtanImpl<float> {
}
};
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
template <>
struct AtanImpl<double> {
@ -635,7 +635,7 @@ struct CosSinImpl<float> {
}
};
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
template <>
struct CosSinImpl<double> {
@ -787,7 +787,7 @@ struct LogImpl<float> {
}
};
#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
template <>
struct ExpImpl<double> {
// Rounds double toward zero and returns as int32_t.

Просмотреть файл

@ -61,7 +61,7 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
uint64_t max_ulp = 0;
// Emulation is slower, so cannot afford as many.
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000));
for (int range_index = 0; range_index < range_count; ++range_index) {
const UintT start = ranges[range_index][0];
const UintT stop = ranges[range_index][1];
@ -96,11 +96,24 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
HWY_ASSERT(max_ulp <= max_error_ulp);
}
// TODO(janwas): remove once RVV supports fractional LMUL
#undef DEFINE_MATH_TEST_FUNC
#if HWY_TARGET == HWY_RVV
#define DEFINE_MATH_TEST_FUNC(NAME) \
HWY_NOINLINE void TestAll##NAME() { \
ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \
}
#else
#define DEFINE_MATH_TEST_FUNC(NAME) \
HWY_NOINLINE void TestAll##NAME() { \
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
}
#endif
#undef DEFINE_MATH_TEST
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \

133
third_party/highway/hwy/contrib/sort/BUILD поставляемый
Просмотреть файл

@ -1,133 +0,0 @@
package(default_visibility = ["//visibility:public"])
licenses(["notice"])
# Unused on Bazel builds, where this is not defined/known; Copybara replaces
# usages with an empty list.
COMPAT = [
"//buildenv/target:non_prod", # includes mobile/vendor.
]
cc_library(
name = "vqsort",
srcs = [
# Split into separate files to reduce MSVC build time.
"vqsort.cc",
"vqsort_i16a.cc",
"vqsort_i16d.cc",
"vqsort_u16a.cc",
"vqsort_u16d.cc",
"vqsort_f32a.cc",
"vqsort_f32d.cc",
"vqsort_i32a.cc",
"vqsort_i32d.cc",
"vqsort_u32a.cc",
"vqsort_u32d.cc",
"vqsort_f64a.cc",
"vqsort_f64d.cc",
"vqsort_i64a.cc",
"vqsort_i64d.cc",
"vqsort_u64a.cc",
"vqsort_u64d.cc",
"vqsort_128a.cc",
"vqsort_128d.cc",
],
hdrs = [
"disabled_targets.h",
"vqsort.h", # public interface
],
compatible_with = [],
textual_hdrs = [
"shared-inl.h",
"sorting_networks-inl.h",
"traits-inl.h",
"traits128-inl.h",
"vqsort-inl.h",
],
deps = [
# Only if VQSORT_SECURE_RNG is set.
# "//third_party/absl/random",
"//:hwy",
],
)
# -----------------------------------------------------------------------------
# Internal-only targets
cc_library(
name = "helpers",
testonly = 1,
textual_hdrs = [
"algo-inl.h",
"result-inl.h",
],
deps = [
":vqsort",
"//:nanobenchmark",
# Required for HAVE_PDQSORT, but that is unused and this is
# unavailable to Bazel builds, hence commented out.
# "//third_party/boost/allowed",
# Avoid ips4o and thus TBB to work around hwloc build failure.
],
)
cc_binary(
name = "print_network",
testonly = 1,
srcs = ["print_network.cc"],
deps = [
":helpers",
":vqsort",
"//:hwy",
],
)
cc_test(
name = "sort_test",
size = "medium",
srcs = ["sort_test.cc"],
features = ["fully_static_link"],
linkstatic = True,
local_defines = ["HWY_IS_TEST"],
# for test_suite.
tags = ["hwy_ops_test"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)
cc_binary(
name = "bench_sort",
testonly = 1,
srcs = ["bench_sort.cc"],
features = ["fully_static_link"],
linkstatic = True,
local_defines = ["HWY_IS_TEST"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)
cc_binary(
name = "bench_parallel",
testonly = 1,
srcs = ["bench_parallel.cc"],
features = ["fully_static_link"],
linkstatic = True,
local_defines = ["HWY_IS_TEST"],
deps = [
":helpers",
":vqsort",
"@com_google_googletest//:gtest_main",
"//:hwy",
"//:hwy_test_util",
],
)

Просмотреть файл

@ -1,395 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Normal include guard for target-independent parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
#include <stdint.h>
#include <string.h> // memcpy
#include <algorithm>
#include <cmath> // std::abs
#include <vector>
#include "hwy/base.h"
#include "hwy/contrib/sort/vqsort.h"
// Third-party algorithms
#define HAVE_AVX2SORT 0
#define HAVE_IPS4O 0
#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
#define HAVE_PDQSORT 0
#define HAVE_SORT512 0
#if HAVE_AVX2SORT
HWY_PUSH_ATTRIBUTES("avx2,avx")
#include "avx2sort.h"
HWY_POP_ATTRIBUTES
#endif
#if HAVE_IPS4O
#include "third_party/ips4o/include/ips4o.hpp"
#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
#endif
#if HAVE_PDQSORT
#include "third_party/boost/allowed/sort/sort.hpp"
#endif
#if HAVE_SORT512
#include "sort512.h"
#endif
namespace hwy {
enum class Dist { kUniform8, kUniform16, kUniform32 };
std::vector<Dist> AllDist() {
return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32};
}
const char* DistName(Dist dist) {
switch (dist) {
case Dist::kUniform8:
return "uniform8";
case Dist::kUniform16:
return "uniform16";
case Dist::kUniform32:
return "uniform32";
}
return "unreachable";
}
template <typename T>
class InputStats {
public:
void Notify(T value) {
min_ = std::min(min_, value);
max_ = std::max(max_, value);
sumf_ += static_cast<double>(value);
count_ += 1;
}
bool operator==(const InputStats& other) const {
if (count_ != other.count_) {
HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
static_cast<int>(other.count_));
}
if (min_ != other.min_ || max_ != other.max_) {
HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_),
double(other.min_), double(other.max_));
}
// Sum helps detect duplicated/lost values
if (sumf_ != other.sumf_) {
// Allow some tolerance because kUniform32 * num can exceed double
// precision.
const double mul = 1E-9; // prevent destructive cancellation
const double err = std::abs(sumf_ * mul - other.sumf_ * mul);
if (err > 1E-3) {
HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_,
other.sumf_, err, double(min_), double(max_));
}
}
return true;
}
private:
T min_ = hwy::HighestValue<T>();
T max_ = hwy::LowestValue<T>();
double sumf_ = 0.0;
size_t count_ = 0;
};
enum class Algo {
#if HAVE_AVX2SORT
kSEA,
#endif
#if HAVE_IPS4O
kIPS4O,
#endif
#if HAVE_PARALLEL_IPS4O
kParallelIPS4O,
#endif
#if HAVE_PDQSORT
kPDQ,
#endif
#if HAVE_SORT512
kSort512,
#endif
kStd,
kVQSort,
kHeap,
};
const char* AlgoName(Algo algo) {
switch (algo) {
#if HAVE_AVX2SORT
case Algo::kSEA:
return "sea";
#endif
#if HAVE_IPS4O
case Algo::kIPS4O:
return "ips4o";
#endif
#if HAVE_PARALLEL_IPS4O
case Algo::kParallelIPS4O:
return "par_ips4o";
#endif
#if HAVE_PDQSORT
case Algo::kPDQ:
return "pdq";
#endif
#if HAVE_SORT512
case Algo::kSort512:
return "sort512";
#endif
case Algo::kStd:
return "std";
case Algo::kVQSort:
return "vq";
case Algo::kHeap:
return "heap";
}
return "unreachable";
}
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
#endif
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h" // HeapSort
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
class Xorshift128Plus {
static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
return z ^ (z >> 31);
}
public:
// Generates two vectors of 64-bit seeds via SplitMix64 and stores into
// `seeds`. Generating these afresh in each ChoosePivot is too expensive.
template <class DU64>
static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
seeds[i] = SplitMix64(seeds[i - 1]);
}
}
// Need to pass in the state because vector cannot be class members.
template <class DU64>
static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0,
Vec<DU64>& state1) {
Vec<DU64> s1 = state0;
Vec<DU64> s0 = state1;
const Vec<DU64> bits = Add(s1, s0);
state0 = s0;
s1 = Xor(s1, ShiftLeft<23>(s1));
state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
return bits;
}
};
template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)>
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
const Vec<DU64> mask) {
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
return And(bits, mask);
}
// Important to avoid denormals, which are flushed to zero by SIMD but not
// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
template <typename T, class DU64, HWY_IF_FLOAT(T)>
Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1,
const Vec<DU64> mask) {
const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1);
const Vec<DU64> values = And(bits, mask);
#if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to i32
const RebindToSigned<DU64> di;
#else
const Repartition<MakeSigned<T>, DU64> di;
#endif
const RebindToFloat<decltype(di)> df;
// Avoid NaN/denormal by converting from (range-limited) integer.
const Vec<DU64> no_nan =
And(values, Set(du64, MantissaMask<MakeUnsigned<T>>()));
return BitCast(du64, ConvertTo(df, BitCast(di, no_nan)));
}
template <class DU64>
Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
switch (sizeof_t) {
case 2:
return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
: 0xFFFFFFFFFFFFFFFFull);
case 4:
return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull
: (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
: 0xFFFFFFFFFFFFFFFFull);
case 8:
return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull
: (dist == Dist::kUniform16) ? 0x000000000000FFFFull
: 0x00000000FFFFFFFFull);
default:
HWY_ABORT("Logic error");
return Zero(du64);
}
}
template <typename T>
InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
SortTag<uint64_t> du64;
using VU64 = Vec<decltype(du64)>;
const size_t N64 = Lanes(du64);
auto buf = hwy::AllocateAligned<uint64_t>(2 * N64);
Xorshift128Plus::GenerateSeeds(du64, buf.get());
auto s0 = Load(du64, buf.get());
auto s1 = Load(du64, buf.get() + N64);
const VU64 mask = MaskForDist(du64, dist, sizeof(T));
const Repartition<T, decltype(du64)> d;
const size_t N = Lanes(d);
size_t i = 0;
for (; i + N <= num; i += N) {
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
#if HWY_ARCH_RVV
// v may not be 64-bit aligned
StoreU(bits, du64, buf.get());
memcpy(v + i, buf.get(), N64 * sizeof(uint64_t));
#else
StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i));
#endif
}
if (i < num) {
const VU64 bits = RandomValues<T>(du64, s0, s1, mask);
StoreU(bits, du64, buf.get());
memcpy(v + i, buf.get(), (num - i) * sizeof(T));
}
InputStats<T> input_stats;
for (size_t i = 0; i < num; ++i) {
input_stats.Notify(v[i]);
}
return input_stats;
}
struct ThreadLocal {
Sorter sorter;
};
struct SharedState {
#if HAVE_PARALLEL_IPS4O
ips4o::StdThreadPool pool{
HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))};
#endif
std::vector<ThreadLocal> tls{1};
};
template <class Order, typename T>
void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared,
size_t thread) {
using detail::HeapSort;
using detail::LaneTraits;
using detail::SharedTraits;
switch (algo) {
#if HAVE_AVX2SORT
case Algo::kSEA:
return avx2::quicksort(inout, static_cast<int>(num));
#endif
#if HAVE_IPS4O
case Algo::kIPS4O:
if (Order().IsAscending()) {
return ips4o::sort(inout, inout + num, std::less<T>());
} else {
return ips4o::sort(inout, inout + num, std::greater<T>());
}
#endif
#if HAVE_PARALLEL_IPS4O
case Algo::kParallelIPS4O:
if (Order().IsAscending()) {
return ips4o::parallel::sort(inout, inout + num, std::less<T>());
} else {
return ips4o::parallel::sort(inout, inout + num, std::greater<T>());
}
#endif
#if HAVE_SORT512
case Algo::kSort512:
HWY_ABORT("not supported");
// return Sort512::Sort(inout, num);
#endif
#if HAVE_PDQSORT
case Algo::kPDQ:
if (Order().IsAscending()) {
return boost::sort::pdqsort_branchless(inout, inout + num,
std::less<T>());
} else {
return boost::sort::pdqsort_branchless(inout, inout + num,
std::greater<T>());
}
#endif
case Algo::kStd:
if (Order().IsAscending()) {
return std::sort(inout, inout + num, std::less<T>());
} else {
return std::sort(inout, inout + num, std::greater<T>());
}
case Algo::kVQSort:
return shared.tls[thread].sorter(inout, num, Order());
case Algo::kHeap:
HWY_ASSERT(sizeof(T) < 16);
if (Order().IsAscending()) {
const SharedTraits<LaneTraits<detail::OrderAscending>> st;
return HeapSort(st, inout, num);
} else {
const SharedTraits<LaneTraits<detail::OrderDescending>> st;
return HeapSort(st, inout, num);
}
default:
HWY_ABORT("Not implemented");
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE

Просмотреть файл

@ -1,243 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Concurrent, independent sorts for generating more memory traffic and testing
// scalability.
// clang-format off
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/aligned_allocator.h"
// Last
#include "hwy/tests/test_util-inl.h"
// clang-format on
#include <stdint.h>
#include <stdio.h>
#include <condition_variable> //NOLINT
#include <functional>
#include <memory>
#include <mutex> //NOLINT
#include <thread> //NOLINT
#include <utility>
#include <vector>
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace {
#if HWY_TARGET != HWY_SCALAR
class ThreadPool {
public:
// Starts the given number of worker threads and blocks until they are ready.
explicit ThreadPool(
const size_t num_threads = std::thread::hardware_concurrency() / 2)
: num_threads_(num_threads) {
HWY_ASSERT(num_threads_ > 0);
threads_.reserve(num_threads_);
for (size_t i = 0; i < num_threads_; ++i) {
threads_.emplace_back(ThreadFunc, this, i);
}
WorkersReadyBarrier();
}
ThreadPool(const ThreadPool&) = delete;
ThreadPool& operator&(const ThreadPool&) = delete;
// Waits for all threads to exit.
~ThreadPool() {
StartWorkers(kWorkerExit);
for (std::thread& thread : threads_) {
thread.join();
}
}
size_t NumThreads() const { return threads_.size(); }
template <class Func>
void RunOnThreads(size_t max_threads, const Func& func) {
task_ = &CallClosure<Func>;
data_ = &func;
StartWorkers(max_threads);
WorkersReadyBarrier();
}
private:
// After construction and between calls to Run, workers are "ready", i.e.
// waiting on worker_start_cv_. They are "started" by sending a "command"
// and notifying all worker_start_cv_ waiters. (That is why all workers
// must be ready/waiting - otherwise, the notification will not reach all of
// them and the main thread waits in vain for them to report readiness.)
using WorkerCommand = uint64_t;
static constexpr WorkerCommand kWorkerWait = ~1ULL;
static constexpr WorkerCommand kWorkerExit = ~2ULL;
// Calls a closure (lambda with captures).
template <class Closure>
static void CallClosure(const void* f, size_t thread) {
(*reinterpret_cast<const Closure*>(f))(thread);
}
void WorkersReadyBarrier() {
std::unique_lock<std::mutex> lock(mutex_);
// Typically only a single iteration.
while (workers_ready_ != threads_.size()) {
workers_ready_cv_.wait(lock);
}
workers_ready_ = 0;
// Safely handle spurious worker wakeups.
worker_start_command_ = kWorkerWait;
}
// Precondition: all workers are ready.
void StartWorkers(const WorkerCommand worker_command) {
std::unique_lock<std::mutex> lock(mutex_);
worker_start_command_ = worker_command;
// Workers will need this lock, so release it before they wake up.
lock.unlock();
worker_start_cv_.notify_all();
}
static void ThreadFunc(ThreadPool* self, size_t thread) {
// Until kWorkerExit command received:
for (;;) {
std::unique_lock<std::mutex> lock(self->mutex_);
// Notify main thread that this thread is ready.
if (++self->workers_ready_ == self->num_threads_) {
self->workers_ready_cv_.notify_one();
}
RESUME_WAIT:
// Wait for a command.
self->worker_start_cv_.wait(lock);
const WorkerCommand command = self->worker_start_command_;
switch (command) {
case kWorkerWait: // spurious wakeup:
goto RESUME_WAIT; // lock still held, avoid incrementing ready.
case kWorkerExit:
return; // exits thread
default:
break;
}
lock.unlock();
// Command is the maximum number of threads that should run the task.
HWY_ASSERT(command < self->NumThreads());
if (thread < command) {
self->task_(self->data_, thread);
}
}
}
const size_t num_threads_;
// Unmodified after ctor, but cannot be const because we call thread::join().
std::vector<std::thread> threads_;
std::mutex mutex_; // guards both cv and their variables.
std::condition_variable workers_ready_cv_;
size_t workers_ready_ = 0;
std::condition_variable worker_start_cv_;
WorkerCommand worker_start_command_;
// Written by main thread, read by workers (after mutex lock/unlock).
std::function<void(const void*, size_t)> task_; // points to CallClosure
const void* data_; // points to caller's Func
};
template <class Order, typename T>
void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo,
SharedState& shared, size_t thread) {
auto aligned = hwy::AllocateAligned<T>(num);
(void)GenerateInput(dist, aligned.get(), num);
const Timestamp t0;
Run<Order>(algo, aligned.get(), num, shared, thread);
HWY_ASSERT(aligned[0] < aligned[num - 1]);
}
void BenchParallel() {
// Not interested in benchmark results for other targets
if (HWY_TARGET != HWY_AVX3) return;
ThreadPool pool;
const size_t NT = pool.NumThreads();
using T = int64_t;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
size_t num = 100 * 1000 * 1000;
#if HAVE_IPS4O
const Algo algo = Algo::kIPS4O;
#else
const Algo algo = Algo::kVQSort;
#endif
const Dist dist = Dist::kUniform16;
SharedState shared;
shared.tls.resize(NT);
std::vector<Result> results;
for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
Timestamp t0;
// Default capture because MSVC wants algo/dist but clang does not.
pool.RunOnThreads(nt, [=, &shared](size_t thread) {
RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread);
});
const double sec = SecondsSince(t0);
results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec));
results.back().Print();
}
}
#else
void BenchParallel() {}
#endif
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_BEFORE_TEST(BenchParallel);
HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
} // namespace
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#endif // HWY_ONCE

Просмотреть файл

@ -1,259 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/contrib/sort/vqsort.h"
#include "hwy/contrib/sort/sorting_networks-inl.h" // SharedTraits
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
#include <stdint.h>
#include <stdio.h>
#include <string.h> // memcpy
#include <vector>
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace {
using detail::LaneTraits;
using detail::OrderAscending;
using detail::OrderDescending;
using detail::SharedTraits;
#if HWY_TARGET != HWY_SCALAR
using detail::OrderAscending128;
using detail::OrderDescending128;
using detail::Traits128;
template <class Traits, typename T>
HWY_NOINLINE void BenchPartition() {
const SortTag<T> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform8;
double sum = 0.0;
const size_t max_log2 = AdjustedLog2Reps(20);
for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
const size_t num = 1ull << log2;
auto aligned = hwy::AllocateAligned<T>(num);
auto buf =
hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d)));
std::vector<double> seconds;
const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps;
for (size_t rep = 0; rep < num_reps; ++rep) {
(void)GenerateInput(dist, aligned.get(), num);
const Timestamp t0;
detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)),
buf.get());
seconds.push_back(SecondsSince(t0));
// 'Use' the result to prevent optimizing out the partition.
sum += static_cast<double>(aligned.get()[num / 2]);
}
MakeResult<T>(Algo::kVQSort, dist, st, num, 1,
SummarizeMeasurements(seconds))
.Print();
}
HWY_ASSERT(sum != 999999); // Prevent optimizing out
}
HWY_NOINLINE void BenchAllPartition() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 ||
HWY_TARGET == HWY_AVX2) {
return;
}
BenchPartition<LaneTraits<OrderDescending>, float>();
BenchPartition<LaneTraits<OrderAscending>, int64_t>();
BenchPartition<Traits128<OrderDescending128>, uint64_t>();
}
template <class Traits, typename T>
HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
const SortTag<T> d;
detail::SharedTraits<Traits> st;
const Dist dist = Dist::kUniform32;
const size_t N = Lanes(d);
const size_t num = SortConstants::BaseCaseNum(N);
auto keys = hwy::AllocateAligned<T>(num);
auto buf = hwy::AllocateAligned<T>(num + N);
std::vector<double> seconds;
double sum = 0; // prevents elision
constexpr size_t kMul = AdjustedReps(600); // ensures long enough to measure
for (size_t rep = 0; rep < kReps; ++rep) {
InputStats<T> input_stats = GenerateInput(dist, keys.get(), num);
const Timestamp t0;
for (size_t i = 0; i < kMul; ++i) {
detail::BaseCase(d, st, keys.get(), num, buf.get());
sum += static_cast<double>(keys[0]);
}
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase"));
}
HWY_ASSERT(sum < 1E99);
results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1,
SummarizeMeasurements(seconds)));
}
HWY_NOINLINE void BenchAllBase() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3) {
return;
}
std::vector<Result> results;
BenchBase<LaneTraits<OrderAscending>, float>(results);
BenchBase<LaneTraits<OrderDescending>, int64_t>(results);
BenchBase<Traits128<OrderAscending128>, uint64_t>(results);
for (const Result& r : results) {
r.Print();
}
}
std::vector<Algo> AlgoForBench() {
return {
#if HAVE_AVX2SORT
Algo::kSEA,
#endif
#if HAVE_PARALLEL_IPS4O
Algo::kParallelIPS4O,
#endif
#if HAVE_IPS4O
Algo::kIPS4O,
#endif
#if HAVE_PDQSORT
Algo::kPDQ,
#endif
#if HAVE_SORT512
Algo::kSort512,
#endif
// Algo::kStd, // too slow to always benchmark
// Algo::kHeap, // too slow to always benchmark
Algo::kVQSort,
};
}
template <class Traits, typename T>
HWY_NOINLINE void BenchSort(size_t num) {
SharedState shared;
detail::SharedTraits<Traits> st;
auto aligned = hwy::AllocateAligned<T>(num);
for (Algo algo : AlgoForBench()) {
for (Dist dist : AllDist()) {
std::vector<double> seconds;
for (size_t rep = 0; rep < kReps; ++rep) {
InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num);
const Timestamp t0;
Run<typename Traits::Order>(algo, aligned.get(), num, shared,
/*thread=*/0);
seconds.push_back(SecondsSince(t0));
// printf("%f\n", seconds.back());
HWY_ASSERT(
VerifySort(st, input_stats, aligned.get(), num, "BenchSort"));
}
MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds))
.Print();
} // dist
} // algo
}
HWY_NOINLINE void BenchAllSort() {
// Not interested in benchmark results for these targets
if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
return;
}
constexpr size_t K = 1000;
constexpr size_t M = K * K;
(void)K;
(void)M;
for (size_t num : {
#if HAVE_PARALLEL_IPS4O
100 * M,
#else
AdjustedReps(1 * M),
#endif
}) {
// BenchSort<LaneTraits<OrderAscending>, float>(num);
// BenchSort<LaneTraits<OrderDescending>, double>(num);
// BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
BenchSort<LaneTraits<OrderDescending>, int32_t>(num);
BenchSort<LaneTraits<OrderAscending>, int64_t>(num);
// BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
// BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
// BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
// BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
}
}
#else
void BenchAllPartition() {}
void BenchAllBase() {}
void BenchAllSort() {}
#endif
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_BEFORE_TEST(BenchSort);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
} // namespace
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#endif // HWY_ONCE

Просмотреть файл

@ -1,30 +0,0 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Speed up MSVC builds by building fewer targets. This header must be included
// from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
// However, users of vqsort.h are unaffected.
#ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
#define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
#include "hwy/base.h"
#if HWY_COMPILER_MSVC
#undef HWY_DISABLED_TARGETS
// HWY_SCALAR remains, so there will still be a valid target to call.
#define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4)
#endif // HWY_COMPILER_MSVC
#endif // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_

Просмотреть файл

@ -1,190 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <algorithm>
#include "hwy/base.h"
// Based on A.7 in "Entwurf und Implementierung vektorisierter
// Sortieralgorithmen" and code by Mark Blacher.
void PrintMergeNetwork16x2() {
for (int i = 8; i < 16; ++i) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
printf("\n");
}
void PrintMergeNetwork16x4() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.Reverse4(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
void PrintMergeNetwork16x8() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
void PrintMergeNetwork16x16() {
printf("\n");
for (int i = 8; i < 16; ++i) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
}
for (int i = 0; i < 8; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
}
for (int i = 0; i < 4; ++i) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
}
for (int i = 0; i < 4; ++i) {
printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
}
for (int i = 0; i < 16; i += 4) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
}
for (int i = 0; i < 16; i += 4) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
}
for (int i = 0; i < 16; i += 2) {
printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
}
for (int i = 0; i < 16; i += 2) {
printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
}
for (int i = 0; i < 16; ++i) {
printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
}
}
int main(int argc, char** argv) {
PrintMergeNetwork16x2();
PrintMergeNetwork16x4();
PrintMergeNetwork16x8();
PrintMergeNetwork16x16();
return 0;
}

Просмотреть файл

@ -1,149 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/algo-inl.h"
// Normal include guard for non-SIMD parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
#include <time.h>
#include <algorithm> // std::sort
#include <string>
#include "hwy/base.h"
#include "hwy/nanobenchmark.h"
namespace hwy {
struct Timestamp {
Timestamp() { t = platform::Now(); }
double t;
};
double SecondsSince(const Timestamp& t0) {
const Timestamp t1;
return t1.t - t0.t;
}
constexpr size_t kReps = 30;
// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
// enough for the mode to be reliable).
double SummarizeMeasurements(std::vector<double>& seconds) {
std::sort(seconds.begin(), seconds.end());
double sum = 0;
int count = 0;
for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) {
sum += seconds[i];
count += 1;
}
return sum / count;
}
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct Result {
Result() {}
Result(const uint32_t target, const Algo algo, Dist dist, bool is128,
size_t num, size_t num_threads, double sec, size_t sizeof_t,
const char* type_name)
: target(target),
algo(algo),
dist(dist),
is128(is128),
num(num),
num_threads(num_threads),
sec(sec),
sizeof_t(sizeof_t),
type_name(type_name) {}
void Print() const {
const double bytes = static_cast<double>(num) *
static_cast<double>(num_threads) *
static_cast<double>(sizeof_t);
printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
hwy::TargetName(target), AlgoName(algo),
is128 ? "u128" : type_name.c_str(), DistName(dist),
static_cast<double>(num), bytes * 1E-6 / sec, num_threads);
}
uint32_t target;
Algo algo;
Dist dist;
bool is128;
size_t num = 0;
size_t num_threads = 0;
double sec = 0.0;
size_t sizeof_t = 0;
std::string type_name;
};
template <typename T, class Traits>
Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num,
size_t num_threads, double sec) {
char string100[100];
hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100);
return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec,
sizeof(T), string100);
}
template <class Traits, typename T>
bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out,
size_t num, const char* caller) {
constexpr size_t N1 = st.Is128() ? 2 : 1;
HWY_ASSERT(num >= N1);
InputStats<T> output_stats;
// Ensure it matches the sort order
for (size_t i = 0; i < num - N1; i += N1) {
output_stats.Notify(out[i]);
if (N1 == 2) output_stats.Notify(out[i + 1]);
// Reverse order instead of checking !Compare1 so we accept equal keys.
if (st.Compare1(out + i + N1, out + i)) {
printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller,
static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1),
double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]),
double(out[i + N1]));
HWY_ABORT("%d-bit sort is incorrect\n",
static_cast<int>(sizeof(T) * 8 * N1));
}
}
output_stats.Notify(out[num - N1]);
if (N1 == 2) output_stats.Notify(out[num - N1 + 1]);
return input_stats == output_stats;
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE

Просмотреть файл

@ -1,104 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Definitions shared between vqsort-inl and sorting_networks-inl.
// Normal include guard for target-independent parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
#include "hwy/base.h"
namespace hwy {
// Internal constants - these are to avoid magic numbers/literals and cannot be
// changed without also changing the associated code.
struct SortConstants {
// SortingNetwork reshapes its input into a matrix. This is the maximum number
// of *keys* per vector.
#if HWY_COMPILER_MSVC
static constexpr size_t kMaxCols = 8; // avoids build timeout
#else
static constexpr size_t kMaxCols = 16; // enough for u32 in 512-bit vector
#endif
// 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
// fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
// code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
// extra logN factor for larger networks (for which only loose upper bounds
// on size are known).
static constexpr size_t kMaxRowsLog2 = 4;
static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
static HWY_INLINE size_t BaseCaseNum(size_t N) {
return kMaxRows * HWY_MIN(N, kMaxCols);
}
// Unrolling is important (pipelining and amortizing branch mispredictions);
// 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
// somewhat slower for sorting than 4x.
//
// To change, must also update left + 3 * N etc. in the loop.
static constexpr size_t kPartitionUnroll = 4;
static HWY_INLINE size_t PartitionBufNum(size_t N) {
// The main loop reads kPartitionUnroll vectors, and first loads from
// both left and right beforehand, so it requires min = 2 *
// kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
// >= BaseCaseNum), we partition the right side into a buffer. We need
// another vector at the end so CompressStore does not overwrite anything.
return (2 * kPartitionUnroll + 1) * N;
}
// Chunk := group of keys loaded for sampling a pivot. Matches the typical
// cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
// are larger, use entire vectors to ensure we do not overrun the array.
static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
return HWY_MAX(64 / sizeof_t, N);
}
};
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
#endif
#include "hwy/highway.h"
namespace hwy {
namespace HWY_NAMESPACE {
// Default tag / vector width selector.
// TODO(janwas): enable once LMUL < 1 is supported.
#if HWY_TARGET == HWY_RVV && 0
template <typename T>
using SortTag = ScalableTag<T, -1>;
#else
template <typename T>
using SortTag = ScalableTag<T>;
#endif
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE

Просмотреть файл

@ -12,553 +12,160 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/contrib/sort/vqsort.h"
// After foreach_target
#include "hwy/contrib/sort/algo-inl.h"
#include "hwy/contrib/sort/result-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h" // BaseCase
#include "hwy/contrib/sort/sort-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
#include <stdint.h>
#include <stdio.h>
#include <string.h> // memcpy
#include <algorithm> // std::max
#include <vector>
#undef VQSORT_TEST_IMPL
#if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD)
// Scalar does not implement these, and MSVC non-debug builds time out.
#define VQSORT_TEST_IMPL 0
#else
#define VQSORT_TEST_IMPL 1
#endif
#undef VQSORT_TEST_SORT
// MSVC non-debug builds time out.
#if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD
#define VQSORT_TEST_SORT 0
#else
#define VQSORT_TEST_SORT 1
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace {
#if VQSORT_TEST_IMPL || VQSORT_TEST_SORT
using detail::LaneTraits;
using detail::OrderAscending;
using detail::OrderAscending128;
using detail::OrderDescending;
using detail::OrderDescending128;
using detail::SharedTraits;
using detail::Traits128;
#endif
#if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
#if !VQSORT_TEST_IMPL
static void TestAllMedian() {}
static void TestAllBaseCase() {}
static void TestAllPartition() {}
static void TestAllGenerator() {}
#else
template <class Traits>
static HWY_NOINLINE void TestMedian3() {
using T = uint64_t;
using D = CappedTag<T, 1>;
SharedTraits<Traits> st;
const D d;
using V = Vec<D>;
for (uint32_t bits = 0; bits < 8; ++bits) {
const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u});
const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u});
const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u});
const T m = GetLane(detail::MedianOf3(st, v0, v1, v2));
// If at least half(rounded up) of bits are 1, so is the median.
const size_t count = PopCount(bits);
HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m);
}
template <class D>
size_t K(D d) {
return SortBatchSize(d);
}
HWY_NOINLINE void TestAllMedian() {
TestMedian3<LaneTraits<OrderAscending> >();
}
template <class Traits, typename T>
static HWY_NOINLINE void TestBaseCaseAscDesc() {
SharedTraits<Traits> st;
const SortTag<T> d;
template <SortOrder kOrder, class D>
void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) {
const size_t N = Lanes(d);
const size_t base_case_num = SortConstants::BaseCaseNum(N);
const size_t N1 = st.LanesPerKey();
constexpr int kDebug = 0;
auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N);
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
std::vector<size_t> lengths;
lengths.push_back(HWY_MAX(1, N1));
lengths.push_back(3 * N1);
lengths.push_back(base_case_num / 2);
lengths.push_back(base_case_num / 2 + N1);
lengths.push_back(base_case_num - N1);
lengths.push_back(base_case_num);
std::vector<size_t> misalignments;
misalignments.push_back(0);
misalignments.push_back(1);
if (N >= 6) misalignments.push_back(N / 2 - 1);
misalignments.push_back(N / 2);
misalignments.push_back(N / 2 + 1);
misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
for (bool asc : {false, true}) {
for (size_t len : lengths) {
for (size_t misalign : misalignments) {
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
if (kDebug) {
printf("============%s asc %d N1 %d len %d misalign %d\n",
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1),
static_cast<int>(len), static_cast<int>(misalign));
}
for (size_t i = 0; i < misalign; ++i) {
aligned_keys[i] = hwy::LowestValue<T>();
}
InputStats<T> input_stats;
for (size_t i = 0; i < len; ++i) {
keys[i] =
asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i));
input_stats.Notify(keys[i]);
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
}
for (size_t i = len; i < base_case_num + N; ++i) {
keys[i] = hwy::LowestValue<T>();
}
detail::BaseCase(d, st, keys, len, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = 0; i < len; ++i) {
printf("%3zu: %f\n", i, double(keys[i]));
}
}
HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc"));
for (size_t i = 0; i < misalign; ++i) {
if (aligned_keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
}
for (size_t i = len; i < base_case_num + N; ++i) {
if (keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // len
} // asc
}
template <class Traits, typename T>
static HWY_NOINLINE void TestBaseCase01() {
SharedTraits<Traits> st;
const SortTag<T> d;
const size_t N = Lanes(d);
const size_t base_case_num = SortConstants::BaseCaseNum(N);
const size_t N1 = st.LanesPerKey();
constexpr int kDebug = 0;
auto keys = hwy::AllocateAligned<T>(base_case_num + N);
auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N);
std::vector<size_t> lengths;
lengths.push_back(HWY_MAX(1, N1));
lengths.push_back(3 * N1);
lengths.push_back(base_case_num / 2);
lengths.push_back(base_case_num / 2 + N1);
lengths.push_back(base_case_num - N1);
lengths.push_back(base_case_num);
for (size_t len : lengths) {
if (kDebug) {
printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(),
static_cast<int>(N1), static_cast<int>(len));
}
const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
InputStats<T> input_stats;
for (size_t i = 0; i < len; ++i) {
keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
input_stats.Notify(keys[i]);
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
}
for (size_t i = len; i < base_case_num + N; ++i) {
keys[i] = hwy::LowestValue<T>();
// Ensure it matches the sort order
for (size_t i = 0; i < K(d) - 1; ++i) {
if (!verify::Compare(out[i], out[i + 1], kOrder)) {
printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n",
static_cast<uint64_t>(i), static_cast<uint64_t>(i),
static_cast<uint64_t>(N), static_cast<float>(out[i + 0]),
static_cast<float>(out[i + 1]));
for (size_t i = 0; i < K(d); ++i) {
printf("%.0f\n", static_cast<float>(out[i]));
}
detail::BaseCase(d, st, keys.get(), len, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = 0; i < len; ++i) {
printf("%3zu: %f\n", i, double(keys[i]));
}
printf("\n\nin was:\n");
for (size_t i = 0; i < K(d); ++i) {
printf("%.0f\n", static_cast<float>(in[i]));
}
HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01"));
for (size_t i = len; i < base_case_num + N; ++i) {
if (keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // bits
} // len
}
template <class Traits, typename T>
static HWY_NOINLINE void TestBaseCase() {
TestBaseCaseAscDesc<Traits, T>();
TestBaseCase01<Traits, T>();
}
HWY_NOINLINE void TestAllBaseCase() {
// Workaround for stack overflow on MSVC debug.
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
return;
#endif
TestBaseCase<LaneTraits<OrderAscending>, int32_t>();
TestBaseCase<LaneTraits<OrderDescending>, int64_t>();
TestBaseCase<Traits128<OrderAscending128>, uint64_t>();
TestBaseCase<Traits128<OrderDescending128>, uint64_t>();
}
template <class Traits, typename T>
static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys,
size_t left, size_t border,
size_t right, const size_t N1,
const T* pivot) {
/* for (size_t i = left; i < right; ++i) {
if (i == border) printf("--\n");
printf("%4zu: %3d\n", i, keys[i]);
}*/
HWY_ASSERT(left % N1 == 0);
HWY_ASSERT(border % N1 == 0);
HWY_ASSERT(right % N1 == 0);
const bool asc = typename Traits::Order().IsAscending();
for (size_t i = left; i < border; i += N1) {
if (st.Compare1(pivot, keys + i)) {
HWY_ABORT(
"%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
"border %d",
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
double(keys[i + 0]), static_cast<int>(border));
fflush(stdout);
HWY_ABORT("Sort is incorrect");
}
}
for (size_t i = border; i < right; i += N1) {
if (!st.Compare1(pivot, keys + i)) {
HWY_ABORT(
"%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
"border %d",
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i),
double(pivot[1]), double(pivot[0]), double(keys[i + 1]),
double(keys[i]), static_cast<int>(border));
// Also verify sums match (detects duplicated/lost values)
double expected_sum = 0.0;
double actual_sum = 0.0;
for (size_t i = 0; i < K(d); ++i) {
expected_sum += in[i];
actual_sum += out[i];
}
if (expected_sum != actual_sum) {
for (size_t i = 0; i < K(d); ++i) {
printf("%.0f %.0f\n", static_cast<float>(in[i]),
static_cast<float>(out[i]));
}
HWY_ABORT("Mismatch");
}
}
template <class Traits, typename T>
static HWY_NOINLINE void TestPartition() {
const SortTag<T> d;
SharedTraits<Traits> st;
const bool asc = typename Traits::Order().IsAscending();
const size_t N = Lanes(d);
constexpr int kDebug = 0;
const size_t base_case_num = SortConstants::BaseCaseNum(N);
// left + len + align
const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
auto aligned_keys = hwy::AllocateAligned<T>(total);
auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N));
class TestReverse {
template <SortOrder kOrder, class D>
void TestOrder(D d, RandomState& /* rng */) {
using T = TFromD<D>;
const size_t N = Lanes(d);
HWY_ASSERT((N % 4) == 0);
auto in = AllocateAligned<T>(K(d));
auto inout = AllocateAligned<T>(K(d));
const size_t N1 = st.LanesPerKey();
for (bool in_asc : {false, true}) {
for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
const size_t len = (base_case_num + ofs) & ~(N1 - 1);
for (T pivot1 :
{T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) {
const T pivot2[2] = {pivot1, 0};
const auto pivot = st.SetKey(d, pivot2);
for (size_t misalign = 0; misalign < N;
misalign += st.LanesPerKey()) {
T* HWY_RESTRICT keys = aligned_keys.get() + misalign;
const size_t right = left + len;
if (kDebug) {
printf(
"=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left),
static_cast<int>(len), static_cast<int>(right),
double(pivot2[1]), double(pivot2[0]));
}
const size_t expected_size = SortBatchSize(d);
for (size_t i = 0; i < misalign; ++i) {
aligned_keys[i] = hwy::LowestValue<T>();
}
for (size_t i = 0; i < left; ++i) {
keys[i] = hwy::LowestValue<T>();
}
for (size_t i = left; i < right; ++i) {
keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left)
: static_cast<T>(right) - T(i));
if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i]));
}
for (size_t i = right; i < total - misalign; ++i) {
keys[i] = hwy::LowestValue<T>();
}
size_t border =
detail::Partition(d, st, keys, left, right, pivot, buf.get());
if (kDebug >= 2) {
printf("out>>>>>>\n");
for (size_t i = left; i < right; ++i) {
printf("%3zu: %f\n", i, double(keys[i]));
}
for (size_t i = right; i < total - misalign; ++i) {
printf("%3zu: sentinel %f\n", i, double(keys[i]));
}
}
VerifyPartition(st, keys, left, border, right, N1, pivot2);
for (size_t i = 0; i < misalign; ++i) {
if (aligned_keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
}
for (size_t i = 0; i < left; ++i) {
if (keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
}
for (size_t i = right; i < total - misalign; ++i) {
if (keys[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // pivot
} // len
} // left
} // asc
}
HWY_NOINLINE void TestAllPartition() {
TestPartition<LaneTraits<OrderAscending>, int16_t>();
TestPartition<LaneTraits<OrderDescending>, int32_t>();
TestPartition<LaneTraits<OrderAscending>, int64_t>();
TestPartition<LaneTraits<OrderDescending>, float>();
#if HWY_HAVE_FLOAT64
TestPartition<LaneTraits<OrderDescending>, double>();
#endif
TestPartition<Traits128<OrderAscending128>, uint64_t>();
TestPartition<Traits128<OrderDescending128>, uint64_t>();
}
// (used for sample selection for choosing a pivot)
template <typename TU>
static HWY_NOINLINE void TestRandomGenerator() {
static_assert(!hwy::IsSigned<TU>(), "");
SortTag<TU> du;
const size_t N = Lanes(du);
detail::Generator rng(&N, N);
const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N); // power of two
for (uint32_t num_blocks = 2; num_blocks < 100000;
num_blocks = 3 * num_blocks / 2) {
// Generate some numbers and ensure all are in range
uint64_t sum = 0;
constexpr size_t kReps = 10000;
for (size_t rep = 0; rep < kReps; ++rep) {
const uint32_t bits = rng() & 0xFFFFFFFF;
const size_t index = detail::RandomChunkIndex(num_blocks, bits);
HWY_ASSERT(((index + 1) * lanes_per_block) <=
num_blocks * lanes_per_block);
sum += index;
for (size_t i = 0; i < K(d); ++i) {
in[i] = static_cast<T>(K(d) - i);
inout[i] = in[i];
}
// Also ensure the mean is near the middle of the range
const double expected = (num_blocks - 1) / 2.0;
const double actual = double(sum) / kReps;
HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
HWY_ASSERT_EQ(expected_size, actual_size);
Validate<kOrder>(d, in.get(), inout.get());
}
}
HWY_NOINLINE void TestAllGenerator() {
TestRandomGenerator<uint32_t>();
TestRandomGenerator<uint64_t>();
}
#endif // VQSORT_TEST_IMPL
#if !VQSORT_TEST_SORT
static void TestAllSort() {}
#else
// Remembers input, and compares results to that of a reference algorithm.
template <class Traits, typename T>
class CompareResults {
public:
void SetInput(const T* in, size_t num) {
copy_.resize(num);
memcpy(copy_.data(), in, num * sizeof(T));
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
TestOrder<SortOrder::kAscending>(d, rng);
TestOrder<SortOrder::kDescending>(d, rng);
}
bool Verify(const T* output) {
#if HAVE_PDQSORT
const Algo reference = Algo::kPDQ;
#else
const Algo reference = Algo::kStd;
#endif
SharedState shared;
using Order = typename Traits::Order;
Run<Order>(reference, copy_.data(), copy_.size(), shared,
/*thread=*/0);
for (size_t i = 0; i < copy_.size(); ++i) {
if (copy_[i] != output[i]) {
fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(),
static_cast<int>(i), double(copy_[i]), double(output[i]));
return false;
}
}
return true;
}
private:
std::vector<T> copy_;
};
std::vector<Algo> AlgoForTest() {
return {
#if HAVE_AVX2SORT
Algo::kSEA,
#endif
#if HAVE_IPS4O
Algo::kIPS4O,
#endif
#if HAVE_PDQSORT
Algo::kPDQ,
#endif
#if HAVE_SORT512
Algo::kSort512,
#endif
Algo::kHeap, Algo::kVQSort,
};
void TestAllReverse() {
TestReverse test;
test(int32_t(), CappedTag<int32_t, 16>());
test(uint32_t(), CappedTag<uint32_t, 16>());
}
template <class Traits, typename T>
void TestSort(size_t num) {
// TODO(janwas): fix
if (HWY_TARGET == HWY_SSSE3) return;
// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
#if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3)
return;
#endif
class TestRanges {
template <SortOrder kOrder, class D>
void TestOrder(D d, RandomState& rng) {
using T = TFromD<D>;
const size_t N = Lanes(d);
HWY_ASSERT((N % 4) == 0);
auto in = AllocateAligned<T>(K(d));
auto inout = AllocateAligned<T>(K(d));
SharedState shared;
SharedTraits<Traits> st;
const size_t expected_size = SortBatchSize(d);
constexpr size_t kMaxMisalign = 16;
auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign);
for (Algo algo : AlgoForTest()) {
#if HAVE_IPS4O
if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) {
continue;
// For each range, try all 0/1 combinations and set any other lanes to
// random inputs.
constexpr size_t kRange = 8;
for (size_t range = 0; range < K(d); range += kRange) {
for (size_t bits = 0; bits < (1ull << kRange); ++bits) {
// First set all to random, will later overwrite those for `range`
for (size_t i = 0; i < K(d); ++i) {
in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF);
}
// Now set the current combination of {0,1} for elements in the range.
// This is sufficient to establish correctness (arbitrary inputs could
// be mapped to 0/1 with a comparison predicate).
for (size_t i = 0; i < kRange; ++i) {
in[range + i] = inout[range + i] = (bits >> i) & 1;
}
const size_t actual_size = SortBatch<kOrder>(d, inout.get());
HWY_ASSERT_EQ(expected_size, actual_size);
Validate<kOrder>(d, in.get(), inout.get());
}
}
#endif
for (Dist dist : AllDist()) {
for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
T* keys = aligned.get() + misalign;
// Set up red zones before/after the keys to sort
for (size_t i = 0; i < misalign; ++i) {
aligned[i] = hwy::LowestValue<T>();
}
for (size_t i = 0; i < kMaxMisalign; ++i) {
keys[num + i] = hwy::HighestValue<T>();
}
#if HWY_IS_MSAN
__msan_poison(aligned.get(), misalign * sizeof(T));
__msan_poison(keys + num, kMaxMisalign * sizeof(T));
#endif
InputStats<T> input_stats = GenerateInput(dist, keys, num);
CompareResults<Traits, T> compare;
compare.SetInput(keys, num);
Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0);
HWY_ASSERT(compare.Verify(keys));
HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort"));
// Check red zones
#if HWY_IS_MSAN
__msan_unpoison(aligned.get(), misalign * sizeof(T));
__msan_unpoison(keys + num, kMaxMisalign * sizeof(T));
#endif
for (size_t i = 0; i < misalign; ++i) {
if (aligned[i] != hwy::LowestValue<T>())
HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
}
for (size_t i = num; i < num + kMaxMisalign; ++i) {
if (keys[i] != hwy::HighestValue<T>())
HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
}
} // misalign
} // dist
} // algo
}
void TestAllSort() {
const size_t num = 15 * 1000;
TestSort<LaneTraits<OrderAscending>, int16_t>(num);
TestSort<LaneTraits<OrderDescending>, uint16_t>(num);
TestSort<LaneTraits<OrderDescending>, int32_t>(num);
TestSort<LaneTraits<OrderDescending>, uint32_t>(num);
TestSort<LaneTraits<OrderAscending>, int64_t>(num);
TestSort<LaneTraits<OrderAscending>, uint64_t>(num);
// WARNING: for float types, SIMD comparisons will flush denormals to zero,
// causing mismatches with scalar sorts. In this test, we avoid generating
// denormal inputs.
TestSort<LaneTraits<OrderAscending>, float>(num);
#if HWY_HAVE_FLOAT64 // protects algo-inl's GenerateRandom
if (Sorter::HaveFloat64()) {
TestSort<LaneTraits<OrderDescending>, double>(num);
}
#endif
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
TestSort<Traits128<OrderAscending128>, uint64_t>(num);
public:
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
TestOrder<SortOrder::kAscending>(d, rng);
TestOrder<SortOrder::kDescending>(d, rng);
}
};
void TestAllRanges() {
TestRanges test;
test(int32_t(), CappedTag<int32_t, 16>());
test(uint32_t(), CappedTag<uint32_t, 16>());
}
#endif // VQSORT_TEST_SORT
#else
void TestAllReverse() {}
void TestAllRanges() {}
#endif // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
} // namespace
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
@ -567,14 +174,9 @@ HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_BEFORE_TEST(SortTest);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
} // namespace
HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse);
HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges);
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.
@ -583,4 +185,4 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
#endif // HWY_ONCE
#endif

Просмотреть файл

@ -1,686 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
#endif
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
using Constants = hwy::SortConstants;
// ------------------------------ SharedTraits
// Code shared between all traits. It's unclear whether these can profitably be
// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
// Compare/DupOdd.
template <class Base>
struct SharedTraits : public Base {
// Conditionally swaps lane 0 with 2, 1 with 3 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->SwapAdjacentPairs(d, v);
base->Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys8(d, v);
base->Sort2(d, v, swapped);
return base->OddEvenQuads(d, swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 8 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
Vec<D> swapped = base->ReverseKeys(d, v);
base->Sort2(d, v, swapped);
return ConcatUpperLower(d, swapped, v); // 8 = half of the vector
}
};
// ------------------------------ Sorting network
// (Green's irregular) sorting network for independent columns in 16 vectors.
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
st.Sort2(d, v0, v2);
st.Sort2(d, v1, v3);
st.Sort2(d, v4, v6);
st.Sort2(d, v5, v7);
st.Sort2(d, v8, va);
st.Sort2(d, v9, vb);
st.Sort2(d, vc, ve);
st.Sort2(d, vd, vf);
st.Sort2(d, v0, v4);
st.Sort2(d, v1, v5);
st.Sort2(d, v2, v6);
st.Sort2(d, v3, v7);
st.Sort2(d, v8, vc);
st.Sort2(d, v9, vd);
st.Sort2(d, va, ve);
st.Sort2(d, vb, vf);
st.Sort2(d, v0, v8);
st.Sort2(d, v1, v9);
st.Sort2(d, v2, va);
st.Sort2(d, v3, vb);
st.Sort2(d, v4, vc);
st.Sort2(d, v5, vd);
st.Sort2(d, v6, ve);
st.Sort2(d, v7, vf);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v3, vc);
st.Sort2(d, v7, vb);
st.Sort2(d, vd, ve);
st.Sort2(d, v4, v8);
st.Sort2(d, v1, v2);
st.Sort2(d, v1, v4);
st.Sort2(d, v7, vd);
st.Sort2(d, v2, v8);
st.Sort2(d, vb, ve);
st.Sort2(d, v2, v4);
st.Sort2(d, v5, v6);
st.Sort2(d, v9, va);
st.Sort2(d, vb, vd);
st.Sort2(d, v3, v8);
st.Sort2(d, v7, vc);
st.Sort2(d, v3, v5);
st.Sort2(d, v6, v8);
st.Sort2(d, v7, v9);
st.Sort2(d, va, vc);
st.Sort2(d, v3, v4);
st.Sort2(d, v5, v6);
st.Sort2(d, v7, v8);
st.Sort2(d, v9, va);
st.Sort2(d, vb, vc);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
}
// ------------------------------ Merging networks
// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys2(d, v8);
v9 = st.ReverseKeys2(d, v9);
va = st.ReverseKeys2(d, va);
vb = st.ReverseKeys2(d, vb);
vc = st.ReverseKeys2(d, vc);
vd = st.ReverseKeys2(d, vd);
ve = st.ReverseKeys2(d, ve);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys2(d, v4);
vc = st.ReverseKeys2(d, vc);
v5 = st.ReverseKeys2(d, v5);
vd = st.ReverseKeys2(d, vd);
v6 = st.ReverseKeys2(d, v6);
ve = st.ReverseKeys2(d, ve);
v7 = st.ReverseKeys2(d, v7);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys2(d, v2);
v3 = st.ReverseKeys2(d, v3);
v6 = st.ReverseKeys2(d, v6);
v7 = st.ReverseKeys2(d, v7);
va = st.ReverseKeys2(d, va);
vb = st.ReverseKeys2(d, vb);
ve = st.ReverseKeys2(d, ve);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys2(d, v1);
v3 = st.ReverseKeys2(d, v3);
v5 = st.ReverseKeys2(d, v5);
v7 = st.ReverseKeys2(d, v7);
v9 = st.ReverseKeys2(d, v9);
vb = st.ReverseKeys2(d, vb);
vd = st.ReverseKeys2(d, vd);
vf = st.ReverseKeys2(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys4(d, v8);
v9 = st.ReverseKeys4(d, v9);
va = st.ReverseKeys4(d, va);
vb = st.ReverseKeys4(d, vb);
vc = st.ReverseKeys4(d, vc);
vd = st.ReverseKeys4(d, vd);
ve = st.ReverseKeys4(d, ve);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys4(d, v4);
vc = st.ReverseKeys4(d, vc);
v5 = st.ReverseKeys4(d, v5);
vd = st.ReverseKeys4(d, vd);
v6 = st.ReverseKeys4(d, v6);
ve = st.ReverseKeys4(d, ve);
v7 = st.ReverseKeys4(d, v7);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys4(d, v2);
v3 = st.ReverseKeys4(d, v3);
v6 = st.ReverseKeys4(d, v6);
v7 = st.ReverseKeys4(d, v7);
va = st.ReverseKeys4(d, va);
vb = st.ReverseKeys4(d, vb);
ve = st.ReverseKeys4(d, ve);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys4(d, v1);
v3 = st.ReverseKeys4(d, v3);
v5 = st.ReverseKeys4(d, v5);
v7 = st.ReverseKeys4(d, v7);
v9 = st.ReverseKeys4(d, v9);
vb = st.ReverseKeys4(d, vb);
vd = st.ReverseKeys4(d, vd);
vf = st.ReverseKeys4(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse4(d, v0);
v1 = st.SortPairsReverse4(d, v1);
v2 = st.SortPairsReverse4(d, v2);
v3 = st.SortPairsReverse4(d, v3);
v4 = st.SortPairsReverse4(d, v4);
v5 = st.SortPairsReverse4(d, v5);
v6 = st.SortPairsReverse4(d, v6);
v7 = st.SortPairsReverse4(d, v7);
v8 = st.SortPairsReverse4(d, v8);
v9 = st.SortPairsReverse4(d, v9);
va = st.SortPairsReverse4(d, va);
vb = st.SortPairsReverse4(d, vb);
vc = st.SortPairsReverse4(d, vc);
vd = st.SortPairsReverse4(d, vd);
ve = st.SortPairsReverse4(d, ve);
vf = st.SortPairsReverse4(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
V& ve, V& vf) {
v8 = st.ReverseKeys8(d, v8);
v9 = st.ReverseKeys8(d, v9);
va = st.ReverseKeys8(d, va);
vb = st.ReverseKeys8(d, vb);
vc = st.ReverseKeys8(d, vc);
vd = st.ReverseKeys8(d, vd);
ve = st.ReverseKeys8(d, ve);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys8(d, v4);
vc = st.ReverseKeys8(d, vc);
v5 = st.ReverseKeys8(d, v5);
vd = st.ReverseKeys8(d, vd);
v6 = st.ReverseKeys8(d, v6);
ve = st.ReverseKeys8(d, ve);
v7 = st.ReverseKeys8(d, v7);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys8(d, v2);
v3 = st.ReverseKeys8(d, v3);
v6 = st.ReverseKeys8(d, v6);
v7 = st.ReverseKeys8(d, v7);
va = st.ReverseKeys8(d, va);
vb = st.ReverseKeys8(d, vb);
ve = st.ReverseKeys8(d, ve);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys8(d, v1);
v3 = st.ReverseKeys8(d, v3);
v5 = st.ReverseKeys8(d, v5);
v7 = st.ReverseKeys8(d, v7);
v9 = st.ReverseKeys8(d, v9);
vb = st.ReverseKeys8(d, vb);
vd = st.ReverseKeys8(d, vd);
vf = st.ReverseKeys8(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse8(d, v0);
v1 = st.SortPairsReverse8(d, v1);
v2 = st.SortPairsReverse8(d, v2);
v3 = st.SortPairsReverse8(d, v3);
v4 = st.SortPairsReverse8(d, v4);
v5 = st.SortPairsReverse8(d, v5);
v6 = st.SortPairsReverse8(d, v6);
v7 = st.SortPairsReverse8(d, v7);
v8 = st.SortPairsReverse8(d, v8);
v9 = st.SortPairsReverse8(d, v9);
va = st.SortPairsReverse8(d, va);
vb = st.SortPairsReverse8(d, vb);
vc = st.SortPairsReverse8(d, vc);
vd = st.SortPairsReverse8(d, vd);
ve = st.SortPairsReverse8(d, ve);
vf = st.SortPairsReverse8(d, vf);
v0 = st.SortPairsDistance2(d, v0);
v1 = st.SortPairsDistance2(d, v1);
v2 = st.SortPairsDistance2(d, v2);
v3 = st.SortPairsDistance2(d, v3);
v4 = st.SortPairsDistance2(d, v4);
v5 = st.SortPairsDistance2(d, v5);
v6 = st.SortPairsDistance2(d, v6);
v7 = st.SortPairsDistance2(d, v7);
v8 = st.SortPairsDistance2(d, v8);
v9 = st.SortPairsDistance2(d, v9);
va = st.SortPairsDistance2(d, va);
vb = st.SortPairsDistance2(d, vb);
vc = st.SortPairsDistance2(d, vc);
vd = st.SortPairsDistance2(d, vd);
ve = st.SortPairsDistance2(d, ve);
vf = st.SortPairsDistance2(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
// Unused on MSVC, see below
#if !HWY_COMPILER_MSVC
template <class D, class Traits, class V = Vec<D>>
HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
V& vd, V& ve, V& vf) {
v8 = st.ReverseKeys16(d, v8);
v9 = st.ReverseKeys16(d, v9);
va = st.ReverseKeys16(d, va);
vb = st.ReverseKeys16(d, vb);
vc = st.ReverseKeys16(d, vc);
vd = st.ReverseKeys16(d, vd);
ve = st.ReverseKeys16(d, ve);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, vf);
st.Sort2(d, v1, ve);
st.Sort2(d, v2, vd);
st.Sort2(d, v3, vc);
st.Sort2(d, v4, vb);
st.Sort2(d, v5, va);
st.Sort2(d, v6, v9);
st.Sort2(d, v7, v8);
v4 = st.ReverseKeys16(d, v4);
vc = st.ReverseKeys16(d, vc);
v5 = st.ReverseKeys16(d, v5);
vd = st.ReverseKeys16(d, vd);
v6 = st.ReverseKeys16(d, v6);
ve = st.ReverseKeys16(d, ve);
v7 = st.ReverseKeys16(d, v7);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v7);
st.Sort2(d, v8, vf);
st.Sort2(d, v1, v6);
st.Sort2(d, v9, ve);
st.Sort2(d, v2, v5);
st.Sort2(d, va, vd);
st.Sort2(d, v3, v4);
st.Sort2(d, vb, vc);
v2 = st.ReverseKeys16(d, v2);
v3 = st.ReverseKeys16(d, v3);
v6 = st.ReverseKeys16(d, v6);
v7 = st.ReverseKeys16(d, v7);
va = st.ReverseKeys16(d, va);
vb = st.ReverseKeys16(d, vb);
ve = st.ReverseKeys16(d, ve);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v3);
st.Sort2(d, v1, v2);
st.Sort2(d, v4, v7);
st.Sort2(d, v5, v6);
st.Sort2(d, v8, vb);
st.Sort2(d, v9, va);
st.Sort2(d, vc, vf);
st.Sort2(d, vd, ve);
v1 = st.ReverseKeys16(d, v1);
v3 = st.ReverseKeys16(d, v3);
v5 = st.ReverseKeys16(d, v5);
v7 = st.ReverseKeys16(d, v7);
v9 = st.ReverseKeys16(d, v9);
vb = st.ReverseKeys16(d, vb);
vd = st.ReverseKeys16(d, vd);
vf = st.ReverseKeys16(d, vf);
st.Sort2(d, v0, v1);
st.Sort2(d, v2, v3);
st.Sort2(d, v4, v5);
st.Sort2(d, v6, v7);
st.Sort2(d, v8, v9);
st.Sort2(d, va, vb);
st.Sort2(d, vc, vd);
st.Sort2(d, ve, vf);
v0 = st.SortPairsReverse16(d, v0);
v1 = st.SortPairsReverse16(d, v1);
v2 = st.SortPairsReverse16(d, v2);
v3 = st.SortPairsReverse16(d, v3);
v4 = st.SortPairsReverse16(d, v4);
v5 = st.SortPairsReverse16(d, v5);
v6 = st.SortPairsReverse16(d, v6);
v7 = st.SortPairsReverse16(d, v7);
v8 = st.SortPairsReverse16(d, v8);
v9 = st.SortPairsReverse16(d, v9);
va = st.SortPairsReverse16(d, va);
vb = st.SortPairsReverse16(d, vb);
vc = st.SortPairsReverse16(d, vc);
vd = st.SortPairsReverse16(d, vd);
ve = st.SortPairsReverse16(d, ve);
vf = st.SortPairsReverse16(d, vf);
v0 = st.SortPairsDistance4(d, v0);
v1 = st.SortPairsDistance4(d, v1);
v2 = st.SortPairsDistance4(d, v2);
v3 = st.SortPairsDistance4(d, v3);
v4 = st.SortPairsDistance4(d, v4);
v5 = st.SortPairsDistance4(d, v5);
v6 = st.SortPairsDistance4(d, v6);
v7 = st.SortPairsDistance4(d, v7);
v8 = st.SortPairsDistance4(d, v8);
v9 = st.SortPairsDistance4(d, v9);
va = st.SortPairsDistance4(d, va);
vb = st.SortPairsDistance4(d, vb);
vc = st.SortPairsDistance4(d, vc);
vd = st.SortPairsDistance4(d, vd);
ve = st.SortPairsDistance4(d, ve);
vf = st.SortPairsDistance4(d, vf);
v0 = st.SortPairsDistance2(d, v0);
v1 = st.SortPairsDistance2(d, v1);
v2 = st.SortPairsDistance2(d, v2);
v3 = st.SortPairsDistance2(d, v3);
v4 = st.SortPairsDistance2(d, v4);
v5 = st.SortPairsDistance2(d, v5);
v6 = st.SortPairsDistance2(d, v6);
v7 = st.SortPairsDistance2(d, v7);
v8 = st.SortPairsDistance2(d, v8);
v9 = st.SortPairsDistance2(d, v9);
va = st.SortPairsDistance2(d, va);
vb = st.SortPairsDistance2(d, vb);
vc = st.SortPairsDistance2(d, vc);
vd = st.SortPairsDistance2(d, vd);
ve = st.SortPairsDistance2(d, ve);
vf = st.SortPairsDistance2(d, vf);
v0 = st.SortPairsDistance1(d, v0);
v1 = st.SortPairsDistance1(d, v1);
v2 = st.SortPairsDistance1(d, v2);
v3 = st.SortPairsDistance1(d, v3);
v4 = st.SortPairsDistance1(d, v4);
v5 = st.SortPairsDistance1(d, v5);
v6 = st.SortPairsDistance1(d, v6);
v7 = st.SortPairsDistance1(d, v7);
v8 = st.SortPairsDistance1(d, v8);
v9 = st.SortPairsDistance1(d, v9);
va = st.SortPairsDistance1(d, va);
vb = st.SortPairsDistance1(d, vb);
vc = st.SortPairsDistance1(d, vc);
vd = st.SortPairsDistance1(d, vd);
ve = st.SortPairsDistance1(d, ve);
vf = st.SortPairsDistance1(d, vf);
}
#endif // !HWY_COMPILER_MSVC
// Reshapes `buf` into a matrix, sorts columns independently, and then merges
// into a sorted 1D array without transposing.
//
// `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
// bridges differences in sort order and single-lane vs 128-bit keys.
// `buf` ensures full vectors are aligned, and enables loads/stores without
// bounds checks.
//
// References:
// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
template <class Traits, typename T>
HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
const CappedTag<T, Constants::kMaxCols> d;
using V = decltype(Zero(d));
HWY_DASSERT(cols <= Constants::kMaxCols);
// The network width depends on the number of keys, not lanes.
constexpr size_t kLanesPerKey = st.LanesPerKey();
const size_t keys = cols / kLanesPerKey;
constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
// These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
// offsets to duplicating this code for every value of cols.
static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
V v0 = LoadU(d, buf + 0x0 * cols);
V v1 = LoadU(d, buf + 0x1 * cols);
V v2 = LoadU(d, buf + 0x2 * cols);
V v3 = LoadU(d, buf + 0x3 * cols);
V v4 = LoadU(d, buf + 0x4 * cols);
V v5 = LoadU(d, buf + 0x5 * cols);
V v6 = LoadU(d, buf + 0x6 * cols);
V v7 = LoadU(d, buf + 0x7 * cols);
V v8 = LoadU(d, buf + 0x8 * cols);
V v9 = LoadU(d, buf + 0x9 * cols);
V va = LoadU(d, buf + 0xa * cols);
V vb = LoadU(d, buf + 0xb * cols);
V vc = LoadU(d, buf + 0xc * cols);
V vd = LoadU(d, buf + 0xd * cols);
V ve = LoadU(d, buf + 0xe * cols);
V vf = LoadU(d, buf + 0xf * cols);
Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
// Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
// code paths: if MaxLanes < 2, then keys <= cols < 2.
if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
vf);
if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
vf);
if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
ve, vf);
// Avoids build timeout
#if !HWY_COMPILER_MSVC
if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
ve, vf);
static_assert(Constants::kMaxCols <= 16, "Add more branches");
}
#endif
}
}
}
StoreU(v0, d, buf + 0x0 * cols);
StoreU(v1, d, buf + 0x1 * cols);
StoreU(v2, d, buf + 0x2 * cols);
StoreU(v3, d, buf + 0x3 * cols);
StoreU(v4, d, buf + 0x4 * cols);
StoreU(v5, d, buf + 0x5 * cols);
StoreU(v6, d, buf + 0x6 * cols);
StoreU(v7, d, buf + 0x7 * cols);
StoreU(v8, d, buf + 0x8 * cols);
StoreU(v9, d, buf + 0x9 * cols);
StoreU(va, d, buf + 0xa * cols);
StoreU(vb, d, buf + 0xb * cols);
StoreU(vc, d, buf + 0xc * cols);
StoreU(vd, d, buf + 0xd * cols);
StoreU(ve, d, buf + 0xe * cols);
StoreU(vf, d, buf + 0xf * cols);
}
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE

Просмотреть файл

@ -1,324 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
#endif
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/shared-inl.h" // SortConstants
#include "hwy/contrib/sort/vqsort.h" // SortDescending
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
// along with an abstraction layer for single-lane vs. lane-pair, which is
// independent of the order.
struct KeyLane {
constexpr size_t LanesPerKey() const { return 1; }
// For HeapSort
template <typename T>
HWY_INLINE void Swap(T* a, T* b) const {
const T temp = *a;
*a = *b;
*b = temp;
}
// Broadcasts one key into a vector
template <class D>
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
return Set(d, *key);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
return Reverse(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
return Reverse2(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
return Reverse4(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
return Reverse8(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
return ReverseKeys(d, v);
}
template <class V>
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
return OddEven(odd, even);
}
template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
const Repartition<uint32_t, D> du32;
return BitCast(d, Shuffle2301(BitCast(du32, v)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
return Shuffle1032(v);
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
return SwapAdjacentBlocks(v);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
#endif
return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
// Assumes max vector size = 512
return ConcatLowerUpper(d, v, v);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
const Vec<D> even) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
#endif
return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
return OddEvenBlocks(odd, even);
}
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
#if HWY_HAVE_FLOAT64 // in case D is float32
const RepartitionToWide<D> dw;
#else
const RepartitionToWide<RebindToUnsigned<D>> dw;
#endif
return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
}
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
return ConcatUpperLower(d, odd, even);
}
};
// Anything order-related depends on the key traits *and* the order (see
// FirstOfLanes). We cannot implement just one Compare function because Lt128
// only compiles if the lane type is u64. Thus we need either overloaded
// functions with a tag type, class specializations, or separate classes.
// We avoid overloaded functions because we want all functions to be callable
// from a SortTraits without per-function wrappers. Specializing would work, but
// we are anyway going to specialize at a higher level.
struct OrderAscending : public KeyLane {
using Order = SortAscending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return *a < *b;
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(a, b);
}
// Two halves of Sort2, used in ScanMinMax.
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D>>());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D>>());
}
};
struct OrderDescending : public KeyLane {
using Order = SortDescending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return *b < *a;
}
template <class D>
HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
return Lt(b, a);
}
template <class D>
HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Max(a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
return Min(a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT /* buf */) const {
return MaxOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT /* buf */) const {
return MinOfLanes(d, v);
}
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D>>());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D>>());
}
};
// Shared code that depends on Order.
template <class Base>
struct LaneTraits : public Base {
constexpr bool Is128() const { return false; }
// For each lane i: replaces a[i] with the first and b[i] with the second
// according to Base.
// Corresponds to a conditional swap, which is one "node" of a sorting
// network. Min/Max are cheaper than compare + blend at least for integers.
template <class D>
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
const Base* base = static_cast<const Base*>(this);
const Vec<D> a_copy = a;
// Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
// instructions. We can reduce it to a compare + 2 IfThenElse.
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
if (sizeof(TFromD<D>) == 8) {
const Mask<D> cmp = base->Compare(d, a, b);
a = IfThenElse(cmp, a, b);
b = IfThenElse(cmp, b, a_copy);
return;
}
#endif
a = base->First(d, a, b);
b = base->Last(d, a_copy, b);
}
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
// Further to the above optimization, Sort2+OddEvenKeys compile to four
// instructions; we can save one by combining two blends.
#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
return IfVecThenElse(DupOdd(cmp), swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
#endif
}
// (See above - we use Sort2 for non-64-bit types.)
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
}
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys4(d, v);
Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
}
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->SwapAdjacentQuads(d, v);
// Only used in Merge16, so this will not be used on AVX2 (which only has 4
// u64 lanes), so skip the above optimization for 64-bit AVX2.
Sort2(d, v, swapped);
return base->OddEvenQuads(d, swapped, v);
}
};
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE

Просмотреть файл

@ -1,368 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
#endif
#include "hwy/contrib/sort/vqsort.h" // SortDescending
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
#if HWY_TARGET == HWY_SCALAR
struct OrderAscending128 {
using Order = SortAscending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
}
};
struct OrderDescending128 {
using Order = SortDescending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
}
};
template <class Order>
struct Traits128 : public Order {
constexpr bool Is128() const { return true; }
constexpr size_t LanesPerKey() const { return 2; }
};
#else
// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
// along with an abstraction layer for single-lane vs. lane-pair, which is
// independent of the order.
struct Key128 {
constexpr size_t LanesPerKey() const { return 2; }
template <typename T>
HWY_INLINE void Swap(T* a, T* b) const {
const FixedTag<T, 2> d;
const auto temp = LoadU(d, a);
StoreU(LoadU(d, b), d, a);
StoreU(temp, d, b);
}
template <class D>
HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
return LoadDup128(d, key);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
return ReverseBlocks(d, v);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
return SwapAdjacentBlocks(v);
}
// Only called for 4 keys because we do not support >512-bit vectors.
template <class D>
HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
return ReverseKeys(d, v);
}
// Only called for 4 keys because we do not support >512-bit vectors.
template <class D>
HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
const Vec<D> even) const {
HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
return ConcatUpperLower(d, odd, even);
}
template <class V>
HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
return OddEvenBlocks(odd, even);
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
HWY_ASSERT(0); // not supported: would require 1024-bit vectors
}
template <class D>
HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
HWY_ASSERT(0); // not supported: would require 2048-bit vectors
}
// This is only called for 8/16 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
HWY_ASSERT(0);
}
// This is only called for 16 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
HWY_ASSERT(0);
}
// This is only called for 8 col networks (not supported).
template <class D>
HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
HWY_ASSERT(0);
}
};
// Anything order-related depends on the key traits *and* the order (see
// FirstOfLanes). We cannot implement just one Compare function because Lt128
// only compiles if the lane type is u64. Thus we need either overloaded
// functions with a tag type, class specializations, or separate classes.
// We avoid overloaded functions because we want all functions to be callable
// from a SortTraits without per-function wrappers. Specializing would work, but
// we are anyway going to specialize at a higher level.
struct OrderAscending128 : public Key128 {
using Order = SortAscending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128(d, a, b);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(a, b);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Min128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Max128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const size_t N = Lanes(d);
Store(v, d, buf);
v = SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
v = First(d, v, SetKey(d, buf + i));
}
return v;
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const size_t N = Lanes(d);
Store(v, d, buf);
v = SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
v = Last(d, v, SetKey(d, buf + i));
}
return v;
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
};
struct OrderDescending128 : public Key128 {
using Order = SortDescending;
template <typename T>
HWY_INLINE bool Compare1(const T* a, const T* b) {
return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
}
template <class D>
HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
return Lt128(d, b, a);
}
// Used by CompareTop
template <class V>
HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
return Lt(b, a);
}
template <class D>
HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
return Max128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
return Min128(d, a, b);
}
template <class D>
HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const size_t N = Lanes(d);
Store(v, d, buf);
v = SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
v = First(d, v, SetKey(d, buf + i));
}
return v;
}
template <class D>
HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
TFromD<D>* HWY_RESTRICT buf) const {
const size_t N = Lanes(d);
Store(v, d, buf);
v = SetKey(d, buf + 0); // result must be broadcasted
for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) {
v = Last(d, v, SetKey(d, buf + i));
}
return v;
}
// Same as for regular lanes because 128-bit lanes are u64.
template <class D>
HWY_INLINE Vec<D> FirstValue(D d) const {
return Set(d, hwy::HighestValue<TFromD<D> >());
}
template <class D>
HWY_INLINE Vec<D> LastValue(D d) const {
return Set(d, hwy::LowestValue<TFromD<D> >());
}
};
// Shared code that depends on Order.
template <class Base>
class Traits128 : public Base {
#if HWY_TARGET <= HWY_AVX2
// Returns vector with only the top u64 lane valid. Useful when the next step
// is to replicate the mask anyway.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
const Base* base = static_cast<const Base*>(this);
const Vec<D> eqHL = VecFromMask(d, Eq(a, b));
const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
return OrAnd(ltHL, eqHL, ltLX);
}
// We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
// the most-significant of those lanes (the result of CompareTop), so
// replicate it 4x. Only called for >= 256-bit vectors.
template <class V>
HWY_INLINE V ReplicateTop4x(V v) const {
#if HWY_TARGET <= HWY_AVX3
return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
#else // AVX2
return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
#endif
}
#endif
public:
constexpr bool Is128() const { return true; }
template <class D>
HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
const Base* base = static_cast<const Base*>(this);
const Vec<D> a_copy = a;
const auto lt = base->Compare(d, a, b);
a = IfThenElse(lt, a, b);
b = IfThenElse(lt, b, a_copy);
}
// Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys2(d, v);
#if HWY_TARGET <= HWY_AVX2
const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
return IfVecThenElse(select, swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenKeys(swapped, v);
#endif
}
// Swaps with the vector formed by reversing contiguous groups of 4 keys.
template <class D>
HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
const Base* base = static_cast<const Base*>(this);
Vec<D> swapped = base->ReverseKeys4(d, v);
// Only specialize for AVX3 because this requires 512-bit vectors.
#if HWY_TARGET <= HWY_AVX3
const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
// Similar to ReplicateTop4x, we want to gang together 2 comparison results
// (4 lanes). They are not contiguous, so use permute to replicate 4x.
alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
const Vec512<uint64_t> select =
TableLookupLanes(outHx, SetTableIndices(d, kIndices));
return IfVecThenElse(select, swapped, v);
#else
Sort2(d, v, swapped);
return base->OddEvenPairs(d, swapped, v);
#endif
}
// Conditionally swaps lane 0 with 4, 1 with 5 etc.
template <class D>
HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
// Only used by Merge16, which would require 2048 bit vectors (unsupported).
HWY_ASSERT(0);
}
};
#endif // HWY_TARGET != HWY_SCALAR
} // namespace detail
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE

Просмотреть файл

@ -1,722 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Normal include guard for target-independent parts
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
// Makes it harder for adversaries to predict our sampling locations, at the
// cost of 1-2% increased runtime.
#ifndef VQSORT_SECURE_RNG
#define VQSORT_SECURE_RNG 0
#endif
#if VQSORT_SECURE_RNG
#include "third_party/absl/random/random.h"
#endif
#include <string.h> // memcpy
#include "hwy/cache_control.h" // Prefetch
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h" // Fill24Bytes
#if HWY_IS_MSAN
#include <sanitizer/msan_interface.h>
#endif
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
// Per-target
#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
#else
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
#endif
#include "hwy/contrib/sort/shared-inl.h"
#include "hwy/contrib/sort/sorting_networks-inl.h"
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
namespace detail {
#if HWY_TARGET == HWY_SCALAR
template <typename T>
void Swap(T* a, T* b) {
T t = *a;
*a = *b;
*b = t;
}
// Scalar version of HeapSort (see below)
template <class Traits, typename T>
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
if (num < 2) return;
// Build heap.
for (size_t i = 1; i < num; i += 1) {
size_t j = i;
while (j != 0) {
const size_t idx_parent = ((j - 1) / 1 / 2);
if (!st.Compare1(keys + idx_parent, keys + j)) {
break;
}
Swap(keys + j, keys + idx_parent);
j = idx_parent;
}
}
for (size_t i = num - 1; i != 0; i -= 1) {
// Swap root with last
Swap(keys + 0, keys + i);
// Sift down the new root.
size_t j = 0;
while (j < i) {
const size_t left = 2 * j + 1;
const size_t right = 2 * j + 2;
if (left >= i) break;
size_t idx_larger = j;
if (st.Compare1(keys + j, keys + left)) {
idx_larger = left;
}
if (right < i && st.Compare1(keys + idx_larger, keys + right)) {
idx_larger = right;
}
if (idx_larger == j) break;
Swap(keys + j, keys + idx_larger);
j = idx_larger;
}
}
}
#else
using Constants = hwy::SortConstants;
// ------------------------------ HeapSort
// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
template <class Traits, typename T>
void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) {
constexpr size_t N1 = st.LanesPerKey();
const FixedTag<T, N1> d;
if (num < 2 * N1) return;
// Build heap.
for (size_t i = N1; i < num; i += N1) {
size_t j = i;
while (j != 0) {
const size_t idx_parent = ((j - N1) / N1 / 2) * N1;
if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent),
st.SetKey(d, keys + j)))) {
break;
}
st.Swap(keys + j, keys + idx_parent);
j = idx_parent;
}
}
for (size_t i = num - N1; i != 0; i -= N1) {
// Swap root with last
st.Swap(keys + 0, keys + i);
// Sift down the new root.
size_t j = 0;
while (j < i) {
const size_t left = 2 * j + N1;
const size_t right = 2 * j + 2 * N1;
if (left >= i) break;
size_t idx_larger = j;
const auto key_j = st.SetKey(d, keys + j);
if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) {
idx_larger = left;
}
if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger),
st.SetKey(d, keys + right)))) {
idx_larger = right;
}
if (idx_larger == j) break;
st.Swap(keys + j, keys + idx_larger);
j = idx_larger;
}
}
}
// ------------------------------ BaseCase
// Sorts `keys` within the range [0, num) via sorting network.
template <class D, class Traits, typename T>
HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf) {
const size_t N = Lanes(d);
using V = decltype(Zero(d));
// _Nonzero32 requires num - 1 != 0.
if (HWY_UNLIKELY(num <= 1)) return;
// Reshape into a matrix with kMaxRows rows, and columns limited by the
// 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum).
const size_t num_pow2 = size_t{1}
<< (32 - Num0BitsAboveMS1Bit_Nonzero32(
static_cast<uint32_t>(num - 1)));
HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N));
const size_t cols =
HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
HWY_DASSERT(cols <= N);
// Copy `keys` to `buf`.
size_t i;
for (i = 0; i + N <= num; i += N) {
Store(LoadU(d, keys + i), d, buf + i);
}
for (; i < num; ++i) {
buf[i] = keys[i];
}
// Fill with padding - last in sort order, not copied to keys.
const V kPadding = st.LastValue(d);
// Initialize an extra vector because SortingNetwork loads full vectors,
// which may exceed cols*kMaxRows.
for (; i < (cols * Constants::kMaxRows + N); i += N) {
StoreU(kPadding, d, buf + i);
}
SortingNetwork(st, buf, cols);
for (i = 0; i + N <= num; i += N) {
StoreU(Load(d, buf + i), d, keys + i);
}
for (; i < num; ++i) {
keys[i] = buf[i];
}
}
// ------------------------------ Partition
// Consumes from `left` until a multiple of kUnroll*N remains.
// Temporarily stores the right side into `buf`, then moves behind `right`.
template <class D, class Traits, class T>
HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
T* HWY_RESTRICT keys,
size_t& left, size_t& right,
const Vec<D> pivot,
T* HWY_RESTRICT buf) {
constexpr size_t kUnroll = Constants::kPartitionUnroll;
const size_t N = Lanes(d);
size_t readL = left;
size_t bufR = 0;
const size_t num = right - left;
// Partition requires both a multiple of kUnroll*N and at least
// 2*kUnroll*N for the initial loads. If less, consume all here.
const size_t num_rem =
(num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1));
size_t i = 0;
for (; i + N <= num_rem; i += N) {
const Vec<D> vL = LoadU(d, keys + readL);
readL += N;
const auto comp = st.Compare(d, pivot, vL);
left += CompressBlendedStore(vL, Not(comp), d, keys + left);
bufR += CompressStore(vL, comp, d, buf + bufR);
}
// Last iteration: only use valid lanes.
if (HWY_LIKELY(i != num_rem)) {
const auto mask = FirstN(d, num_rem - i);
const Vec<D> vL = LoadU(d, keys + readL);
const auto comp = st.Compare(d, pivot, vL);
left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left);
bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
}
// MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
#if HWY_IS_MSAN
__msan_unpoison(buf, bufR * sizeof(T));
#endif
// Everything we loaded was put into buf, or behind the new `left`, after
// which there is space for bufR items. First move items from `right` to
// `left` to free up space, then copy `buf` into the vacated `right`.
// A loop with masked loads from `buf` is insufficient - we would also need to
// mask from `right`. Combining a loop with memcpy for the remainders is
// slower than just memcpy, so we use that for simplicity.
right -= bufR;
memcpy(keys + left, keys + right, bufR * sizeof(T));
memcpy(keys + right, buf, bufR * sizeof(T));
}
template <class D, class Traits, typename T>
HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
const Vec<D> pivot, T* HWY_RESTRICT keys,
size_t& writeL, size_t& writeR) {
const size_t N = Lanes(d);
const auto comp = st.Compare(d, pivot, v);
const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL);
writeL += num_left;
writeR -= (N - num_left);
(void)CompressBlendedStore(v, comp, d, keys + writeR);
}
template <class D, class Traits, typename T>
HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
const Vec<D> v1, const Vec<D> v2,
const Vec<D> v3, const Vec<D> pivot,
T* HWY_RESTRICT keys, size_t& writeL,
size_t& writeR) {
StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR);
StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR);
StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR);
StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR);
}
// Moves "<= pivot" keys to the front, and others to the back. pivot is
// broadcasted. Time-critical!
//
// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
template <class D, class Traits, typename T>
HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
size_t right, const Vec<D> pivot,
T* HWY_RESTRICT buf) {
using V = decltype(Zero(d));
const size_t N = Lanes(d);
// StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
// lanes happen to be in the right-side partition, this will overrun `keys`,
// which triggers asan errors. Avoid by special-casing the last vector.
HWY_DASSERT(right - left > 2 * N); // ensured by HandleSpecialCases
right -= N;
const size_t last = right;
const V vlast = LoadU(d, keys + last);
PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
constexpr size_t kUnroll = Constants::kPartitionUnroll;
// Invariant: [left, writeL) and [writeR, right) are already partitioned.
size_t writeL = left;
size_t writeR = right;
const size_t num = right - left;
// Cannot load if there were fewer than 2 * kUnroll * N.
if (HWY_LIKELY(num != 0)) {
HWY_DASSERT(num >= 2 * kUnroll * N);
HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
// Make space for writing in-place by reading from left and right.
const V vL0 = LoadU(d, keys + left + 0 * N);
const V vL1 = LoadU(d, keys + left + 1 * N);
const V vL2 = LoadU(d, keys + left + 2 * N);
const V vL3 = LoadU(d, keys + left + 3 * N);
left += kUnroll * N;
right -= kUnroll * N;
const V vR0 = LoadU(d, keys + right + 0 * N);
const V vR1 = LoadU(d, keys + right + 1 * N);
const V vR2 = LoadU(d, keys + right + 2 * N);
const V vR3 = LoadU(d, keys + right + 3 * N);
// The left/right updates may consume all inputs, so check before the loop.
while (left != right) {
V v0, v1, v2, v3;
// Free up capacity for writing by loading from the side that has less.
// Data-dependent but branching is faster than forcing branch-free.
const size_t capacityL = left - writeL;
const size_t capacityR = writeR - right;
HWY_DASSERT(capacityL <= num && capacityR <= num); // >= 0
if (capacityR < capacityL) {
right -= kUnroll * N;
v0 = LoadU(d, keys + right + 0 * N);
v1 = LoadU(d, keys + right + 1 * N);
v2 = LoadU(d, keys + right + 2 * N);
v3 = LoadU(d, keys + right + 3 * N);
hwy::Prefetch(keys + right - 3 * kUnroll * N);
} else {
v0 = LoadU(d, keys + left + 0 * N);
v1 = LoadU(d, keys + left + 1 * N);
v2 = LoadU(d, keys + left + 2 * N);
v3 = LoadU(d, keys + left + 3 * N);
left += kUnroll * N;
hwy::Prefetch(keys + left + 3 * kUnroll * N);
}
StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR);
}
// Now finish writing the initial left/right to the middle.
StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR);
StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR);
}
// We have partitioned [left, right) such that writeL is the boundary.
HWY_DASSERT(writeL == writeR);
// Make space for inserting vlast: move up to N of the first right-side keys
// into the unused space starting at last. If we have fewer, ensure they are
// the last items in that vector by subtracting from the *load* address,
// which is safe because we have at least two vectors (checked above).
const size_t totalR = last - writeL;
const size_t startR = totalR < N ? writeL + totalR - N : writeL;
StoreU(LoadU(d, keys + startR), d, keys + last);
// Partition vlast: write L, then R, into the single-vector gap at writeL.
const auto comp = st.Compare(d, pivot, vlast);
writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
(void)CompressBlendedStore(vlast, comp, d, keys + writeL);
return writeL;
}
// ------------------------------ Pivot
template <class Traits, class V>
HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
const DFromV<V> d;
// Slightly faster for 128-bit, apparently because not serially dependent.
if (st.Is128()) {
// Median = XOR-sum 'minus' the first and last. Calling First twice is
// slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
const auto sum = Xor(Xor(v0, v1), v2);
const auto first = st.First(d, st.First(d, v0, v1), v2);
const auto last = st.Last(d, st.Last(d, v0, v1), v2);
return Xor(Xor(sum, first), last);
}
st.Sort2(d, v0, v2);
v1 = st.Last(d, v0, v1);
v1 = st.First(d, v1, v2);
return v1;
}
// Replaces triplets with their median and recurses until less than 3 keys
// remain. Ignores leftover values (non-whole triplets)!
template <class D, class Traits, typename T>
Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf) {
const size_t N = Lanes(d);
constexpr size_t N1 = st.LanesPerKey();
if (num < 3 * N1) return st.SetKey(d, keys);
size_t read = 0;
size_t written = 0;
// Triplets of vectors
for (; read + 3 * N <= num; read += 3 * N) {
const auto v0 = Load(d, keys + read + 0 * N);
const auto v1 = Load(d, keys + read + 1 * N);
const auto v2 = Load(d, keys + read + 2 * N);
Store(MedianOf3(st, v0, v1, v2), d, buf + written);
written += N;
}
// Triplets of keys
for (; read + 3 * N1 <= num; read += 3 * N1) {
const auto v0 = st.SetKey(d, keys + read + 0 * N1);
const auto v1 = st.SetKey(d, keys + read + 1 * N1);
const auto v2 = st.SetKey(d, keys + read + 2 * N1);
StoreU(MedianOf3(st, v0, v1, v2), d, buf + written);
written += N1;
}
// Tail recursion; swap buffers
return RecursiveMedianOf3(d, st, buf, written, keys);
}
#if VQSORT_SECURE_RNG
using Generator = absl::BitGen;
#else
// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
#pragma pack(push, 1)
class Generator {
public:
Generator(const void* heap, size_t num) {
Sorter::Fill24Bytes(heap, num, &a_);
k_ = 1; // stream index: must be odd
}
uint64_t operator()() {
const uint64_t b = b_;
w_ += k_;
const uint64_t next = a_ ^ w_;
a_ = (b + (b << 3)) ^ (b >> 11);
const uint64_t rot = (b << 24) | (b >> 40);
b_ = rot + next;
return next;
}
private:
uint64_t a_;
uint64_t b_;
uint64_t w_;
uint64_t k_; // increment
};
#pragma pack(pop)
#endif // !VQSORT_SECURE_RNG
// Returns slightly biased random index of a chunk in [0, num_chunks).
// See https://www.pcg-random.org/posts/bounded-rands.html.
HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32;
HWY_DASSERT(chunk_index < num_chunks);
return static_cast<size_t>(chunk_index);
}
template <class D, class Traits, typename T>
HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
const size_t begin, const size_t end,
T* HWY_RESTRICT buf, Generator& rng) {
using V = decltype(Zero(d));
const size_t N = Lanes(d);
// Power of two
const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
keys += begin;
size_t num = end - begin;
// Align start of keys to chunks. We always have at least 2 chunks because the
// base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
HWY_DASSERT(num >= 2 * lanes_per_chunk);
const size_t misalign =
(reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1);
if (misalign != 0) {
const size_t consume = lanes_per_chunk - misalign;
keys += consume;
num -= consume;
}
// Generate enough random bits for 9 uint32
uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf);
for (size_t i = 0; i < 5; ++i) {
bits64[i] = rng();
}
const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf);
const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk);
// Avoid division
const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32);
const size_t num_chunks64 = num >> log2_lpc;
// Clamp to uint32 for RandomChunkIndex
const uint32_t num_chunks =
static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull));
const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc;
const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc;
const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc;
const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc;
const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc;
const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc;
const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc;
const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc;
const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc;
for (size_t i = 0; i < lanes_per_chunk; i += N) {
const V v0 = Load(d, keys + offset0 + i);
const V v1 = Load(d, keys + offset1 + i);
const V v2 = Load(d, keys + offset2 + i);
const V medians0 = MedianOf3(st, v0, v1, v2);
Store(medians0, d, buf + i);
const V v3 = Load(d, keys + offset3 + i);
const V v4 = Load(d, keys + offset4 + i);
const V v5 = Load(d, keys + offset5 + i);
const V medians1 = MedianOf3(st, v3, v4, v5);
Store(medians1, d, buf + i + lanes_per_chunk);
const V v6 = Load(d, keys + offset6 + i);
const V v7 = Load(d, keys + offset7 + i);
const V v8 = Load(d, keys + offset8 + i);
const V medians2 = MedianOf3(st, v6, v7, v8);
Store(medians2, d, buf + i + lanes_per_chunk * 2);
}
return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk,
buf + 3 * lanes_per_chunk);
}
// Compute exact min/max to detect all-equal partitions. Only called after a
// degenerate Partition (none in the right partition).
template <class D, class Traits, typename T>
HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
Vec<D>& last) {
const size_t N = Lanes(d);
first = st.LastValue(d);
last = st.FirstValue(d);
size_t i = 0;
for (; i + N <= num; i += N) {
const Vec<D> v = LoadU(d, keys + i);
first = st.First(d, v, first);
last = st.Last(d, v, last);
}
if (HWY_LIKELY(i != num)) {
HWY_DASSERT(num >= N); // See HandleSpecialCases
const Vec<D> v = LoadU(d, keys + num - N);
first = st.First(d, v, first);
last = st.Last(d, v, last);
}
first = st.FirstOfLanes(d, first, buf);
last = st.LastOfLanes(d, last, buf);
}
template <class D, class Traits, typename T>
void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin,
const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf,
Generator& rng, size_t remaining_levels) {
HWY_DASSERT(begin + 1 < end);
const size_t num = end - begin; // >= 2
// Too many degenerate partitions. This is extremely unlikely to happen
// because we select pivots from large (though still O(1)) samples.
if (HWY_UNLIKELY(remaining_levels == 0)) {
HeapSort(st, keys + begin, num); // Slow but N*logN.
return;
}
const ptrdiff_t base_case_num =
static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d)));
const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
const ptrdiff_t num_left =
static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin);
const ptrdiff_t num_right =
static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound);
// Check for degenerate partitions (i.e. Partition did not move any keys):
if (HWY_UNLIKELY(num_right == 0)) {
// Because the pivot is one of the keys, it must have been equal to the
// first or last key in sort order. Scan for the actual min/max:
// passing the current pivot as the new bound is insufficient because one of
// the partitions might not actually include that key.
Vec<D> first, last;
ScanMinMax(d, st, keys + begin, num, buf, first, last);
if (AllTrue(d, Eq(first, last))) return;
// Separate recursion to make sure that we don't pick `last` as the
// pivot - that would again lead to a degenerate partition.
Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1);
return;
}
if (HWY_UNLIKELY(num_left <= base_case_num)) {
BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf);
} else {
const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng);
Recurse(d, st, keys, begin, bound, next_pivot, buf, rng,
remaining_levels - 1);
}
if (HWY_UNLIKELY(num_right <= base_case_num)) {
BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf);
} else {
const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng);
Recurse(d, st, keys, bound, end, next_pivot, buf, rng,
remaining_levels - 1);
}
}
// Returns true if sorting is finished.
template <class D, class Traits, typename T>
bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf) {
const size_t N = Lanes(d);
const size_t base_case_num = Constants::BaseCaseNum(N);
// 128-bit keys require vectors with at least two u64 lanes, which is always
// the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
// hardware vector width is less than 128bit / fraction.
const bool partial_128 = N < 2 && st.Is128();
// Partition assumes its input is at least two vectors. If vectors are huge,
// base_case_num may actually be smaller. If so, which is only possible on
// RVV, pass a capped or partial d (LMUL < 1).
constexpr bool kPotentiallyHuge =
HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
if (partial_128 || huge_vec) {
// PERFORMANCE WARNING: falling back to HeapSort.
HeapSort(st, keys, num);
return true;
}
// Small arrays: use sorting network, no need for other checks.
if (HWY_UNLIKELY(num <= base_case_num)) {
BaseCase(d, st, keys, num, buf);
return true;
}
// We could also check for already sorted/reverse/equal, but that's probably
// counterproductive if vqsort is used as a base case.
return false; // not finished sorting
}
#endif // HWY_TARGET != HWY_SCALAR
} // namespace detail
// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
// Non-stable (order of equal keys may change), except for the common case where
// the upper bits of T are the key, and the lower bits are a sequential or at
// least unique ID.
// There is no upper limit on `num`, but note that pivots may be chosen by
// sampling only from the first 256 GiB.
//
// `d` is typically SortTag<T> (chooses between full and partial vectors).
// `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
// bridges differences in sort order and single-lane vs 128-bit keys.
template <class D, class Traits, typename T>
void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
T* HWY_RESTRICT buf) {
#if HWY_TARGET == HWY_SCALAR
(void)d;
(void)buf;
// PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
return detail::HeapSort(st, keys, num);
#else
if (detail::HandleSpecialCases(d, st, keys, num, buf)) return;
#if HWY_MAX_BYTES > 64
// sorting_networks-inl and traits assume no more than 512 bit vectors.
if (Lanes(d) > 64 / sizeof(T)) {
return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
}
#endif // HWY_MAX_BYTES > 64
// Pulled out of the recursion so we can special-case degenerate partitions.
detail::Generator rng(keys, num);
const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng);
// Introspection: switch to worst-case N*logN heapsort after this many.
const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels);
#endif // HWY_TARGET == HWY_SCALAR
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE

148
third_party/highway/hwy/contrib/sort/vqsort.cc поставляемый
Просмотреть файл

@ -1,148 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/vqsort.h"
#include <string.h> // memset
#include "hwy/aligned_allocator.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/shared-inl.h"
// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
// (not all Android support the getrandom wrapper)
#ifndef VQSORT_SECURE_SEED
#if (defined(linux) || defined(__linux__)) && \
!(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV)
#define VQSORT_SECURE_SEED 1
#elif defined(_WIN32) || defined(_WIN64)
#define VQSORT_SECURE_SEED 2
#else
#define VQSORT_SECURE_SEED 0
#endif
#endif // VQSORT_SECURE_SEED
#if !VQSORT_SECURE_RNG
#include <time.h>
#if VQSORT_SECURE_SEED == 1
#include <sys/random.h>
#elif VQSORT_SECURE_SEED == 2
#include <windows.h>
#pragma comment(lib, "Advapi32.lib")
// Must come after windows.h.
#include <wincrypt.h>
#endif // VQSORT_SECURE_SEED
#endif // !VQSORT_SECURE_RNG
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(VectorSize);
HWY_EXPORT(HaveFloat64);
HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
// 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N);
return (3 + 1) * lpc + 2 * N;
}
} // namespace
Sorter::Sorter() {
// Determine the largest buffer size required for any type by trying them all.
// (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
// may require a larger buffer.)
const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
size_t max_bytes = 0;
for (size_t sizeof_t :
{sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) {
const size_t N = vector_size / sizeof_t;
// One extra for padding plus another for full-vector loads.
const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N;
const size_t partition_num = SortConstants::PartitionBufNum(N);
const size_t buf_lanes =
HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N)));
max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t);
}
ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
// Prevent msan errors by initializing.
memset(ptr_, 0, max_bytes);
}
void Sorter::Delete() {
FreeAlignedBytes(ptr_, nullptr, nullptr);
ptr_ = nullptr;
}
#if !VQSORT_SECURE_RNG
void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
#if VQSORT_SECURE_SEED == 1
// May block if urandom is not yet initialized.
const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
if (ret == 24) return;
#elif VQSORT_SECURE_SEED == 2
HCRYPTPROV hProvider{};
if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
CRYPT_VERIFYCONTEXT)) {
const BOOL ok =
CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
CryptReleaseContext(hProvider, 0);
if (ok) return;
}
#endif
// VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
// stack/heap/code addresses and the clock() timer.
uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
uint64_t** seed_stack = &words;
void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
const uint64_t bits_time = static_cast<uint64_t>(clock());
words[0] = bits_stack ^ bits_time ^ seed_num;
words[1] = bits_heap ^ bits_time ^ seed_num;
words[2] = bits_code ^ bits_time ^ seed_num;
}
#endif // !VQSORT_SECURE_RNG
bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
} // namespace hwy
#endif // HWY_ONCE

104
third_party/highway/hwy/contrib/sort/vqsort.h поставляемый
Просмотреть файл

@ -1,104 +0,0 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Interface to vectorized quicksort with dynamic dispatch.
#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
#include "hwy/base.h"
namespace hwy {
// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
// https://reviews.llvm.org/D86310
#pragma pack(push, 1)
struct alignas(16) uint128_t {
uint64_t lo; // little-endian layout
uint64_t hi;
};
#pragma pack(pop)
// Tag arguments that determine the sort order.
struct SortAscending {
constexpr bool IsAscending() const { return true; }
};
struct SortDescending {
constexpr bool IsAscending() const { return false; }
};
// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
// This allows amortizing the allocation over multiple sorts.
class HWY_CONTRIB_DLLEXPORT Sorter {
public:
Sorter();
~Sorter() { Delete(); }
// Move-only
Sorter(const Sorter&) = delete;
Sorter& operator=(const Sorter&) = delete;
Sorter(Sorter&& other) {
Delete();
ptr_ = other.ptr_;
other.ptr_ = nullptr;
}
Sorter& operator=(Sorter&& other) {
Delete();
ptr_ = other.ptr_;
other.ptr_ = nullptr;
return *this;
}
// Sorts keys[0, n). Dispatches to the best available instruction set,
// and does not allocate memory.
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
// For internal use only
static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
static bool HaveFloat64();
private:
void Delete();
template <typename T>
T* Get() const {
return static_cast<T*>(ptr_);
}
void* ptr_ = nullptr;
};
} // namespace hwy
#endif // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_

Просмотреть файл

@ -1,55 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(Sort128Asc);
} // namespace
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(Sort128Asc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,55 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits128-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(Sort128Desc);
} // namespace
void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(Sort128Desc)
(reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,53 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
SortTag<float> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF32Asc);
} // namespace
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
float* HWY_RESTRICT buf) {
SortTag<float> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF32Desc);
} // namespace
void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,61 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
double* HWY_RESTRICT buf) {
#if HWY_HAVE_FLOAT64
SortTag<double> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
#else
(void)keys;
(void)num;
(void)buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF64Asc);
} // namespace
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,61 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
double* HWY_RESTRICT buf) {
#if HWY_HAVE_FLOAT64
SortTag<double> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
#else
(void)keys;
(void)num;
(void)buf;
HWY_ASSERT(0);
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortF64Desc);
} // namespace
void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,59 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
// Workaround for build timeout
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
int16_t* HWY_RESTRICT buf) {
SortTag<int16_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI16Asc);
} // namespace
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD

Просмотреть файл

@ -1,59 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
// Workaround for build timeout
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
int16_t* HWY_RESTRICT buf) {
SortTag<int16_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI16Desc);
} // namespace
void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
int32_t* HWY_RESTRICT buf) {
SortTag<int32_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI32Asc);
} // namespace
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
int32_t* HWY_RESTRICT buf) {
SortTag<int32_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI32Desc);
} // namespace
void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
int64_t* HWY_RESTRICT buf) {
SortTag<int64_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI64Asc);
} // namespace
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
int64_t* HWY_RESTRICT buf) {
SortTag<int64_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortI64Desc);
} // namespace
void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,59 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
// Workaround for build timeout
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
uint16_t* HWY_RESTRICT buf) {
SortTag<uint16_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU16Asc);
} // namespace
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD

Просмотреть файл

@ -1,59 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
// Workaround for build timeout
#if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
uint16_t* HWY_RESTRICT buf) {
SortTag<uint16_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU16Desc);
} // namespace
void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
}
} // namespace hwy
#endif // HWY_ONCE
#endif // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
uint32_t* HWY_RESTRICT buf) {
SortTag<uint32_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU32Asc);
} // namespace
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
uint32_t* HWY_RESTRICT buf) {
SortTag<uint32_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU32Desc);
} // namespace
void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU64Asc);
} // namespace
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
SortAscending) const {
HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -1,54 +0,0 @@
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/contrib/sort/disabled_targets.h"
#include "hwy/contrib/sort/vqsort.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
#include "hwy/foreach_target.h"
// After foreach_target
#include "hwy/contrib/sort/traits-inl.h"
#include "hwy/contrib/sort/vqsort-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
uint64_t* HWY_RESTRICT buf) {
SortTag<uint64_t> d;
detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st;
Sort(d, st, keys, num, buf);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(SortU64Desc);
} // namespace
void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
SortDescending) const {
HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
}
} // namespace hwy
#endif // HWY_ONCE

Просмотреть файл

@ -106,6 +106,20 @@
//------------------------------------------------------------------------------
// Architecture
#if defined(HWY_EMULATE_SVE)
#define HWY_ARCH_X86_32 0
#define HWY_ARCH_X86_64 0
#define HWY_ARCH_X86 0
#define HWY_ARCH_PPC 0
#define HWY_ARCH_ARM_A64 1
#define HWY_ARCH_ARM_V7 0
#define HWY_ARCH_ARM 1
#define HWY_ARCH_WASM 0
#define HWY_ARCH_RVV 0
#else
#if defined(__i386__) || defined(_M_IX86)
#define HWY_ARCH_X86_32 1
#else
@ -168,6 +182,8 @@
#define HWY_ARCH_RVV 0
#endif
#endif // defined(HWY_EMULATE_SVE)
// It is an error to detect multiple architectures at the same time, but OK to
// detect none of the above.
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \

9
third_party/highway/hwy/detect_targets.h поставляемый
Просмотреть файл

@ -161,6 +161,11 @@
// user to override this without any guarantee of success.
#ifndef HWY_BASELINE_TARGETS
#if defined(HWY_EMULATE_SVE)
#define HWY_BASELINE_TARGETS HWY_SVE // does not support SVE2
#define HWY_BASELINE_AVX3_DL 0
#else
// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
// HWY_TARGET == HWY_SCALAR.
@ -181,7 +186,7 @@
#define HWY_BASELINE_PPC8 0
#endif
// SVE2 compiles, but is not yet tested.
// SVE compiles, but is not yet tested.
#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
#define HWY_BASELINE_SVE2 HWY_SVE2
#else
@ -302,6 +307,8 @@
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
#endif // HWY_EMULATE_SVE
#else
// User already defined HWY_BASELINE_TARGETS, but we still need to define
// HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.

66
third_party/highway/hwy/examples/benchmark.cc поставляемый
Просмотреть файл

@ -25,17 +25,15 @@
#include <numeric> // iota
#include "hwy/aligned_allocator.h"
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
#include "hwy/nanobenchmark.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
#if HWY_TARGET != HWY_SCALAR
using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
#endif
class TwoArray {
@ -89,14 +87,14 @@ void RunBenchmark(const char* caption) {
}
void Intro() {
const float in[16] = {1, 2, 3, 4, 5, 6};
float out[16];
HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
HWY_ALIGN float out[16];
const ScalableTag<float> d; // largest possible vector
for (size_t i = 0; i < 16; i += Lanes(d)) {
const auto vec = LoadU(d, in + i); // no alignment requirement
auto result = Mul(vec, vec);
result = Add(result, result); // can update if not const
StoreU(result, d, out + i);
const auto vec = Load(d, in + i); // aligned!
auto result = vec * vec;
result += result; // can update if not const
Store(result, d, out + i);
}
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
}
@ -111,34 +109,32 @@ class BenchmarkDot : public TwoArray {
const ScalableTag<float> d;
const size_t N = Lanes(d);
using V = decltype(Zero(d));
constexpr size_t unroll = 8;
// Compiler doesn't make independent sum* accumulators, so unroll manually.
// We cannot use an array because V might be a sizeless type. For reasonable
// code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
V sum0 = Zero(d);
V sum1 = Zero(d);
V sum2 = Zero(d);
V sum3 = Zero(d);
// Some older compilers might not be able to fit the 8 arrays in registers,
// so manual unrolling can be helpfull if you run into this issue.
// 2 FMA ports * 4 cycle latency = 8x unrolled.
V sum[unroll];
for (size_t i = 0; i < unroll; ++i) {
sum[i] = Zero(d);
}
const float* const HWY_RESTRICT pa = &a_[0];
const float* const HWY_RESTRICT pb = b_;
for (size_t i = 0; i < num_items; i += 4 * N) {
const auto a0 = Load(d, pa + i + 0 * N);
const auto b0 = Load(d, pb + i + 0 * N);
sum0 = MulAdd(a0, b0, sum0);
const auto a1 = Load(d, pa + i + 1 * N);
const auto b1 = Load(d, pb + i + 1 * N);
sum1 = MulAdd(a1, b1, sum1);
const auto a2 = Load(d, pa + i + 2 * N);
const auto b2 = Load(d, pb + i + 2 * N);
sum2 = MulAdd(a2, b2, sum2);
const auto a3 = Load(d, pa + i + 3 * N);
const auto b3 = Load(d, pb + i + 3 * N);
sum3 = MulAdd(a3, b3, sum3);
for (size_t i = 0; i < num_items; i += unroll * N) {
for (size_t j = 0; j < unroll; ++j) {
const auto a = Load(d, pa + i + j * N);
const auto b = Load(d, pb + i + j * N);
sum[j] = MulAdd(a, b, sum[j]);
}
}
// Reduction tree: sum of all accumulators by pairs into sum0.
sum0 = Add(sum0, sum1);
sum2 = Add(sum2, sum3);
sum0 = Add(sum0, sum2);
dot_ = GetLane(SumOfLanes(d, sum0));
// Reduction tree: sum of all accumulators by pairs into sum[0], then the
// lanes.
for (size_t power = 1; power < unroll; power *= 2) {
for (size_t i = 0; i < unroll; i += 2 * power) {
sum[i] += sum[i + power];
}
}
dot_ = GetLane(SumOfLanes(d, sum[0]));
return static_cast<FuncOutput>(dot_);
}
void Verify(size_t num_items) {
@ -197,9 +193,9 @@ struct BenchmarkDelta : public TwoArray {
auto prev = Load(df, &a_[0]);
for (; i < num_items; i += Lanes(df)) {
const auto a = Load(df, &a_[i]);
const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
const auto shifted = CombineShiftRightLanes<3>(a, prev);
prev = a;
Store(Sub(a, shifted), df, &b_[i]);
Store(a - shifted, df, &b_[i]);
}
#endif
return static_cast<FuncOutput>(b_[num_items - 1]);

22
third_party/highway/hwy/examples/skeleton.cc поставляемый
Просмотреть файл

@ -24,14 +24,11 @@
// Generates code for each enabled target by re-including this source file.
#include "hwy/foreach_target.h"
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
// Optional, can instead add HWY_ATTR to all functions.
HWY_BEFORE_NAMESPACE();
namespace skeleton {
// This namespace name is unique per target, which allows code for multiple
// targets to co-exist in the same translation unit.
namespace HWY_NAMESPACE {
// Highway ops reside here; ADL does not find templates nor builtins.
@ -50,7 +47,7 @@ template <class DF>
ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
uint8_t* HWY_RESTRICT log2) {
// Type tags for converting to other element types (Rebind = same count).
const RebindToSigned<DF> d32;
const Rebind<int32_t, DF> d32;
const Rebind<uint8_t, DF> d8;
const auto u8 = Load(d8, values);
@ -62,7 +59,7 @@ ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
void CodepathDemo() {
// Highway defaults to portability, but per-target codepaths may be selected
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
const char* gather = "Has int64";
#else
const char* gather = "No int64";
@ -74,16 +71,20 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
uint8_t* HWY_RESTRICT log2) {
CodepathDemo();
const ScalableTag<float> df;
// Second argument is necessary on RVV until it supports fractional lengths.
const ScalableTag<float, 2> df;
const size_t N = Lanes(df);
size_t i = 0;
for (; i + N <= count; i += N) {
OneFloorLog2(df, values + i, log2 + i);
}
// TODO(janwas): implement
#if HWY_TARGET != HWY_RVV
for (; i < count; ++i) {
CappedTag<float, 1> d1;
OneFloorLog2(d1, values + i, log2 + i);
OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i);
}
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
@ -91,9 +92,6 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
} // namespace skeleton
HWY_AFTER_NAMESPACE();
// The table of pointers to the various implementations in HWY_NAMESPACE must
// be compiled only once (foreach_target #includes this file multiple times).
// HWY_ONCE is true for only one of these 'compilation passes'.
#if HWY_ONCE
namespace skeleton {
@ -107,8 +105,6 @@ HWY_EXPORT(FloorLog2);
// is equivalent to inlining this function.
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
uint8_t* HWY_RESTRICT out) {
// This must reside outside of HWY_NAMESPACE because it references (calls the
// appropriate one from) the per-target implementations there.
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
}

Просмотреть файл

@ -21,13 +21,10 @@
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
#include "hwy/foreach_target.h"
// Must come after foreach_target.h to avoid redefinition errors.
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Optional: factor out parts of the implementation into *-inl.h
// (must also come after foreach_target.h to avoid redefinition errors)
#include "hwy/examples/skeleton-inl.h"
HWY_BEFORE_NAMESPACE();
@ -53,7 +50,10 @@ struct TestFloorLog2 {
CallFloorLog2(in.get(), count, out.get());
int sum = 0;
for (size_t i = 0; i < count; ++i) {
// TODO(janwas): implement
#if HWY_TARGET != HWY_RVV
HWY_ASSERT_EQ(expected[i], out[i]);
#endif
sum += out[i];
}
hwy::PreventElision(sum);

22
third_party/highway/hwy/foreach_target.h поставляемый
Просмотреть файл

@ -74,28 +74,6 @@
#endif
#endif
#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
#undef HWY_TARGET
#define HWY_TARGET HWY_SVE2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSSE3

25
third_party/highway/hwy/highway.h поставляемый
Просмотреть файл

@ -27,7 +27,7 @@ namespace hwy {
// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 0
#define HWY_MINOR 16
#define HWY_MINOR 15
#define HWY_PATCH 0
//------------------------------------------------------------------------------
@ -37,9 +37,7 @@ namespace hwy {
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
// registers in the group, and is ignored on targets that do not support groups.
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
#define HWY_FULL2(T, LMUL) \
hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))>
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
@ -48,9 +46,9 @@ namespace hwy {
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
// Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
#define HWY_CAPPED(T, MAX_N) \
hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))>
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch
@ -111,7 +109,6 @@ struct FunctionCache {
template <FunctionType* const table[]>
static RetType ChooseAndCall(Args... args) {
// If we are running here it means we need to update the chosen target.
ChosenTarget& chosen_target = GetChosenTarget();
chosen_target.Update();
return (table[chosen_target.GetIndex()])(args...);
}
@ -266,15 +263,10 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
HWY_CHOOSE_SCALAR(FUNC_NAME), \
}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// DEPRECATED names; please use HWY_HAVE_* instead.
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
} // namespace hwy
#endif // HWY_HIGHWAY_INCLUDED
@ -291,6 +283,13 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
#define HWY_HIGHWAY_PER_TARGET
#endif
#undef HWY_FULL2
#if HWY_TARGET == HWY_RVV
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
#else
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
#endif
// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"

106
third_party/highway/hwy/highway_export.h поставляемый
Просмотреть файл

@ -1,106 +0,0 @@
// Pseudo-generated file to handle both cmake & bazel build system.
// Initial generation done using cmake code:
// include(GenerateExportHeader)
// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
// hwy/highway_export.h)
// code reformatted using clang-format --style=Google
#ifndef HWY_DLLEXPORT_H
#define HWY_DLLEXPORT_H
// Bazel build are always static:
#if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE)
#define HWY_STATIC_DEFINE
#endif
#ifdef HWY_STATIC_DEFINE
#define HWY_DLLEXPORT
#define HWY_NO_EXPORT
#define HWY_CONTRIB_DLLEXPORT
#define HWY_CONTRIB_NO_EXPORT
#define HWY_TEST_DLLEXPORT
#define HWY_TEST_NO_EXPORT
#else
#ifndef HWY_DLLEXPORT
#if defined(hwy_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllexport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#else
/* We are using this library */
#ifdef _WIN32
#define HWY_DLLEXPORT __declspec(dllimport)
#else
#define HWY_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif
#endif
#ifndef HWY_NO_EXPORT
#ifdef _WIN32
#define HWY_NO_EXPORT
#else
#define HWY_NO_EXPORT __attribute__((visibility("hidden")))
#endif
#endif
#ifndef HWY_CONTRIB_DLLEXPORT
#if defined(hwy_contrib_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#else
/* We are using this library */
#ifdef _WIN32
#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
#else
#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif
#endif
#ifndef HWY_CONTRIB_NO_EXPORT
#ifdef _WIN32
#define HWY_CONTRIB_NO_EXPORT
#else
#define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden")))
#endif
#endif
#ifndef HWY_TEST_DLLEXPORT
#if defined(hwy_test_EXPORTS)
/* We are building this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllexport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#else
/* We are using this library */
#ifdef _WIN32
#define HWY_TEST_DLLEXPORT __declspec(dllimport)
#else
#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
#endif
#endif
#endif
#ifndef HWY_TEST_NO_EXPORT
#ifdef _WIN32
#define HWY_TEST_NO_EXPORT
#else
#define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden")))
#endif
#endif
#endif
#endif /* HWY_DLLEXPORT_H */

50
third_party/highway/hwy/highway_test.cc поставляемый
Просмотреть файл

@ -15,8 +15,6 @@
#include <stddef.h>
#include <stdint.h>
#include <bitset>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "highway_test.cc"
#include "hwy/foreach_target.h"
@ -28,53 +26,6 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// For testing that ForPartialVectors reaches every possible size:
using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
// Monostate pattern because ForPartialVectors takes a template argument, not a
// functor by reference.
static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
return num_lanes + sizeof_t;
}
static size_t* MaxLanesForSize(size_t sizeof_t) {
HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
return num_lanes + sizeof_t;
}
struct TestMaxLanes {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const size_t kMax = MaxLanes(d);
HWY_ASSERT(N <= kMax);
HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
NumLanesForSize(sizeof(T))->set(N);
*MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
}
};
HWY_NOINLINE void TestAllMaxLanes() {
ForAllTypes(ForPartialVectors<TestMaxLanes>());
// Ensure ForPartialVectors visited all powers of two [1, N].
for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
sizeof(uint64_t)}) {
const size_t N = *MaxLanesForSize(sizeof_t);
for (size_t i = 1; i <= N; i += i) {
if (!NumLanesForSize(sizeof_t)->test(i)) {
fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
static_cast<int>(sizeof_t), static_cast<int>(i),
static_cast<int>(N));
HWY_ASSERT(false);
}
}
}
}
struct TestSet {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -371,7 +322,6 @@ HWY_AFTER_NAMESPACE();
namespace hwy {
HWY_BEFORE_TEST(HighwayTest);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);

19
third_party/highway/hwy/hwy.version поставляемый
Просмотреть файл

@ -1,19 +0,0 @@
HWY_0 {
global:
extern "C++" {
*hwy::*;
};
local:
# Hide all the std namespace symbols. std namespace is explicitly marked
# as visibility(default) and header-only functions or methods (such as those
# from templates) should be exposed in shared libraries as weak symbols but
# this is only needed when we expose those types in the shared library API
# in any way. We don't use C++ std types in the API and we also don't
# support exceptions in the library.
# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
# about this.
extern "C++" {
*std::*;
};
};

19
third_party/highway/hwy/nanobenchmark.cc поставляемый
Просмотреть файл

@ -37,7 +37,7 @@
#include <windows.h>
#endif
#if defined(__APPLE__)
#if defined(__MACH__)
#include <mach/mach.h>
#include <mach/mach_time.h>
#endif
@ -148,7 +148,7 @@ inline Ticks Start() {
LARGE_INTEGER counter;
(void)QueryPerformanceCounter(&counter);
t = counter.QuadPart;
#elif defined(__APPLE__)
#elif defined(__MACH__)
t = mach_absolute_time();
#elif defined(__HAIKU__)
t = system_time_nsecs(); // since boot
@ -405,7 +405,7 @@ double NominalClockRate() {
} // namespace
HWY_DLLEXPORT double InvariantTicksPerSecond() {
double InvariantTicksPerSecond() {
#if HWY_ARCH_PPC && defined(__GLIBC__)
return double(__ppc_get_timebase_freq());
#elif HWY_ARCH_X86
@ -415,7 +415,7 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
LARGE_INTEGER freq;
(void)QueryPerformanceFrequency(&freq);
return double(freq.QuadPart);
#elif defined(__APPLE__)
#elif defined(__MACH__)
// https://developer.apple.com/library/mac/qa/qa1398/_index.html
mach_timebase_info_data_t timebase;
(void)mach_timebase_info(&timebase);
@ -426,12 +426,12 @@ HWY_DLLEXPORT double InvariantTicksPerSecond() {
#endif
}
HWY_DLLEXPORT double Now() {
double Now() {
static const double mul = 1.0 / InvariantTicksPerSecond();
return static_cast<double>(timer::Start()) * mul;
}
HWY_DLLEXPORT uint64_t TimerResolution() {
uint64_t TimerResolution() {
// Nested loop avoids exceeding stack/L1 capacity.
timer::Ticks repetitions[Params::kTimerSamples];
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
@ -656,11 +656,10 @@ timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
} // namespace
HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
int Unpredictable1() { return timer::Start() != ~0ULL; }
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
const FuncInput* inputs, const size_t num_inputs,
Result* results, const Params& p) {
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results, const Params& p) {
NANOBENCHMARK_CHECK(num_inputs != 0);
#if HWY_ARCH_X86

16
third_party/highway/hwy/nanobenchmark.h поставляемый
Просмотреть файл

@ -47,8 +47,6 @@
#include <stddef.h>
#include <stdint.h>
#include "hwy/highway_export.h"
// Enables sanity checks that verify correct operation at the cost of
// longer benchmark runs.
#ifndef NANOBENCHMARK_ENABLE_CHECKS
@ -74,23 +72,23 @@ namespace platform {
// Returns tick rate, useful for converting measurements to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
HWY_DLLEXPORT double InvariantTicksPerSecond();
double InvariantTicksPerSecond();
// Returns current timestamp [in seconds] relative to an unspecified origin.
// Features: monotonic (no negative elapsed time), steady (unaffected by system
// time changes), high-resolution (on the order of microseconds).
HWY_DLLEXPORT double Now();
double Now();
// Returns ticks elapsed in back to back timer calls, i.e. a function of the
// timer resolution (minimum measurable difference) and overhead.
// This call is expensive, callers should cache the result.
HWY_DLLEXPORT uint64_t TimerResolution();
uint64_t TimerResolution();
} // namespace platform
// Returns 1, but without the compiler knowing what the value is. This prevents
// optimizing out code.
HWY_DLLEXPORT int Unpredictable1();
int Unpredictable1();
// Input influencing the function being measured (e.g. number of bytes to copy).
using FuncInput = size_t;
@ -166,9 +164,9 @@ struct Result {
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
// Returns how many Result were written to "results": one per unique input, or
// zero if the measurement failed (an error message goes to stderr).
HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
const FuncInput* inputs, const size_t num_inputs,
Result* results, const Params& p = Params());
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params());
// Calls operator() of the given closure (lambda function).
template <class Closure>

1630
third_party/highway/hwy/ops/arm_neon-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1024
third_party/highway/hwy/ops/arm_sve-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

65
third_party/highway/hwy/ops/generic_ops-inl.h поставляемый
Просмотреть файл

@ -19,14 +19,14 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
template <class V>
using LaneType = decltype(GetLane(V()));
// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
// type of functions that do not take a vector argument, or as an argument type
// if the function only has a template argument for D, or for explicit type
// names instead of auto. This may be a built-in type.
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
// of functions that do not take a vector argument, or as an argument type if
// the function only has a template argument for D, or for explicit type names
// instead of auto. This may be a built-in type.
template <class D>
using Vec = decltype(Zero(D()));
@ -53,6 +53,12 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
return CombineShiftRightBytes<kBytes>(d, hi, lo);
}
// DEPRECATED
template <size_t kLanes, class V>
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo);
}
#endif
// Returns lanes with the most significant bit set and all other bits zero.
@ -202,15 +208,6 @@ HWY_API V AESRound(V state, const V round_key) {
return state;
}
template <class V> // u8
HWY_API V AESLastRound(V state, const V round_key) {
// LIke AESRound, but without MixColumns.
state = detail::SubBytes(state);
state = detail::ShiftRows(state);
state = Xor(state, round_key); // AddRoundKey
return state;
}
// Constant-time implementation inspired by
// https://www.bearssl.org/constanttime.html, but about half the cost because we
// use 64x64 multiplies and 128-bit XORs.
@ -281,47 +278,23 @@ HWY_API V CLMulUpper(V a, V b) {
#define HWY_NATIVE_POPCNT
#endif
#if HWY_TARGET == HWY_RVV
#define HWY_MIN_POW2_FOR_128 1
#else
// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
// guarantee 128 bits anyway.
#define HWY_MIN_POW2_FOR_128 0
#endif
// This algorithm requires vectors to be at least 16 bytes, which is the case
// for LMUL >= 2. If not, use the fallback below.
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>),
HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
template <typename V, HWY_IF_LANES_ARE(uint8_t, V)>
HWY_API V PopulationCount(V v) {
const DFromV<V> d;
constexpr DFromV<V> d;
HWY_ALIGN constexpr uint8_t kLookup[16] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
};
const auto lo = And(v, Set(d, 0xF));
const auto hi = ShiftRight<4>(v);
const auto lookup = LoadDup128(d, kLookup);
auto lo = And(v, Set(d, 0xF));
auto hi = ShiftRight<4>(v);
auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup);
return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
}
// RVV has a specialization that avoids the Set().
#if HWY_TARGET != HWY_RVV
// Slower fallback for capped vectors.
template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)>
HWY_API V PopulationCount(V v) {
const DFromV<V> d;
// See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
}
#endif // HWY_TARGET != HWY_RVV
template <typename V, HWY_IF_LANES_ARE(uint16_t, V)>
HWY_API V PopulationCount(V v) {
const DFromV<V> d;
const Repartition<uint8_t, decltype(d)> d8;
const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
Repartition<uint8_t, decltype(d)> d8;
auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
}
@ -333,7 +306,7 @@ HWY_API V PopulationCount(V v) {
return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
}
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
template <typename V, HWY_IF_LANES_ARE(uint64_t, V)>
HWY_API V PopulationCount(V v) {
const DFromV<V> d;

1765
third_party/highway/hwy/ops/rvv-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

120
third_party/highway/hwy/ops/scalar-inl.h поставляемый
Просмотреть файл

@ -27,7 +27,7 @@ namespace HWY_NAMESPACE {
// Single instruction, single data.
template <typename T>
using Sisd = Simd<T, 1, 0>;
using Sisd = Simd<T, 1>;
// (Wrapper class required for overloading comparison operators.)
template <typename T>
@ -187,20 +187,6 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
return Xor(a, b);
}
// ------------------------------ OrAnd
template <typename T>
HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
return Or(o, And(a1, a2));
}
// ------------------------------ IfVecThenElse
template <typename T>
HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
return IfThenElse(MaskFromVec(mask), yes, no);
}
// ------------------------------ CopySign
template <typename T>
@ -289,11 +275,6 @@ HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
return mask.bits ? Vec1<T>(0) : no;
}
template <typename T>
HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
return v.raw < 0 ? yes : no;
}
template <typename T>
HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
return v.raw < 0 ? Vec1<T>(0) : v;
@ -442,13 +423,7 @@ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
return Vec1<double>(a.raw - b.raw);
}
// ------------------------------ SumsOf8
HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
return Vec1<uint64_t>(v.raw);
}
// ------------------------------ SaturatedAdd
// ------------------------------ Saturating addition
// Returns a + b clamped to the destination range.
@ -956,30 +931,21 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
return Vec1<ToT>(static_cast<ToT>(from.raw));
}
// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
// so we overload for FromT=double and ToT={float,int32_t}.
HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
// Prevent ubsan errors when converting float to narrower integer/float
if (std::isinf(from.raw) ||
std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
: HighestValue<float>());
std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) {
return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>()
: HighestValue<ToT>());
}
return Vec1<float>(static_cast<float>(from.raw));
}
HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
// Prevent ubsan errors when converting int32_t to narrower integer/int32_t
if (std::isinf(from.raw) ||
std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
: HighestValue<int32_t>());
}
return Vec1<int32_t>(static_cast<int32_t>(from.raw));
return Vec1<ToT>(static_cast<ToT>(from.raw));
}
template <typename FromT, typename ToT>
template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
// Int to int: choose closest value in ToT to `from` (avoids UB)
@ -1117,12 +1083,6 @@ HWY_API T GetLane(const Vec1<T> v) {
return v.raw;
}
template <typename T>
HWY_API Vec1<T> DupEven(Vec1<T> v) {
return v;
}
// DupOdd is unsupported.
template <typename T>
HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
return even;
@ -1165,14 +1125,6 @@ HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
return v;
}
// ------------------------------ ReverseBlocks
// Single block: no change
template <typename T>
HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
// ------------------------------ Reverse
template <typename T>
@ -1180,21 +1132,6 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
template <typename T>
HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
template <typename T>
HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
template <typename T>
HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
// ================================================== BLOCKWISE
// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
@ -1371,6 +1308,41 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
return v;
}
// ================================================== DEPRECATED
template <typename T>
HWY_API size_t StoreMaskBits(const Mask1<T> mask, uint8_t* bits) {
return StoreMaskBits(Sisd<T>(), mask, bits);
}
template <typename T>
HWY_API bool AllTrue(const Mask1<T> mask) {
return AllTrue(Sisd<T>(), mask);
}
template <typename T>
HWY_API bool AllFalse(const Mask1<T> mask) {
return AllFalse(Sisd<T>(), mask);
}
template <typename T>
HWY_API size_t CountTrue(const Mask1<T> mask) {
return CountTrue(Sisd<T>(), mask);
}
template <typename T>
HWY_API Vec1<T> SumOfLanes(const Vec1<T> v) {
return SumOfLanes(Sisd<T>(), v);
}
template <typename T>
HWY_API Vec1<T> MinOfLanes(const Vec1<T> v) {
return MinOfLanes(Sisd<T>(), v);
}
template <typename T>
HWY_API Vec1<T> MaxOfLanes(const Vec1<T> v) {
return MaxOfLanes(Sisd<T>(), v);
}
// ================================================== Operator wrapper
template <class V>

110
third_party/highway/hwy/ops/set_macros-inl.h поставляемый
Просмотреть файл

@ -32,10 +32,9 @@
#undef HWY_MAX_BYTES
#undef HWY_LANES
#undef HWY_HAVE_SCALABLE
#undef HWY_HAVE_INTEGER64
#undef HWY_HAVE_FLOAT16
#undef HWY_HAVE_FLOAT64
#undef HWY_CAP_INTEGER64
#undef HWY_CAP_FLOAT16
#undef HWY_CAP_FLOAT64
#undef HWY_CAP_GE256
#undef HWY_CAP_GE512
@ -80,10 +79,9 @@
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_AES 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -98,10 +96,9 @@
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -116,10 +113,9 @@
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
@ -133,10 +129,9 @@
#define HWY_MAX_BYTES 64
#define HWY_LANES(T) (64 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 1
@ -164,10 +159,9 @@
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 0
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 0
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -183,16 +177,15 @@
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_ARCH_ARM_A64
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_FLOAT64 1
#else
#define HWY_HAVE_FLOAT64 0
#define HWY_CAP_FLOAT64 0
#endif
#define HWY_NAMESPACE N_NEON
@ -203,19 +196,23 @@
// SVE[2]
#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE
#if defined(HWY_EMULATE_SVE) && !defined(__F16C__)
#error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16"
#endif
// SVE only requires lane alignment, not natural alignment of the entire vector.
#define HWY_ALIGN alignas(8)
#define HWY_MAX_BYTES 256
// Value ensures MaxLanes() is the tightest possible upper bound to reduce
// overallocation.
#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
#define HWY_LANES(T) (32768 / sizeof(T))
#define HWY_HAVE_SCALABLE 1
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -235,10 +232,9 @@
#define HWY_MAX_BYTES 16
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 0
#define HWY_CAP_INTEGER64 0
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -254,10 +250,9 @@
#define HWY_MAX_BYTES 32
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 0
#define HWY_CAP_INTEGER64 0
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -276,20 +271,20 @@
// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
#define HWY_MAX_BYTES 65536
// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
// LMUL. This is the tightest possible upper bound.
#define HWY_LANES(T) (8192 / sizeof(T))
// <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
// 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
// div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
#define HWY_LANES(T) (8388608 / sizeof(T))
#define HWY_HAVE_SCALABLE 1
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if defined(__riscv_zfh)
#define HWY_HAVE_FLOAT16 1
#define HWY_CAP_FLOAT16 1
#else
#define HWY_HAVE_FLOAT16 0
#define HWY_CAP_FLOAT16 0
#endif
#define HWY_NAMESPACE N_RVV
@ -305,10 +300,9 @@
#define HWY_MAX_BYTES 8
#define HWY_LANES(T) 1
#define HWY_HAVE_SCALABLE 0
#define HWY_HAVE_INTEGER64 1
#define HWY_HAVE_FLOAT16 1
#define HWY_HAVE_FLOAT64 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT16 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
@ -350,3 +344,7 @@
#else
#define HWY_ATTR
#endif
// DEPRECATED
#undef HWY_GATHER_LANES
#define HWY_GATHER_LANES(T) HWY_LANES(T)

194
third_party/highway/hwy/ops/shared-inl.h поставляемый
Просмотреть файл

@ -26,117 +26,65 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Highway operations are implemented as overloaded functions selected using an
// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
// shift count applied to scalable vectors. Instead of referring to Simd<>
// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
// full vector, or fractions/groups if the argument is negative/positive),
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
// cap. For constexpr-size vectors, N is the actual number of lanes. This
// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
template <typename Lane, size_t N, int kPow2>
// SIMD operations are implemented as overloaded functions selected using a tag
// type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
// use only. Users create D via aliases ScalableTag<T>() (a full vector),
// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
// (always a power of two) is Lanes(D()).
template <typename Lane, size_t N>
struct Simd {
constexpr Simd() = default;
using T = Lane;
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
// Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
// warns when using enums and non-enums in the same expression. Cannot be
// static constexpr function (another MSVC limitation).
static constexpr size_t kPrivateN = N;
static constexpr int kPrivatePow2 = kPow2;
template <typename NewT>
static constexpr size_t NewN() {
// Round up to correctly handle scalars with N=1.
return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
}
#if HWY_HAVE_SCALABLE
template <typename NewT>
static constexpr int Pow2Ratio() {
return (sizeof(NewT) > sizeof(T))
? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
: -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
}
#endif
// Widening/narrowing ops change the number of lanes and/or their type.
// To initialize such vectors, we need the corresponding tag types:
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
#if HWY_HAVE_SCALABLE
template <typename NewT>
using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
#else
template <typename NewT>
using Rebind = Simd<NewT, N, kPow2>;
#endif
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
template <typename NewLane>
using Rebind = Simd<NewLane, N>;
// Change lane type while keeping the same vector size, e.g. for MulEven.
template <typename NewT>
using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
// MulEven() with another lane type, but same total size.
// Round up to correctly handle scalars with N=1.
template <typename NewLane>
using Repartition =
Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
// Round up to correctly handle scalars with N=1.
#if HWY_HAVE_SCALABLE
// Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
// then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
#else
using Half = Simd<T, (N + 1) / 2, kPow2>;
#endif
// LowerHalf() with the same lane type, but half the lanes.
// Round up to correctly handle scalars with N=1.
using Half = Simd<T, (N + 1) / 2>;
// Twice the lanes while keeping the same lane type, e.g. for Combine.
#if HWY_HAVE_SCALABLE
using Twice = Simd<T, 2 * N, kPow2 + 1>;
#else
using Twice = Simd<T, 2 * N, kPow2>;
#endif
// Combine() with the same lane type, but twice the lanes.
using Twice = Simd<T, 2 * N>;
};
namespace detail {
#if HWY_HAVE_SCALABLE
template <typename T, size_t N, int kPow2>
constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
return N == HWY_LANES(T) && kPow2 == 0;
}
#endif
// Returns the number of lanes (possibly zero) after applying a shift:
// - 0: no change;
// - [1,3]: a group of 2,4,8 [fractional] vectors;
// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
// Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
// - a full vector (pow2 = 0);
// - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
// - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
constexpr size_t ScaleByPower(size_t N, int pow2) {
return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
#if HWY_TARGET == HWY_RVV
// For fractions, if N == 1 ensure we still return at least one lane.
return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2)));
#else
// If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
return HWY_MAX(1, N >> HWY_MAX(-pow2, 0));
#endif
}
// Struct wrappers enable validation of arguments via static_assert.
template <typename T, int kPow2>
struct ScalableTagChecker {
static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
#if HWY_TARGET == HWY_RVV
// Only RVV supports register groups.
using type = Simd<T, HWY_LANES(T), kPow2>;
#elif HWY_HAVE_SCALABLE
// For SVE[2], only allow full or fractions.
using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
#elif HWY_TARGET == HWY_SCALAR
using type = Simd<T, /*N=*/1, 0>;
#else
// Only allow full or fractions.
using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
#endif
using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>;
};
template <typename T, size_t kLimit>
struct CappedTagChecker {
static_assert(kLimit != 0, "Does not make sense to have zero lanes");
using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>;
using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>;
};
template <typename T, size_t kNumLanes>
@ -147,7 +95,7 @@ struct FixedTagChecker {
// HWY_MAX_BYTES would still allow uint8x8, which is not supported.
static_assert(kNumLanes == 1, "Scalar only supports one lane");
#endif
using type = Simd<T, kNumLanes, 0>;
using type = Simd<T, kNumLanes>;
};
} // namespace detail
@ -166,14 +114,15 @@ using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
// typically used for 1D loops with a relatively low application-defined upper
// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
// chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
// this would enable vector-length-agnostic loops using ScalableTag).
template <typename T, size_t kLimit>
using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
// even on targets with scalable vectors. HWY_SCALAR only supports one lane.
// All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
// even on targets with scalable vectors. All targets except HWY_SCALAR support
// up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
// on that is non-portable and discouraged.
//
// NOTE: if the application does not need to support HWY_SCALAR (+), use this
// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
@ -214,11 +163,11 @@ using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
template <class D>
using Half = typename D::Half;
// Tag for the same lane type as D, but twice the lanes.
// Descriptor for the same lane type as D, but twice the lanes.
template <class D>
using Twice = typename D::Twice;
// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
// Same as base.h macros but with a Simd<T, N> argument instead of T.
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
@ -226,12 +175,6 @@ using Twice = typename D::Twice;
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
// MSVC workaround: use PrivateN directly instead of MaxLanes.
#define HWY_IF_LT128_D(D) \
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
#define HWY_IF_GE128_D(D) \
hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
// Same, but with a vector argument.
#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
@ -240,59 +183,42 @@ using Twice = typename D::Twice;
// For implementing functions for a specific type.
// IsSame<...>() in template arguments is broken on MSVC2015.
#define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr
#define HWY_IF_LANES_ARE(T, V) \
EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
return D::kPrivatePow2;
}
// MSVC requires the explicit <D>.
#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
#if HWY_HAVE_SCALABLE
// Upper bound on the number of lanes. Intended for template arguments and
// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
// actual size for allocating storage. WARNING: MSVC might not be able to deduce
// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
D::kPrivatePow2);
}
#else
// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
// is not an option, nor does a member function work.
template <class D>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
return D::kPrivateN;
}
// (Potentially) non-constant actual size of the vector at runtime, subject to
// the limit imposed by the Simd. Useful for advancing loop counters.
// Targets with scalable vectors define this themselves.
template <typename T, size_t N, int kPow2>
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
// Compile-time-constant, (typically but not guaranteed) an upper bound on the
// number of lanes.
// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
// `#if HWY_CAP_GE*`.
template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
return N;
}
#endif // !HWY_HAVE_SCALABLE
// Targets with non-constexpr Lanes define this themselves.
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE
// (Potentially) non-constant actual size of the vector at runtime, subject to
// the limit imposed by the Simd. Useful for advancing loop counters.
template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
return N;
}
#endif
// NOTE: GCC generates incorrect code for vector arguments to non-inlined
// functions in two situations:
// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
// all) tests to fail.
//
// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
// and possibly also other functions that are not inlined.
#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64)
((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64)
template <class V>
using VecArg = const V&;
#else

1057
third_party/highway/hwy/ops/wasm_128-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

2171
third_party/highway/hwy/ops/wasm_256-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1097
third_party/highway/hwy/ops/x86_128-inl.h поставляемый

Разница между файлами не показана из-за своего большого размера Загрузить разницу

329
third_party/highway/hwy/ops/x86_256-inl.h поставляемый
Просмотреть файл

@ -44,6 +44,10 @@
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
using Full256 = Simd<T, 32 / sizeof(T)>;
namespace detail {
template <typename T>
@ -322,38 +326,6 @@ HWY_API Vec256<T> Not(const Vec256<T> v) {
#endif
}
// ------------------------------ OrAnd
template <typename T>
HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
#if HWY_TARGET <= HWY_AVX3
const Full256<T> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const __m256i ret = _mm256_ternarylogic_epi64(
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
return BitCast(d, VU{ret});
#else
return Or(o, And(a1, a2));
#endif
}
// ------------------------------ IfVecThenElse
template <typename T>
HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
#if HWY_TARGET <= HWY_AVX3
const Full256<T> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
BitCast(du, yes).raw,
BitCast(du, no).raw, 0xCA)});
#else
return IfThenElse(MaskFromVec(mask), yes, no);
#endif
}
// ------------------------------ Operator overloads (internal-only if float)
template <typename T>
@ -813,7 +785,6 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
template <typename T, HWY_IF_FLOAT(T)>
HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
const auto zero = Zero(Full256<T>());
// AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
return IfThenElse(MaskFromVec(v), zero, v);
}
@ -1424,12 +1395,7 @@ HWY_API Vec256<double> operator-(const Vec256<double> a,
return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
}
// ------------------------------ SumsOf8
HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
}
// ------------------------------ SaturatedAdd
// ------------------------------ Saturating addition
// Returns a + b clamped to the destination range.
@ -1453,7 +1419,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
}
// ------------------------------ SaturatedSub
// ------------------------------ Saturating subtraction
// Returns a - b clamped to the destination range.
@ -1719,35 +1685,6 @@ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
#endif
}
// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
Vec256<int8_t> no) {
// int8: AVX2 IfThenElse only looks at the MSB.
return IfThenElse(MaskFromVec(v), yes, no);
}
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
static_assert(IsSigned<T>(), "Only works for signed/float");
const Full256<T> d;
const RebindToSigned<decltype(d)> di;
// 16-bit: no native blendv, so copy sign to lower byte's MSB.
v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
return IfThenElse(MaskFromVec(v), yes, no);
}
template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
static_assert(IsSigned<T>(), "Only works for signed/float");
const Full256<T> d;
const RebindToFloat<decltype(d)> df;
// 32/64-bit: use float IfThenElse, which only looks at the MSB.
const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
}
// ------------------------------ ShiftLeftSame
HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
@ -2297,7 +2234,7 @@ HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
Store(v, d, lanes);
alignas(32) Offset offset_lanes[N];
Store(offset, Full256<Offset>(), offset_lanes);
Store(offset, Simd<Offset, N>(), offset_lanes);
uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
for (size_t i = 0; i < N; ++i) {
@ -2315,7 +2252,7 @@ HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
Store(v, d, lanes);
alignas(32) Index index_lanes[N];
Store(index, Full256<Index>(), index_lanes);
Store(index, Simd<Index, N>(), index_lanes);
for (size_t i = 0; i < N; ++i) {
base[index_lanes[i]] = lanes[i];
@ -2536,7 +2473,7 @@ HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
template <int kLanes, typename T>
HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// ------------------------------ CombineShiftRightBytes
@ -2796,81 +2733,6 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
#endif
}
// ------------------------------ Reverse2
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
const Full256<uint32_t> du32;
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
return Shuffle2301(v);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
return Shuffle01(v);
}
// ------------------------------ Reverse4
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
#if HWY_TARGET <= HWY_AVX3
const RebindToSigned<decltype(d)> di;
alignas(32) constexpr int16_t kReverse4[16] = {3, 2, 1, 0, 7, 6, 5, 4,
11, 10, 9, 8, 15, 14, 13, 12};
const Vec256<int16_t> idx = Load(di, kReverse4);
return BitCast(d, Vec256<int16_t>{
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
#else
const RepartitionToWide<decltype(d)> dw;
return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
#endif
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
return Shuffle0123(v);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
}
HWY_API Vec256<double> Reverse4(Full256<double> /* tag */, Vec256<double> v) {
return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
}
// ------------------------------ Reverse8
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
#if HWY_TARGET <= HWY_AVX3
const RebindToSigned<decltype(d)> di;
alignas(32) constexpr int16_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8};
const Vec256<int16_t> idx = Load(di, kReverse8);
return BitCast(d, Vec256<int16_t>{
_mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
#else
const RepartitionToWide<decltype(d)> dw;
return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
#endif
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
return Reverse(d, v);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes
}
// ------------------------------ InterleaveLower
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@ -2920,6 +2782,12 @@ HWY_API Vec256<double> InterleaveLower(const Vec256<double> a,
return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
}
// Additional overload for the optional Simd<> tag.
template <typename T, class V = Vec256<T>>
HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
return InterleaveLower(a, b);
}
// ------------------------------ InterleaveUpper
// All functions inside detail lack the required D parameter.
@ -2981,11 +2849,11 @@ HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
// this is necessary because the single-lane scalar cannot return two values.
template <typename T, typename TW = MakeWide<T>>
HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) {
return BitCast(Full256<TW>(), InterleaveLower(a, b));
return BitCast(Full256<TW>(), InterleaveLower(Full256<T>(), a, b));
}
template <typename T, typename TW = MakeWide<T>>
HWY_API Vec256<TW> ZipLower(Full256<TW> dw, Vec256<T> a, Vec256<T> b) {
return BitCast(dw, InterleaveLower(a, b));
return BitCast(dw, InterleaveLower(Full256<T>(), a, b));
}
template <typename T, typename TW = MakeWide<T>>
@ -3195,38 +3063,6 @@ HWY_API Vec256<double> ConcatEven(Full256<double> d, Vec256<double> hi,
#endif
}
// ------------------------------ DupEven (InterleaveLower)
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec256<T> DupEven(Vec256<T> v) {
return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
}
HWY_API Vec256<float> DupEven(Vec256<float> v) {
return Vec256<float>{
_mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec256<T> DupEven(const Vec256<T> v) {
return InterleaveLower(Full256<T>(), v, v);
}
// ------------------------------ DupOdd (InterleaveUpper)
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec256<T> DupOdd(Vec256<T> v) {
return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
}
HWY_API Vec256<float> DupOdd(Vec256<float> v) {
return Vec256<float>{
_mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
return InterleaveUpper(Full256<T>(), v, v);
}
// ------------------------------ OddEven
namespace detail {
@ -3304,13 +3140,6 @@ HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))};
}
// ------------------------------ ReverseBlocks (ConcatLowerUpper)
template <typename T>
HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
return ConcatLowerUpper(d, v, v);
}
// ------------------------------ TableLookupBytes (ZeroExtendVector)
// Both full
@ -3607,7 +3436,7 @@ HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
}
HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */,
HWY_API Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */,
const Vec256<int32_t> v) {
const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
// Concatenate lower 64 bits of each 128-bit block
@ -3626,7 +3455,7 @@ HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
}
HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */,
HWY_API Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */,
const Vec256<int32_t> v) {
const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
// Concatenate lower 64 bits of each 128-bit block
@ -3724,7 +3553,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
const auto lo = LowerHalf(quad);
const auto hi = UpperHalf(Full128<uint32_t>(), quad);
const auto pair = LowerHalf(lo | hi);
return BitCast(Full64<uint8_t>(), pair);
return BitCast(Simd<uint8_t, 8>(), pair);
}
// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
@ -3862,19 +3691,6 @@ HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state,
#endif
}
HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state,
Vec256<uint8_t> round_key) {
#if HWY_TARGET == HWY_AVX3_DL
return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
#else
const Full256<uint8_t> d;
const Half<decltype(d)> d2;
return Combine(d,
AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
AESLastRound(LowerHalf(state), LowerHalf(round_key)));
#endif
}
HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) {
#if HWY_TARGET == HWY_AVX3_DL
return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
@ -4203,7 +4019,7 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
#if HWY_TARGET <= HWY_AVX3_DL
return CompressStore(v, m, d, unaligned); // also native
#else
const size_t count = CountTrue(d, m);
const size_t count = CountTrue(m);
const Vec256<T> compressed = Compress(v, m);
// There is no 16-bit MaskedStore, so blend.
const Vec256<T> prev = LoadU(d, unaligned);
@ -4428,7 +4244,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
namespace detail {
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d,
uint64_t mask_bits) {
const RebindToUnsigned<decltype(d)> d32;
// We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
@ -4491,7 +4307,7 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d,
uint64_t mask_bits) {
const Repartition<uint32_t, decltype(d)> d32;
@ -4537,8 +4353,8 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
const auto compressed1 = Compress(promoted1, mask_bits1);
const Half<decltype(du)> dh;
const auto demoted0 = ZeroExtendVector(du, DemoteTo(dh, compressed0));
const auto demoted1 = ZeroExtendVector(du, DemoteTo(dh, compressed1));
const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0));
const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1));
const size_t count0 = PopCount(mask_bits0);
// Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
@ -4809,6 +4625,101 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
}
// ================================================== DEPRECATED
template <typename T>
HWY_API size_t StoreMaskBits(const Mask256<T> mask, uint8_t* bits) {
return StoreMaskBits(Full256<T>(), mask, bits);
}
template <typename T>
HWY_API bool AllTrue(const Mask256<T> mask) {
return AllTrue(Full256<T>(), mask);
}
template <typename T>
HWY_API bool AllFalse(const Mask256<T> mask) {
return AllFalse(Full256<T>(), mask);
}
template <typename T>
HWY_API size_t CountTrue(const Mask256<T> mask) {
return CountTrue(Full256<T>(), mask);
}
template <typename T>
HWY_API Vec256<T> SumOfLanes(const Vec256<T> vHL) {
return SumOfLanes(Full256<T>(), vHL);
}
template <typename T>
HWY_API Vec256<T> MinOfLanes(const Vec256<T> vHL) {
return MinOfLanes(Full256<T>(), vHL);
}
template <typename T>
HWY_API Vec256<T> MaxOfLanes(const Vec256<T> vHL) {
return MaxOfLanes(Full256<T>(), vHL);
}
template <typename T>
HWY_API Vec128<T> UpperHalf(Vec256<T> v) {
return UpperHalf(Full128<T>(), v);
}
template <int kBytes, typename T>
HWY_API Vec256<T> ShiftRightBytes(const Vec256<T> v) {
return ShiftRightBytes<kBytes>(Full256<T>(), v);
}
template <int kLanes, typename T>
HWY_API Vec256<T> ShiftRightLanes(const Vec256<T> v) {
return ShiftRightLanes<kLanes>(Full256<T>(), v);
}
template <size_t kBytes, typename T>
HWY_API Vec256<T> CombineShiftRightBytes(Vec256<T> hi, Vec256<T> lo) {
return CombineShiftRightBytes<kBytes>(Full256<T>(), hi, lo);
}
template <typename T>
HWY_API Vec256<T> InterleaveUpper(Vec256<T> a, Vec256<T> b) {
return InterleaveUpper(Full256<T>(), a, b);
}
template <typename T>
HWY_API Vec256<MakeWide<T>> ZipUpper(Vec256<T> a, Vec256<T> b) {
return InterleaveUpper(Full256<MakeWide<T>>(), a, b);
}
template <typename T>
HWY_API Vec256<T> Combine(Vec128<T> hi, Vec128<T> lo) {
return Combine(Full256<T>(), hi, lo);
}
template <typename T>
HWY_API Vec256<T> ZeroExtendVector(Vec128<T> lo) {
return ZeroExtendVector(Full256<T>(), lo);
}
template <typename T>
HWY_API Vec256<T> ConcatLowerLower(Vec256<T> hi, Vec256<T> lo) {
return ConcatLowerLower(Full256<T>(), hi, lo);
}
template <typename T>
HWY_API Vec256<T> ConcatLowerUpper(Vec256<T> hi, Vec256<T> lo) {
return ConcatLowerUpper(Full256<T>(), hi, lo);
}
template <typename T>
HWY_API Vec256<T> ConcatUpperLower(Vec256<T> hi, Vec256<T> lo) {
return ConcatUpperLower(Full256<T>(), hi, lo);
}
template <typename T>
HWY_API Vec256<T> ConcatUpperUpper(Vec256<T> hi, Vec256<T> lo) {
return ConcatUpperUpper(Full256<T>(), hi, lo);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

340
third_party/highway/hwy/ops/x86_512-inl.h поставляемый
Просмотреть файл

@ -57,6 +57,9 @@ HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T>
using Full512 = Simd<T, 64 / sizeof(T)>;
namespace detail {
template <typename T>
@ -310,30 +313,6 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
}
// ------------------------------ OrAnd
template <typename T>
HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
const Full512<T> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
const __m512i ret = _mm512_ternarylogic_epi64(
BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
return BitCast(d, VU{ret});
}
// ------------------------------ IfVecThenElse
template <typename T>
HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
const Full512<T> d;
const RebindToUnsigned<decltype(d)> du;
using VU = VFromD<decltype(du)>;
return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
BitCast(du, yes).raw,
BitCast(du, no).raw, 0xCA)});
}
// ------------------------------ Operator overloads (internal-only if float)
template <typename T>
@ -600,13 +579,6 @@ HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask,
return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
}
template <typename T>
HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
static_assert(IsSigned<T>(), "Only works for signed/float");
// AVX3 MaskFromVec only looks at the MSB
return IfThenElse(MaskFromVec(v), yes, no);
}
template <typename T, HWY_IF_FLOAT(T)>
HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
// AVX3 MaskFromVec only looks at the MSB
@ -709,12 +681,7 @@ HWY_API Vec512<double> operator-(const Vec512<double> a,
return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
}
// ------------------------------ SumsOf8
HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) {
return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
}
// ------------------------------ SaturatedAdd
// ------------------------------ Saturating addition
// Returns a + b clamped to the destination range.
@ -738,7 +705,7 @@ HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a,
return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
}
// ------------------------------ SaturatedSub
// ------------------------------ Saturating subtraction
// Returns a - b clamped to the destination range.
@ -1853,7 +1820,7 @@ HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
// https://gcc.godbolt.org/z/-Jt_-F
#if HWY_LOADDUP_ASM
__m512i out;
asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
return Vec512<T>{out};
#else
const auto x4 = LoadU(Full128<T>(), p);
@ -1864,7 +1831,7 @@ HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
const float* const HWY_RESTRICT p) {
#if HWY_LOADDUP_ASM
__m512 out;
asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
return Vec512<float>{out};
#else
const __m128 x4 = _mm_loadu_ps(p);
@ -1876,7 +1843,7 @@ HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
const double* const HWY_RESTRICT p) {
#if HWY_LOADDUP_ASM
__m512d out;
asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0]));
asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0]));
return Vec512<double>{out};
#else
const __m128d x2 = _mm_loadu_pd(p);
@ -2040,7 +2007,7 @@ HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<8> /* tag */,
template <typename T, typename Offset>
HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
const Vec512<Offset> offset) {
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
}
template <typename T, typename Index>
@ -2206,7 +2173,7 @@ HWY_API Vec512<T> ShiftRightBytes(Full512<T> /* tag */, const Vec512<T> v) {
template <int kLanes, typename T>
HWY_API Vec512<T> ShiftRightLanes(Full512<T> d, const Vec512<T> v) {
const Repartition<uint8_t, decltype(d)> d8;
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
}
// ------------------------------ CombineShiftRightBytes
@ -2429,78 +2396,6 @@ HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
return TableLookupLanes(v, SetTableIndices(d, kReverse));
}
// ------------------------------ Reverse2
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec512<T> Reverse2(Full512<T> d, const Vec512<T> v) {
const Full512<uint32_t> du32;
return BitCast(d, RotateRight<16>(BitCast(du32, v)));
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
return Shuffle2301(v);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
return Shuffle01(v);
}
// ------------------------------ Reverse4
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec512<T> Reverse4(Full512<T> d, const Vec512<T> v) {
const RebindToSigned<decltype(d)> di;
alignas(64) constexpr int16_t kReverse4[32] = {
3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
const Vec512<int16_t> idx = Load(di, kReverse4);
return BitCast(d, Vec512<int16_t>{
_mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
return Shuffle0123(v);
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
}
HWY_API Vec512<double> Reverse4(Full512<double> /* tag */, Vec512<double> v) {
return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
}
// ------------------------------ Reverse8
template <typename T, HWY_IF_LANE_SIZE(T, 2)>
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
const RebindToSigned<decltype(d)> di;
alignas(64) constexpr int16_t kReverse8[32] = {
7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8,
23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
const Vec512<int16_t> idx = Load(di, kReverse8);
return BitCast(d, Vec512<int16_t>{
_mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
}
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
const RebindToSigned<decltype(d)> di;
alignas(64) constexpr int32_t kReverse8[16] = {7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8};
const Vec512<int32_t> idx = Load(di, kReverse8);
return BitCast(d, Vec512<int32_t>{
_mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
return Reverse(d, v);
}
// ------------------------------ InterleaveLower
// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
@ -2550,6 +2445,12 @@ HWY_API Vec512<double> InterleaveLower(const Vec512<double> a,
return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
}
// Additional overload for the optional Simd<> tag.
template <typename T, class V = Vec512<T>>
HWY_API V InterleaveLower(Full512<T> /* tag */, V a, V b) {
return InterleaveLower(a, b);
}
// ------------------------------ InterleaveUpper
// All functions inside detail lack the required D parameter.
@ -2614,8 +2515,8 @@ HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) {
return BitCast(Full512<TW>(), InterleaveLower(a, b));
}
template <typename T, typename TW = MakeWide<T>>
HWY_API Vec512<TW> ZipLower(Full512<TW> /* d */, Vec512<T> a, Vec512<T> b) {
return BitCast(Full512<TW>(), InterleaveLower(a, b));
HWY_API Vec512<TW> ZipLower(Full512<TW> d, Vec512<T> a, Vec512<T> b) {
return BitCast(Full512<TW>(), InterleaveLower(d, a, b));
}
template <typename T, typename TW = MakeWide<T>>
@ -2663,17 +2564,17 @@ HWY_API Vec512<double> ConcatUpperUpper(Full512<double> /* tag */,
template <typename T>
HWY_API Vec512<T> ConcatLowerUpper(Full512<T> /* tag */, const Vec512<T> hi,
const Vec512<T> lo) {
return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, 0x4E)};
}
HWY_API Vec512<float> ConcatLowerUpper(Full512<float> /* tag */,
const Vec512<float> hi,
const Vec512<float> lo) {
return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, 0x4E)};
}
HWY_API Vec512<double> ConcatLowerUpper(Full512<double> /* tag */,
const Vec512<double> hi,
const Vec512<double> lo) {
return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, 0x4E)};
}
// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
@ -2774,36 +2675,6 @@ HWY_API Vec512<double> ConcatEven(Full512<double> d, Vec512<double> hi,
__mmask8{0xFF}, hi.raw)};
}
// ------------------------------ DupEven (InterleaveLower)
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec512<T> DupEven(Vec512<T> v) {
return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
}
HWY_API Vec512<float> DupEven(Vec512<float> v) {
return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> DupEven(const Vec512<T> v) {
return InterleaveLower(Full512<T>(), v, v);
}
// ------------------------------ DupOdd (InterleaveUpper)
template <typename T, HWY_IF_LANE_SIZE(T, 4)>
HWY_API Vec512<T> DupOdd(Vec512<T> v) {
return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
}
HWY_API Vec512<float> DupOdd(Vec512<float> v) {
return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
}
template <typename T, HWY_IF_LANE_SIZE(T, 8)>
HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
return InterleaveUpper(Full512<T>(), v, v);
}
// ------------------------------ OddEven
template <typename T>
@ -2834,29 +2705,17 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
template <typename T>
HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
}
HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
return Vec512<float>{
_mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
}
HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
}
// ------------------------------ ReverseBlocks
template <typename T>
HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
}
HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
}
HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
Vec512<double> v) {
return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
return Vec512<double>{
_mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))};
}
// ------------------------------ TableLookupBytes (ZeroExtendVector)
@ -3153,23 +3012,17 @@ HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state,
#if HWY_TARGET == HWY_AVX3_DL
return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
#else
alignas(64) uint8_t a[64];
alignas(64) uint8_t b[64];
const Full512<uint8_t> d;
const Half<decltype(d)> d2;
return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
AESRound(LowerHalf(state), LowerHalf(round_key)));
#endif
}
HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state,
Vec512<uint8_t> round_key) {
#if HWY_TARGET == HWY_AVX3_DL
return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
#else
const Full512<uint8_t> d;
const Half<decltype(d)> d2;
return Combine(d,
AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
AESLastRound(LowerHalf(state), LowerHalf(round_key)));
const Full128<uint8_t> d128;
Store(state, d, a);
Store(round_key, d, b);
for (size_t i = 0; i < 64; i += 16) {
const auto enc = AESRound(Load(d128, a + i), Load(d128, b + i));
Store(enc, d128, a + i);
}
return Load(d, a);
#endif
}
@ -3411,8 +3264,8 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
const auto compressed0 = Compress(promoted0, mask0);
const auto compressed1 = Compress(promoted1, mask1);
const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
const auto demoted0 = ZeroExtendVector(DemoteTo(duh, compressed0));
const auto demoted1 = ZeroExtendVector(DemoteTo(duh, compressed1));
// Concatenate into single vector by shifting upper with writemask.
const size_t num0 = CountTrue(dw, mask0);
@ -3510,7 +3363,7 @@ HWY_API size_t CompressBlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> d,
if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
return CompressStore(v, m, d, unaligned);
} else {
const size_t count = CountTrue(d, m);
const size_t count = CountTrue(m);
const Vec512<T> compressed = Compress(v, m);
const Vec512<T> prev = LoadU(d, unaligned);
StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned);
@ -3569,9 +3422,9 @@ HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b,
const auto k = (r2 | g2 | b2).raw; // low byte in each 128bit: 3A 2A 1A 0A
// To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA);
const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB);
const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC);
const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0));
const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1));
const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2));
// Alternating order, most-significant 128 bits from the second arg.
const __mmask8 m = 0xCC;
@ -3603,12 +3456,12 @@ HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0,
const auto k = ZipLower(d32, ba8, dc8).raw; // 4x128bit: d..aB d..a8
const auto l = ZipUpper(d32, ba8, dc8).raw; // 4x128bit: d..aF d..aC
// 128-bit blocks were independent until now; transpose 4x4.
const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA);
const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA);
const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC);
const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC);
constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA;
constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB;
const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0));
const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0));
const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2));
const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2));
constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0);
constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1);
const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20);
const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31);
const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20);
@ -3778,6 +3631,103 @@ HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) {
return BitCast(d, Or(min, ShiftLeft<16>(min)));
}
// ================================================== DEPRECATED
template <typename T>
HWY_API size_t StoreMaskBits(const Mask512<T> mask, uint8_t* bits) {
return StoreMaskBits(Full512<T>(), mask, bits);
}
template <typename T>
HWY_API bool AllTrue(const Mask512<T> mask) {
return AllTrue(Full512<T>(), mask);
}
template <typename T>
HWY_API bool AllFalse(const Mask512<T> mask) {
return AllFalse(Full512<T>(), mask);
}
template <typename T>
HWY_API size_t CountTrue(const Mask512<T> mask) {
return CountTrue(Full512<T>(), mask);
}
template <typename T>
HWY_API Vec512<T> SumOfLanes(Vec512<T> v) {
return SumOfLanes(Full512<T>(), v);
}
template <typename T>
HWY_API Vec512<T> MinOfLanes(Vec512<T> v) {
return MinOfLanes(Full512<T>(), v);
}
template <typename T>
HWY_API Vec512<T> MaxOfLanes(Vec512<T> v) {
return MaxOfLanes(Full512<T>(), v);
}
template <typename T>
HWY_API Vec256<T> UpperHalf(Vec512<T> v) {
return UpperHalf(Full256<T>(), v);
}
template <int kBytes, typename T>
HWY_API Vec512<T> ShiftRightBytes(const Vec512<T> v) {
return ShiftRightBytes<kBytes>(Full512<T>(), v);
}
template <int kLanes, typename T>
HWY_API Vec512<T> ShiftRightLanes(const Vec512<T> v) {
return ShiftRightBytes<kLanes>(Full512<T>(), v);
}
template <size_t kBytes, typename T>
HWY_API Vec512<T> CombineShiftRightBytes(Vec512<T> hi, Vec512<T> lo) {
return CombineShiftRightBytes<kBytes>(Full512<T>(), hi, lo);
}
template <typename T>
HWY_API Vec512<T> InterleaveUpper(Vec512<T> a, Vec512<T> b) {
return InterleaveUpper(Full512<T>(), a, b);
}
template <typename T>
HWY_API Vec512<MakeWide<T>> ZipUpper(Vec512<T> a, Vec512<T> b) {
return InterleaveUpper(Full512<MakeWide<T>>(), a, b);
}
template <typename T>
HWY_API Vec512<T> Combine(Vec256<T> hi, Vec256<T> lo) {
return Combine(Full512<T>(), hi, lo);
}
template <typename T>
HWY_API Vec512<T> ZeroExtendVector(Vec256<T> lo) {
return ZeroExtendVector(Full512<T>(), lo);
}
template <typename T>
HWY_API Vec512<T> ConcatLowerLower(Vec512<T> hi, Vec512<T> lo) {
return ConcatLowerLower(Full512<T>(), hi, lo);
}
template <typename T>
HWY_API Vec512<T> ConcatLowerUpper(Vec512<T> hi, Vec512<T> lo) {
return ConcatLowerUpper(Full512<T>(), hi, lo);
}
template <typename T>
HWY_API Vec512<T> ConcatUpperLower(Vec512<T> hi, Vec512<T> lo) {
return ConcatUpperLower(Full512<T>(), hi, lo);
}
template <typename T>
HWY_API Vec512<T> ConcatUpperUpper(Vec512<T> hi, Vec512<T> lo) {
return ConcatUpperUpper(Full512<T>(), hi, lo);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

49
third_party/highway/hwy/targets.cc поставляемый
Просмотреть файл

@ -15,25 +15,23 @@
#include "hwy/targets.h"
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <atomic>
#include <cstddef>
#include <limits>
#include "hwy/base.h"
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
#endif
#include <stdlib.h> // abort / exit
#endif // defined(*_SANITIZER)
#if HWY_ARCH_X86
#include <xmmintrin.h>
#if HWY_COMPILER_MSVC
#include <intrin.h>
#else // !HWY_COMPILER_MSVC
#else // HWY_COMPILER_MSVC
#include <cpuid.h>
#endif // HWY_COMPILER_MSVC
#endif // HWY_ARCH_X86
@ -95,7 +93,7 @@ std::atomic<uint32_t> supported_{0}; // Not yet initialized
uint32_t supported_targets_for_test_ = 0;
// Mask of targets disabled at runtime with DisableTargets.
uint32_t supported_mask_{LimitsMax<uint32_t>()};
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
#if HWY_ARCH_X86
// Arbritrary bit indices indicating which instruction set extensions are
@ -192,22 +190,21 @@ HWY_NORETURN void HWY_FORMAT(3, 4)
va_end(args);
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
// If compiled with any sanitizer, they can also print a stack trace.
#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
// If compiled with any sanitizer print a stack trace. This call doesn't crash
// the program, instead the trap below will crash it also allowing gdb to
// break there.
__sanitizer_print_stack_trace();
#endif // HWY_IS_*
#endif // defined(*_SANITIZER)
fflush(stderr);
// Now terminate the program:
#if HWY_ARCH_RVV
exit(1); // trap/abort just freeze Spike.
#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
// Facilitates breaking into a debugger, but don't use this in non-debug
// builds because it looks like "illegal instruction", which is misleading.
__builtin_trap();
#else
#if HWY_COMPILER_MSVC
abort(); // Compile error without this due to HWY_NORETURN.
#elif HWY_ARCH_RVV
exit(1); // trap/abort just freeze Spike
#else
__builtin_trap();
#endif
}
@ -216,7 +213,7 @@ void DisableTargets(uint32_t disabled_targets) {
// We can call Update() here to initialize the mask but that will trigger a
// call to SupportedTargets() which we use in tests to tell whether any of the
// highway dynamic dispatch functions were used.
GetChosenTarget().DeInit();
chosen_target.DeInit();
}
void SetSupportedTargetsForTest(uint32_t targets) {
@ -225,7 +222,7 @@ void SetSupportedTargetsForTest(uint32_t targets) {
// if not zero.
supported_.store(0, std::memory_order_release);
supported_targets_for_test_ = targets;
GetChosenTarget().DeInit();
chosen_target.DeInit();
}
bool SupportedTargetsCalledForTest() {
@ -347,10 +344,8 @@ uint32_t SupportedTargets() {
return bits & supported_mask_;
}
HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
static ChosenTarget chosen_target;
return chosen_target;
}
// Declared in targets.h
ChosenTarget chosen_target;
void ChosenTarget::Update() {
// The supported variable contains the current CPU supported targets shifted

16
third_party/highway/hwy/targets.h поставляемый
Просмотреть файл

@ -22,7 +22,6 @@
#include "hwy/base.h"
#include "hwy/detect_targets.h"
#include "hwy/highway_export.h"
namespace hwy {
@ -30,7 +29,7 @@ namespace hwy {
// Implemented in targets.cc; unconditionally compiled to support the use case
// of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
// eliding calls to this function.
HWY_DLLEXPORT uint32_t SupportedTargets();
uint32_t SupportedTargets();
// Evaluates to a function call, or literal if there is a single target.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
@ -45,7 +44,7 @@ HWY_DLLEXPORT uint32_t SupportedTargets();
// lower target is desired. For this reason, attempts to disable targets which
// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
// returns at least the baseline target.
HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
void DisableTargets(uint32_t disabled_targets);
// Set the mock mask of CPU supported targets instead of the actual CPU
// supported targets computed in SupportedTargets(). The return value of
@ -53,11 +52,11 @@ HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets);
// regardless of this mock, to prevent accidentally adding targets that are
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
// mock and use the actual CPU supported targets instead.
HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets);
void SetSupportedTargetsForTest(uint32_t targets);
// Returns whether the SupportedTargets() function was called since the last
// SetSupportedTargetsForTest() call.
HWY_DLLEXPORT bool SupportedTargetsCalledForTest();
bool SupportedTargetsCalledForTest();
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
@ -226,7 +225,7 @@ struct ChosenTarget {
public:
// Update the ChosenTarget mask based on the current CPU supported
// targets.
HWY_DLLEXPORT void Update();
void Update();
// Reset the ChosenTarget to the uninitialized state.
void DeInit() { mask_.store(1); }
@ -246,12 +245,11 @@ struct ChosenTarget {
}
private:
// Initialized to 1 so GetIndex() returns 0.
// Initialized to 1 so GetChosenTargetIndex() returns 0.
std::atomic<uint32_t> mask_{1};
};
// For internal use (e.g. by FunctionCache and DisableTargets).
HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
extern ChosenTarget chosen_target;
} // namespace hwy

4
third_party/highway/hwy/targets_test.cc поставляемый
Просмотреть файл

@ -44,11 +44,11 @@ void CheckFakeFunction() {
hwy::SetSupportedTargetsForTest(HWY_##TGT); \
/* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
/* the pointer to the already cached function. */ \
hwy::GetChosenTarget().Update(); \
hwy::chosen_target.Update(); \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
/* Calling DeInit() will test that the initializer function */ \
/* also calls the right function. */ \
hwy::GetChosenTarget().DeInit(); \
hwy::chosen_target.DeInit(); \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
/* Second call uses the cached value from the previous call. */ \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \

Просмотреть файл

@ -177,6 +177,387 @@ HWY_NOINLINE void TestAllAbs() {
ForFloatTypes(ForPartialVectors<TestFloatAbs>());
}
template <bool kSigned>
struct TestLeftShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
if (kSigned) {
// Also test positive values
TestLeftShifts</*kSigned=*/false>()(t, d);
}
using TI = MakeSigned<T>;
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// 0
HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
// 1
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(T(i) - T(N)) : T(i);
expected[i] = T(TU(value) << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
// max
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(T(i) - T(N)) : T(i);
expected[i] = T(TU(value) << kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
}
};
template <bool kSigned>
struct TestVariableLeftShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
if (kSigned) {
// Also test positive values
TestVariableLeftShifts</*kSigned=*/false>()(t, d);
}
using TI = MakeSigned<T>;
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
// Same: 0
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
// Same: 1
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
// Same: max
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
// Variable: small
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << (i & kMaxShift));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
// Variable: large
for (size_t i = 0; i < N; ++i) {
expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
}
};
struct TestUnsignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto values = Iota(d, 0);
const T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// Shift by 0
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
// max
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
}
};
struct TestRotateRight {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr size_t kBits = sizeof(T) * 8;
const auto mask_shift = Set(d, T{kBits});
// Cover as many bit positions as possible to test shifting out
const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
// Rotate by 0
HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
// Rotate by 1
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
// Rotate by half
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
// Rotate by max
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
}
};
struct TestVariableUnsignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, 0);
const T kMax = LimitsMax<T>();
const auto max = Set(d, kMax);
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
// Same: 0
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
// Same: 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
// Same: max
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
// Variable: small
for (size_t i = 0; i < N; ++i) {
expected[i] = T(i) >> (i & kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
// Variable: Large
for (size_t i = 0; i < N; ++i) {
expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
}
};
template <int kAmount, typename T>
T RightShiftNegative(T val) {
// C++ shifts are implementation-defined for negative numbers, and we have
// seen divisions replaced with shifts, so resort to bit operations.
using TU = hwy::MakeUnsigned<T>;
TU bits;
CopyBytes<sizeof(T)>(&val, &bits);
const TU shifted = TU(bits >> kAmount);
const TU all = TU(~TU(0));
const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
bits = shifted | sign_extended;
CopyBytes<sizeof(T)>(&bits, &val);
return val;
}
class TestSignedRightShifts {
public:
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr T kMin = LimitsMin<T>();
constexpr T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// First test positive values, negative are checked below.
const auto v0 = Zero(d);
const auto values = And(Iota(d, 0), Set(d, kMax));
// Shift by 0
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
// max
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
// Even negative value
Test<0>(kMin, d, __LINE__);
Test<1>(kMin, d, __LINE__);
Test<2>(kMin, d, __LINE__);
Test<kMaxShift>(kMin, d, __LINE__);
const T odd = static_cast<T>(kMin + 1);
Test<0>(odd, d, __LINE__);
Test<1>(odd, d, __LINE__);
Test<2>(odd, d, __LINE__);
Test<kMaxShift>(odd, d, __LINE__);
}
private:
template <int kAmount, typename T, class D>
void Test(T val, D d, int line) {
const auto expected = Set(d, RightShiftNegative<kAmount>(val));
const auto in = Set(d, val);
const char* file = __FILE__;
AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
}
};
struct TestVariableSignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr T kMin = LimitsMin<T>();
constexpr T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// First test positive values, negative are checked below.
const auto v0 = Zero(d);
const auto positive = Iota(d, 0) & Set(d, kMax);
// Shift by 0
HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
// max
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
const auto negative = Iota(d, kMin);
// Test varying negative to shift
for (size_t i = 0; i < N; ++i) {
expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
// Shift MSB right by small amounts
for (size_t i = 0; i < N; ++i) {
const size_t amount = i & kMaxShift;
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
// Shift MSB right by large amounts
for (size_t i = 0; i < N; ++i) {
const size_t amount = kMaxShift - (i & kMaxShift);
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
}
};
HWY_NOINLINE void TestAllShifts() {
ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
}
HWY_NOINLINE void TestAllVariableShifts() {
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
const ForPartialVectors<TestSignedRightShifts> shr_s;
shl_u(uint16_t());
shr_u(uint16_t());
shl_u(uint32_t());
shr_u(uint32_t());
shl_s(int16_t());
shr_s(int16_t());
shl_s(int32_t());
shr_s(int32_t());
#if HWY_CAP_INTEGER64
shl_u(uint64_t());
shr_u(uint64_t());
shl_s(int64_t());
shr_s(int64_t());
#endif
}
HWY_NOINLINE void TestAllRotateRight() {
const ForPartialVectors<TestRotateRight> test;
test(uint32_t());
#if HWY_CAP_INTEGER64
test(uint64_t());
#endif
}
struct TestUnsignedMinMax {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -263,84 +644,6 @@ HWY_NOINLINE void TestAllMinMax() {
ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
}
class TestMinMax128 {
template <class D>
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
alignas(16) uint64_t in[2];
in[0] = lo;
in[1] = hi;
return LoadDup128(d, in);
}
public:
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using V = Vec<D>;
const size_t N = Lanes(d);
auto a_lanes = AllocateAligned<T>(N);
auto b_lanes = AllocateAligned<T>(N);
auto min_lanes = AllocateAligned<T>(N);
auto max_lanes = AllocateAligned<T>(N);
RandomState rng;
const V v00 = Zero(d);
const V v01 = Make128(d, 0, 1);
const V v10 = Make128(d, 1, 0);
const V v11 = Add(v01, v10);
// Same arg
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
// First arg less
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
// Second arg less
HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
// Also check 128-bit blocks are independent
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
for (size_t i = 0; i < N; ++i) {
a_lanes[i] = Random64(&rng);
b_lanes[i] = Random64(&rng);
}
const V a = Load(d, a_lanes.get());
const V b = Load(d, b_lanes.get());
for (size_t i = 0; i < N; i += 2) {
const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
? (a_lanes[i] < b_lanes[i])
: (a_lanes[i + 1] < b_lanes[i + 1]);
min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
}
HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
}
}
};
HWY_NOINLINE void TestAllMinMax128() {
ForGEVectors<128, TestMinMax128>()(uint64_t());
}
struct TestUnsignedMul {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -531,11 +834,11 @@ struct TestMulEvenOdd64 {
};
HWY_NOINLINE void TestAllMulEven() {
ForGEVectors<64, TestMulEven> test;
ForExtendableVectors<TestMulEven> test;
test(int32_t());
test(uint32_t());
ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
ForGE128Vectors<TestMulEvenOdd64>()(uint64_t());
}
struct TestMulAdd {
@ -810,6 +1113,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
// negative +/- epsilon
T(-1) + eps,
T(-1) - eps,
#if !defined(HWY_EMULATE_SVE) // these are not safe to just cast to int
// +/- huge (but still fits in float)
T(1E34),
T(-1E35),
@ -818,6 +1122,7 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
-std::numeric_limits<T>::infinity(),
// qNaN
GetLane(NaN(d))
#endif
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
@ -1064,41 +1369,6 @@ HWY_NOINLINE void TestAllAbsDiff() {
ForPartialVectors<TestAbsDiff>()(float());
}
struct TestSumsOf8 {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
const size_t N = Lanes(d);
if (N < 8) return;
const Repartition<uint64_t, D> du64;
auto in_lanes = AllocateAligned<T>(N);
auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
for (size_t rep = 0; rep < 100; ++rep) {
for (size_t i = 0; i < N; ++i) {
in_lanes[i] = Random64(&rng) & 0xFF;
}
for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
uint64_t sum = 0;
for (size_t i = 0; i < 8; ++i) {
sum += in_lanes[idx_sum * 8 + i];
}
sum_lanes[idx_sum] = sum;
}
const Vec<D> in = Load(d, in_lanes.get());
HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
}
}
};
HWY_NOINLINE void TestAllSumsOf8() {
ForGEVectors<64, TestSumsOf8>()(uint8_t());
}
struct TestNeg {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -1127,8 +1397,10 @@ namespace hwy {
HWY_BEFORE_TEST(HwyArithmeticTest);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul);
@ -1148,7 +1420,6 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8);
HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
} // namespace hwy

Просмотреть файл

@ -393,7 +393,7 @@ HWY_NOINLINE void TestAllZip() {
lower_unsigned(uint8_t());
#endif
lower_unsigned(uint16_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
lower_unsigned(uint32_t()); // generates u64
#endif
@ -402,7 +402,7 @@ HWY_NOINLINE void TestAllZip() {
lower_signed(int8_t());
#endif
lower_signed(int16_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
lower_signed(int32_t()); // generates i64
#endif
@ -411,7 +411,7 @@ HWY_NOINLINE void TestAllZip() {
upper_unsigned(uint8_t());
#endif
upper_unsigned(uint16_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
upper_unsigned(uint32_t()); // generates u64
#endif
@ -420,20 +420,19 @@ HWY_NOINLINE void TestAllZip() {
upper_signed(int8_t());
#endif
upper_signed(int16_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
upper_signed(int32_t()); // generates i64
#endif
// No float - concatenating f32 does not result in a f64
}
template <int kBytes>
struct TestCombineShiftRightBytesR {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
// Scalar does not define CombineShiftRightBytes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
template <int kBytes>
struct TestCombineShiftRightBytes {
template <class T, class D>
HWY_NOINLINE void operator()(T, D d) {
const size_t kBlockSize = 16;
static_assert(kBytes < kBlockSize, "Shift count is per block");
const Repartition<uint8_t, D> d8;
@ -462,13 +461,21 @@ struct TestCombineShiftRightBytes {
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
}
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
#else
(void)t;
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
template <int kLanes>
struct TestCombineShiftRightLanes {
struct TestCombineShiftRightLanesR {
template <class T, class D>
HWY_NOINLINE void operator()(T, D d) {
HWY_NOINLINE void operator()(T t, D d) {
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
const Repartition<uint8_t, D> d8;
const size_t N8 = Lanes(d8);
if (N8 < 16) return;
@ -498,29 +505,33 @@ struct TestCombineShiftRightLanes {
const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
}
TestCombineShiftRightLanesR<kLanes - 1>()(t, d);
#else
(void)t;
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
#endif // #if HWY_TARGET != HWY_SCALAR
template <>
struct TestCombineShiftRightBytesR<0> {
template <class T, class D>
void operator()(T /*unused*/, D /*unused*/) {}
};
template <>
struct TestCombineShiftRightLanesR<0> {
template <class T, class D>
void operator()(T /*unused*/, D /*unused*/) {}
};
struct TestCombineShiftRight {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
// Scalar does not define CombineShiftRightBytes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T)));
constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
TestCombineShiftRightBytes<1>()(t, d);
TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
TestCombineShiftRightLanes<1>()(t, d);
#else
(void)t;
(void)d;
#endif
TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d);
TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d);
}
};
@ -542,13 +553,11 @@ class TestSpecialShuffle32 {
}
private:
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
// 128 bits (the NEON register) of actual are zero.
template <class D, class V>
HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
const size_t i2, const size_t i1,
const size_t i0, const char* filename,
const int line) {
HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
const size_t i2, const size_t i1,
const size_t i0, const char* filename,
const int line) {
using T = TFromD<D>;
constexpr size_t kBlockN = 16 / sizeof(T);
const size_t N = Lanes(d);
@ -573,12 +582,10 @@ class TestSpecialShuffle64 {
}
private:
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
// 128 bits (the NEON register) of actual are zero.
template <class D, class V>
HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
const size_t i0, const char* filename,
const int line) {
HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
const size_t i0, const char* filename,
const int line) {
using T = TFromD<D>;
constexpr size_t kBlockN = 16 / sizeof(T);
const size_t N = Lanes(d);
@ -593,19 +600,19 @@ class TestSpecialShuffle64 {
};
HWY_NOINLINE void TestAllSpecialShuffles() {
const ForGEVectors<128, TestSpecialShuffle32> test32;
const ForGE128Vectors<TestSpecialShuffle32> test32;
test32(uint32_t());
test32(int32_t());
test32(float());
#if HWY_HAVE_INTEGER64
const ForGEVectors<128, TestSpecialShuffle64> test64;
#if HWY_CAP_INTEGER64
const ForGE128Vectors<TestSpecialShuffle64> test64;
test64(uint64_t());
test64(int64_t());
#endif
#if HWY_HAVE_FLOAT64
const ForGEVectors<128, TestSpecialShuffle64> test_d;
#if HWY_CAP_FLOAT64
const ForGE128Vectors<TestSpecialShuffle64> test_d;
test_d(double());
#endif
}

52
third_party/highway/hwy/tests/combine_test.cc поставляемый
Просмотреть файл

@ -22,6 +22,9 @@
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Not yet implemented
#if HWY_TARGET != HWY_RVV
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
@ -82,8 +85,8 @@ struct TestLowerQuarter {
};
HWY_NOINLINE void TestAllLowerHalf() {
ForAllTypes(ForHalfVectors<TestLowerHalf>());
ForAllTypes(ForHalfVectors<TestLowerQuarter, 2>());
ForAllTypes(ForDemoteVectors<TestLowerHalf>());
ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>());
}
struct TestUpperHalf {
@ -92,14 +95,21 @@ struct TestUpperHalf {
// Scalar does not define UpperHalf.
#if HWY_TARGET != HWY_SCALAR
const Half<D> d2;
const size_t N2 = Lanes(d2);
HWY_ASSERT(N2 * 2 == Lanes(d));
auto expected = AllocateAligned<T>(N2);
const auto v = Iota(d, 1);
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
Store(UpperHalf(d2, v), d2, lanes.get());
size_t i = 0;
for (; i < N2; ++i) {
expected[i] = static_cast<T>(N2 + 1 + i);
for (; i < Lanes(d2); ++i) {
HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
}
// Other half remains unchanged
for (; i < N; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
#else
(void)d;
#endif
@ -107,7 +117,7 @@ struct TestUpperHalf {
};
HWY_NOINLINE void TestAllUpperHalf() {
ForAllTypes(ForHalfVectors<TestUpperHalf>());
ForAllTypes(ForShrinkableVectors<TestUpperHalf>());
}
struct TestZeroExtendVector {
@ -116,23 +126,23 @@ struct TestZeroExtendVector {
const Twice<D> d2;
const auto v = Iota(d, 1);
const size_t N = Lanes(d);
const size_t N2 = Lanes(d2);
// If equal, then N was already MaxLanes(d) and it's not clear what
// Combine or ZeroExtendVector should return.
if (N2 == N) return;
HWY_ASSERT(N2 == 2 * N);
auto lanes = AllocateAligned<T>(N2);
Store(v, d, &lanes[0]);
Store(v, d, &lanes[N]);
Store(v, d, &lanes[N2 / 2]);
const auto ext = ZeroExtendVector(d2, v);
Store(ext, d2, lanes.get());
size_t i = 0;
// Lower half is unchanged
HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
for (; i < N2 / 2; ++i) {
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
}
// Upper half is zero
HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
for (; i < N2; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
}
};
@ -148,7 +158,7 @@ struct TestCombine {
auto lanes = AllocateAligned<T>(N2);
const auto lo = Iota(d, 1);
const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
const auto hi = Iota(d, N2 / 2 + 1);
const auto combined = Combine(d2, hi, lo);
Store(combined, d2, lanes.get());
@ -222,7 +232,7 @@ struct TestConcatOddEven {
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR
const size_t N = Lanes(d);
const auto hi = Iota(d, static_cast<T>(N));
const auto hi = Iota(d, N);
const auto lo = Iota(d, 0);
const auto even = Add(Iota(d, 0), Iota(d, 0));
const auto odd = Add(even, Set(d, 1));
@ -262,3 +272,7 @@ int main(int argc, char **argv) {
}
#endif // HWY_ONCE
#else
int main(int, char**) { return 0; }
#endif // HWY_TARGET != HWY_RVV

58
third_party/highway/hwy/tests/compare_test.cc поставляемый
Просмотреть файл

@ -218,63 +218,6 @@ HWY_NOINLINE void TestAllWeakFloat() {
ForFloatTypes(ForPartialVectors<TestWeakFloat>());
}
class TestLt128 {
template <class D>
static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
alignas(16) uint64_t in[2];
in[0] = lo;
in[1] = hi;
return LoadDup128(d, in);
}
public:
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using V = Vec<D>;
const V v00 = Zero(d);
const V v01 = Make128(d, 0, 1);
const V v10 = Make128(d, 1, 0);
const V v11 = Add(v01, v10);
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
// Reversed order
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
// Also check 128-bit blocks are independent
const V iota = Iota(d, 1);
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
// Max value
const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
}
};
HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
@ -289,7 +232,6 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.

328
third_party/highway/hwy/tests/convert_test.cc поставляемый
Просмотреть файл

@ -57,17 +57,17 @@ struct TestBitCastFrom {
TestBitCast<uint8_t>()(t, d);
TestBitCast<uint16_t>()(t, d);
TestBitCast<uint32_t>()(t, d);
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
TestBitCast<uint64_t>()(t, d);
#endif
TestBitCast<int8_t>()(t, d);
TestBitCast<int16_t>()(t, d);
TestBitCast<int32_t>()(t, d);
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
TestBitCast<int64_t>()(t, d);
#endif
TestBitCast<float>()(t, d);
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
TestBitCast<double>()(t, d);
#endif
}
@ -103,39 +103,39 @@ HWY_NOINLINE void TestAllBitCast() {
to_i32(int32_t());
to_i32(float());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
to_u64(uint64_t());
to_u64(int64_t());
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
to_u64(double());
#endif
const ForPartialVectors<TestBitCast<int64_t>> to_i64;
to_i64(uint64_t());
to_i64(int64_t());
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
to_i64(double());
#endif
#endif // HWY_HAVE_INTEGER64
#endif // HWY_CAP_INTEGER64
const ForPartialVectors<TestBitCast<float>> to_float;
to_float(uint32_t());
to_float(int32_t());
to_float(float());
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
const ForPartialVectors<TestBitCast<double>> to_double;
to_double(double());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
to_double(uint64_t());
to_double(int64_t());
#endif // HWY_HAVE_INTEGER64
#endif // HWY_HAVE_FLOAT64
#endif // HWY_CAP_INTEGER64
#endif // HWY_CAP_FLOAT64
#if HWY_TARGET != HWY_SCALAR
// For non-scalar vectors, we can cast all types to all.
ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
ForAllTypes(ForGE64Vectors<TestBitCastFrom>());
#endif
}
@ -165,39 +165,39 @@ struct TestPromoteTo {
};
HWY_NOINLINE void TestAllPromoteTo() {
const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
to_u16div2(uint8_t());
const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
to_u32div4(uint8_t());
const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
to_u32div2(uint16_t());
const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
to_i16div2(uint8_t());
to_i16div2(int8_t());
const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
to_i32div2(uint16_t());
to_i32div2(int16_t());
const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
to_i32div4(uint8_t());
to_i32div4(int8_t());
// Must test f16/bf16 separately because we can only load/store/convert them.
#if HWY_HAVE_INTEGER64
const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
#if HWY_CAP_INTEGER64
const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
to_u64div2(uint32_t());
const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
to_i64div2(int32_t());
#endif
#if HWY_HAVE_FLOAT64
const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
#if HWY_CAP_FLOAT64
const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2;
to_f64div2(int32_t());
to_f64div2(float());
#endif
@ -213,6 +213,111 @@ bool IsFinite(T /*unused*/) {
return true;
}
template <typename ToT>
struct TestDemoteTo {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
// Narrower range in the wider type, for clamping before we cast
const T min = LimitsMin<ToT>();
const T max = LimitsMax<ToT>();
const auto value_ok = [&](T& value) {
if (!IsFinite(value)) return false;
#if HWY_EMULATE_SVE
// farm_sve just casts, which is undefined if the value is out of range.
value = HWY_MIN(HWY_MAX(min, value), max);
#endif
return true;
};
RandomState rng;
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!value_ok(from[i]));
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
DemoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllDemoteToInt() {
ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
to_u16(int32_t());
const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
to_i16(int32_t());
}
HWY_NOINLINE void TestAllDemoteToMixed() {
#if HWY_CAP_FLOAT64
const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
to_i32(double());
#endif
}
template <typename ToT>
struct TestDemoteToFloat {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
// For floats, we clamp differently and cannot call LimitsMin.
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
RandomState rng;
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!IsFinite(from[i]));
const T magn = std::abs(from[i]);
const T max_abs = HighestValue<ToT>();
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
expected[i] = static_cast<ToT>(clipped);
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
DemoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllDemoteToFloat() {
// Must test f16 separately because we can only load/store/convert them.
#if HWY_CAP_FLOAT64
const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
to_float(double());
#endif
}
template <class D>
AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
const float test_cases[] = {
@ -247,7 +352,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
struct TestF16 {
template <typename TF32, class DF32>
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if HWY_HAVE_FLOAT16
#if HWY_CAP_FLOAT16
size_t padded;
auto in = F16TestCases(d32, padded);
using TF16 = float16_t;
@ -301,7 +406,7 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
struct TestBF16 {
template <typename TF32, class DF32>
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if !defined(HWY_EMULATE_SVE)
#if HWY_TARGET != HWY_RVV
size_t padded;
auto in = BF16TestCases(d32, padded);
using TBF16 = bfloat16_t;
@ -312,7 +417,6 @@ struct TestBF16 {
#endif
const Half<decltype(dbf16)> dbf16_half;
const size_t N = Lanes(d32);
HWY_ASSERT(Lanes(dbf16_half) <= N);
auto temp16 = AllocateAligned<TBF16>(N);
for (size_t i = 0; i < padded; i += N) {
@ -330,6 +434,124 @@ struct TestBF16 {
HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
template <class D>
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
const float test_cases[] = {
// Same as BF16TestCases:
// +/- 1
1.0f,
-1.0f,
// +/- 0
0.0f,
-0.0f,
// near 0
0.25f,
-0.25f,
// +/- integer
4.0f,
-32.0f,
// positive +/- delta
2.015625f,
3.984375f,
// negative +/- delta
-2.015625f,
-3.984375f,
// No huge values - would interfere with sum. But add more to fill 2 * N:
-2.0f,
-10.0f,
0.03125f,
1.03125f,
1.5f,
2.0f,
4.0f,
5.0f,
6.0f,
8.0f,
10.0f,
256.0f,
448.0f,
2080.0f,
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
auto in = AllocateAligned<float>(padded);
auto expected = AllocateAligned<float>(padded);
std::copy(test_cases, test_cases + kNumTestCases, in.get());
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
return in;
}
class TestReorderDemote2To {
// In-place N^2 selection sort to avoid dependencies
void Sort(float* p, size_t count) {
for (size_t i = 0; i < count - 1; ++i) {
// Find min_element
size_t idx_min = i;
for (size_t j = i + 1; j < count; j++) {
if (p[j] < p[idx_min]) {
idx_min = j;
}
}
// Swap with current
const float tmp = p[i];
p[i] = p[idx_min];
p[idx_min] = tmp;
}
}
public:
template <typename TF32, class DF32>
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if HWY_TARGET != HWY_SCALAR
size_t padded;
auto in = ReorderBF16TestCases(d32, padded);
using TBF16 = bfloat16_t;
const Repartition<TBF16, DF32> dbf16;
const Half<decltype(dbf16)> dbf16_half;
const size_t N = Lanes(d32);
auto temp16 = AllocateAligned<TBF16>(2 * N);
auto expected = AllocateAligned<float>(2 * N);
auto actual = AllocateAligned<float>(2 * N);
for (size_t i = 0; i < padded; i += 2 * N) {
const auto f0 = Load(d32, &in[i + 0]);
const auto f1 = Load(d32, &in[i + N]);
const auto v16 = ReorderDemote2To(dbf16, f0, f1);
Store(v16, dbf16, temp16.get());
const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
// Smoke test: sum should be same (with tolerance for non-associativity)
const auto sum_expected =
GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
sum_expected <= sum_actual + 1E-4);
// Ensure values are the same after sorting to undo the Reorder
Store(f0, d32, expected.get() + 0);
Store(f1, d32, expected.get() + N);
Store(promoted0, d32, actual.get() + 0);
Store(promoted1, d32, actual.get() + N);
Sort(expected.get(), 2 * N);
Sort(actual.get(), 2 * N);
HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
}
#else // HWY_SCALAR
(void)d32;
#endif
}
};
HWY_NOINLINE void TestAllReorderDemote2To() {
ForShrinkableVectors<TestReorderDemote2To>()(float());
}
struct TestConvertU8 {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
@ -342,7 +564,7 @@ struct TestConvertU8 {
};
HWY_NOINLINE void TestAllConvertU8() {
ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
}
// Separate function to attempt to work around a compiler bug on ARM: when this
@ -352,23 +574,19 @@ struct TestIntFromFloatHuge {
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
// Still does not work, although ARMv7 manual says that float->int
// saturates, i.e. chooses the nearest representable value. Also causes
// out-of-memory for MSVC.
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC
// out-of-memory for MSVC, and unsafe cast in farm_sve.
#if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE)
using TI = MakeSigned<TF>;
const Rebind<TI, DF> di;
// Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
// the expected lvalue also seems to prevent the issue.
const size_t N = Lanes(df);
auto expected = AllocateAligned<TI>(N);
// Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
// the expected i32 value is otherwise 0x80..00).
const auto expected_max = Set(di, LimitsMax<TI>());
HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));
// Huge positive
Store(Set(di, LimitsMax<TI>()), di, expected.get());
HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
// Huge negative
Store(Set(di, LimitsMin<TI>()), di, expected.get());
HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
// Huge negative (also lvalue for safety, but GCC bug was not triggered)
const auto expected_min = Set(di, LimitsMin<TI>());
HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
#else
(void)df;
#endif
@ -416,6 +634,10 @@ class TestIntFromFloat {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(TF));
} while (!std::isfinite(from[i]));
#if defined(HWY_EMULATE_SVE)
// farm_sve just casts, which is undefined if the value is out of range.
from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2);
#endif
if (from[i] >= max) {
expected[i] = LimitsMax<TI>();
} else if (from[i] <= min) {
@ -503,21 +725,30 @@ struct TestI32F64 {
const size_t N = Lanes(df);
// Integer positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
// Integer negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
// Above positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
// Below positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
const TF eps = static_cast<TF>(0.0001);
// Above negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
// Below negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
// Max positive int
@ -527,11 +758,22 @@ struct TestI32F64 {
// Min negative int
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
PromoteTo(df, Set(di, LimitsMin<TI>())));
// farm_sve just casts, which is undefined if the value is out of range.
#if !defined(HWY_EMULATE_SVE)
// Huge positive float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
DemoteTo(di, Set(df, TF(1E12))));
// Huge negative float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
DemoteTo(di, Set(df, TF(-1E12))));
#endif
}
};
HWY_NOINLINE void TestAllI32F64() {
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
ForDemoteVectors<TestI32F64>()(double());
#endif
}
@ -548,8 +790,12 @@ namespace hwy {
HWY_BEFORE_TEST(HwyConvertTest);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);

25
third_party/highway/hwy/tests/crypto_test.cc поставляемый
Просмотреть файл

@ -74,7 +74,7 @@ class TestAES {
}
for (size_t i = 0; i < 256; i += N) {
const auto in = Iota(d, static_cast<T>(i));
const auto in = Iota(d, i);
HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
}
}
@ -89,17 +89,11 @@ class TestAES {
0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
const auto test = LoadDup128(d, test_lanes);
// = ShiftRow result
alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
const auto expected_sr = LoadDup128(d, expected_sr_lanes);
// = MixColumn result
alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
alignas(16) constexpr uint8_t expected0_lanes[16] = {
0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
const auto expected_mc = LoadDup128(d, expected_mc_lanes);
const auto expected0 = LoadDup128(d, expected0_lanes);
// = KeyAddition result
alignas(16) constexpr uint8_t expected_lanes[16] = {
@ -109,20 +103,17 @@ class TestAES {
alignas(16) uint8_t key_lanes[16];
for (size_t i = 0; i < 16; ++i) {
key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i];
}
const auto round_key = LoadDup128(d, key_lanes);
HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d)));
HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
AESLastRound(test, round_key));
TestSBox(t, d);
}
};
HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); }
#else
HWY_NOINLINE void TestAllAES() {}
@ -132,7 +123,7 @@ struct TestCLMul {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// needs 64 bit lanes and 128-bit result
#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
#if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64
const size_t N = Lanes(d);
if (N == 1) return;
@ -534,7 +525,7 @@ struct TestCLMul {
}
};
HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE

333
third_party/highway/hwy/tests/demote_test.cc поставляемый
Просмотреть файл

@ -1,333 +0,0 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Causes build timeout.
#if !HWY_IS_MSAN
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <typename T, HWY_IF_FLOAT(T)>
bool IsFinite(T t) {
return std::isfinite(t);
}
// Wrapper avoids calling std::isfinite for integer types (ambiguous).
template <typename T, HWY_IF_NOT_FLOAT(T)>
bool IsFinite(T /*unused*/) {
return true;
}
template <typename ToT>
struct TestDemoteTo {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
// Narrower range in the wider type, for clamping before we cast
const T min = LimitsMin<ToT>();
const T max = LimitsMax<ToT>();
const auto value_ok = [&](T& value) {
if (!IsFinite(value)) return false;
return true;
};
RandomState rng;
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!value_ok(from[i]));
expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
}
const auto in = Load(from_d, from.get());
HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
}
}
};
HWY_NOINLINE void TestAllDemoteToInt() {
ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
to_u16(int32_t());
const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
to_i16(int32_t());
}
HWY_NOINLINE void TestAllDemoteToMixed() {
#if HWY_HAVE_FLOAT64
const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
to_i32(double());
#endif
}
template <typename ToT>
struct TestDemoteToFloat {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
// For floats, we clamp differently and cannot call LimitsMin.
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
RandomState rng;
for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!IsFinite(from[i]));
const T magn = std::abs(from[i]);
const T max_abs = HighestValue<ToT>();
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
expected[i] = static_cast<ToT>(clipped);
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
DemoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllDemoteToFloat() {
// Must test f16 separately because we can only load/store/convert them.
#if HWY_HAVE_FLOAT64
const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
to_float(double());
#endif
}
template <class D>
AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
const float test_cases[] = {
// Same as BF16TestCases:
// +/- 1
1.0f,
-1.0f,
// +/- 0
0.0f,
-0.0f,
// near 0
0.25f,
-0.25f,
// +/- integer
4.0f,
-32.0f,
// positive +/- delta
2.015625f,
3.984375f,
// negative +/- delta
-2.015625f,
-3.984375f,
// No huge values - would interfere with sum. But add more to fill 2 * N:
-2.0f,
-10.0f,
0.03125f,
1.03125f,
1.5f,
2.0f,
4.0f,
5.0f,
6.0f,
8.0f,
10.0f,
256.0f,
448.0f,
2080.0f,
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
padded = RoundUpTo(kNumTestCases, 2 * N); // allow loading pairs of vectors
auto in = AllocateAligned<float>(padded);
auto expected = AllocateAligned<float>(padded);
std::copy(test_cases, test_cases + kNumTestCases, in.get());
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
return in;
}
class TestReorderDemote2To {
// In-place N^2 selection sort to avoid dependencies
void Sort(float* p, size_t count) {
for (size_t i = 0; i < count - 1; ++i) {
// Find min_element
size_t idx_min = i;
for (size_t j = i + 1; j < count; j++) {
if (p[j] < p[idx_min]) {
idx_min = j;
}
}
// Swap with current
const float tmp = p[i];
p[i] = p[idx_min];
p[idx_min] = tmp;
}
}
public:
template <typename TF32, class DF32>
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
#if HWY_TARGET != HWY_SCALAR
size_t padded;
auto in = ReorderBF16TestCases(d32, padded);
using TBF16 = bfloat16_t;
const Repartition<TBF16, DF32> dbf16;
const Half<decltype(dbf16)> dbf16_half;
const size_t N = Lanes(d32);
auto temp16 = AllocateAligned<TBF16>(2 * N);
auto expected = AllocateAligned<float>(2 * N);
auto actual = AllocateAligned<float>(2 * N);
for (size_t i = 0; i < padded; i += 2 * N) {
const auto f0 = Load(d32, &in[i + 0]);
const auto f1 = Load(d32, &in[i + N]);
const auto v16 = ReorderDemote2To(dbf16, f0, f1);
Store(v16, dbf16, temp16.get());
const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
// Smoke test: sum should be same (with tolerance for non-associativity)
const auto sum_expected =
GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1)));
HWY_ASSERT(sum_actual - 1E-4 <= sum_actual &&
sum_expected <= sum_actual + 1E-4);
// Ensure values are the same after sorting to undo the Reorder
Store(f0, d32, expected.get() + 0);
Store(f1, d32, expected.get() + N);
Store(promoted0, d32, actual.get() + 0);
Store(promoted1, d32, actual.get() + N);
Sort(expected.get(), 2 * N);
Sort(actual.get(), 2 * N);
HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
}
#else // HWY_SCALAR
(void)d32;
#endif
}
};
HWY_NOINLINE void TestAllReorderDemote2To() {
ForShrinkableVectors<TestReorderDemote2To>()(float());
}
struct TestI32F64 {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
using TI = int32_t;
const Rebind<TI, DF> di;
const size_t N = Lanes(df);
// Integer positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
// Integer negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
// Above positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
// Below positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
const TF eps = static_cast<TF>(0.0001);
// Above negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
// Below negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
// Huge positive float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
DemoteTo(di, Set(df, TF(1E12))));
// Huge negative float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
DemoteTo(di, Set(df, TF(-1E12))));
}
};
HWY_NOINLINE void TestAllI32F64() {
#if HWY_HAVE_FLOAT64
ForDemoteVectors<TestI32F64>()(double());
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // !HWY_IS_MSAN
#if HWY_ONCE
namespace hwy {
#if !HWY_IS_MSAN
HWY_BEFORE_TEST(HwyDemoteTest);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
#endif // !HWY_IS_MSAN
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#endif

81
third_party/highway/hwy/tests/logical_test.cc поставляемый
Просмотреть файл

@ -17,6 +17,7 @@
#include <string.h> // memcmp
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
@ -58,15 +59,6 @@ struct TestLogicalInteger {
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
auto v = vi;
v = And(v, vi);
HWY_ASSERT_VEC_EQ(d, vi, v);
@ -164,43 +156,6 @@ struct TestCopySign {
}
};
struct TestIfVecThenElse {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
using TU = MakeUnsigned<T>; // For all-one mask
const Rebind<TU, D> du;
const size_t N = Lanes(d);
auto in1 = AllocateAligned<T>(N);
auto in2 = AllocateAligned<T>(N);
auto vec_lanes = AllocateAligned<TU>(N);
auto expected = AllocateAligned<T>(N);
// Each lane should have a chance of having mask=true.
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
for (size_t i = 0; i < N; ++i) {
in1[i] = static_cast<T>(Random32(&rng));
in2[i] = static_cast<T>(Random32(&rng));
vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
}
const auto v1 = Load(d, in1.get());
const auto v2 = Load(d, in2.get());
const auto vec = BitCast(d, Load(du, vec_lanes.get()));
for (size_t i = 0; i < N; ++i) {
expected[i] = vec_lanes[i] ? in1[i] : in2[i];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
}
}
};
HWY_NOINLINE void TestAllIfVecThenElse() {
ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
}
HWY_NOINLINE void TestAllCopySign() {
ForFloatTypes(ForPartialVectors<TestCopySign>());
}
@ -225,31 +180,6 @@ HWY_NOINLINE void TestAllZeroIfNegative() {
ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
}
struct TestIfNegative {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vp = Iota(d, 1);
const auto vn = Or(vp, SignBit(d));
// Zero and positive remain unchanged
HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
// Negative are replaced with 2nd arg
HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
}
};
HWY_NOINLINE void TestAllIfNegative() {
ForFloatTypes(ForPartialVectors<TestIfNegative>());
ForSignedTypes(ForPartialVectors<TestIfNegative>());
}
struct TestBroadcastSignBit {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
@ -304,11 +234,16 @@ HWY_NOINLINE void TestAllTestBit() {
struct TestPopulationCount {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD
constexpr size_t kNumTests = 1 << 14;
#else
constexpr size_t kNumTests = 1 << 20;
#endif
RandomState rng;
size_t N = Lanes(d);
auto data = AllocateAligned<T>(N);
auto popcnt = AllocateAligned<T>(N);
for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
for (size_t i = 0; i < kNumTests / N; i++) {
for (size_t i = 0; i < N; i++) {
data[i] = static_cast<T>(rng());
popcnt[i] = static_cast<T>(PopCount(data[i]));
@ -333,10 +268,8 @@ namespace hwy {
HWY_BEFORE_TEST(HwyLogicalTest);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);

15
third_party/highway/hwy/tests/mask_test.cc поставляемый
Просмотреть файл

@ -17,6 +17,8 @@
#include <stdint.h>
#include <string.h> // memcmp
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
#include "hwy/foreach_target.h"
@ -53,18 +55,13 @@ struct TestFirstN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const RebindToSigned<D> di;
using TI = TFromD<decltype(di)>;
using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>;
const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
// TODO(janwas): 8-bit FirstN (using SlideUp) causes spike to freeze.
#if HWY_TARGET == HWY_RVV
if (sizeof(T) == 1) return;
#endif
const size_t max_lanes = AdjustedReps(HWY_MIN(2 * N, size_t(64)));
for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) {
const auto expected =
RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len))));
const auto actual = FirstN(d, len);
@ -371,7 +368,7 @@ struct TestFindFirstTrue {
memset(bool_lanes.get(), 0, N * sizeof(TI));
// For all combinations of zero/nonzero state of subset of lanes:
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
const size_t max_lanes = HWY_MIN(N, size_t(10));
HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
@ -410,7 +407,7 @@ struct TestLogicalMask {
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
// For all combinations of zero/nonzero state of subset of lanes:
const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
const size_t max_lanes = HWY_MIN(N, size_t(6));
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
for (size_t i = 0; i < max_lanes; ++i) {
bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);

12
third_party/highway/hwy/tests/memory_test.cc поставляемый
Просмотреть файл

@ -36,7 +36,7 @@ struct TestLoadStore {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto hi = Iota(d, static_cast<T>(1 + N));
const auto hi = Iota(d, 1 + N);
const auto lo = Iota(d, 1);
auto lanes = AllocateAligned<T>(2 * N);
Store(hi, d, &lanes[N]);
@ -135,7 +135,7 @@ struct TestStoreInterleaved3 {
HWY_NOINLINE void TestAllStoreInterleaved3() {
#if HWY_TARGET == HWY_RVV
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
const ForExtendableVectors<TestStoreInterleaved3, 2> test;
const ForExtendableVectors<TestStoreInterleaved3, 4> test;
#else
const ForPartialVectors<TestStoreInterleaved3> test;
#endif
@ -198,7 +198,7 @@ struct TestStoreInterleaved4 {
HWY_NOINLINE void TestAllStoreInterleaved4() {
#if HWY_TARGET == HWY_RVV
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
const ForExtendableVectors<TestStoreInterleaved4, 2> test;
const ForExtendableVectors<TestStoreInterleaved4, 4> test;
#else
const ForPartialVectors<TestStoreInterleaved4> test;
#endif
@ -230,7 +230,7 @@ struct TestLoadDup128 {
};
HWY_NOINLINE void TestAllLoadDup128() {
ForAllTypes(ForGEVectors<128, TestLoadDup128>());
ForAllTypes(ForGE128Vectors<TestLoadDup128>());
}
struct TestStream {
@ -245,7 +245,7 @@ struct TestStream {
std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
Stream(v, d, out.get());
FlushStream();
StoreFence();
const auto actual = Load(d, out.get());
HWY_ASSERT_VEC_EQ(d, v, actual);
// Ensure Stream didn't modify more memory than expected
@ -386,7 +386,7 @@ HWY_NOINLINE void TestAllGather() {
HWY_NOINLINE void TestAllCache() {
LoadFence();
FlushStream();
StoreFence();
int test = 0;
Prefetch(&test);
FlushCacheline(&test);

433
third_party/highway/hwy/tests/shift_test.cc поставляемый
Просмотреть файл

@ -1,433 +0,0 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <algorithm>
#include <limits>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <bool kSigned>
struct TestLeftShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
if (kSigned) {
// Also test positive values
TestLeftShifts</*kSigned=*/false>()(t, d);
}
using TI = MakeSigned<T>;
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// 0
HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
// 1
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(T(i) - T(N)) : T(i);
expected[i] = T(TU(value) << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
// max
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(T(i) - T(N)) : T(i);
expected[i] = T(TU(value) << kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
}
};
template <bool kSigned>
struct TestVariableLeftShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
if (kSigned) {
// Also test positive values
TestVariableLeftShifts</*kSigned=*/false>()(t, d);
}
using TI = MakeSigned<T>;
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, kSigned ? -TI(N) : TI(0)); // value to shift
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
// Same: 0
HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
// Same: 1
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
// Same: max
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
// Variable: small
for (size_t i = 0; i < N; ++i) {
const T value = kSigned ? T(i) - T(N) : T(i);
expected[i] = T(TU(value) << (i & kMaxShift));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
// Variable: large
for (size_t i = 0; i < N; ++i) {
expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
}
};
struct TestUnsignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto values = Iota(d, 0);
const T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// Shift by 0
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
// max
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
}
};
struct TestRotateRight {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr size_t kBits = sizeof(T) * 8;
const auto mask_shift = Set(d, T{kBits});
// Cover as many bit positions as possible to test shifting out
const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
// Rotate by 0
HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
// Rotate by 1
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
// Rotate by half
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
// Rotate by max
Store(values, d, expected.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
}
};
struct TestVariableUnsignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
const auto v0 = Zero(d);
const auto v1 = Set(d, 1);
const auto values = Iota(d, 0);
const T kMax = LimitsMax<T>();
const auto max = Set(d, kMax);
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
// Same: 0
HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
// Same: 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
// Same: max
HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
// Variable: small
for (size_t i = 0; i < N; ++i) {
expected[i] = T(i) >> (i & kMaxShift);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
// Variable: Large
for (size_t i = 0; i < N; ++i) {
expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
}
};
template <int kAmount, typename T>
T RightShiftNegative(T val) {
// C++ shifts are implementation-defined for negative numbers, and we have
// seen divisions replaced with shifts, so resort to bit operations.
using TU = hwy::MakeUnsigned<T>;
TU bits;
CopyBytes<sizeof(T)>(&val, &bits);
const TU shifted = TU(bits >> kAmount);
const TU all = TU(~TU(0));
const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
bits = shifted | sign_extended;
CopyBytes<sizeof(T)>(&bits, &val);
return val;
}
class TestSignedRightShifts {
public:
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr T kMin = LimitsMin<T>();
constexpr T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// First test positive values, negative are checked below.
const auto v0 = Zero(d);
const auto values = And(Iota(d, 0), Set(d, kMax));
// Shift by 0
HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
// max
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
// Even negative value
Test<0>(kMin, d, __LINE__);
Test<1>(kMin, d, __LINE__);
Test<2>(kMin, d, __LINE__);
Test<kMaxShift>(kMin, d, __LINE__);
const T odd = static_cast<T>(kMin + 1);
Test<0>(odd, d, __LINE__);
Test<1>(odd, d, __LINE__);
Test<2>(odd, d, __LINE__);
Test<kMaxShift>(odd, d, __LINE__);
}
private:
template <int kAmount, typename T, class D>
void Test(T val, D d, int line) {
const auto expected = Set(d, RightShiftNegative<kAmount>(val));
const auto in = Set(d, val);
const char* file = __FILE__;
AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
}
};
struct TestVariableSignedRightShifts {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
constexpr T kMin = LimitsMin<T>();
constexpr T kMax = LimitsMax<T>();
constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
// First test positive values, negative are checked below.
const auto v0 = Zero(d);
const auto positive = Iota(d, 0) & Set(d, kMax);
// Shift by 0
HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
// Shift by 1
for (size_t i = 0; i < N; ++i) {
expected[i] = T(T(i & kMax) >> 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
// max
HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
const auto max_shift = Set(d, kMaxShift);
const auto small_shifts = And(Iota(d, 0), max_shift);
const auto large_shifts = max_shift - small_shifts;
const auto negative = Iota(d, kMin);
// Test varying negative to shift
for (size_t i = 0; i < N; ++i) {
expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
// Shift MSB right by small amounts
for (size_t i = 0; i < N; ++i) {
const size_t amount = i & kMaxShift;
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
// Shift MSB right by large amounts
for (size_t i = 0; i < N; ++i) {
const size_t amount = kMaxShift - (i & kMaxShift);
const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
CopyBytes<sizeof(T)>(&shifted, &expected[i]);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
}
};
HWY_NOINLINE void TestAllShifts() {
ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
}
HWY_NOINLINE void TestAllVariableShifts() {
const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
const ForPartialVectors<TestUnsignedRightShifts> shr_u;
const ForPartialVectors<TestSignedRightShifts> shr_s;
shl_u(uint16_t());
shr_u(uint16_t());
shl_u(uint32_t());
shr_u(uint32_t());
shl_s(int16_t());
shr_s(int16_t());
shl_s(int32_t());
shr_s(int32_t());
#if HWY_HAVE_INTEGER64
shl_u(uint64_t());
shr_u(uint64_t());
shl_s(int64_t());
shr_s(int64_t());
#endif
}
HWY_NOINLINE void TestAllRotateRight() {
const ForPartialVectors<TestRotateRight> test;
test(uint32_t());
#if HWY_HAVE_INTEGER64
test(uint64_t());
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyShiftTest);
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
} // namespace hwy
// Ought not to be necessary, but without this, no tests run on RVV.
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
#endif

180
third_party/highway/hwy/tests/swizzle_test.cc поставляемый
Просмотреть файл

@ -19,8 +19,6 @@
#include <array> // IWYU pragma: keep
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
#include "hwy/foreach_target.h"
@ -46,48 +44,12 @@ HWY_NOINLINE void TestAllGetLane() {
ForAllTypes(ForPartialVectors<TestGetLane>());
}
struct TestDupEven {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
}
};
HWY_NOINLINE void TestAllDupEven() {
ForUIF3264(ForShrinkableVectors<TestDupEven>());
}
struct TestDupOdd {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_TARGET != HWY_SCALAR
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllDupOdd() {
ForUIF3264(ForShrinkableVectors<TestDupOdd>());
}
struct TestOddEven {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto even = Iota(d, 1);
const auto odd = Iota(d, static_cast<T>(1 + N));
const auto odd = Iota(d, 1 + N);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
@ -105,7 +67,7 @@ struct TestOddEvenBlocks {
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto even = Iota(d, 1);
const auto odd = Iota(d, static_cast<T>(1 + N));
const auto odd = Iota(d, 1 + N);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
const size_t idx_block = i / (16 / sizeof(T));
@ -116,7 +78,7 @@ struct TestOddEvenBlocks {
};
HWY_NOINLINE void TestAllOddEvenBlocks() {
ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
ForAllTypes(ForShrinkableVectors<TestOddEvenBlocks>());
}
struct TestSwapAdjacentBlocks {
@ -138,7 +100,7 @@ struct TestSwapAdjacentBlocks {
};
HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
ForAllTypes(ForPartialVectors<TestSwapAdjacentBlocks>());
}
struct TestTableLookupLanes {
@ -235,131 +197,23 @@ struct TestReverse {
}
};
struct TestReverse2 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const RebindToUnsigned<D> du; // Iota does not support float16_t.
const auto v = BitCast(d, Iota(du, 1));
auto expected = AllocateAligned<T>(N);
// Can't set float16_t value directly, need to permute in memory.
auto copy = AllocateAligned<T>(N);
Store(v, d, copy.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = copy[i ^ 1];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
}
};
struct TestReverse4 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const RebindToUnsigned<D> du; // Iota does not support float16_t.
const auto v = BitCast(d, Iota(du, 1));
auto expected = AllocateAligned<T>(N);
// Can't set float16_t value directly, need to permute in memory.
auto copy = AllocateAligned<T>(N);
Store(v, d, copy.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = copy[i ^ 3];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
}
};
struct TestReverse8 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const RebindToUnsigned<D> du; // Iota does not support float16_t.
const auto v = BitCast(d, Iota(du, 1));
auto expected = AllocateAligned<T>(N);
// Can't set float16_t value directly, need to permute in memory.
auto copy = AllocateAligned<T>(N);
Store(v, d, copy.get());
for (size_t i = 0; i < N; ++i) {
expected[i] = copy[i ^ 7];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
}
};
HWY_NOINLINE void TestAllReverse() {
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
// which requires 16 bits.
ForUIF163264(ForPartialVectors<TestReverse>());
}
HWY_NOINLINE void TestAllReverse2() {
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
// which requires 16 bits.
ForUIF64(ForGEVectors<128, TestReverse2>());
ForUIF32(ForGEVectors<64, TestReverse2>());
ForUIF16(ForGEVectors<32, TestReverse2>());
}
HWY_NOINLINE void TestAllReverse4() {
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
// which requires 16 bits.
ForUIF64(ForGEVectors<256, TestReverse4>());
ForUIF32(ForGEVectors<128, TestReverse4>());
ForUIF16(ForGEVectors<64, TestReverse4>());
}
HWY_NOINLINE void TestAllReverse8() {
// 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
// which requires 16 bits.
ForUIF64(ForGEVectors<512, TestReverse8>());
ForUIF32(ForGEVectors<256, TestReverse8>());
ForUIF16(ForGEVectors<128, TestReverse8>());
}
struct TestReverseBlocks {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const RebindToUnsigned<D> du; // Iota does not support float16_t.
const auto v = BitCast(d, Iota(du, 1));
auto expected = AllocateAligned<T>(N);
constexpr size_t kLanesPerBlock = 16 / sizeof(T);
const size_t num_blocks = N / kLanesPerBlock;
HWY_ASSERT(num_blocks != 0);
// Can't set float16_t value directly, need to permute in memory.
auto copy = AllocateAligned<T>(N);
Store(v, d, copy.get());
for (size_t i = 0; i < N; ++i) {
const size_t idx_block = i / kLanesPerBlock;
const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
expected[i] = copy[base + (i % kLanesPerBlock)];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
}
};
HWY_NOINLINE void TestAllReverseBlocks() {
ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
}
class TestCompress {
template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
const AlignedFreeUniquePtr<T[]>& in,
template <typename T, typename TI, size_t N>
void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos,
size_t actual_pos, const AlignedFreeUniquePtr<T[]>& in,
const AlignedFreeUniquePtr<TI[]>& mask_lanes,
const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
int line) {
if (expected_pos != actual_pos) {
hwy::Abort(
__FILE__, line,
"Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos),
static_cast<uint64_t>(actual_pos));
hwy::Abort(__FILE__, line,
"Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n",
TypeName(T(), N).c_str(), static_cast<uint64_t>(expected_pos), static_cast<uint64_t>(actual_pos));
}
// Upper lanes are undefined. Modified from AssertVecEqual.
for (size_t i = 0; i < expected_pos; ++i) {
@ -368,7 +222,6 @@ class TestCompress {
"Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n",
static_cast<uint64_t>(i), static_cast<uint64_t>(expected_pos),
line);
const size_t N = Lanes(d);
Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
Print(d, "in", Load(d, in.get()), 0, N);
Print(d, "expect", Load(d, expected.get()), 0, N);
@ -398,10 +251,7 @@ class TestCompress {
auto expected = AllocateAligned<T>(N);
auto actual_a = AllocateAligned<T>(misalign + N);
T* actual_u = actual_a.get() + misalign;
const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
auto bits = AllocateAligned<uint8_t>(bits_size);
memset(bits.get(), 0, bits_size); // for MSAN
auto bits = AllocateAligned<uint8_t>(HWY_MAX(8, (N + 7) / 8));
// Each lane should have a chance of having mask=true.
for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
@ -615,7 +465,7 @@ HWY_NOINLINE void TestAllCompress() {
test(uint16_t());
test(int16_t());
#if HWY_HAVE_FLOAT16
#if HWY_CAP_FLOAT16
test(float16_t());
#endif
@ -632,17 +482,11 @@ HWY_AFTER_NAMESPACE();
namespace hwy {
HWY_BEFORE_TEST(HwySwizzleTest);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse2);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse4);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse8);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress);
} // namespace hwy

378
third_party/highway/hwy/tests/test_util-inl.h поставляемый
Просмотреть файл

@ -41,7 +41,7 @@ HWY_NOINLINE void PrintValue(T value) {
fprintf(stderr, "0x%02X,", byte);
}
#if HWY_HAVE_FLOAT16
#if HWY_CAP_FLOAT16
HWY_NOINLINE void PrintValue(float16_t value) {
uint16_t bits;
CopyBytes<2>(&value, &bits);
@ -70,11 +70,9 @@ void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
}
// Compare expected vector to vector.
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
// 128 bits (the NEON register) of actual are zero.
template <class D, typename T = TFromD<D>, class V = Vec<D>>
HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
const char* filename, const int line) {
void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
const char* filename, const int line) {
const size_t N = Lanes(d);
auto actual_lanes = AllocateAligned<T>(N);
Store(actual, d, actual_lanes.get());
@ -86,11 +84,9 @@ HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
}
// Compare expected lanes to vector.
// HWY_INLINE works around a Clang SVE compiler bug where all but the first
// 128 bits (the NEON register) of actual are zero.
template <class D, typename T = TFromD<D>, class V = Vec<D>>
HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
const char* filename, int line) {
HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
const char* filename, int line) {
auto expected_lanes = AllocateAligned<T>(Lanes(d));
Store(expected, d, expected_lanes.get());
AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
@ -100,10 +96,7 @@ HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
template <class D>
HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
const char* filename, int line) {
// lvalues prevented MSAN failure in farm_sve.
const Vec<D> va = VecFromMask(d, a);
const Vec<D> vb = VecFromMask(d, b);
AssertVecEqual(d, va, vb, filename, line);
AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
const char* target_name = hwy::TargetName(HWY_TARGET);
AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
@ -185,269 +178,169 @@ HWY_INLINE Mask<D> MaskFalse(const D d) {
// Helpers for instantiating tests with combinations of lane types / counts.
// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
// is required to ensure capped vectors remain extendable. Implemented by
// recursively halving kMul until it is zero.
template <typename T, size_t kMul, size_t kMinArg, class Test>
struct ForeachCappedR {
static void Do(size_t min_lanes, size_t max_lanes) {
const CappedTag<T, kMul * kMinArg> d;
// For ensuring we do not call tests with D such that widening D results in 0
// lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
// of a u64 vector in this case.
template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
HWY_INLINE size_t PromotedLanes(const D d) {
return Lanes(RepartitionToWide<decltype(d)>());
}
// Already the widest possible T, cannot widen.
template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
HWY_INLINE size_t PromotedLanes(const D d) {
return Lanes(d);
}
// If we already don't have enough lanes, stop.
const size_t lanes = Lanes(d);
if (lanes < min_lanes) return;
// For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
// stops at kMul == 0). Note that N may be capped or a fraction.
template <typename T, size_t kMul, size_t kMinLanes, class Test,
bool kPromote = false>
struct ForeachSizeR {
static void Do() {
const Simd<T, kMul * kMinLanes> d;
if (lanes <= max_lanes) {
Test()(T(), d);
}
ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
// Skip invalid fractions (e.g. 1/8th of u32x4).
const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d);
if (lanes < kMinLanes) return;
Test()(T(), d);
static_assert(kMul != 0, "Recursion should have ended already");
ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do();
}
};
// Base case to stop the recursion.
template <typename T, size_t kMinArg, class Test>
struct ForeachCappedR<T, 0, kMinArg, Test> {
static void Do(size_t, size_t) {}
template <typename T, size_t kMinLanes, class Test, bool kPromote>
struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> {
static void Do() {}
};
#if HWY_HAVE_SCALABLE
constexpr int MinVectorSize() {
#if HWY_TARGET == HWY_RVV
// Actually 16 for the application processor profile, but the intrinsics are
// defined as if VLEN might be only 64: there is no vuint64mf2_t.
return 8;
#else
return 16;
#endif
}
template <typename T>
constexpr int MinPow2() {
// Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
// as kPow2 == -3). The fraction also must not result in zero lanes for the
// smallest possible vector size.
return HWY_MAX(-3, -static_cast<int>(CeilLog2(MinVectorSize() / sizeof(T))));
}
// Iterates kPow2 upward through +3.
template <typename T, int kPow2, int kAddPow2, class Test>
struct ForeachShiftR {
static void Do(size_t min_lanes) {
const ScalableTag<T, kPow2 + kAddPow2> d;
// Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
// vector size, so we always have enough lanes, except ForGEVectors.
if (Lanes(d) >= min_lanes) {
Test()(T(), d);
} else {
fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
HWY_ASSERT(min_lanes != 1);
}
ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
}
};
// Base case to stop the recursion.
template <typename T, int kAddPow2, class Test>
struct ForeachShiftR<T, 4, kAddPow2, Test> {
static void Do(size_t) {}
};
#else
// ForeachCappedR already handled all possible sizes.
#endif // HWY_HAVE_SCALABLE
// These adapters may be called directly, or via For*Types:
// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
// Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
template <class Test, int kPow2 = 1>
template <class Test, size_t kFactor = 2>
struct ForExtendableVectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kMaxCapped = HWY_LANES(T);
// Skip CappedTag that are already full vectors.
const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
(void)kMaxCapped;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
// not supported
#else
ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
constexpr bool kPromote = true;
#if HWY_TARGET == HWY_RVV
// For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
#elif HWY_HAVE_SCALABLE
// For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(1);
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do();
// TODO(janwas): also capped
// ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
// Capped
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
// Fractions
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do();
#else
ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do();
#endif
#endif // HWY_SCALAR
}
};
// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
// Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
// that narrow their input, e.g. UpperHalf.
template <class Test, int kPow2 = 1>
template <class Test, size_t kFactor = 2>
struct ForShrinkableVectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kMinLanes = size_t{1} << kPow2;
constexpr size_t kMaxCapped = HWY_LANES(T);
// For shrinking, an upper limit is unnecessary.
constexpr size_t max_lanes = kMaxCapped;
(void)kMinLanes;
(void)max_lanes;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
// not supported
#elif HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
// TODO(janwas): also capped
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
// Capped
ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do();
// Fractions
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do();
#elif HWY_TARGET == HWY_SCALAR
// not supported
#else
ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
max_lanes);
#if HWY_TARGET == HWY_RVV
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
#elif HWY_HAVE_SCALABLE
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do();
#endif
#endif // HWY_TARGET == HWY_SCALAR
}
};
// Calls Test for all supported power of two vectors of at least kMinBits.
// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
template <size_t kMinBits, class Test>
struct ForGEVectors {
// Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
// ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
template <class Test>
struct ForGE128Vectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kMaxCapped = HWY_LANES(T);
constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
// An upper limit is unnecessary.
constexpr size_t max_lanes = kMaxCapped;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
(void)kMinLanes; // not supported
// not supported
#elif HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
// TODO(janwas): also capped
// ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
// Capped
ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do();
// Fractions
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
#else
ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(kMinLanes,
max_lanes);
#if HWY_TARGET == HWY_RVV
// Can be 0 (handled below) if kMinBits > 64.
constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
constexpr int kMinPow2 =
kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
// For each [kMinPow2, 3]; counter is [kMinPow2, 3].
ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
#elif HWY_HAVE_SCALABLE
// Can be 0 (handled below) if kMinBits > 128.
constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits;
constexpr int kMinPow2 =
kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
// For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
Test>::Do();
#endif
#endif // HWY_TARGET == HWY_SCALAR
}
};
// Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
// ops that require at least 64 bits, e.g. casts.
template <class Test>
using ForGE128Vectors = ForGEVectors<128, Test>;
struct ForGE64Vectors {
template <typename T>
void operator()(T /*unused*/) const {
#if HWY_TARGET == HWY_SCALAR
// not supported
#elif HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
// TODO(janwas): also capped
// ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2
// Capped
ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do();
// Fractions
ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do();
#else
ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)),
Test>::Do();
#endif
}
};
// Calls Test for all N that can be promoted (not the same as Extendable because
// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
template <class Test, int kPow2 = 1>
template <class Test, size_t kFactor = 2>
struct ForPromoteVectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kFactor = size_t{1} << kPow2;
static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
constexpr size_t kMaxCapped = HWY_LANES(T);
constexpr size_t kMinLanes = kFactor;
// Skip CappedTag that are already full vectors.
const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
(void)kMaxCapped;
(void)kMinLanes;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do();
#else
// TODO(janwas): call Extendable if kMinLanes check not required?
ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes, max_lanes);
#if HWY_TARGET == HWY_RVV
// For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(kMinLanes);
#elif HWY_HAVE_SCALABLE
// For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(kMinLanes);
return ForExtendableVectors<Test, kFactor>()(T());
#endif
#endif // HWY_SCALAR
}
};
// Calls Test for all N than can be demoted (not the same as Shrinkable because
// HWY_SCALAR has one lane).
template <class Test, int kPow2 = 1>
// HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
template <class Test, size_t kFactor = 2>
struct ForDemoteVectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kMinLanes = size_t{1} << kPow2;
constexpr size_t kMaxCapped = HWY_LANES(T);
// For shrinking, an upper limit is unnecessary.
constexpr size_t max_lanes = kMaxCapped;
(void)kMinLanes;
(void)max_lanes;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
ForeachSizeR<T, 1, 1, Test>::Do();
#else
ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
max_lanes);
// TODO(janwas): call Extendable if kMinLanes check not required?
#if HWY_TARGET == HWY_RVV
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
#elif HWY_HAVE_SCALABLE
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
return ForShrinkableVectors<Test, kFactor>()(T());
#endif
#endif // HWY_TARGET == HWY_SCALAR
}
};
// For LowerHalf/Quarter.
template <class Test, int kPow2 = 1>
struct ForHalfVectors {
template <typename T>
void operator()(T /*unused*/) const {
constexpr size_t kMinLanes = size_t{1} << kPow2;
constexpr size_t kMaxCapped = HWY_LANES(T);
// For shrinking, an upper limit is unnecessary.
constexpr size_t max_lanes = kMaxCapped;
(void)kMinLanes;
(void)max_lanes;
(void)max_lanes;
#if HWY_TARGET == HWY_SCALAR
ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
#else
// ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
// max_lanes);
// TODO(janwas): call Extendable if kMinLanes check not required?
#if HWY_TARGET == HWY_RVV
// For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes);
#elif HWY_HAVE_SCALABLE
// For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes);
#endif
#endif // HWY_TARGET == HWY_SCALAR
}
};
@ -457,7 +350,7 @@ template <class Test>
struct ForPartialVectors {
template <typename T>
void operator()(T t) const {
ForExtendableVectors<Test, 0>()(t);
ForExtendableVectors<Test, 1>()(t);
}
};
@ -468,7 +361,7 @@ void ForSignedTypes(const Func& func) {
func(int8_t());
func(int16_t());
func(int32_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
func(int64_t());
#endif
}
@ -478,7 +371,7 @@ void ForUnsignedTypes(const Func& func) {
func(uint8_t());
func(uint16_t());
func(uint32_t());
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
func(uint64_t());
#endif
}
@ -492,7 +385,7 @@ void ForIntegerTypes(const Func& func) {
template <class Func>
void ForFloatTypes(const Func& func) {
func(float());
#if HWY_HAVE_FLOAT64
#if HWY_CAP_FLOAT64
func(double());
#endif
}
@ -504,49 +397,32 @@ void ForAllTypes(const Func& func) {
}
template <class Func>
void ForUIF16(const Func& func) {
func(uint16_t());
func(int16_t());
#if HWY_HAVE_FLOAT16
func(float16_t());
#endif
}
template <class Func>
void ForUIF32(const Func& func) {
void ForUIF3264(const Func& func) {
func(uint32_t());
func(int32_t());
func(float());
}
template <class Func>
void ForUIF64(const Func& func) {
#if HWY_HAVE_INTEGER64
#if HWY_CAP_INTEGER64
func(uint64_t());
func(int64_t());
#endif
#if HWY_HAVE_FLOAT64
func(double());
#endif
}
template <class Func>
void ForUIF3264(const Func& func) {
ForUIF32(func);
ForUIF64(func);
ForFloatTypes(func);
}
template <class Func>
void ForUIF163264(const Func& func) {
ForUIF16(func);
ForUIF3264(func);
func(uint16_t());
func(int16_t());
#if HWY_CAP_FLOAT16
func(float16_t());
#endif
}
// For tests that involve loops, adjust the trip count so that emulated tests
// finish quickly (but always at least 2 iterations to ensure some diversity).
constexpr size_t AdjustedReps(size_t max_reps) {
#if HWY_ARCH_RVV
return HWY_MAX(max_reps / 32, 2);
return HWY_MAX(max_reps / 16, 2);
#elif HWY_ARCH_ARM
return HWY_MAX(max_reps / 4, 2);
#elif HWY_IS_DEBUG_BUILD
@ -556,20 +432,6 @@ constexpr size_t AdjustedReps(size_t max_reps) {
#endif
}
// Same as above, but the loop trip count will be 1 << max_pow2.
constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
// If "negative" (unsigned wraparound), use original.
#if HWY_ARCH_RVV
return HWY_MIN(max_pow2 - 4, max_pow2);
#elif HWY_ARCH_ARM
return HWY_MIN(max_pow2 - 1, max_pow2);
#elif HWY_IS_DEBUG_BUILD
return HWY_MIN(max_pow2 - 1, max_pow2);
#else
return max_pow2;
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy

3
third_party/highway/hwy/tests/test_util.cc поставляемый
Просмотреть файл

@ -30,6 +30,9 @@ bool BytesEqual(const void* p1, const void* p2, const size_t size,
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
for (size_t i = 0; i < size; ++i) {
if (bytes1[i] != bytes2[i]) {
fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n",
static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i],
bytes2[i]);
if (pos != nullptr) {
*pos = i;
}

39
third_party/highway/hwy/tests/test_util.h поставляемый
Просмотреть файл

@ -26,7 +26,6 @@
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/highway.h"
#include "hwy/highway_export.h"
namespace hwy {
@ -68,7 +67,9 @@ static HWY_INLINE uint32_t Random32(RandomState* rng) {
return static_cast<uint32_t>((*rng)());
}
static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
static HWY_INLINE uint64_t Random64(RandomState* rng) {
return (*rng)();
}
// Prevents the compiler from eliding the computations that led to "output".
// Works by indicating to the compiler that "output" is being read and modified.
@ -83,8 +84,8 @@ inline void PreventElision(T&& output) {
#endif // HWY_COMPILER_MSVC
}
HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
const size_t size, size_t* pos = nullptr);
bool BytesEqual(const void* p1, const void* p2, const size_t size,
size_t* pos = nullptr);
void AssertStringEqual(const char* expected, const char* actual,
const char* target_name, const char* filename, int line);
@ -128,25 +129,25 @@ HWY_INLINE TypeInfo MakeTypeInfo() {
return info;
}
HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
const void* actual_ptr);
bool IsEqual(const TypeInfo& info, const void* expected_ptr,
const void* actual_ptr);
HWY_TEST_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
void TypeName(const TypeInfo& info, size_t N, char* string100);
HWY_TEST_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
const void* array_void, size_t N,
size_t lane_u = 0, size_t max_lanes = 7);
void PrintArray(const TypeInfo& info, const char* caption,
const void* array_void, size_t N, size_t lane_u = 0,
size_t max_lanes = 7);
HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
const char* target_name, const char* filename, int line, size_t lane = 0,
size_t num_lanes = 1);
HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info,
const void* expected_ptr,
const void* actual_ptr,
const char* target_name,
const char* filename, int line,
size_t lane = 0, size_t num_lanes = 1);
HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
const void* expected_void,
const void* actual_void, size_t N,
const char* target_name,
const char* filename, int line);
void AssertArrayEqual(const TypeInfo& info, const void* expected_void,
const void* actual_void, size_t N,
const char* target_name, const char* filename, int line);
} // namespace detail

Просмотреть файл

@ -52,10 +52,10 @@ HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
struct TestEqualInteger {
template <class T>
HWY_NOINLINE void operator()(T /*t*/) const {
HWY_ASSERT_EQ(T(0), T(0));
HWY_ASSERT_EQ(T(1), T(1));
HWY_ASSERT_EQ(T(-1), T(-1));
HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
HWY_ASSERT(IsEqual(T(0), T(0)));
HWY_ASSERT(IsEqual(T(1), T(1)));
HWY_ASSERT(IsEqual(T(-1), T(-1)));
HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));
HWY_ASSERT(!IsEqual(T(0), T(1)));
HWY_ASSERT(!IsEqual(T(1), T(0)));

2
third_party/highway/libhwy-contrib.pc.in поставляемый
Просмотреть файл

@ -4,7 +4,7 @@ libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: libhwy-contrib
Description: Additions to Highway: dot product, image, math, sort
Description: Additions to Highway: image and math library
Version: @HWY_LIBRARY_VERSION@
Libs: -L${libdir} -lhwy_contrib
Cflags: -I${includedir}

2
third_party/highway/libhwy.pc.in поставляемый
Просмотреть файл

@ -7,4 +7,4 @@ Name: libhwy
Description: Efficient and performance-portable SIMD wrapper
Version: @HWY_LIBRARY_VERSION@
Libs: -L${libdir} -lhwy
Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@
Cflags: -I${includedir}

9
third_party/highway/preamble.js.lds поставляемый
Просмотреть файл

@ -1,9 +0,0 @@
/*
* Copyright 2019 Google LLC
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/
/* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */
var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } };

4
third_party/highway/run_tests.sh поставляемый
Просмотреть файл

@ -59,7 +59,7 @@ export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
rm -rf build_arm7
mkdir build_arm7
cd build_arm7
CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
make -j8
ctest
cd ..
@ -71,7 +71,7 @@ export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
rm -rf build_arm8
mkdir build_arm8
cd build_arm8
CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
make -j8
ctest
cd ..

Просмотреть файл

@ -34,7 +34,7 @@ jobs:
env_stack_size: 1
max_stack: 3000
# Conformance tooling test requires numpy.
apt_pkgs: graphviz python3-numpy
apt_pkgs: python3-numpy
- name: lowprecision
mode: release
test_in_pr: true
@ -461,8 +461,8 @@ jobs:
runs-on: ubuntu-latest
env:
CCACHE_DIR: ${{ github.workspace }}/.ccache
EM_VERSION: 3.1.4
V8_VERSION: 9.8.177
EM_VERSION: 2.0.23
V8_VERSION: 9.3.22
V8: ${{ github.workspace }}/.jsvu/v8
BUILD_TARGET: wasm32
@ -506,7 +506,7 @@ jobs:
${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }}
- name: Install emsdk
uses: mymindstorm/setup-emsdk@v11
uses: mymindstorm/setup-emsdk@v10
# TODO(deymo): We could cache this action but it doesn't work when running
# in a matrix.
with:

2
third_party/jpeg-xl/deps.sh поставляемый
Просмотреть файл

@ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0"))
# Git revisions we use for the given submodules. Update these whenever you
# update a git submodule.
THIRD_PARTY_GFLAGS="827c769e5fc98e0f2a34c47cef953cc6328abced"
THIRD_PARTY_HIGHWAY="f13e3b956eb226561ac79427893ec0afd66f91a8"
THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5"
THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"

34
third_party/jpeg-xl/lib/extras/codec.cc поставляемый
Просмотреть файл

@ -5,12 +5,6 @@
#include "lib/extras/codec.h"
#include "jxl/decode.h"
#include "jxl/types.h"
#include "lib/extras/packed_image.h"
#include "lib/jxl/base/padded_bytes.h"
#include "lib/jxl/base/status.h"
#if JPEGXL_ENABLE_APNG
#include "lib/extras/enc/apng.h"
#endif
@ -74,14 +68,6 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
JXL_WARNING("Writing JPEG data as pixels");
}
extras::PackedPixelFile ppf;
size_t num_channels = io.metadata.m.color_encoding.Channels();
JxlPixelFormat format = {
static_cast<uint32_t>(num_channels),
bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
JXL_NATIVE_ENDIAN, 0};
std::vector<uint8_t> bytes_vector;
const bool floating_point = bits_per_sample > 16;
switch (codec) {
case extras::Codec::kPNG:
#if JPEGXL_ENABLE_APNG
@ -101,24 +87,8 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
return JXL_FAILURE("JPEG XL was built without JPEG support");
#endif
case extras::Codec::kPNM:
// Choose native for PFM; PGM/PPM require big-endian (N/A for PBM)
format.endianness = floating_point ? JXL_NATIVE_ENDIAN : JXL_BIG_ENDIAN;
if (floating_point) {
format.data_type = JXL_TYPE_FLOAT;
}
if (!c_desired.IsSRGB()) {
JXL_WARNING(
"PNM encoder cannot store custom ICC profile; decoder\n"
"will need hint key=color_space to get the same values");
}
JXL_RETURN_IF_ERROR(extras::ConvertCodecInOutToPackedPixelFile(
io, format, c_desired, pool, &ppf));
JXL_RETURN_IF_ERROR(
extras::EncodeImagePNM(ppf, bits_per_sample, pool, &bytes_vector));
bytes->assign(bytes_vector.data(),
bytes_vector.data() + bytes_vector.size());
return true;
return extras::EncodeImagePNM(&io, c_desired, bits_per_sample, pool,
bytes);
case extras::Codec::kPGX:
return extras::EncodeImagePGX(&io, c_desired, bits_per_sample, pool,
bytes);

Просмотреть файл

@ -61,6 +61,7 @@ const EnumName<JxlRenderingIntent> kJxlRenderingIntentNames[] = {
template <typename T>
Status ParseEnum(const std::string& token, const EnumName<T>* enum_values,
size_t enum_len, T* value) {
std::string str;
for (size_t i = 0; i < enum_len; i++) {
if (enum_values[i].name == token) {
*value = enum_values[i].value;

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше