Bug 1707590 - Part 1: Add vendored libjxl source r=aosmond

Differential Revision: https://phabricator.services.mozilla.com/D113358
This commit is contained in:
Kagami Sascha Rosylight 2021-05-06 01:14:20 +00:00
Родитель 8fb52356f4
Коммит 626cb0e6e1
523 изменённых файлов: 135247 добавлений и 0 удалений

3
config/external/moz.build поставляемый
Просмотреть файл

@ -55,6 +55,9 @@ if CONFIG["CPU_ARCH"] == "arm":
if CONFIG["MOZ_FFVPX"]:
external_dirs += ["media/ffvpx"]
if CONFIG["MOZ_JXL"]:
external_dirs += ["media/libjxl", "media/highway"]
external_dirs += [
"media/kiss_fft",
"media/libcubeb",

41
media/highway/moz.build Normal file
Просмотреть файл

@ -0,0 +1,41 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
LOCAL_INCLUDES += [
"/third_party/highway/",
]
SOURCES += [
"/third_party/highway/contrib/image/image.cc",
"/third_party/highway/hwy/aligned_allocator.cc",
"/third_party/highway/hwy/targets.cc",
]
EXPORTS.hwy += [
"/third_party/highway/hwy/aligned_allocator.h",
"/third_party/highway/hwy/base.h",
"/third_party/highway/hwy/cache_control.h",
"/third_party/highway/hwy/foreach_target.h",
"/third_party/highway/hwy/highway.h",
"/third_party/highway/hwy/targets.h",
]
EXPORTS.hwy.ops += [
"/third_party/highway/hwy/ops/arm_neon-inl.h",
"/third_party/highway/hwy/ops/rvv-inl.h",
"/third_party/highway/hwy/ops/scalar-inl.h",
"/third_party/highway/hwy/ops/set_macros-inl.h",
"/third_party/highway/hwy/ops/shared-inl.h",
"/third_party/highway/hwy/ops/wasm_128-inl.h",
"/third_party/highway/hwy/ops/x86_128-inl.h",
"/third_party/highway/hwy/ops/x86_256-inl.h",
"/third_party/highway/hwy/ops/x86_512-inl.h",
]
FINAL_LIBRARY = "gkmedias"
# We allow warnings for third-party code that can be updated from upstream.
AllowCompilerWarnings()

43
media/highway/moz.yaml Normal file
Просмотреть файл

@ -0,0 +1,43 @@
# Version of this schema
schema: 1
bugzilla:
# Bugzilla product and component for this directory and subdirectories
product: Core
component: "ImageLib"
# Document the source of externally hosted code
origin:
# Short name of the package/library
name: highway
description: Performance-portable, length-agnostic SIMD with runtime dispatch
# Full URL for the package's homepage/etc
# Usually different from repository url
url: https://github.com/google/highway
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit ca1a57c342cd815053abfcffa29b44eaead4f20b (2021-04-15T17:54:35Z).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: ca1a57c342cd815053abfcffa29b44eaead4f20b
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
# Multiple licenses can be specified (as a YAML list)
# A "LICENSE" file must exist containing the full license text
license: Apache-2.0
license-file: LICENSE
vendoring:
url: https://github.com/google/highway.git
source-hosting: github
vendor-directory: third_party/jpeg-xl/third_party/highway
exclude:
- g3doc/

Просмотреть файл

@ -0,0 +1,12 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef JXL_EXPORT_H
#define JXL_EXPORT_H
#define JXL_EXPORT
#endif /* JXL_EXPORT_H */

Просмотреть файл

@ -0,0 +1,12 @@
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef JXL_THREADS_EXPORT_H
#define JXL_THREADS_EXPORT_H
#define JXL_THREADS_EXPORT
#endif /* JXL_THREADS_EXPORT_H */

114
media/libjxl/moz.build Normal file
Просмотреть файл

@ -0,0 +1,114 @@
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
# vim: set filetype=python:
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
LOCAL_INCLUDES += [
"./include/",
"/third_party/jpeg-xl/",
"/third_party/jpeg-xl/lib/include/",
]
SOURCES += [
"/third_party/jpeg-xl/lib/jxl/ac_strategy.cc",
"/third_party/jpeg-xl/lib/jxl/alpha.cc",
"/third_party/jpeg-xl/lib/jxl/ans_common.cc",
"/third_party/jpeg-xl/lib/jxl/aux_out.cc",
"/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc",
"/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc",
"/third_party/jpeg-xl/lib/jxl/base/descriptive_statistics.cc",
"/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc",
"/third_party/jpeg-xl/lib/jxl/base/status.cc",
"/third_party/jpeg-xl/lib/jxl/base/time.cc",
"/third_party/jpeg-xl/lib/jxl/blending.cc",
"/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc",
"/third_party/jpeg-xl/lib/jxl/coeff_order.cc",
"/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc",
"/third_party/jpeg-xl/lib/jxl/color_management.cc",
"/third_party/jpeg-xl/lib/jxl/compressed_dc.cc",
"/third_party/jpeg-xl/lib/jxl/convolve.cc",
"/third_party/jpeg-xl/lib/jxl/dct_scales.cc",
"/third_party/jpeg-xl/lib/jxl/dec_ans.cc",
"/third_party/jpeg-xl/lib/jxl/dec_context_map.cc",
"/third_party/jpeg-xl/lib/jxl/dec_external_image.cc",
"/third_party/jpeg-xl/lib/jxl/dec_frame.cc",
"/third_party/jpeg-xl/lib/jxl/dec_group.cc",
"/third_party/jpeg-xl/lib/jxl/dec_group_border.cc",
"/third_party/jpeg-xl/lib/jxl/dec_huffman.cc",
"/third_party/jpeg-xl/lib/jxl/dec_modular.cc",
"/third_party/jpeg-xl/lib/jxl/dec_noise.cc",
"/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc",
"/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc",
"/third_party/jpeg-xl/lib/jxl/dec_upsample.cc",
"/third_party/jpeg-xl/lib/jxl/dec_xyb.cc",
"/third_party/jpeg-xl/lib/jxl/decode.cc",
"/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc",
"/third_party/jpeg-xl/lib/jxl/entropy_coder.cc",
"/third_party/jpeg-xl/lib/jxl/epf.cc",
"/third_party/jpeg-xl/lib/jxl/fields.cc",
"/third_party/jpeg-xl/lib/jxl/filters.cc",
"/third_party/jpeg-xl/lib/jxl/frame_header.cc",
"/third_party/jpeg-xl/lib/jxl/gauss_blur.cc",
"/third_party/jpeg-xl/lib/jxl/headers.cc",
"/third_party/jpeg-xl/lib/jxl/huffman_table.cc",
"/third_party/jpeg-xl/lib/jxl/icc_codec.cc",
"/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc",
"/third_party/jpeg-xl/lib/jxl/image.cc",
"/third_party/jpeg-xl/lib/jxl/image_bundle.cc",
"/third_party/jpeg-xl/lib/jxl/image_metadata.cc",
"/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc",
"/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc",
"/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc",
"/third_party/jpeg-xl/lib/jxl/loop_filter.cc",
"/third_party/jpeg-xl/lib/jxl/luminance.cc",
"/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc",
"/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc",
"/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc",
"/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc",
"/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc",
"/third_party/jpeg-xl/lib/jxl/opsin_params.cc",
"/third_party/jpeg-xl/lib/jxl/passes_state.cc",
"/third_party/jpeg-xl/lib/jxl/quant_weights.cc",
"/third_party/jpeg-xl/lib/jxl/quantizer.cc",
"/third_party/jpeg-xl/lib/jxl/splines.cc",
"/third_party/jpeg-xl/lib/jxl/toc.cc",
]
SOURCES += [
"/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc",
"/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc",
]
DEFINES["JPEGXL_MAJOR_VERSION"] = "0"
DEFINES["JPEGXL_MINOR_VERSION"] = "0"
DEFINES["JPEGXL_PATCH_VERSION"] = "0"
EXPORTS.jxl += [
"./include/jxl/jxl_export.h",
"./include/jxl/jxl_threads_export.h",
"/third_party/jpeg-xl/lib/include/jxl/butteraugli.h",
"/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h",
"/third_party/jpeg-xl/lib/include/jxl/codestream_header.h",
"/third_party/jpeg-xl/lib/include/jxl/color_encoding.h",
"/third_party/jpeg-xl/lib/include/jxl/decode.h",
"/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h",
"/third_party/jpeg-xl/lib/include/jxl/encode.h",
"/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h",
"/third_party/jpeg-xl/lib/include/jxl/memory_manager.h",
"/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h",
"/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h",
"/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h",
"/third_party/jpeg-xl/lib/include/jxl/types.h",
]
FINAL_LIBRARY = "gkmedias"
# We allow warnings for third-party code that can be updated from upstream.
AllowCompilerWarnings()
# Clang 5.0 has a compiler bug that prevents build in c++17
# See https://gitlab.com/wg1/jpeg-xl/-/issues/227
# This should be okay since we are using the C API.
if CONFIG["CC_TYPE"] == "clang":
CXXFLAGS += ["-std=c++11"]

53
media/libjxl/moz.yaml Normal file
Просмотреть файл

@ -0,0 +1,53 @@
# Version of this schema
schema: 1
bugzilla:
# Bugzilla product and component for this directory and subdirectories
product: Core
component: "ImageLib"
# Document the source of externally hosted code
origin:
# Short name of the package/library
name: jpeg-xl
description: JPEG XL image format reference implementation
# Full URL for the package's homepage/etc
# Usually different from repository url
url: https://gitlab.com/wg1/jpeg-xl
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: commit 9a8f5195e4d1c45112fd65f184ebe115f4163ba2 (2021-05-04T13:15:00.000+02:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
# NOTE(krosylight): Update highway together when updating this!
revision: 9a8f5195e4d1c45112fd65f184ebe115f4163ba2
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/
# Multiple licenses can be specified (as a YAML list)
# A "LICENSE" file must exist containing the full license text
license: Apache-2.0
license-file: LICENSE
updatebot:
maintainer-phab: saschanaz
maintainer-bz: krosylight@mozilla.com
tasks:
- type: vendoring
enabled: True
vendoring:
url: https://gitlab.com/wg1/jpeg-xl.git
source-hosting: gitlab
vendor-directory: third_party/jpeg-xl
exclude:
- doc/
- third_party/testdata/
- tools/

300
third_party/highway/CMakeLists.txt поставляемый Normal file
Просмотреть файл

@ -0,0 +1,300 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.10)
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
project(hwy VERSION 0.1)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
# Enabled PIE binaries by default if supported.
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
if(CHECK_PIE_SUPPORTED)
check_pie_supported(LANGUAGES CXX)
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()
endif()
include(GNUInstallDirs)
if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE RelWithDebInfo)
endif()
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
"int main() {
#if !defined(__EMSCRIPTEN__)
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
#endif
return 0;
}"
HWY_EMSCRIPTEN
)
set(HWY_SOURCES
contrib/image/image.cc
contrib/image/image.h
contrib/math/math-inl.h
hwy/aligned_allocator.cc
hwy/aligned_allocator.h
hwy/base.h
hwy/cache_control.h
hwy/foreach_target.h
hwy/highway.h
hwy/nanobenchmark.cc
hwy/nanobenchmark.h
hwy/ops/arm_neon-inl.h
hwy/ops/scalar-inl.h
hwy/ops/set_macros-inl.h
hwy/ops/shared-inl.h
hwy/ops/wasm_128-inl.h
hwy/ops/x86_128-inl.h
hwy/ops/x86_256-inl.h
hwy/ops/x86_512-inl.h
hwy/targets.cc
hwy/targets.h
hwy/tests/test_util-inl.h
)
if (MSVC)
# TODO(janwas): add flags
else()
set(HWY_FLAGS
# Avoid changing binaries based on the current time and date.
-Wno-builtin-macro-redefined
-D__DATE__="redacted"
-D__TIMESTAMP__="redacted"
-D__TIME__="redacted"
# Optimizations
-fmerge-all-constants
# Warnings
-Wall
-Wextra
-Wformat-security
-Wno-unused-function
-Wnon-virtual-dtor
-Woverloaded-virtual
-Wvla
)
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
list(APPEND HWY_FLAGS
-Wc++2a-extensions
-Wfloat-overflow-conversion
-Wfloat-zero-conversion
-Wfor-loop-analysis
-Wgnu-redeclared-enum
-Winfinite-recursion
-Wself-assign
-Wstring-conversion
-Wtautological-overlap-compare
-Wthread-safety-analysis
-Wundefined-func-template
-fno-cxx-exceptions
-fno-slp-vectorize
-fno-vectorize
# Use color in messages
-fdiagnostics-show-option -fcolor-diagnostics
)
endif()
if (WIN32)
list(APPEND HWY_FLAGS
-Wno-c++98-compat-pedantic
-Wno-cast-align
-Wno-double-promotion
-Wno-float-equal
-Wno-format-nonliteral
-Wno-global-constructors
-Wno-language-extension-token
-Wno-missing-prototypes
-Wno-shadow
-Wno-shadow-field-in-constructor
-Wno-sign-conversion
-Wno-unused-member-function
-Wno-unused-template
-Wno-used-but-marked-unused
-Wno-zero-as-null-pointer-constant
)
else()
list(APPEND HWY_FLAGS
-fmath-errno
-fno-exceptions
)
endif()
endif()
add_library(hwy STATIC ${HWY_SOURCES})
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
# -------------------------------------------------------- install library
install(TARGETS hwy
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
# Install all the headers keeping the relative path to the current directory
# when installing them.
foreach (source ${HWY_SOURCES})
if ("${source}" MATCHES "\.h$")
get_filename_component(dirname "${source}" DIRECTORY)
install(FILES "${source}"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
endif()
endforeach()
# Add a pkg-config file for libhwy and the test library.
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
foreach (pc libhwy.pc libhwy-test.pc)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
endforeach()
# -------------------------------------------------------- hwy_list_targets
# Generate a tool to print the compiled-in targets as defined by the current
# flags. This tool will print to stderr at build time, after building hwy.
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
target_include_directories(hwy_list_targets PRIVATE
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
# TARGET_FILE always returns the path to executable
# Naked target also not always could be run (due to the lack of '.\' prefix)
# Thus effective command to run should contain the full path
# and emulator prefix (if any).
add_custom_command(TARGET hwy POST_BUILD
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
# -------------------------------------------------------- Examples
# Avoids mismatch between GTest's static CRT and our dynamic.
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Programming exercise with integrated benchmark
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
target_sources(hwy_benchmark PRIVATE
hwy/nanobenchmark.cc
hwy/nanobenchmark.h)
# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
# observe the difference in targets printed.
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
target_link_libraries(hwy_benchmark hwy)
set_target_properties(hwy_benchmark
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
# -------------------------------------------------------- Tests
include(CTest)
if(BUILD_TESTING)
enable_testing()
include(GoogleTest)
set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
if(HWY_SYSTEM_GTEST)
find_package(GTest REQUIRED)
else()
# Download and unpack googletest at configure time
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
endif()
execute_process(COMMAND ${CMAKE_COMMAND} --build .
RESULT_VARIABLE result
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
if(result)
message(FATAL_ERROR "Build step for googletest failed: ${result}")
endif()
# Prevent overriding the parent project's compiler/linker
# settings on Windows
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
# Add googletest directly to our build. This defines
# the gtest and gtest_main targets.
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
EXCLUDE_FROM_ALL)
# The gtest/gtest_main targets carry header search path
# dependencies automatically when using CMake 2.8.11 or
# later. Otherwise we have to add them here ourselves.
if (CMAKE_VERSION VERSION_LESS 2.8.11)
include_directories("${gtest_SOURCE_DIR}/include")
endif()
endif() # HWY_SYSTEM_GTEST
set(HWY_TEST_FILES
contrib/image/image_test.cc
# contrib/math/math_test.cc
hwy/aligned_allocator_test.cc
hwy/base_test.cc
hwy/highway_test.cc
hwy/targets_test.cc
hwy/examples/skeleton_test.cc
hwy/tests/arithmetic_test.cc
hwy/tests/combine_test.cc
hwy/tests/compare_test.cc
hwy/tests/convert_test.cc
hwy/tests/logical_test.cc
hwy/tests/memory_test.cc
hwy/tests/swizzle_test.cc
hwy/tests/test_util_test.cc
)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
foreach (TESTFILE IN LISTS HWY_TEST_FILES)
# The TESTNAME is the name without the extension or directory.
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
add_executable(${TESTNAME} ${TESTFILE})
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
if(HWY_SYSTEM_GTEST)
target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
else()
target_link_libraries(${TESTNAME} hwy gtest gtest_main)
endif()
# Output test targets in the test directory.
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
if (HWY_EMSCRIPTEN)
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
endif()
if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
gtest_discover_tests(${TESTNAME} TIMEOUT 60)
else ()
gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
endif ()
endforeach ()
# The skeleton test uses the skeleton library code.
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
endif() # BUILD_TESTING

15
third_party/highway/CMakeLists.txt.in поставляемый Normal file
Просмотреть файл

@ -0,0 +1,15 @@
cmake_minimum_required(VERSION 2.8.2)
project(googletest-download NONE)
include(ExternalProject)
ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG master
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)

33
third_party/highway/CONTRIBUTING поставляемый Normal file
Просмотреть файл

@ -0,0 +1,33 @@
# How to Contribute
We'd love to accept your patches and contributions to this project. There are
just a few small guidelines you need to follow.
## Contributor License Agreement
Contributions to this project must be accompanied by a Contributor License
Agreement. You (or your employer) retain the copyright to your contribution;
this simply gives us permission to use and redistribute your contributions as
part of the project. Head over to <https://cla.developers.google.com/> to see
your current agreements on file or to sign a new one.
You generally only need to submit a CLA once, so if you've already submitted one
(even if it was for a different project), you probably don't need to do it
again.
## Code reviews
All submissions, including submissions by project members, require review. We
use GitHub pull requests for this purpose. Consult
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
information on using pull requests.
## Testing
This repository is used by JPEG XL, so major API changes will require
coordination. Please get in touch with us beforehand, e.g. by raising an issue.
## Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).

201
third_party/highway/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

44
third_party/highway/Makefile поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
.DELETE_ON_ERROR:
OBJS := aligned_allocator.o nanobenchmark.o targets.o
IMAGE_OBJS := image.o
TEST_NAMES := arithmetic_test combine_test compare_test convert_test hwy_test logical_test memory_test swizzle_test
TESTS := $(foreach i, $(TEST_NAMES), bin/$(i))
ROOT_TEST_NAMES := aligned_allocator_test nanobenchmark_test
ROOT_TESTS := $(foreach i, $(ROOT_TEST_NAMES), bin/$(i))
CXXFLAGS += -I. -fmerge-all-constants -std=c++17 -O2 \
-Wno-builtin-macro-redefined -D__DATE__="redacted" \
-D__TIMESTAMP__="redacted" -D__TIME__="redacted" \
-Wall -Wextra -Wformat-security -Wno-unused-function \
-Wnon-virtual-dtor -Woverloaded-virtual -Wvla
.PHONY: all
all: $(TESTS) $(ROOT_TESTS) benchmark test
.PHONY: clean
clean: ; @rm -rf $(OBJS) bin/ benchmark.o
$(OBJS): %.o: hwy/%.cc
$(CXX) -c $(CXXFLAGS) $< -o $@
$(IMAGE_OBJS): %.o: contrib/image/%.cc
$(CXX) -c $(CXXFLAGS) $< -o $@
benchmark: $(OBJS) hwy/examples/benchmark.cc
mkdir -p bin && $(CXX) $(CXXFLAGS) $^ -o bin/benchmark
bin/%: hwy/%.cc $(OBJS)
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
bin/%: hwy/tests/%.cc $(OBJS)
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
bin/%: contrib/image/%.cc $(OBJS)
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
bin/%: contrib/math/%.cc $(OBJS)
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
.PHONY: test
test: $(TESTS) $(ROOT_TESTS)
for name in $^; do echo ---------------------$$name && $$name; done

361
third_party/highway/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,361 @@
# Efficient and performance-portable SIMD
Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
applying the same operation to 'lanes'.
## Why Highway?
- more portable (same source code) than platform-specific intrinsics,
- works on a wider range of compilers than compiler-specific vector extensions,
- more dependable than autovectorization,
- easier to write/maintain than assembly language,
- supports **runtime dispatch**,
- supports **variable-length vector** architectures.
## Current status
Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
A port to RVV is in progress.
Version 0.11 is considered stable enough to use in other projects, and is
expected to remain backwards compatible unless serious issues are discovered
while implementing SVE/RVV targets. After these targets are added, Highway will
reach version 1.0.
Continuous integration tests build with a recent version of Clang (running on
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). Also periodically
tested on x86 with Clang 7-11 and GCC 8, 9 and 10.2.1.
The `contrib` directory contains SIMD-related utilities: an image class with
aligned rows, and a math library (16 functions already implemented, mostly
trigonometry).
## Installation
This project uses cmake to generate and build. In a Debian-based system you can
install it via:
```bash
sudo apt install cmake
```
Highway's unit tests use [googletest](https://github.com/google/googletest).
By default, Highway's CMake downloads this dependency at configuration time.
You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
installing gtest separately:
```bash
sudo apt install libgtest-dev
```
To build and test the library the standard cmake workflow can be used:
```bash
mkdir -p build && cd build
cmake ..
make -j && make test
```
Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
To test on all the attainable targets for your platform, use
`cmake .. -DCMAKE_CXX_FLAGS="-DHWY_COMPILE_ALL_ATTAINABLE"`. Otherwise, the
default configuration skips baseline targets (e.g. scalar) that are superseded
by another baseline target.
## Quick start
You can use the `benchmark` inside examples/ as a starting point.
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
and their parameters, and the [instruction_matrix][instmtx] indicates the
number of instructions per operation.
We recommend using full SIMD vectors whenever possible for maximum performance
portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
two) lanes: `HWY_CAPPED(T, N)`. 128-bit vectors are guaranteed to be available
for lanes of type `T` if `HWY_TARGET != HWY_SCALAR` and `N == 16 / sizeof(T)`.
Functions using Highway must be inside a namespace `namespace HWY_NAMESPACE {`
(possibly nested in one or more other namespaces defined by the project), and
additionally either prefixed with `HWY_ATTR`, or residing between
`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
* For static dispatch, `HWY_TARGET` will be the best available target among
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
[quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
they are defined in. You can call the function from other modules by
wrapping it in a regular function and declaring the regular function in a
header.
* For dynamic dispatch, a table of function pointers is generated via the
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
call the best function pointer for the current CPU supported targets. A
module is automatically compiled for each target in `HWY_TARGETS` (see
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
defined and foreach_target.h is included.
## Strip-mining loops
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
loop with number of iterations matching the preferred vector width.
In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
number of elements to process, and `N = Lanes(d)` the number of lanes in a full
vector. Assume the loop body is given as a function `template<bool partial,
class D> void LoopBody(D d, size_t max_n)`.
Highway offers several ways to express loops where `N` need not divide `count`:
* Ensure all inputs/outputs are padded. Then the loop is simply
```
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
```
Here, the template parameter and second function argument are not needed.
This is the preferred option, unless `N` is in the thousands and vector
operations are pipelined with long latencies. This was the case for
supercomputers in the 90s, but nowadays ALUs are cheap and we see most
implementations split vectors into 1, 2 or 4 parts, so there is little cost
to processing entire vectors even if we do not need all their lanes. Indeed
this avoids the (potentially large) cost of predication or partial
loads/stores on older targets, and does not duplicate code.
* Process whole vectors as above, followed by a scalar loop:
```
size_t i = 0;
for (; i + N <= count; i += N) LoopBody<false>(d, 0);
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
```
The template parameter and second function arguments are again not needed.
This avoids duplicating code, and is reasonable if `count` is large.
Otherwise, multiple iterations may be slower than one `LoopBody` variant
with masking, especially because the `HWY_SCALAR` target selected by
`HWY_CAPPED(T, 1)` is slower for some operations due to workarounds for
undefined behavior in C++.
* Process whole vectors as above, followed by a single call to a modified
`LoopBody` with masking:
```
size_t i = 0;
for (; i + N <= count; i += N) {
LoopBody<false>(d, 0);
}
if (i < count) {
LoopBody<true>(d, count - i);
}
```
Now the template parameter and second function argument can be used inside
`LoopBody` to replace `Load/Store` of full aligned vectors with
`LoadN/StoreN(n)` that affect no more than `1 <= n <= N` aligned elements
(pending implementation).
This is a good default when it is infeasible to ensure vectors are padded.
In contrast to the scalar loop, only a single final iteration is needed.
## Design philosophy
* Performance is important but not the sole consideration. Anyone who goes to
the trouble of using SIMD clearly cares about speed. However, portability,
maintainability and readability also matter, otherwise we would write in
assembly. We aim for performance within 10-20% of a hand-written assembly
implementation on the development platform.
* The guiding principles of C++ are "pay only for what you use" and "leave no
room for a lower-level language below C++". We apply these by defining a
SIMD API that ensures operation costs are visible, predictable and minimal.
* Performance portability is important, i.e. the API should be efficient on
all target platforms. Unfortunately, common idioms for one platform can be
inefficient on others. For example: summing lanes horizontally versus
shuffling. Documenting which operations are expensive does not prevent their
use, as evidenced by widespread use of `HADDPS`. Performance acceptance
tests may detect large regressions, but do not help choose the approach
during initial development. Analysis tools can warn about some potential
inefficiencies, but likely not all. We instead provide [a carefully chosen
set of vector types and operations that are efficient on all target
platforms][instmtx] (PPC8, SSE4/AVX2+, ARMv8).
* Future SIMD hardware features are difficult to predict. For example, AVX2
came with surprising semantics (almost no interaction between 128-bit
blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
To ensure the API reflects hardware realities, we suggest a flexible
approach that adds new operations as they become commonly available, with
scalar fallbacks where not supported.
* Masking is not yet widely supported on current CPUs. It is difficult to
define an interface that provides access to all platform features while
retaining performance portability. The P0214R5 proposal lacks support for
AVX-512/ARM SVE zeromasks. We suggest limiting usage of masks to the
`IfThen[Zero]Else[Zero]` functions until the community has gained more
experience with them.
* "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
For example, valarray-like code can iterate over a 1D array with a
library-specified vector width. This will result in better code when vector
sizes increase, and matches the direction taken by
[ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
RiscV V as well as Agner Fog's
[ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
applications may require fixed sizes, so we also guarantee support for
128-bit vectors in each instruction set.
* The API and its implementation should be usable and efficient with commonly
used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
(https://godbolt.org/g/rKx5Ga). Highway requires function-specific
target attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
* Efficient and safe runtime dispatch is important. Modules such as image or
video codecs are typically embedded into larger applications such as
browsers, so they cannot require separate binaries for each CPU. Libraries
also cannot predict whether the application already uses AVX2 (and pays the
frequency throttling cost), so this decision must be left to the
application. Using only the lowest-common denominator instructions
sacrifices too much performance.
Therefore, we provide code paths for multiple instruction sets and choose
the most suitable at runtime. To reduce overhead, dispatch should be hoisted
to higher layers instead of checking inside every low-level function.
Highway supports inlining functions in the same file or in *-inl.h headers.
We generate all code paths from the same source to reduce implementation-
and debugging cost.
* Not every CPU need be supported. For example, pre-SSE4.1 CPUs are
increasingly rare and the AVX instruction set is limited to floating-point
operations. To reduce code size and compile time, we provide specializations
for SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar fallback.
* Access to platform-specific intrinsics is necessary for acceptance in
performance-critical projects. We provide conversions to and from intrinsics
to allow utilizing specialized platform-specific functionality, and simplify
incremental porting of existing code.
* The core API should be compact and easy to learn. We provide only the few
dozen operations which are necessary and sufficient for most of the 150+
SIMD applications we examined.
## Prior API designs
The author has been writing SIMD code since 2002: first via assembly language,
then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
of custom vector classes. The first used macros to generate the classes, which
reduces duplication but also readability. The second used templates instead.
The third (used in highwayhash and PIK) added support for AVX2 and runtime
dispatch. The current design (used in JPEG XL) enables code generation for
multiple platforms and/or instruction sets from the same source, and improves
runtime dispatch.
## Differences versus [P0214R5 proposal](https://goo.gl/zKW4SA)
1. Adding widely used and portable operations such as `AndNot`, `AverageRound`,
bit-shift by immediates and `IfThenElse`.
1. Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
crossing 128-bit 'block' boundaries.
1. Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
returns `fixed_size<>` vectors which are more expensive to access because
they reside on the stack. We can avoid this plus additional overhead on
ARM/AVX2 by defining width-expanding operations as functions of a vector
part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
`uint16_t`, or demoting full vectors to half vectors with half-width lanes.
1. Guaranteeing access to the underlying intrinsic vector type. This ensures
all platform-specific capabilities can be used. P0214R5 instead only
'encourages' implementations to provide access.
1. Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
based on the Vc library, which does not provide assistance for linking
multiple instruction sets into the same binary. The Vc documentation
suggests compiling separate executables for each instruction set or using
GCC's ifunc (indirect functions). The latter is compiler-specific and risks
crashes due to ODR violations when compiling the same function with
different compiler flags. We solve this problem via target-specific
namespaces and attributes (see HOWTO section below). We also permit a mix of
static target selection and runtime dispatch for hotspots that may benefit
from newer instruction sets if available.
1. Using built-in PPC vector types without a wrapper class. This leads to much
better code generation with GCC 6.3: https://godbolt.org/z/pd2PNP.
By contrast, P0214R5 requires a wrapper. We avoid this by using only the
member operators provided by the PPC vectors; all other functions and
typedefs are non-members. 2019-04 update: Clang power64le does not have
this issue, so we simplified get_part(d, v) to GetLane(v).
1. Omitting inefficient or non-performance-portable operations such as `hmax`,
`operator[]`, and unsupported integer comparisons. Applications can often
replace these operations at lower cost than emulating that exact behavior.
1. Omitting `long double` types: these are not commonly available in hardware.
1. Ensuring signed integer overflow has well-defined semantics (wraparound).
1. Simple header-only implementation and less than a tenth of the size of the
Vc library from which P0214 was derived (98,000 lines in
https://github.com/VcDevel/Vc according to the gloc Chrome extension).
1. Avoiding hidden performance costs. P0214R5 allows implicit conversions from
integer to float, which costs 3-4 cycles on x86. We make these conversions
explicit to ensure their cost is visible.
## Other related work
* [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
adopts a similar approach with interchangeable vector/scalar types and
a compact interface. It allows access to the underlying intrinsics, but
does not appear to be designed for other platforms than x86.
* UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
also adopts an explicit vectorization model with vector classes.
However, it exposes the union of all platform capabilities, which makes the
API harder to learn (209-page spec) and implement (the estimated LOC count
is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
because it allows applications to use operations that are inefficient on
other platforms.
* Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
is a vector library for scientific computing with some innovative features:
automatic FLOPS counting, and "if/else branches" using lambda functions.
It supports IBM Power8, but only provides float and double types.
## Overloaded function API
Most C++ vector APIs rely on class templates. However, the ARM SVE vector
type is sizeless and cannot be wrapped in a class. We instead rely on overloaded
functions. Overloading based on vector types is also undesirable because SVE
vectors cannot be default-constructed. We instead use a dedicated 'descriptor'
type `Simd` for overloading, abbreviated to `D` for template arguments and
`d` in lvalues.
Note that generic function templates are possible (see highway.h).
## Masks
AVX-512 introduced a major change to the SIMD interface: special mask registers
(one bit per lane) that serve as predicates. It would be expensive to force
AVX-512 implementations to conform to the prior model of full vectors with lanes
set to all one or all zero bits. We instead provide a Mask type that emulates
a subset of this functionality on other platforms at zero cost.
Masks are returned by comparisons and `TestBit`; they serve as the input to
`IfThen*`. We provide conversions between masks and vector lanes. For clarity
and safety, we use FF..FF as the definition of true. To also benefit from
x86 instructions that only require the sign bit of floating-point inputs to be
set, we provide a special `ZeroIfNegative` function.
## Additional resources
* [Highway introduction (slides)][intro]
* [Overview of instructions per operation on different architectures][instmtx]
[intro]: g3doc/highway_intro.pdf
[instmtx]: g3doc/instruction_matrix.pdf
This is not an officially supported Google product.
Contact: janwas@google.com

145
third_party/highway/contrib/image/image.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,145 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "contrib/image/image.h"
#include <cstddef>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "contrib/image/image.cc"
#include <algorithm> // swap
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
size_t GetVectorSize() { return Lanes(HWY_FULL(uint8_t)()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
namespace {
HWY_EXPORT(GetVectorSize); // Local function.
} // namespace
size_t ImageBase::VectorSize() {
// Do not cache result - must return the current value, which may be greater
// than the first call if it was subject to DisableTargets!
return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
}
size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
const size_t vec_size = VectorSize();
size_t valid_bytes = xsize * sizeof_t;
// Allow unaligned accesses starting at the last valid value - this may raise
// msan errors unless the user calls InitializePaddingForUnalignedAccesses.
// Skip for the scalar case because no extra lanes will be loaded.
if (vec_size != 1) {
HWY_DASSERT(vec_size >= sizeof_t);
valid_bytes += vec_size - sizeof_t;
}
// Round up to vector and cache line size.
const size_t align = std::max<size_t>(vec_size, HWY_ALIGNMENT);
size_t bytes_per_row = RoundUpTo(valid_bytes, align);
// During the lengthy window before writes are committed to memory, CPUs
// guard against read after write hazards by checking the address, but
// only the lower 11 bits. We avoid a false dependency between writes to
// consecutive rows by ensuring their sizes are not multiples of 2 KiB.
// Avoid2K prevents the same problem for the planes of an Image3.
if (bytes_per_row % HWY_ALIGNMENT == 0) {
bytes_per_row += align;
}
HWY_DASSERT(bytes_per_row % align == 0);
return bytes_per_row;
}
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
const size_t sizeof_t)
: xsize_(static_cast<uint32_t>(xsize)),
ysize_(static_cast<uint32_t>(ysize)),
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
bytes_per_row_ = 0;
// Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
// if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
if (xsize != 0 && ysize != 0) {
bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
HWY_ASSERT(bytes_.get() != nullptr);
InitializePadding(sizeof_t, Padding::kRoundUp);
}
}
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
const size_t bytes_per_row, void* const aligned)
: xsize_(static_cast<uint32_t>(xsize)),
ysize_(static_cast<uint32_t>(ysize)),
bytes_per_row_(bytes_per_row),
bytes_(static_cast<uint8_t*>(aligned),
AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
const size_t vec_size = VectorSize();
HWY_ASSERT(bytes_per_row % vec_size == 0);
HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
}
void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
#if defined(MEMORY_SANITIZER) || HWY_IDE
if (xsize_ == 0 || ysize_ == 0) return;
const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t!
if (vec_size == 1) return; // Scalar mode: no padding needed
const size_t valid_size = xsize_ * sizeof_t;
const size_t initialize_size = padding == Padding::kRoundUp
? RoundUpTo(valid_size, vec_size)
: valid_size + vec_size - sizeof_t;
if (valid_size == initialize_size) return;
for (size_t y = 0; y < ysize_; ++y) {
uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
#if defined(__clang__) && (__clang_major__ <= 6)
// There's a bug in msan in clang-6 when handling AVX2 operations. This
// workaround allows tests to pass on msan, although it is slower and
// prevents msan warnings from uninitialized images.
memset(row, 0, initialize_size);
#else
memset(row + valid_size, 0, initialize_size - valid_size);
#endif // clang6
}
#else
(void)sizeof_t;
(void)padding;
#endif // MEMORY_SANITIZER
}
void ImageBase::Swap(ImageBase& other) {
std::swap(xsize_, other.xsize_);
std::swap(ysize_, other.ysize_);
std::swap(bytes_per_row_, other.bytes_per_row_);
std::swap(bytes_, other.bytes_);
}
} // namespace hwy
#endif // HWY_ONCE

468
third_party/highway/contrib/image/image.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,468 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
#define HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
// SIMD/multicore-friendly planar image representation with row accessors.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <cstddef>
#include <utility> // std::move
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
namespace hwy {
// Type-independent parts of Image<> - reduces code duplication and facilitates
// moving member function implementations to cc file.
struct ImageBase {
// Returns required alignment in bytes for externally allocated memory.
static size_t VectorSize();
// Returns distance [bytes] between the start of two consecutive rows, a
// multiple of VectorSize but NOT kAlias (see implementation).
static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
// No allocation (for output params or unused images)
ImageBase()
: xsize_(0),
ysize_(0),
bytes_per_row_(0),
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
// Allocates memory (this is the common case)
ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
// References but does not take ownership of external memory. Useful for
// interoperability with other libraries. `aligned` must be aligned to a
// multiple of VectorSize() and `bytes_per_row` must also be a multiple of
// VectorSize() or preferably equal to BytesPerRow().
ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
// Copy construction/assignment is forbidden to avoid inadvertent copies,
// which can be very expensive. Use CopyImageTo() instead.
ImageBase(const ImageBase& other) = delete;
ImageBase& operator=(const ImageBase& other) = delete;
// Move constructor (required for returning Image from function)
ImageBase(ImageBase&& other) noexcept = default;
// Move assignment (required for std::vector)
ImageBase& operator=(ImageBase&& other) noexcept = default;
void Swap(ImageBase& other);
// Useful for pre-allocating image with some padding for alignment purposes
// and later reporting the actual valid dimensions. Caller is responsible
// for ensuring xsize/ysize are <= the original dimensions.
void ShrinkTo(const size_t xsize, const size_t ysize) {
xsize_ = static_cast<uint32_t>(xsize);
ysize_ = static_cast<uint32_t>(ysize);
// NOTE: we can't recompute bytes_per_row for more compact storage and
// better locality because that would invalidate the image contents.
}
// How many pixels.
HWY_INLINE size_t xsize() const { return xsize_; }
HWY_INLINE size_t ysize() const { return ysize_; }
// NOTE: do not use this for copying rows - the valid xsize may be much less.
HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
// Raw access to byte contents, for interfacing with other libraries.
// Unsigned char instead of char to avoid surprises (sign extension).
HWY_INLINE uint8_t* bytes() {
void* p = bytes_.get();
return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
}
HWY_INLINE const uint8_t* bytes() const {
const void* p = bytes_.get();
return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
}
protected:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidRow(const size_t y) const {
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
if (y >= ysize_) {
HWY_ABORT("Row(%zu) >= %u\n", y, ysize_);
}
#endif
void* row = bytes_.get() + y * bytes_per_row_;
return HWY_ASSUME_ALIGNED(row, 64);
}
enum class Padding {
// Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
kRoundUp,
// Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
// vector to be initialized. If done by default, this would suppress
// legitimate msan warnings. We therefore require users to explicitly call
// InitializePadding before using unaligned loads (e.g. convolution).
kUnaligned
};
// Initializes the minimum bytes required to suppress msan warnings from
// legitimate (according to Padding mode) vector loads/stores on the right
// border, where some lanes are uninitialized and assumed to be unused.
void InitializePadding(size_t sizeof_t, Padding padding);
// (Members are non-const to enable assignment during move-assignment.)
uint32_t xsize_; // In valid pixels, not including any padding.
uint32_t ysize_;
size_t bytes_per_row_; // Includes padding.
AlignedFreeUniquePtr<uint8_t[]> bytes_;
};
// Single channel, aligned rows separated by padding. T must be POD.
//
// 'Single channel' (one 2D array per channel) simplifies vectorization
// (repeating the same operation on multiple adjacent components) without the
// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
// can easily iterate over all components in a row and Image requires no
// knowledge of the pixel format beyond the component type "T".
//
// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
// false sharing between two threads operating on adjacent rows.
//
// 'Padding' is still relevant because vectors could potentially be larger than
// a cache line. By rounding up row sizes to the vector size, we allow
// reading/writing ALIGNED vectors whose first lane is a valid sample. This
// avoids needing a separate loop to handle remaining unaligned lanes.
//
// This image layout could also be achieved with a vector and a row accessor
// function, but a class wrapper with support for "deleter" allows wrapping
// existing memory allocated by clients without copying the pixels. It also
// provides convenient accessors for xsize/ysize, which shortens function
// argument lists. Supports move-construction so it can be stored in containers.
template <typename ComponentType>
class Image : public ImageBase {
public:
using T = ComponentType;
Image() = default;
Image(const size_t xsize, const size_t ysize)
: ImageBase(xsize, ysize, sizeof(T)) {}
Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
void* aligned)
: ImageBase(xsize, ysize, bytes_per_row, aligned) {}
void InitializePaddingForUnalignedAccesses() {
InitializePadding(sizeof(T), Padding::kUnaligned);
}
HWY_INLINE const T* ConstRow(const size_t y) const {
return static_cast<const T*>(VoidRow(y));
}
HWY_INLINE const T* ConstRow(const size_t y) {
return static_cast<const T*>(VoidRow(y));
}
// Returns pointer to non-const. This allows passing const Image* parameters
// when the callee is only supposed to fill the pixels, as opposed to
// allocating or resizing the image.
HWY_INLINE T* MutableRow(const size_t y) const {
return static_cast<T*>(VoidRow(y));
}
HWY_INLINE T* MutableRow(const size_t y) {
return static_cast<T*>(VoidRow(y));
}
// Returns number of pixels (some of which are padding) per row. Useful for
// computing other rows via pointer arithmetic. WARNING: this must
// NOT be used to determine xsize.
HWY_INLINE intptr_t PixelsPerRow() const {
return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
}
};
using ImageF = Image<float>;
// A bundle of 3 same-sized images. To fill an existing Image3 using
// single-channel producers, we also need access to each const Image*. Const
// prevents breaking the same-size invariant, while still allowing pixels to be
// changed via MutableRow.
template <typename ComponentType>
class Image3 {
public:
using T = ComponentType;
using ImageT = Image<T>;
static constexpr size_t kNumPlanes = 3;
Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
Image3(const size_t xsize, const size_t ysize)
: planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
ImageT(xsize, ysize)} {}
Image3(Image3&& other) noexcept {
for (size_t i = 0; i < kNumPlanes; i++) {
planes_[i] = std::move(other.planes_[i]);
}
}
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
HWY_ABORT("Not same size: %zu x %zu, %zu x %zu, %zu x %zu\n",
plane0.xsize(), plane0.ysize(), plane1.xsize(), plane1.ysize(),
plane2.xsize(), plane2.ysize());
}
planes_[0] = std::move(plane0);
planes_[1] = std::move(plane1);
planes_[2] = std::move(plane2);
}
// Copy construction/assignment is forbidden to avoid inadvertent copies,
// which can be very expensive. Use CopyImageTo instead.
Image3(const Image3& other) = delete;
Image3& operator=(const Image3& other) = delete;
Image3& operator=(Image3&& other) noexcept {
for (size_t i = 0; i < kNumPlanes; i++) {
planes_[i] = std::move(other.planes_[i]);
}
return *this;
}
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
return static_cast<const T*>(VoidPlaneRow(c, y));
}
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
return static_cast<const T*>(VoidPlaneRow(c, y));
}
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
return static_cast<T*>(VoidPlaneRow(c, y));
}
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
return static_cast<T*>(VoidPlaneRow(c, y));
}
HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
void Swap(Image3& other) {
for (size_t c = 0; c < 3; ++c) {
other.planes_[c].Swap(planes_[c]);
}
}
void ShrinkTo(const size_t xsize, const size_t ysize) {
for (ImageT& plane : planes_) {
plane.ShrinkTo(xsize, ysize);
}
}
// Sizes of all three images are guaranteed to be equal.
HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
// Returns offset [bytes] from one row to the next row of the same plane.
// WARNING: this must NOT be used to determine xsize, nor for copying rows -
// the valid xsize may be much less.
HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
// Returns number of pixels (some of which are padding) per row. Useful for
// computing other rows via pointer arithmetic. WARNING: this must NOT be used
// to determine xsize.
HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
private:
// Returns pointer to the start of a row.
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
if (c >= kNumPlanes || y >= ysize()) {
HWY_ABORT("PlaneRow(%zu, %zu) >= %zu\n", c, y, ysize());
}
#endif
// Use the first plane's stride because the compiler might not realize they
// are all equal. Thus we only need a single multiplication for all planes.
const size_t row_offset = y * planes_[0].bytes_per_row();
const void* row = planes_[c].bytes() + row_offset;
return static_cast<const T * HWY_RESTRICT>(
HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
}
private:
ImageT planes_[kNumPlanes];
};
using Image3F = Image3<float>;
// Rectangular region in image(s). Factoring this out of Image instead of
// shifting the pointer by x0/y0 allows this to apply to multiple images with
// different resolutions. Can compare size via SameSize(rect1, rect2).
class Rect {
public:
// Most windows are xsize_max * ysize_max, except those on the borders where
// begin + size_max > end.
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
size_t ysize_max, size_t xend, size_t yend)
: x0_(xbegin),
y0_(ybegin),
xsize_(ClampedSize(xbegin, xsize_max, xend)),
ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
// Construct with origin and known size (typically from another Rect).
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
: x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
// Construct a rect that covers a whole image.
template <typename Image>
explicit Rect(const Image& image)
: Rect(0, 0, image.xsize(), image.ysize()) {}
Rect() : Rect(0, 0, 0, 0) {}
Rect(const Rect&) = default;
Rect& operator=(const Rect&) = default;
Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
size_t ysize_max) {
return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
y0_ + ysize_);
}
template <typename T>
const T* ConstRow(const Image<T>* image, size_t y) const {
return image->ConstRow(y + y0_) + x0_;
}
template <typename T>
T* MutableRow(const Image<T>* image, size_t y) const {
return image->MutableRow(y + y0_) + x0_;
}
template <typename T>
const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
return image.ConstPlaneRow(c, y + y0_) + x0_;
}
template <typename T>
T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
return image->MutablePlaneRow(c, y + y0_) + x0_;
}
// Returns true if this Rect fully resides in the given image. ImageT could be
// Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
template <class ImageT>
bool IsInside(const ImageT& image) const {
return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
}
size_t x0() const { return x0_; }
size_t y0() const { return y0_; }
size_t xsize() const { return xsize_; }
size_t ysize() const { return ysize_; }
private:
// Returns size_max, or whatever is left in [begin, end).
static constexpr size_t ClampedSize(size_t begin, size_t size_max,
size_t end) {
return (begin + size_max <= end) ? size_max
: (end > begin ? end - begin : 0);
}
size_t x0_;
size_t y0_;
size_t xsize_;
size_t ysize_;
};
// Works for any image-like input type(s).
template <class Image1, class Image2>
HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
}
// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
// We assume the radius (distance outside the image) is small compared to the
// image size, otherwise this might not terminate.
// The mirror is outside the last column (border pixel is also replicated).
static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
const int64_t xsize) {
HWY_DASSERT(xsize != 0);
// TODO(janwas): replace with branchless version
while (x < 0 || x >= xsize) {
if (x < 0) {
x = -x - 1;
} else {
x = 2 * xsize - 1 - x;
}
}
return static_cast<size_t>(x);
}
// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
// Mirrors (repeating the edge pixel once). Useful for convolutions.
struct WrapMirror {
HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
return Mirror(coord, static_cast<int64_t>(size));
}
};
// Returns the same coordinate, for when we know "coord" is already valid (e.g.
// interior of an image).
struct WrapUnchanged {
HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
return static_cast<size_t>(coord);
}
};
// Similar to Wrap* but for row pointers (reduces Row() multiplications).
class WrapRowMirror {
public:
template <class View>
WrapRowMirror(const View& image, size_t ysize)
: first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
const float* operator()(const float* const HWY_RESTRICT row,
const int64_t stride) const {
if (row < first_row_) {
const int64_t num_before = first_row_ - row;
// Mirrored; one row before => row 0, two before = row 1, ...
return first_row_ + num_before - stride;
}
if (row > last_row_) {
const int64_t num_after = row - last_row_;
// Mirrored; one row after => last row, two after = last - 1, ...
return last_row_ - num_after + stride;
}
return row;
}
private:
const float* const HWY_RESTRICT first_row_;
const float* const HWY_RESTRICT last_row_;
};
struct WrapRowUnchanged {
HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
int64_t /*stride*/) const {
return row;
}
};
} // namespace hwy
#endif // HIGHWAY_CONTRIB_IMAGE_IMAGE_H_

151
third_party/highway/contrib/image/image_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,151 @@
// Copyright (c) the JPEG XL Project
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "contrib/image/image.h"
#include <cstddef>
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "contrib/image/image_test.cc"
#include "hwy/foreach_target.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <random>
#include <utility>
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Ensure we can always write full aligned vectors.
struct TestAlignedT {
template <typename T>
void operator()(T /*unused*/) const {
std::mt19937 rng(129);
std::uniform_int_distribution<int> dist(0, 16);
const HWY_FULL(T) d;
for (size_t ysize = 1; ysize < 4; ++ysize) {
for (size_t xsize = 1; xsize < 64; ++xsize) {
Image<T> img(xsize, ysize);
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; x += Lanes(d)) {
const auto values = Iota(d, dist(rng));
Store(values, d, row + x);
}
}
// Sanity check to prevent optimizing out the writes
const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
}
}
}
};
void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
// Ensure we can write an unaligned vector starting at the last valid value.
struct TestUnalignedT {
template <typename T>
void operator()(T /*unused*/) const {
std::mt19937 rng(129);
std::uniform_int_distribution<int> dist(0, 3);
const HWY_FULL(T) d;
for (size_t ysize = 1; ysize < 4; ++ysize) {
for (size_t xsize = 1; xsize < 128; ++xsize) {
Image<T> img(xsize, ysize);
img.InitializePaddingForUnalignedAccesses();
// This test reads padding, which only works if it was initialized,
// which only happens in MSAN builds.
#if defined(MEMORY_SANITIZER) || HWY_IDE
// Initialize only the valid samples
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
row[x] = 1 << dist(rng);
}
}
// Read padding bits
auto accum = Zero(d);
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
accum |= LoadU(d, row + x);
}
}
// Ensure padding was zero
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
Store(accum, d, lanes.get());
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT(lanes[i] < 16);
}
#else // Check that writing padding does not overwrite valid samples
// Initialize only the valid samples
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize; ++x) {
row[x] = static_cast<T>(x);
}
}
// Zero padding and rightmost sample
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
StoreU(Zero(d), d, row + xsize - 1);
}
// Ensure no samples except the rightmost were overwritten
for (size_t y = 0; y < ysize; ++y) {
T* HWY_RESTRICT row = img.MutableRow(y);
for (size_t x = 0; x < xsize - 1; ++x) {
HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
}
}
#endif
}
}
}
};
void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(ImageTest);
HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
} // namespace hwy
#endif

1192
third_party/highway/contrib/math/math-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

188
third_party/highway/contrib/math/math_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,188 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cfloat> // FLT_MAX
#include <iostream>
#include <type_traits>
// clang-format off
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "contrib/math/math_test.cc"
#include "hwy/foreach_target.h"
#include "contrib/math/math-inl.h"
#include "hwy/tests/test_util-inl.h"
// clang-format on
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
template <class T, class D>
void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
D d, T min, T max, uint64_t max_error_ulp) {
constexpr bool kIsF32 = (sizeof(T) == 4);
using UintT = MakeUnsigned<T>;
const UintT min_bits = BitCast<UintT>(min);
const UintT max_bits = BitCast<UintT>(max);
// If min is negative and max is positive, the range needs to be broken into
// two pieces, [+0, max] and [-0, min], otherwise [min, max].
int range_count = 1;
UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
if ((min < 0.0) && (max > 0.0)) {
ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
ranges[0][1] = max_bits;
ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
ranges[1][1] = min_bits;
range_count = 2;
}
uint64_t max_ulp = 0;
#if HWY_ARCH_ARM
// Emulation is slower, so cannot afford as many.
constexpr UintT kSamplesPerRange = 25000;
#else
constexpr UintT kSamplesPerRange = 100000;
#endif
for (int range_index = 0; range_index < range_count; ++range_index) {
const UintT start = ranges[range_index][0];
const UintT stop = ranges[range_index][1];
const UintT step = std::max<UintT>(1, ((stop - start) / kSamplesPerRange));
for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
const T value = BitCast<T>(std::min(value_bits, stop));
const T actual = GetLane(fxN(d, Set(d, value)));
const T expected = fx1(value);
// Skip small inputs and outputs on armv7, it flushes subnormals to zero.
#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
continue;
}
#endif
const auto ulp = ComputeUlpDelta(actual, expected);
max_ulp = std::max<uint64_t>(max_ulp, ulp);
if (ulp > max_error_ulp) {
std::cout << name << "<" << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
<< ">(" << value << ") expected: " << expected
<< " actual: " << actual << std::endl;
}
HWY_ASSERT(ulp <= max_error_ulp);
}
}
std::cout << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
<< ", Max ULP: " << max_ulp << std::endl;
}
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
struct Test##NAME { \
template <class T, class D> \
HWY_NOINLINE void operator()(T, D d) { \
if (sizeof(T) == 4) { \
TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \
F32_ERROR); \
} else { \
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, F64_MIN, F64_MAX, \
F64_ERROR); \
} \
} \
}; \
HWY_NOINLINE void TestAll##NAME() { \
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
}
// Floating point values closest to but less than 1.0
const float kNearOneF = BitCast<float>(0x3F7FFFFF);
const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
// clang-format off
DEFINE_MATH_TEST(Acos,
std::acos, CallAcos, -1.0, +1.0, 3, // NEON is 3 instead of 2
std::acos, CallAcos, -1.0, +1.0, 2)
DEFINE_MATH_TEST(Acosh,
std::acosh, CallAcosh, +1.0, +FLT_MAX, 3,
std::acosh, CallAcosh, +1.0, +DBL_MAX, 3)
DEFINE_MATH_TEST(Asin,
std::asin, CallAsin, -1.0, +1.0, 3, // NEON is 3 instead of 2
std::asin, CallAsin, -1.0, +1.0, 2)
DEFINE_MATH_TEST(Asinh,
std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3,
std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3)
DEFINE_MATH_TEST(Atan,
std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3,
std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3)
DEFINE_MATH_TEST(Atanh,
std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3
std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3)
DEFINE_MATH_TEST(Cos,
std::cos, CallCos, -39000.0, +39000.0, 3,
std::cos, CallCos, -39000.0, +39000.0, 3)
DEFINE_MATH_TEST(Exp,
std::exp, CallExp, -FLT_MAX, +104.0, 1,
std::exp, CallExp, -DBL_MAX, +104.0, 1)
DEFINE_MATH_TEST(Expm1,
std::expm1, CallExpm1, -FLT_MAX, +104.0, 4,
std::expm1, CallExpm1, -DBL_MAX, +104.0, 4)
DEFINE_MATH_TEST(Log,
std::log, CallLog, +FLT_MIN, +FLT_MAX, 1,
std::log, CallLog, +DBL_MIN, +DBL_MAX, 1)
DEFINE_MATH_TEST(Log10,
std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2,
std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2)
DEFINE_MATH_TEST(Log1p,
std::log1p, CallLog1p, +0.0f, +1e37, 3, // NEON is 3 instead of 2
std::log1p, CallLog1p, +0.0, +DBL_MAX, 2)
DEFINE_MATH_TEST(Log2,
std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2,
std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2)
DEFINE_MATH_TEST(Sin,
std::sin, CallSin, -39000.0, +39000.0, 3,
std::sin, CallSin, -39000.0, +39000.0, 3)
DEFINE_MATH_TEST(Sinh,
std::sinh, CallSinh, -80.0f, +80.0f, 4,
std::sinh, CallSinh, -709.0, +709.0, 4)
DEFINE_MATH_TEST(Tanh,
std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4,
std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4)
// clang-format on
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyMathTest);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
} // namespace hwy
#endif

31
third_party/highway/debian/changelog поставляемый Normal file
Просмотреть файл

@ -0,0 +1,31 @@
highway (0.12.0-1) UNRELEASED; urgency=medium
* Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
* Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
* Proper IEEE rounding, reduce libstdc++ usage, inlined math
-- Jan Wassenberg <janwas@google.com> Thu, 15 Apr 2021 20:00:00 +0200
highway (0.11.1-1) UNRELEASED; urgency=medium
* Fix clang7 asan error, finish f16 conversions and add test
-- Jan Wassenberg <janwas@google.com> Thu, 25 Feb 2021 16:00:00 +0200
highway (0.11.0-1) UNRELEASED; urgency=medium
* Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
-- Jan Wassenberg <janwas@google.com> Thu, 18 Feb 2021 20:00:00 +0200
highway (0.7.0-1) UNRELEASED; urgency=medium
* Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
-- Jan Wassenberg <janwas@google.com> Tue, 5 Jan 2021 17:00:00 +0200
highway (0.1-1) UNRELEASED; urgency=medium
* Initial debian package.
-- Alex Deymo <deymo@google.com> Mon, 19 Oct 2020 16:48:07 +0200

1
third_party/highway/debian/compat поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
10

23
third_party/highway/debian/control поставляемый Normal file
Просмотреть файл

@ -0,0 +1,23 @@
Source: highway
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
Section: misc
Priority: optional
Standards-Version: 3.9.8
Build-Depends: cmake,
debhelper (>= 9),
libgtest-dev
Homepage: https://github.com/google/highway
Package: libhwy-dev
Architecture: any
Section: libdevel
Depends: ${misc:Depends}
Description: Efficient and performance-portable SIMD wrapper (developer files)
This library provides type-safe and source-code portable wrappers over
existing platform-specific intrinsics. Its design aims for simplicity,
reliable efficiency across platforms, and immediate usability with current
compilers.
.
This package installs the development files. There's no runtime library
since most of Highway is implemented in headers and only a very small
static library is needed.

20
third_party/highway/debian/copyright поставляемый Normal file
Просмотреть файл

@ -0,0 +1,20 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: highway
Files: *
Copyright: 2020 Google LLC
License: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
.
http://www.apache.org/licenses/LICENSE-2.0
.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
.
On Debian systems, the complete text of the Apache License, Version 2
can be found in "/usr/share/common-licenses/Apache-2.0".

6
third_party/highway/debian/rules поставляемый Normal file
Просмотреть файл

@ -0,0 +1,6 @@
#!/usr/bin/make -f
%:
dh $@ --buildsystem=cmake
override_dh_auto_configure:
dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON

1
third_party/highway/debian/source/format поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
3.0 (quilt)

138
third_party/highway/hwy/aligned_allocator.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,138 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/aligned_allocator.h"
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h> // malloc
#include <atomic>
#include <limits>
#include "hwy/base.h"
namespace hwy {
namespace {
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize);
// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
// if this is used for single-vector allocations. 256 is more reasonable.
constexpr size_t kAlias = kAlignment * 4;
#pragma pack(push, 1)
struct AllocationHeader {
void* allocated;
size_t payload_size;
};
#pragma pack(pop)
// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
size_t NextAlignedOffset() {
static std::atomic<uint32_t> next{0};
constexpr uint32_t kGroups = kAlias / kAlignment;
const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
const size_t offset = kAlignment * group;
HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
return offset;
}
} // namespace
void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
void* opaque_ptr) {
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
HWY_DASSERT(false && "payload_size too large");
return nullptr;
}
size_t offset = NextAlignedOffset();
// What: | misalign | unused | AllocationHeader |payload
// Size: |<= kAlias | offset |payload_size
// ^allocated.^aligned.^header............^payload
// The header must immediately precede payload, which must remain aligned.
// To avoid wasting space, the header resides at the end of `unused`,
// which therefore cannot be empty (offset == 0).
if (offset == 0) {
offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
}
const size_t allocated_size = kAlias + offset + payload_size;
void* allocated;
if (alloc_ptr == nullptr) {
allocated = malloc(allocated_size);
} else {
allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
}
if (allocated == nullptr) return nullptr;
// Always round up even if already aligned - we already asked for kAlias
// extra bytes and there's no way to give them back.
uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
aligned &= ~(kAlias - 1);
const uintptr_t payload = aligned + offset; // still aligned
// Stash `allocated` and payload_size inside header for FreeAlignedBytes().
// The allocated_size can be reconstructed from the payload_size.
AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
header->allocated = allocated;
header->payload_size = payload_size;
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kMaxVectorSize);
}
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr) {
if (aligned_pointer == nullptr) return;
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
HWY_DASSERT(payload % kAlignment == 0);
const AllocationHeader* header =
reinterpret_cast<const AllocationHeader*>(payload) - 1;
if (free_ptr == nullptr) {
free(header->allocated);
} else {
(*free_ptr)(opaque_ptr, header->allocated);
}
}
// static
void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr,
ArrayDeleter deleter) {
if (aligned_pointer == nullptr) return;
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
HWY_DASSERT(payload % kAlignment == 0);
const AllocationHeader* header =
reinterpret_cast<const AllocationHeader*>(payload) - 1;
if (deleter) {
(*deleter)(aligned_pointer, header->payload_size);
}
if (free_ptr == nullptr) {
free(header->allocated);
} else {
(*free_ptr)(opaque_ptr, header->allocated);
}
}
} // namespace hwy

179
third_party/highway/hwy/aligned_allocator.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,179 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
// Memory allocator with support for alignment and offsets.
#include <stddef.h>
#include <memory>
namespace hwy {
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
// requires a literal. This matches typical L1 cache line sizes, which prevents
// false sharing.
#define HWY_ALIGNMENT 64
// Pointers to functions equivalent to malloc/free with an opaque void* passed
// to them.
using AllocPtr = void* (*)(void* opaque, size_t bytes);
using FreePtr = void (*)(void* opaque, void* memory);
// Returns null or a pointer to at least `payload_size` (which can be zero)
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
// memory or malloc() if it is null.
void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
void* opaque_ptr);
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
// must have been returned from a previous call to `AllocateAlignedBytes`.
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
// `free_ptr` function is null, uses the default free().
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr);
// Class that deletes the aligned pointer passed to operator() calling the
// destructor before freeing the pointer. This is equivalent to the
// std::default_delete but for aligned objects. For a similar deleter equivalent
// to free() for aligned memory see AlignedFreer().
class AlignedDeleter {
public:
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
TypedArrayDeleter<T>);
}
private:
template <typename T>
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
size_t elems = size_in_bytes / sizeof(T);
for (size_t i = 0; i < elems; i++) {
// Explicitly call the destructor on each element.
(static_cast<T*>(ptr) + i)->~T();
}
}
// Function prototype that calls the destructor for each element in a typed
// array. TypeArrayDeleter<T> would match this prototype.
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
void* opaque_ptr, ArrayDeleter deleter);
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to T with custom aligned deleter. This can be a single
// element U or an array of element if T is a U[]. The custom aligned deleter
// will call the destructor on U or each element of a U[] in the array case.
template <typename T>
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
// Aligned memory equivalent of make_unique<T> using the custom allocators
// alloc/free with the passed `opaque` pointer. This function calls the
// constructor with the passed Args... and calls the destructor of the object
// when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
void* opaque, Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
AlignedDeleter(free, opaque));
}
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
// functions.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
T* ptr = static_cast<T*>(AllocateAlignedBytes(
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
return AlignedUniquePtr<T>(
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
}
// Aligned memory equivalent of make_unique<T[]> for array types using the
// custom allocators alloc/free. This function calls the constructor with the
// passed Args... on every created item. The destructor of each element will be
// called when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
T* ptr =
static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
for (size_t i = 0; i < items; i++) {
new (ptr + i) T(std::forward<Args>(args)...);
}
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
}
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
}
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
// but for aligned memory.
class AlignedFreer {
public:
// Pass address of this to ctor to skip deleting externally-owned memory.
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
template <typename T>
void operator()(T* aligned_pointer) const {
// TODO(deymo): assert that we are using a POD type T.
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
}
private:
FreePtr free_;
void* opaque_ptr_;
};
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
// data use AlignedUniquePtr.
template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
// Upon destruction of the unique_ptr the aligned array will be freed.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
FreePtr free, void* opaque) {
return AlignedFreeUniquePtr<T[]>(
static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
AlignedFreer(free, opaque));
}
// Same as previous AllocateAligned(), using default allocate/free functions.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
}
} // namespace hwy
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_

250
third_party/highway/hwy/aligned_allocator_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,250 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/aligned_allocator.h"
#include <stddef.h>
#include <new>
#include <random>
#include <vector>
#include "gtest/gtest.h"
#include "hwy/base.h"
namespace {
// Sample object that keeps track on an external counter of how many times was
// the explicit constructor and destructor called.
template <size_t N>
class SampleObject {
public:
SampleObject() { data_[0] = 'a'; }
explicit SampleObject(int* counter) : counter_(counter) {
if (counter) (*counter)++;
data_[0] = 'b';
}
~SampleObject() {
if (counter_) (*counter_)--;
}
static_assert(N > sizeof(int*), "SampleObject size too small.");
int* counter_ = nullptr;
char data_[N - sizeof(int*)];
};
class FakeAllocator {
public:
// static AllocPtr and FreePtr member to be used with the alligned
// allocator. These functions calls the private non-static members.
static void* StaticAlloc(void* opaque, size_t bytes) {
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
}
static void StaticFree(void* opaque, void* memory) {
return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
}
// Returns the number of pending allocations to be freed.
size_t PendingAllocs() { return allocs_.size(); }
private:
void* Alloc(size_t bytes) {
void* ret = malloc(bytes);
allocs_.insert(ret);
return ret;
}
void Free(void* memory) {
if (!memory) return;
EXPECT_NE(allocs_.end(), allocs_.find(memory));
free(memory);
allocs_.erase(memory);
}
std::set<void*> allocs_;
};
} // namespace
namespace hwy {
class AlignedAllocatorTest : public testing::Test {};
TEST(AlignedAllocatorTest, FreeNullptr) {
// Calling free with a nullptr is always ok.
FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
/*opaque_ptr=*/nullptr);
}
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
const size_t kSize = 7777;
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
/*opaque_ptr=*/nullptr);
ASSERT_NE(nullptr, ptr);
// Make sure the pointer is actually aligned.
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
char* p = static_cast<char*>(ptr);
size_t ret = 0;
for (size_t i = 0; i < kSize; i++) {
// Performs a computation using p[] to prevent it being optimized away.
p[i] = static_cast<char>(i & 0x7F);
if (i) ret += p[i] * p[i - 1];
}
EXPECT_NE(0U, ret);
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
}
TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
}
TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
}
TEST(AlignedAllocatorTest, CustomAlloc) {
FakeAllocator fake_alloc;
const size_t kSize = 7777;
void* ptr =
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
ASSERT_NE(nullptr, ptr);
// We should have only requested one alloc from the allocator.
EXPECT_EQ(1U, fake_alloc.PendingAllocs());
// Make sure the pointer is actually aligned.
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
EXPECT_EQ(0U, fake_alloc.PendingAllocs());
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
{
auto ptr = MakeUniqueAligned<SampleObject<24>>();
// Default constructor sets the data_[0] to 'a'.
EXPECT_EQ('a', ptr->data_[0]);
EXPECT_EQ(nullptr, ptr->counter_);
}
}
TEST(AlignedAllocatorTest, MakeUniqueAligned) {
int counter = 0;
{
// Creates the object, initializes it with the explicit constructor and
// returns an unique_ptr to it.
auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
EXPECT_EQ(1, counter);
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', ptr->data_[0]);
}
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
int counter = 0;
{
// Creates the array of objects and initializes them with the explicit
// constructor.
auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
EXPECT_EQ(7, counter);
for (size_t i = 0; i < 7; i++) {
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
}
}
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, AllocSingleInt) {
auto ptr = AllocateAligned<uint32_t>(1);
ASSERT_NE(nullptr, ptr.get());
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
// Force delete of the unique_ptr now to check that it doesn't crash.
ptr.reset(nullptr);
EXPECT_EQ(nullptr, ptr.get());
}
TEST(AlignedAllocatorTest, AllocMultipleInt) {
const size_t kSize = 7777;
auto ptr = AllocateAligned<uint32_t>(kSize);
ASSERT_NE(nullptr, ptr.get());
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
size_t ret = 0;
for (size_t i = 0; i < kSize; i++) {
// Performs a computation using ptr[] to prevent it being optimized away.
ptr[i] = static_cast<uint32_t>(i);
if (i) ret += ptr[i] * ptr[i - 1];
}
EXPECT_NE(0U, ret);
}
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
int counter = 0;
{
// This doesn't call the constructor.
auto obj = AllocateAligned<SampleObject<24>>(1);
obj[0].counter_ = &counter;
}
// Destroying the unique_ptr shouldn't have called the destructor of the
// SampleObject<24>.
EXPECT_EQ(0, counter);
}
TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
FakeAllocator fake_alloc;
int counter = 0;
{
// Creates the array of objects and initializes them with the explicit
// constructor.
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
&counter);
// An array shold still only call a single allocation.
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
EXPECT_EQ(7, counter);
for (size_t i = 0; i < 7; i++) {
// Custom constructor sets the data_[0] to 'b'.
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
}
}
EXPECT_EQ(0, counter);
EXPECT_EQ(0u, fake_alloc.PendingAllocs());
}
TEST(AlignedAllocatorTest, DefaultInit) {
// The test is whether this compiles. Default-init is useful for output params
// and per-thread storage.
std::vector<AlignedUniquePtr<int[]>> ptrs;
std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
ptrs.resize(128);
free_ptrs.resize(128);
// The following is to prevent elision of the pointers.
std::mt19937 rng(129); // Emscripten lacks random_device.
std::uniform_int_distribution<size_t> dist(0, 127);
ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
free_ptrs[dist(rng)] = AllocateAligned<double>(456);
// "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
constexpr size_t kBits = sizeof(uintptr_t) * 8;
EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
(addr2 >> (kBits - 1)) >> (kBits - 1));
}
} // namespace hwy

635
third_party/highway/hwy/base.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,635 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_BASE_H_
#define HIGHWAY_HWY_BASE_H_
// For SIMD module implementations and their callers, target-independent.
#include <stddef.h>
#include <stdint.h>
#include <atomic>
#include <cfloat>
// Add to #if conditions to prevent IDE from graying out code.
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
#define HWY_IDE 1
#else
#define HWY_IDE 0
#endif
//------------------------------------------------------------------------------
// Detect compiler using predefined macros
// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
// purpose.
#if defined(_MSC_VER) && !defined(__clang__)
#define HWY_COMPILER_MSVC _MSC_VER
#else
#define HWY_COMPILER_MSVC 0
#endif
#ifdef __INTEL_COMPILER
#define HWY_COMPILER_ICC __INTEL_COMPILER
#else
#define HWY_COMPILER_ICC 0
#endif
#ifdef __GNUC__
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
#else
#define HWY_COMPILER_GCC 0
#endif
// Clang can masquerade as MSVC/GCC, in which case both are set.
#ifdef __clang__
#ifdef __APPLE__
// Apple LLVM version is unrelated to the actual Clang version, which we need
// for enabling workarounds. Use the presence of warning flags to deduce it.
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
#if __has_warning("-Wformat-insufficient-args")
#define HWY_COMPILER_CLANG 1200
#elif __has_warning("-Wimplicit-const-int-float-conversion")
#define HWY_COMPILER_CLANG 1100
#elif __has_warning("-Wmisleading-indentation")
#define HWY_COMPILER_CLANG 1000
#elif defined(__FILE_NAME__)
#define HWY_COMPILER_CLANG 900
#elif __has_warning("-Wextra-semi-stmt") || \
__has_builtin(__builtin_rotateleft32)
#define HWY_COMPILER_CLANG 800
#elif __has_warning("-Wc++98-compat-extra-semi")
#define HWY_COMPILER_CLANG 700
#else // Anything older than 7.0 is not recommended for Highway.
#define HWY_COMPILER_CLANG 600
#endif // __has_warning chain
#else // Non-Apple: normal version
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
#endif
#else // Not clang
#define HWY_COMPILER_CLANG 0
#endif
// More than one may be nonzero, but we want at least one.
#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
!HWY_COMPILER_CLANG
#error "Unsupported compiler"
#endif
//------------------------------------------------------------------------------
// Compiler-specific definitions
#define HWY_STR_IMPL(macro) #macro
#define HWY_STR(macro) HWY_STR_IMPL(macro)
#if HWY_COMPILER_MSVC
#include <intrin.h>
#define HWY_RESTRICT __restrict
#define HWY_INLINE __forceinline
#define HWY_NOINLINE __declspec(noinline)
#define HWY_FLATTEN
#define HWY_NORETURN __declspec(noreturn)
#define HWY_LIKELY(expr) (expr)
#define HWY_UNLIKELY(expr) (expr)
#define HWY_PRAGMA(tokens) __pragma(tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
#define HWY_MAYBE_UNUSED
#define HWY_HAS_ASSUME_ALIGNED 0
#if (_MSC_VER >= 1700)
#define HWY_MUST_USE_RESULT _Check_return_
#else
#define HWY_MUST_USE_RESULT
#endif
#else
#define HWY_RESTRICT __restrict__
#define HWY_INLINE inline __attribute__((always_inline))
#define HWY_NOINLINE __attribute__((noinline))
#define HWY_FLATTEN __attribute__((flatten))
#define HWY_NORETURN __attribute__((noreturn))
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
// Encountered "attribute list cannot appear here" when using the C++17
// [[maybe_unused]], so only use the old style attribute for now.
#define HWY_MAYBE_UNUSED __attribute__((unused))
#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
#endif // !HWY_COMPILER_MSVC
//------------------------------------------------------------------------------
// Builtin/attributes
#ifdef __has_builtin
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
#else
#define HWY_HAS_BUILTIN(name) 0
#endif
#ifdef __has_attribute
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
#else
#define HWY_HAS_ATTRIBUTE(name) 0
#endif
// Enables error-checking of format strings.
#if HWY_HAS_ATTRIBUTE(__format__)
#define HWY_FORMAT(idx_fmt, idx_arg) \
__attribute__((__format__(__printf__, idx_fmt, idx_arg)))
#else
#define HWY_FORMAT(idx_fmt, idx_arg)
#endif
// Returns a void* pointer which the compiler then assumes is N-byte aligned.
// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
//
// The assignment semantics are required by GCC/Clang. ICC provides an in-place
// __assume_aligned, whereas MSVC's __assume appears unsuitable.
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
#else
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
#endif
// Clang and GCC require attributes on each function into which SIMD intrinsics
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
// automatic annotation via pragmas.
#if HWY_COMPILER_CLANG
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
apply_to = function))
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
#elif HWY_COMPILER_GCC
#define HWY_PUSH_ATTRIBUTES(targets_str) \
HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
#else
#define HWY_PUSH_ATTRIBUTES(targets_str)
#define HWY_POP_ATTRIBUTES
#endif
//------------------------------------------------------------------------------
// Detect architecture using predefined macros
#if defined(__i386__) || defined(_M_IX86)
#define HWY_ARCH_X86_32 1
#else
#define HWY_ARCH_X86_32 0
#endif
#if defined(__x86_64__) || defined(_M_X64)
#define HWY_ARCH_X86_64 1
#else
#define HWY_ARCH_X86_64 0
#endif
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
#define HWY_ARCH_X86 1
#else
#define HWY_ARCH_X86 0
#endif
#if defined(__powerpc64__) || defined(_M_PPC)
#define HWY_ARCH_PPC 1
#else
#define HWY_ARCH_PPC 0
#endif
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
#define HWY_ARCH_ARM_A64 1
#else
#define HWY_ARCH_ARM_A64 0
#endif
#if defined(__arm__) || defined(_M_ARM)
#define HWY_ARCH_ARM_V7 1
#else
#define HWY_ARCH_ARM_V7 0
#endif
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
#error "Cannot have both A64 and V7"
#endif
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
#define HWY_ARCH_ARM 1
#else
#define HWY_ARCH_ARM 0
#endif
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
#define HWY_ARCH_WASM 1
#else
#define HWY_ARCH_WASM 0
#endif
#ifdef __riscv
#define HWY_ARCH_RVV 1
#else
#define HWY_ARCH_RVV 0
#endif
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
HWY_ARCH_RVV) != 1
#error "Must detect exactly one platform"
#endif
//------------------------------------------------------------------------------
// Macros
#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
#define HWY_CONCAT_IMPL(a, b) a##b
#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
// does, without generating code.
#if HWY_ARCH_X86
#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
#else
// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
#define HWY_FENCE
#endif
// 4 instances of a given literal value, useful as input to LoadDup128.
#define HWY_REP4(literal) literal, literal, literal, literal
#define HWY_ABORT(format, ...) \
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
// Always enabled.
#define HWY_ASSERT(condition) \
do { \
if (!(condition)) { \
HWY_ABORT("Assert %s", #condition); \
} \
} while (0)
// Only for "debug" builds
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
#define HWY_DASSERT(condition) HWY_ASSERT(condition)
#else
#define HWY_DASSERT(condition) \
do { \
} while (0)
#endif
namespace hwy {
//------------------------------------------------------------------------------
// Alignment
// Not guaranteed to be an upper bound, but the alignment established by
// aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
#if HWY_ARCH_X86
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
#define HWY_ALIGN_MAX alignas(64)
#elif HWY_ARCH_RVV
// Not actually an upper bound on the size, but this value prevents crossing a
// 4K boundary (relevant on Andes).
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
#else
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
#define HWY_ALIGN_MAX alignas(16)
#endif
//------------------------------------------------------------------------------
// Lane types
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
// by concatenating base type and bits.
// RVV already has a builtin type and the GCC intrinsics require it.
#if HWY_ARCH_RVV && HWY_COMPILER_GCC
using float16_t = __fp16;
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
// arguments, so use a wrapper.
// TODO(janwas): replace with _Float16 when that is supported?
#else
#pragma pack(push, 1)
struct float16_t {
uint16_t bits;
};
#pragma pack(pop)
#endif
using float32_t = float;
using float64_t = double;
//------------------------------------------------------------------------------
// Controlling overload resolution (SFINAE)
template <bool Condition, class T>
struct EnableIfT {};
template <class T>
struct EnableIfT<true, T> {
using type = T;
};
template <bool Condition, class T = void>
using EnableIf = typename EnableIfT<Condition, T>::type;
// Insert into template/function arguments to enable this overload only for
// vectors of AT MOST this many bits.
//
// Note that enabling for exactly 128 bits is unnecessary because a function can
// simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
// other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
#define HWY_IF_SIGNED(T) \
hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
#define HWY_IF_LANE_SIZE(T, bytes) \
hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
// Empty struct used as a size tag type.
template <size_t N>
struct SizeTag {};
//------------------------------------------------------------------------------
// Type traits
template <typename T>
constexpr bool IsFloat() {
return T(1.25) != T(1);
}
template <typename T>
constexpr bool IsSigned() {
return T(0) > T(-1);
}
// Largest/smallest representable integer values.
template <typename T>
constexpr T LimitsMax() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
: static_cast<T>(~0ull);
}
template <typename T>
constexpr T LimitsMin() {
static_assert(!IsFloat<T>(), "Only for integer types");
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
}
// Largest/smallest representable value (integer or float). This naming avoids
// confusion with numeric_limits<float>::min() (the smallest positive value).
template <typename T>
constexpr T LowestValue() {
return LimitsMin<T>();
}
template <>
constexpr float LowestValue<float>() {
return -FLT_MAX;
}
template <>
constexpr double LowestValue<double>() {
return -DBL_MAX;
}
template <typename T>
constexpr T HighestValue() {
return LimitsMax<T>();
}
template <>
constexpr float HighestValue<float>() {
return FLT_MAX;
}
template <>
constexpr double HighestValue<double>() {
return DBL_MAX;
}
// Returns bitmask of the exponent field in IEEE binary32/64.
template <typename T>
constexpr T ExponentMask() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr uint32_t ExponentMask<uint32_t>() {
return 0x7F800000;
}
template <>
constexpr uint64_t ExponentMask<uint64_t>() {
return 0x7FF0000000000000ULL;
}
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
// absolute value are less than this can be represented exactly.
template <typename T>
constexpr T MantissaEnd() {
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
return 0;
}
template <>
constexpr float MantissaEnd<float>() {
return 8388608.0f; // 1 << 23
}
template <>
constexpr double MantissaEnd<double>() {
// floating point literal with p52 requires C++17.
return 4503599627370496.0; // 1 << 52
}
//------------------------------------------------------------------------------
// Type relations
namespace detail {
template <typename T>
struct Relations;
template <>
struct Relations<uint8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = uint16_t;
};
template <>
struct Relations<int8_t> {
using Unsigned = uint8_t;
using Signed = int8_t;
using Wide = int16_t;
};
template <>
struct Relations<uint16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = uint32_t;
using Narrow = uint8_t;
};
template <>
struct Relations<int16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Wide = int32_t;
using Narrow = int8_t;
};
template <>
struct Relations<uint32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = uint64_t;
using Narrow = uint16_t;
};
template <>
struct Relations<int32_t> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = int64_t;
using Narrow = int16_t;
};
template <>
struct Relations<uint64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = uint32_t;
};
template <>
struct Relations<int64_t> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = int32_t;
};
template <>
struct Relations<float16_t> {
using Unsigned = uint16_t;
using Signed = int16_t;
using Float = float16_t;
using Wide = float;
};
template <>
struct Relations<float> {
using Unsigned = uint32_t;
using Signed = int32_t;
using Float = float;
using Wide = double;
};
template <>
struct Relations<double> {
using Unsigned = uint64_t;
using Signed = int64_t;
using Float = double;
using Narrow = float;
};
} // namespace detail
// Aliases for types of a different category, but the same size.
template <typename T>
using MakeUnsigned = typename detail::Relations<T>::Unsigned;
template <typename T>
using MakeSigned = typename detail::Relations<T>::Signed;
template <typename T>
using MakeFloat = typename detail::Relations<T>::Float;
// Aliases for types of the same category, but different size.
template <typename T>
using MakeWide = typename detail::Relations<T>::Wide;
template <typename T>
using MakeNarrow = typename detail::Relations<T>::Narrow;
//------------------------------------------------------------------------------
// Helper functions
template <typename T1, typename T2>
constexpr inline T1 DivCeil(T1 a, T2 b) {
return (a + b - 1) / b;
}
// Works for any `align`; if a power of two, compiler emits ADD+AND.
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
return DivCeil(what, align) * align;
}
// Undefined results for x == 0.
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
#if HWY_COMPILER_MSVC
unsigned long index; // NOLINT
_BitScanForward(&index, x);
return index;
#else // HWY_COMPILER_MSVC
return static_cast<size_t>(__builtin_ctz(x));
#endif // HWY_COMPILER_MSVC
}
HWY_API size_t PopCount(uint64_t x) {
#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
return static_cast<size_t>(__builtin_popcountll(x));
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
return _mm_popcnt_u64(x);
#elif HWY_COMPILER_MSVC
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
#else
x -= ((x >> 1) & 0x55555555U);
x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
x = (((x >> 4) + x) & 0x0F0F0F0FU);
x += (x >> 8);
x += (x >> 16);
x += (x >> 32);
x = x & 0x0000007FU;
return (unsigned int)x;
#endif
}
// The source/destination must not overlap/alias.
template <size_t kBytes, typename From, typename To>
HWY_API void CopyBytes(const From* from, To* to) {
#if HWY_COMPILER_MSVC
const uint8_t* HWY_RESTRICT from_bytes =
reinterpret_cast<const uint8_t*>(from);
uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
for (size_t i = 0; i < kBytes; ++i) {
to_bytes[i] = from_bytes[i];
}
#else
// Avoids horrible codegen on Clang (series of PINSRB)
__builtin_memcpy(to, from, kBytes);
#endif
}
HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...);
} // namespace hwy
#endif // HIGHWAY_HWY_BASE_H_

123
third_party/highway/hwy/base_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,123 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <limits>
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "base_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
HWY_NOINLINE void TestAllLimits() {
HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
}
struct TestLowestHighest {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
}
};
HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
struct TestIsUnsigned {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(!IsFloat<T>(), "Expected !IsFloat");
static_assert(!IsSigned<T>(), "Expected !IsSigned");
}
};
struct TestIsSigned {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(!IsFloat<T>(), "Expected !IsFloat");
static_assert(IsSigned<T>(), "Expected IsSigned");
}
};
struct TestIsFloat {
template <class T>
HWY_NOINLINE void operator()(T /*unused*/) const {
static_assert(IsFloat<T>(), "Expected IsFloat");
static_assert(IsSigned<T>(), "Floats are also considered signed");
}
};
HWY_NOINLINE void TestAllType() {
ForUnsignedTypes(TestIsUnsigned());
ForSignedTypes(TestIsSigned());
ForFloatTypes(TestIsFloat());
}
HWY_NOINLINE void TestAllPopCount() {
HWY_ASSERT_EQ(size_t(0), PopCount(0u));
HWY_ASSERT_EQ(size_t(1), PopCount(1u));
HWY_ASSERT_EQ(size_t(1), PopCount(2u));
HWY_ASSERT_EQ(size_t(2), PopCount(3u));
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(BaseTest);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
} // namespace hwy
#endif

88
third_party/highway/hwy/cache_control.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,88 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
#define HIGHWAY_HWY_CACHE_CONTROL_H_
#include <stddef.h>
#include <stdint.h>
#include "hwy/base.h"
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
// https://github.com/gperftools/gperftools/issues/946).
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
#undef HWY_DISABLE_CACHE_CONTROL
#define HWY_DISABLE_CACHE_CONTROL
#endif
// intrin.h is sufficient on MSVC and already included by base.h.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#include <emmintrin.h> // SSE2
#endif
namespace hwy {
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
#define HWY_STREAM_MULTIPLE 16
// The following functions may also require an attribute.
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
#else
#define HWY_ATTR_CACHE
#endif
// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
// serves as a full fence (waits for all prior instructions to complete).
// No effect on non-x86.
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_lfence();
#endif
}
// Ensures previous weakly-ordered stores are visible. No effect on non-x86.
HWY_INLINE HWY_ATTR_CACHE void StoreFence() {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_sfence();
#endif
}
// Begins loading the cache line containing "p".
template <typename T>
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
// desirable, so use the default 3 (keep in caches).
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
#else
(void)p;
#endif
}
// Invalidates and flushes the cache line containing "p". No effect on non-x86.
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
_mm_clflush(p);
#else
(void)p;
#endif
}
} // namespace hwy
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_

243
third_party/highway/hwy/examples/benchmark.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,243 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
#include "hwy/foreach_target.h"
#include <stddef.h>
#include <stdio.h>
#include <memory>
#include <numeric> // iota
#include "hwy/aligned_allocator.h"
#include "hwy/highway.h"
#include "hwy/nanobenchmark.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// These templates are not found via ADL.
#if HWY_TARGET != HWY_SCALAR
using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
#endif
class TwoArray {
public:
// Passed to ctor as a value NOT known to the compiler. Must be a multiple of
// the vector lane count * 8.
static size_t NumItems() { return 3456; }
explicit TwoArray(const size_t num_items)
: a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
const float init = num_items / NumItems(); // 1, but compiler doesn't know
std::iota(a_.get(), a_.get() + num_items, init);
std::iota(b_, b_ + num_items, init);
}
protected:
AlignedFreeUniquePtr<float[]> a_;
float* b_;
};
// Measures durations, verifies results, prints timings.
template <class Benchmark>
void RunBenchmark(const char* caption) {
printf("%10s: ", caption);
const size_t kNumInputs = 1;
const size_t num_items = Benchmark::NumItems() * Unpredictable1();
const FuncInput inputs[kNumInputs] = {num_items};
Result results[kNumInputs];
Benchmark benchmark(num_items);
Params p;
p.verbose = false;
p.max_evals = 7;
p.target_rel_mad = 0.002;
const size_t num_results = MeasureClosure(
[&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
kNumInputs, results, p);
if (num_results != kNumInputs) {
fprintf(stderr, "MeasureClosure failed.\n");
}
benchmark.Verify(num_items);
for (size_t i = 0; i < num_results; ++i) {
const double cycles_per_item = results[i].ticks / results[i].input;
const double mad = results[i].variability * cycles_per_item;
printf("%6zu: %6.3f (+/- %5.3f)\n", results[i].input, cycles_per_item, mad);
}
}
void Intro() {
HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
HWY_ALIGN float out[16];
HWY_FULL(float) d; // largest possible vector
for (size_t i = 0; i < 16; i += Lanes(d)) {
const auto vec = Load(d, in + i); // aligned!
auto result = vec * vec;
result += result; // can update if not const
Store(result, d, out + i);
}
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
}
// BEGINNER: dot product
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
class BenchmarkDot : public TwoArray {
public:
explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
FuncOutput operator()(const size_t num_items) {
HWY_FULL(float) d;
const size_t N = Lanes(d);
using V = decltype(Zero(d));
constexpr int unroll = 8;
// Compiler doesn't make independent sum* accumulators, so unroll manually.
// Some older compilers might not be able to fit the 8 arrays in registers,
// so manual unrolling can be helpfull if you run into this issue.
// 2 FMA ports * 4 cycle latency = 8x unrolled.
V sum[unroll];
for (int i = 0; i < unroll; ++i) {
sum[i] = Zero(d);
}
const float* const HWY_RESTRICT pa = &a_[0];
const float* const HWY_RESTRICT pb = b_;
for (size_t i = 0; i < num_items; i += unroll * N) {
for (int j = 0; j < unroll; ++j) {
const auto a = Load(d, pa + i + j * N);
const auto b = Load(d, pb + i + j * N);
sum[j] = MulAdd(a, b, sum[j]);
}
}
// Reduction tree: sum of all accumulators by pairs into sum[0], then the
// lanes.
for (int power = 1; power < unroll; power *= 2) {
for (int i = 0; i < unroll; i += 2 * power) {
sum[i] += sum[i + power];
}
}
return dot_ = GetLane(SumOfLanes(sum[0]));
}
void Verify(size_t num_items) {
if (dot_ == -1.0f) {
fprintf(stderr, "Dot: must call Verify after benchmark");
abort();
}
const float expected =
std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
const float rel_err = std::abs(expected - dot_) / expected;
if (rel_err > 1.1E-6f) {
fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
rel_err);
abort();
}
}
private:
float dot_; // for Verify
};
// INTERMEDIATE: delta coding
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
struct BenchmarkDelta : public TwoArray {
explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
FuncOutput operator()(const size_t num_items) const {
#if HWY_TARGET == HWY_SCALAR
b_[0] = a_[0];
for (size_t i = 1; i < num_items; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
#elif HWY_CAP_GE256
// Larger vectors are split into 128-bit blocks, easiest to use the
// unaligned load support to shift between them.
const HWY_FULL(float) df;
const size_t N = Lanes(df);
size_t i;
b_[0] = a_[0];
for (i = 1; i < N; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
for (; i < num_items; i += N) {
const auto a = Load(df, &a_[i]);
const auto shifted = LoadU(df, &a_[i - 1]);
Store(a - shifted, df, &b_[i]);
}
#else // 128-bit
// Slightly better than unaligned loads
const HWY_CAPPED(float, 4) df;
const size_t N = Lanes(df);
size_t i;
b_[0] = a_[0];
for (i = 1; i < N; ++i) {
b_[i] = a_[i] - a_[i - 1];
}
auto prev = Load(df, &a_[0]);
for (; i < num_items; i += Lanes(df)) {
const auto a = Load(df, &a_[i]);
const auto shifted = CombineShiftRightLanes<3>(a, prev);
prev = a;
Store(a - shifted, df, &b_[i]);
}
#endif
return b_[num_items - 1];
}
void Verify(size_t num_items) {
for (size_t i = 0; i < num_items; ++i) {
const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
const float err = std::abs(expected - b_[i]);
if (err > 1E-6f) {
fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
}
}
}
};
void RunBenchmarks() {
Intro();
printf("------------------------ %s\n", TargetName(HWY_TARGET));
RunBenchmark<BenchmarkDot>("dot");
RunBenchmark<BenchmarkDelta>("delta");
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_EXPORT(RunBenchmarks);
void Run() {
for (uint32_t target : SupportedAndGeneratedTargets()) {
SetSupportedTargetsForTest(target);
HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
}
SetSupportedTargetsForTest(0); // Reset the mask afterwards.
}
} // namespace hwy
int main(int /*argc*/, char** /*argv*/) {
hwy::Run();
return 0;
}
#endif // HWY_ONCE

62
third_party/highway/hwy/examples/skeleton-inl.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,62 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Demo of functions that might be called from multiple SIMD modules (either
// other -inl.h files, or a .cc file between begin/end_target-inl). This is
// optional - all SIMD code can reside in .cc files. However, this allows
// splitting code into different files while still inlining instead of requiring
// calling through function pointers.
// Include guard (still compiled once per target)
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#else
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
#endif
// It is fine to #include normal or *-inl headers.
#include <stddef.h>
#include "hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
using namespace hwy::HWY_NAMESPACE;
// Example of a type-agnostic (caller-specified lane type) and width-agnostic
// (uses best available instruction set) function in a header.
//
// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
template <class D, typename T>
HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
const T* HWY_RESTRICT add_array,
const size_t size, T* HWY_RESTRICT x_array) {
for (size_t i = 0; i < size; i += Lanes(d)) {
const auto mul = Load(d, mul_array + i);
const auto add = Load(d, add_array + i);
auto x = Load(d, x_array + i);
x = MulAdd(mul, x, add);
Store(x, d, x_array + i);
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
#endif // include guard

103
third_party/highway/hwy/examples/skeleton.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,103 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/examples/skeleton.h"
#include <assert.h>
#include <stdio.h>
// First undef to prevent error when re-included.
#undef HWY_TARGET_INCLUDE
// For runtime dispatch, specify the name of the current file (unfortunately
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
// Generates code for each enabled target by re-including this source file.
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
// Optional, can instead add HWY_ATTR to all functions.
HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
// Highway ops reside here; ADL does not find templates nor builtins.
using namespace hwy::HWY_NAMESPACE;
// Computes log2 by converting to a vector of floats. Compiled once per target.
template <class DF>
HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
uint8_t* HWY_RESTRICT log2) {
// Type tags for converting to other element types (Rebind = same count).
const Rebind<int32_t, DF> d32;
const Rebind<uint8_t, DF> d8;
const auto u8 = Load(d8, values);
const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
Store(DemoteTo(d8, exponent), d8, log2);
}
HWY_NOINLINE void CodepathDemo() {
// Highway defaults to portability, but per-target codepaths may be selected
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
#if HWY_CAP_INTEGER64
const char* gather = "Has int64";
#else
const char* gather = "No int64";
#endif
printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
}
HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
uint8_t* HWY_RESTRICT log2) {
CodepathDemo();
// Second argument is necessary on RVV until it supports fractional lengths.
HWY_FULL(float, 4) df;
// Caller padded memory to a multiple of Lanes(). If that is not possible,
// we could use blended or masked stores, see README.md.
const size_t N = Lanes(df);
for (size_t i = 0; i < count; i += N) {
OneFloorLog2(df, values + i, log2 + i);
}
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace skeleton {
// This macro declares a static array used for dynamic dispatch; it resides in
// the same outer namespace that contains FloorLog2.
HWY_EXPORT(FloorLog2);
// This function is optional and only needed in the case of exposing it in the
// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
// is equivalent to inlining this function.
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
uint8_t* HWY_RESTRICT out) {
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
}
// Optional: anything to compile only once, e.g. non-SIMD implementations of
// public functions provided by this module, can go inside #if HWY_ONCE.
} // namespace skeleton
#endif // HWY_ONCE

35
third_party/highway/hwy/examples/skeleton.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,35 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Demo interface to target-specific code in skeleton.cc
// Normal header with include guard and namespace.
#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
#include <stddef.h>
// Platform-specific definitions used for declaring an interface, independent of
// the SIMD instruction set.
#include "hwy/base.h" // HWY_RESTRICT
namespace skeleton {
// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
uint8_t* HWY_RESTRICT out);
} // namespace skeleton
#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_

104
third_party/highway/hwy/examples/skeleton_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,104 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Example of unit test for the "skeleton" library.
#include "hwy/examples/skeleton.h"
#include <stdio.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Optional: factor out parts of the implementation into *-inl.h
#include "hwy/examples/skeleton-inl.h"
HWY_BEFORE_NAMESPACE();
namespace skeleton {
namespace HWY_NAMESPACE {
using namespace hwy::HWY_NAMESPACE;
// Calls function defined in skeleton.cc.
struct TestFloorLog2 {
template <class T, class DF>
HWY_NOINLINE void operator()(T /*unused*/, DF df) {
const size_t count = 5 * Lanes(df);
auto in = hwy::AllocateAligned<uint8_t>(count);
auto expected = hwy::AllocateAligned<uint8_t>(count);
hwy::RandomState rng;
for (size_t i = 0; i < count; ++i) {
expected[i] = Random32(&rng) & 7;
in[i] = static_cast<uint8_t>(1u << expected[i]);
}
auto out = hwy::AllocateAligned<uint8_t>(count);
CallFloorLog2(in.get(), count, out.get());
int sum = 0;
for (size_t i = 0; i < count; ++i) {
HWY_ASSERT_EQ(expected[i], out[i]);
sum += out[i];
}
hwy::PreventElision(sum);
}
};
HWY_NOINLINE void TestAllFloorLog2() {
ForFullVectors<TestFloorLog2>()(float());
}
// Calls function defined in skeleton-inl.h.
struct TestSumMulAdd {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
hwy::RandomState rng;
const size_t count = 4096;
EXPECT_TRUE(count % Lanes(d) == 0);
auto mul = hwy::AllocateAligned<T>(count);
auto x = hwy::AllocateAligned<T>(count);
auto add = hwy::AllocateAligned<T>(count);
for (size_t i = 0; i < count; ++i) {
mul[i] = static_cast<T>(Random32(&rng) & 0xF);
x[i] = static_cast<T>(Random32(&rng) & 0xFF);
add[i] = static_cast<T>(Random32(&rng) & 0xFF);
}
double expected_sum = 0.0;
for (size_t i = 0; i < count; ++i) {
expected_sum += mul[i] * x[i] + add[i];
}
MulAddLoop(d, mul.get(), add.get(), count, x.get());
HWY_ASSERT_EQ(4344240.0, expected_sum);
}
};
HWY_NOINLINE void TestAllSumMulAdd() {
ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace skeleton
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace skeleton {
HWY_BEFORE_TEST(SkeletonTest);
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
} // namespace skeleton
#endif

161
third_party/highway/hwy/foreach_target.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,161 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
#define HIGHWAY_HWY_FOREACH_TARGET_H_
// Re-includes the translation unit zero or more times to compile for any
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
// highway.h defines the corresponding macro/namespace.
#include "hwy/targets.h"
// *_inl.h may include other headers, which requires include guards to prevent
// repeated inclusion. The guards must be reset after compiling each target, so
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
// defining it if undefined and vice versa. This macro is initially undefined
// so that IDEs don't gray out the contents of each header.
#ifdef HWY_TARGET_TOGGLE
#error "This macro must not be defined outside foreach_target.h"
#endif
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
// Trigger fixup at the bottom of this header.
#define HWY_ALREADY_INCLUDED
// The next highway.h must re-include set_macros-inl.h because the first
// highway.h chose the static target instead of what we will set below.
#undef HWY_SET_MACROS_PER_TARGET
#endif
// Disable HWY_EXPORT in user code until we have generated all targets. Note
// that a subsequent highway.h will not override this definition.
#undef HWY_ONCE
#define HWY_ONCE (0 || HWY_IDE)
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
// also skip if only 1 target defined (no re-inclusion will be necessary).
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
#if !defined(HWY_TARGET_INCLUDE)
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
#endif
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
#undef HWY_TARGET
#define HWY_TARGET HWY_SCALAR
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
#undef HWY_TARGET
#define HWY_TARGET HWY_NEON
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
#undef HWY_TARGET
#define HWY_TARGET HWY_SSE4
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX2
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
#undef HWY_TARGET
#define HWY_TARGET HWY_AVX3
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
#undef HWY_TARGET
#define HWY_TARGET HWY_WASM
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
#undef HWY_TARGET
#define HWY_TARGET HWY_PPC8
#include HWY_TARGET_INCLUDE
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
#endif
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
#undef HWY_ONCE
#define HWY_ONCE 1
// If we re-include once per enabled target, the translation unit's
// implementation would have to be skipped via #if to avoid redefining symbols.
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
// implementation when resuming compilation of the translation unit.
#undef HWY_TARGET
#define HWY_TARGET HWY_STATIC_TARGET
#ifdef HWY_ALREADY_INCLUDED
// Revert the previous toggle to prevent redefinitions for the static target.
#ifdef HWY_TARGET_TOGGLE
#undef HWY_TARGET_TOGGLE
#else
#define HWY_TARGET_TOGGLE
#endif
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_

337
third_party/highway/hwy/highway.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,337 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This include guard is checked by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
// after/outside this include guard.
#ifndef HWY_HIGHWAY_INCLUDED
#define HWY_HIGHWAY_INCLUDED
// Main header required before using vector types.
#include "hwy/base.h"
#include "hwy/targets.h"
namespace hwy {
// API version (https://semver.org/)
#define HWY_MAJOR 0
#define HWY_MINOR 12
#define HWY_PATCH 0
//------------------------------------------------------------------------------
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
// Because Highway functions take descriptor and/or vector arguments, ADL finds
// these functions without requiring users in project::HWY_NAMESPACE to
// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
// registers in the group, and is ignored on targets that do not support groups.
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
// Trailing comma avoids -pedantic false alarm
#define HWY_CHOOSE_FULL(...) \
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
// Vector of up to MAX_N lanes.
#define HWY_CAPPED(T, MAX_N) \
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch
// Evaluates to 0 inside a translation unit if it is generating anything but the
// static target (the last one if multiple targets are enabled). Used to prevent
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
// compile once anyway, so this is 1 unless it is or has been included.
#ifndef HWY_ONCE
#define HWY_ONCE 1
#endif
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
// defined), and can be used to deduce the return type of Choose*.
#if HWY_STATIC_TARGET == HWY_SCALAR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_RVV
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC8
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
#endif
// Dynamic dispatch declarations.
template <typename RetType, typename... Args>
struct FunctionCache {
public:
typedef RetType(FunctionType)(Args...);
// A template function that when instantiated has the same signature as the
// function being called. This function initializes the global cache of the
// current supported targets mask used for dynamic dispatch and calls the
// appropriate function. Since this mask used for dynamic dispatch is a
// global cache, all the highway exported functions, even those exposed by
// different modules, will be initialized after this function runs for any one
// of those exported functions.
template <FunctionType* const table[]>
static RetType ChooseAndCall(Args... args) {
// If we are running here it means we need to update the chosen target.
chosen_target.Update();
return (table[chosen_target.GetIndex()])(args...);
}
};
// Factory function only used to infer the template parameters RetType and Args
// from a function passed to the factory.
template <typename RetType, typename... Args>
FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
return FunctionCache<RetType, Args...>();
}
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
// nullptr is that target was not compiled.
#if HWY_TARGETS & HWY_SCALAR
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
#else
// When scalar is not present and we try to use scalar because other targets
// were disabled at runtime we fall back to the baseline with
// HWY_STATIC_DISPATCH()
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#endif
#if HWY_TARGETS & HWY_WASM
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
#else
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_RVV
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
#else
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_NEON
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
#else
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_PPC8
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
#else
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_SSE4
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
#else
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX2
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
#else
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
#endif
#if HWY_TARGETS & HWY_AVX3
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
#endif
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
// static array must be defined at the same namespace level as the function
// it is exporting.
// After being exported, it can be called from other parts of the same source
// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
// like in the following example:
//
// #include "hwy/highway.h"
// HWY_BEFORE_NAMESPACE();
// namespace skeleton {
// namespace HWY_NAMESPACE {
//
// void MyFunction(int a, char b, const char* c) { ... }
//
// // NOLINTNEXTLINE(google-readability-namespace-comments)
// } // namespace HWY_NAMESPACE
// } // namespace skeleton
// HWY_AFTER_NAMESPACE();
//
// namespace skeleton {
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
//
// void MyFunction(int a, char b, const char* c) {
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
// }
// } // namespace skeleton
//
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
// Simplified version for IDE or the dynamic dispatch case with only one target.
// This case still uses a table, although of a single element, to provide the
// same compile error conditions as with the dynamic dispatch case when multiple
// targets are being compiled.
#define HWY_EXPORT(FUNC_NAME) \
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = { \
&HWY_STATIC_DISPATCH(FUNC_NAME)}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
#else
// Dynamic dispatch case with one entry per dynamic target plus the scalar
// mode and the initialization wrapper.
#define HWY_EXPORT(FUNC_NAME) \
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
/* The first entry in the table initializes the global cache and \
* calls the appropriate function. */ \
&decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH( \
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
HWY_CHOOSE_SCALAR(FUNC_NAME), \
}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
} // namespace hwy
#endif // HWY_HIGHWAY_INCLUDED
//------------------------------------------------------------------------------
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
// to include them once per target, which is ensured by the toggle check.
// Because ops/*.h are included under it, they do not need their own guard.
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_HIGHWAY_PER_TARGET
#undef HWY_HIGHWAY_PER_TARGET
#else
#define HWY_HIGHWAY_PER_TARGET
#endif
#undef HWY_FULL2
#if HWY_TARGET == HWY_RVV
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
#else
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
#endif
// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"
#elif HWY_TARGET == HWY_AVX2
#include "hwy/ops/x86_256-inl.h"
#elif HWY_TARGET == HWY_AVX3
#include "hwy/ops/x86_512-inl.h"
#elif HWY_TARGET == HWY_PPC8
#elif HWY_TARGET == HWY_NEON
#include "hwy/ops/arm_neon-inl.h"
#elif HWY_TARGET == HWY_WASM
#include "hwy/ops/wasm_128-inl.h"
#elif HWY_TARGET == HWY_RVV
#include "hwy/ops/rvv-inl.h"
#elif HWY_TARGET == HWY_SCALAR
#include "hwy/ops/scalar-inl.h"
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
// Commonly used functions/types that must come after ops are defined.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
template <class V>
using LaneType = decltype(GetLane(V()));
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
// of functions that do not take a vector argument, or as an argument type if
// the function only has a template argument for D, or for explicit type names
// instead of auto. This may be a built-in type.
template <class D>
using Vec = decltype(Zero(D()));
// Mask type. Useful as the return type of functions that do not take a mask
// argument, or as an argument type if the function only has a template argument
// for D, or for explicit type names instead of auto.
template <class D>
using Mask = decltype(MaskFromVec(Zero(D())));
// Returns the closest value to v within [lo, hi].
template <class V>
HWY_API V Clamp(const V v, const V lo, const V hi) {
return Min(Max(lo, v), hi);
}
// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
// TODO(janwas): implement for RVV
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
template <size_t kLanes, class V>
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
}
#endif
// Returns lanes with the most significant bit set and all other bits zero.
template <class D>
HWY_API Vec<D> SignBit(D d) {
using Unsigned = MakeUnsigned<TFromD<D>>;
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
}
// Returns quiet NaN.
template <class D>
HWY_API Vec<D> NaN(D d) {
const RebindToSigned<D> di;
// LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
// mantissa MSB (to indicate quiet) would be sufficient.
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // HWY_HIGHWAY_PER_TARGET

313
third_party/highway/hwy/highway_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,313 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "highway_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/nanobenchmark.h" // Unpredictable1
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestSet {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Zero
const auto v0 = Zero(d);
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
std::fill(expected.get(), expected.get() + N, T(0));
HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
// Set
const auto v2 = Set(d, T(2));
for (size_t i = 0; i < N; ++i) {
expected[i] = 2;
}
HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
// Iota
const auto vi = Iota(d, T(5));
for (size_t i = 0; i < N; ++i) {
expected[i] = T(5 + i);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
// Undefined
const auto vu = Undefined(d);
Store(vu, d, expected.get());
}
};
HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
// Ensures wraparound (mod 2^bits)
struct TestOverflow {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(1));
const auto vmax = Set(d, LimitsMax<T>());
const auto vmin = Set(d, LimitsMin<T>());
// Unsigned underflow / negative -> positive
HWY_ASSERT_VEC_EQ(d, vmax, vmin - v1);
// Unsigned overflow / positive -> negative
HWY_ASSERT_VEC_EQ(d, vmin, vmax + v1);
}
};
HWY_NOINLINE void TestAllOverflow() {
ForIntegerTypes(ForPartialVectors<TestOverflow>());
}
struct TestSignBitInteger {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto all = VecFromMask(d, Eq(v0, v0));
const auto vs = SignBit(d);
const auto other = Sub(vs, Set(d, 1));
// Shifting left by one => overflow, equal zero
HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
// Verify the lower bits are zero (only +/- and logical ops are available
// for all types)
HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
}
};
struct TestSignBitFloat {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vs = SignBit(d);
const auto vp = Set(d, 2.25);
const auto vn = Set(d, -2.25);
HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
HWY_ASSERT_VEC_EQ(d, v0, vs);
}
};
HWY_NOINLINE void TestAllSignBit() {
ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
}
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
template <typename TF>
bool IsNaN(TF f) {
MakeUnsigned<TF> bits;
memcpy(&bits, &f, sizeof(TF));
bits += bits;
bits >>= 1; // clear sign bit
// NaN if all exponent bits are set and the mantissa is not zero.
return bits > ExponentMask<decltype(bits)>();
}
template <class D, class V>
HWY_NOINLINE void AssertNaN(const D d, const V v, const char* file, int line) {
using T = TFromD<D>;
const T lane = GetLane(v);
if (!IsNaN(lane)) {
const std::string type_name = TypeName(T(), Lanes(d));
MakeUnsigned<T> bits;
memcpy(&bits, &lane, sizeof(T));
// RVV lacks PRIu64, so use size_t; double will be truncated on 32-bit.
Abort(file, line, "Expected %s NaN, got %E (%zu)", type_name.c_str(), lane,
size_t(bits));
}
}
#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
struct TestNaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
HWY_ASSERT_NAN(d, nan);
// Arithmetic
HWY_ASSERT_NAN(d, Add(nan, v1));
HWY_ASSERT_NAN(d, Add(v1, nan));
HWY_ASSERT_NAN(d, Sub(nan, v1));
HWY_ASSERT_NAN(d, Sub(v1, nan));
HWY_ASSERT_NAN(d, Mul(nan, v1));
HWY_ASSERT_NAN(d, Mul(v1, nan));
HWY_ASSERT_NAN(d, Div(nan, v1));
HWY_ASSERT_NAN(d, Div(v1, nan));
// FMA
HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
// Rcp/Sqrt
HWY_ASSERT_NAN(d, Sqrt(nan));
// Sign manipulation
HWY_ASSERT_NAN(d, Abs(nan));
HWY_ASSERT_NAN(d, Neg(nan));
HWY_ASSERT_NAN(d, CopySign(nan, v1));
HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
// Rounding
HWY_ASSERT_NAN(d, Ceil(nan));
HWY_ASSERT_NAN(d, Floor(nan));
HWY_ASSERT_NAN(d, Round(nan));
HWY_ASSERT_NAN(d, Trunc(nan));
// Logical (And/AndNot/Xor will clear NaN!)
HWY_ASSERT_NAN(d, Or(nan, v1));
// Comparison
HWY_ASSERT(AllFalse(Eq(nan, v1)));
HWY_ASSERT(AllFalse(Gt(nan, v1)));
HWY_ASSERT(AllFalse(Lt(nan, v1)));
HWY_ASSERT(AllFalse(Ge(nan, v1)));
HWY_ASSERT(AllFalse(Le(nan, v1)));
}
};
// For functions only available for float32
struct TestF32NaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
}
};
// TODO(janwas): move to TestNaN once supported for partial vectors
struct TestFullNaN {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v1 = Set(d, T(Unpredictable1()));
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
HWY_ASSERT_NAN(d, SumOfLanes(nan));
// Reduction (pending clarification on RVV)
#if HWY_TARGET != HWY_RVV
HWY_ASSERT_NAN(d, MinOfLanes(nan));
HWY_ASSERT_NAN(d, MaxOfLanes(nan));
#endif
#if HWY_ARCH_X86 && HWY_TARGET != HWY_SCALAR
// x86 SIMD returns the second operand if any input is NaN.
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
HWY_ASSERT_NAN(d, Min(v1, nan));
HWY_ASSERT_NAN(d, Max(v1, nan));
#elif HWY_ARCH_WASM
// Should return NaN if any input is NaN, but does not for scalar.
// TODO(janwas): remove once this is fixed.
#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
// ARMv7 NEON returns NaN if any input is NaN.
HWY_ASSERT_NAN(d, Min(v1, nan));
HWY_ASSERT_NAN(d, Max(v1, nan));
HWY_ASSERT_NAN(d, Min(nan, v1));
HWY_ASSERT_NAN(d, Max(nan, v1));
#else
// IEEE 754-2019 minimumNumber is defined as the other argument if exactly
// one is NaN, and qNaN if both are.
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
#endif
HWY_ASSERT_NAN(d, Min(nan, nan));
HWY_ASSERT_NAN(d, Max(nan, nan));
// Comparison
HWY_ASSERT(AllFalse(Eq(nan, v1)));
HWY_ASSERT(AllFalse(Gt(nan, v1)));
HWY_ASSERT(AllFalse(Lt(nan, v1)));
HWY_ASSERT(AllFalse(Ge(nan, v1)));
HWY_ASSERT(AllFalse(Le(nan, v1)));
}
};
HWY_NOINLINE void TestAllNaN() {
ForFloatTypes(ForPartialVectors<TestNaN>());
ForPartialVectors<TestF32NaN>()(float());
ForFloatTypes(ForFullVectors<TestFullNaN>());
}
struct TestCopyAndAssign {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// copy V
const auto v3 = Iota(d, 3);
auto v3b(v3);
HWY_ASSERT_VEC_EQ(d, v3, v3b);
// assign V
auto v3c = Undefined(d);
v3c = v3;
HWY_ASSERT_VEC_EQ(d, v3, v3c);
}
};
HWY_NOINLINE void TestAllCopyAndAssign() {
ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
}
struct TestGetLane {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
}
};
HWY_NOINLINE void TestAllGetLane() {
ForAllTypes(ForPartialVectors<TestGetLane>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HighwayTest);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
} // namespace hwy
#endif

722
third_party/highway/hwy/nanobenchmark.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,722 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/nanobenchmark.h"
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h> // abort
#include <string.h> // memcpy
#include <time.h> // clock_gettime
#include <algorithm> // sort
#include <array>
#include <atomic>
#include <limits>
#include <numeric> // iota
#include <random>
#include <string>
#include <vector>
#include "hwy/base.h"
#if HWY_ARCH_PPC
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
#elif HWY_ARCH_X86
#if HWY_COMPILER_MSVC
#include <intrin.h>
#else
#include <cpuid.h> // NOLINT
#endif // HWY_COMPILER_MSVC
#endif // HWY_ARCH_X86
namespace hwy {
namespace platform {
namespace {
#if HWY_ARCH_X86
void Cpuid(const uint32_t level, const uint32_t count,
uint32_t* HWY_RESTRICT abcd) {
#if HWY_COMPILER_MSVC
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
}
#else
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
__cpuid_count(level, count, a, b, c, d);
abcd[0] = a;
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
#endif
}
std::string BrandString() {
char brand_string[49];
std::array<uint32_t, 4> abcd;
// Check if brand string is supported (it is on all reasonable Intel/AMD)
Cpuid(0x80000000U, 0, abcd.data());
if (abcd[0] < 0x80000004U) {
return std::string();
}
for (size_t i = 0; i < 3; ++i) {
Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
}
brand_string[48] = 0;
return brand_string;
}
// Returns the frequency quoted inside the brand string. This does not
// account for throttling nor Turbo Boost.
double NominalClockRate() {
const std::string& brand_string = BrandString();
// Brand strings include the maximum configured frequency. These prefixes are
// defined by Intel CPUID documentation.
const char* prefixes[3] = {"MHz", "GHz", "THz"};
const double multipliers[3] = {1E6, 1E9, 1E12};
for (size_t i = 0; i < 3; ++i) {
const size_t pos_prefix = brand_string.find(prefixes[i]);
if (pos_prefix != std::string::npos) {
const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
if (pos_space != std::string::npos) {
const std::string digits =
brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
return std::stod(digits) * multipliers[i];
}
}
}
return 0.0;
}
#endif // HWY_ARCH_X86
} // namespace
// Returns tick rate. Invariant means the tick counter frequency is independent
// of CPU throttling or sleep. May be expensive, caller should cache the result.
double InvariantTicksPerSecond() {
#if HWY_ARCH_PPC
return __ppc_get_timebase_freq();
#elif HWY_ARCH_X86
// We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
return NominalClockRate();
#else
// Fall back to clock_gettime nanoseconds.
return 1E9;
#endif
}
} // namespace platform
namespace {
// Prevents the compiler from eliding the computations that led to "output".
template <class T>
inline void PreventElision(T&& output) {
#if HWY_COMPILER_MSVC == 0
// Works by indicating to the compiler that "output" is being read and
// modified. The +r constraint avoids unnecessary writes to memory, but only
// works for built-in types (typically FuncOutput).
asm volatile("" : "+r"(output) : : "memory");
#else
// MSVC does not support inline assembly anymore (and never supported GCC's
// RTL constraints). Self-assignment with #pragma optimize("off") might be
// expected to prevent elision, but it does not with MSVC 2015. Type-punning
// with volatile pointers generates inefficient code on MSVC 2017.
static std::atomic<T> dummy(T{});
dummy.store(output, std::memory_order_relaxed);
#endif
}
namespace timer {
// Start/Stop return absolute timestamps and must be placed immediately before
// and after the region to measure. We provide separate Start/Stop functions
// because they use different fences.
//
// Background: RDTSC is not 'serializing'; earlier instructions may complete
// after it, and/or later instructions may complete before it. 'Fences' ensure
// regions' elapsed times are independent of such reordering. The only
// documented unprivileged serializing instruction is CPUID, which acts as a
// full fence (no reordering across it in either direction). Unfortunately
// the latency of CPUID varies wildly (perhaps made worse by not initializing
// its EAX input). Because it cannot reliably be deducted from the region's
// elapsed time, it must not be included in the region to measure (i.e.
// between the two RDTSC).
//
// The newer RDTSCP is sometimes described as serializing, but it actually
// only serves as a half-fence with release semantics. Although all
// instructions in the region will complete before the final timestamp is
// captured, subsequent instructions may leak into the region and increase the
// elapsed time. Inserting another fence after the final RDTSCP would prevent
// such reordering without affecting the measured region.
//
// Fortunately, such a fence exists. The LFENCE instruction is only documented
// to delay later loads until earlier loads are visible. However, Intel's
// reference manual says it acts as a full fence (waiting until all earlier
// instructions have completed, and delaying later instructions until it
// completes). AMD assigns the same behavior to MFENCE.
//
// We need a fence before the initial RDTSC to prevent earlier instructions
// from leaking into the region, and arguably another after RDTSC to avoid
// region instructions from completing before the timestamp is recorded.
// When surrounded by fences, the additional RDTSCP half-fence provides no
// benefit, so the initial timestamp can be recorded via RDTSC, which has
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
//
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
// However, Stop+Stop includes an LFENCE in the region measurements, which
// adds a delay dependent on earlier loads. The combination of Start+Stop
// is faster than Start+Start and more consistent than Stop+Stop because
// the first LFENCE already delayed subsequent loads before the measured
// region. This combination seems not to have been considered in prior work:
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
//
// Note: performance counters can measure 'exact' instructions-retired or
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
// requires fences. Unfortunately, it is not accessible on all OSes and we
// prefer to avoid kernel-mode drivers. Performance counters are also affected
// by several under/over-count errata, so we use the TSC instead.
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
// divide by InvariantTicksPerSecond.
inline uint64_t Start64() {
uint64_t t;
#if HWY_ARCH_PPC
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
t = __rdtsc();
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rdx", "memory", "cc");
#elif HWY_ARCH_RVV
asm volatile("rdcycle %0" : "=r"(t));
#else
// Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
#endif
return t;
}
inline uint64_t Stop64() {
uint64_t t;
#if HWY_ARCH_PPC
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
unsigned aux;
t = __rdtscp(&aux);
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
asm volatile(
"rdtscp\n\t"
"shl $32, %%rdx\n\t"
"or %%rdx, %0\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
// "cc" = flags modified by SHL.
: "rcx", "rdx", "memory", "cc");
#else
t = Start64();
#endif
return t;
}
// Returns a 32-bit timestamp with about 4 cycles less overhead than
// Start64. Only suitable for measuring very short regions because the
// timestamp overflows about once a second.
inline uint32_t Start32() {
uint32_t t;
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
t = static_cast<uint32_t>(__rdtsc());
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
asm volatile(
"lfence\n\t"
"rdtsc\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rdx = TSC >> 32.
: "rdx", "memory");
#elif HWY_ARCH_RVV
asm volatile("rdcycle %0" : "=r"(t));
#else
t = static_cast<uint32_t>(Start64());
#endif
return t;
}
inline uint32_t Stop32() {
uint32_t t;
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
_ReadWriteBarrier();
unsigned aux;
t = static_cast<uint32_t>(__rdtscp(&aux));
_ReadWriteBarrier();
_mm_lfence();
_ReadWriteBarrier();
#elif HWY_ARCH_X86_64
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
asm volatile(
"rdtscp\n\t"
"lfence"
: "=a"(t)
:
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
: "rcx", "rdx", "memory");
#else
t = static_cast<uint32_t>(Stop64());
#endif
return t;
}
} // namespace timer
namespace robust_statistics {
// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
// than std::sort for input distributions with very few unique values.
template <class T>
void CountingSort(T* values, size_t num_values) {
// Unique values and their frequency (similar to flat_map).
using Unique = std::pair<T, int>;
std::vector<Unique> unique;
for (size_t i = 0; i < num_values; ++i) {
const T value = values[i];
const auto pos =
std::find_if(unique.begin(), unique.end(),
[value](const Unique u) { return u.first == value; });
if (pos == unique.end()) {
unique.push_back(std::make_pair(value, 1));
} else {
++pos->second;
}
}
// Sort in ascending order of value (pair.first).
std::sort(unique.begin(), unique.end());
// Write that many copies of each unique value to the array.
T* HWY_RESTRICT p = values;
for (const auto& value_count : unique) {
std::fill(p, p + value_count.second, value_count.first);
p += value_count.second;
}
NANOBENCHMARK_CHECK(p == values + num_values);
}
// @return i in [idx_begin, idx_begin + half_count) that minimizes
// sorted[i + half_count] - sorted[i].
template <typename T>
size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
const size_t half_count) {
T min_range = std::numeric_limits<T>::max();
size_t min_idx = 0;
for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
const T range = sorted[idx + half_count] - sorted[idx];
if (range < min_range) {
min_range = range;
min_idx = idx;
}
}
return min_idx;
}
// Returns an estimate of the mode by calling MinRange on successively
// halved intervals. "sorted" must be in ascending order. This is the
// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
// estimator of the mode", with complexity O(N log N). The mode is less
// affected by outliers in highly-skewed distributions than the median.
// The averaging operation below assumes "T" is an unsigned integer type.
template <typename T>
T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
size_t idx_begin = 0;
size_t half_count = num_values / 2;
while (half_count > 1) {
idx_begin = MinRange(sorted, idx_begin, half_count);
half_count >>= 1;
}
const T x = sorted[idx_begin + 0];
if (half_count == 0) {
return x;
}
NANOBENCHMARK_CHECK(half_count == 1);
const T average = (x + sorted[idx_begin + 1] + 1) / 2;
return average;
}
// Returns the mode. Side effect: sorts "values".
template <typename T>
T Mode(T* values, const size_t num_values) {
CountingSort(values, num_values);
return ModeOfSorted(values, num_values);
}
template <typename T, size_t N>
T Mode(T (&values)[N]) {
return Mode(&values[0], N);
}
// Returns the median value. Side effect: sorts "values".
template <typename T>
T Median(T* values, const size_t num_values) {
NANOBENCHMARK_CHECK(!values->empty());
std::sort(values, values + num_values);
const size_t half = num_values / 2;
// Odd count: return middle
if (num_values % 2) {
return values[half];
}
// Even count: return average of middle two.
return (values[half] + values[half - 1] + 1) / 2;
}
// Returns a robust measure of variability.
template <typename T>
T MedianAbsoluteDeviation(const T* values, const size_t num_values,
const T median) {
NANOBENCHMARK_CHECK(num_values != 0);
std::vector<T> abs_deviations;
abs_deviations.reserve(num_values);
for (size_t i = 0; i < num_values; ++i) {
const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
abs_deviations.push_back(static_cast<T>(abs));
}
return Median(abs_deviations.data(), num_values);
}
} // namespace robust_statistics
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
// read than 64 bit.
using Ticks = uint32_t;
// Returns timer overhead / minimum measurable difference.
Ticks TimerResolution() {
// Nested loop avoids exceeding stack/L1 capacity.
Ticks repetitions[Params::kTimerSamples];
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
Ticks samples[Params::kTimerSamples];
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
const Ticks t0 = timer::Start32();
const Ticks t1 = timer::Stop32();
samples[i] = t1 - t0;
}
repetitions[rep] = robust_statistics::Mode(samples);
}
return robust_statistics::Mode(repetitions);
}
static const Ticks timer_resolution = TimerResolution();
// Estimates the expected value of "lambda" values with a variable number of
// samples until the variability "rel_mad" is less than "max_rel_mad".
template <class Lambda>
Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
const Params& p, const Lambda& lambda) {
// Choose initial samples_per_eval based on a single estimated duration.
Ticks t0 = timer::Start32();
lambda();
Ticks t1 = timer::Stop32();
Ticks est = t1 - t0;
static const double ticks_per_second = platform::InvariantTicksPerSecond();
const size_t ticks_per_eval =
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
size_t samples_per_eval =
est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
std::vector<Ticks> samples;
samples.reserve(1 + samples_per_eval);
samples.push_back(est);
// Percentage is too strict for tiny differences, so also allow a small
// absolute "median absolute deviation".
const Ticks max_abs_mad = (timer_resolution + 99) / 100;
*rel_mad = 0.0; // ensure initialized
for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
samples.reserve(samples.size() + samples_per_eval);
for (size_t i = 0; i < samples_per_eval; ++i) {
t0 = timer::Start32();
lambda();
t1 = timer::Stop32();
samples.push_back(t1 - t0);
}
if (samples.size() >= p.min_mode_samples) {
est = robust_statistics::Mode(samples.data(), samples.size());
} else {
// For "few" (depends also on the variance) samples, Median is safer.
est = robust_statistics::Median(samples.data(), samples.size());
}
NANOBENCHMARK_CHECK(est != 0);
// Median absolute deviation (mad) is a robust measure of 'variability'.
const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
samples.data(), samples.size(), est);
*rel_mad = static_cast<double>(int(abs_mad)) / est;
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
if (p.verbose) {
printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
samples.size(), est, abs_mad, *rel_mad * 100.0);
}
return est;
}
}
if (p.verbose) {
printf(
"WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
*rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
}
return est;
}
using InputVec = std::vector<FuncInput>;
// Returns vector of unique input values.
InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
InputVec unique(inputs, inputs + num_inputs);
std::sort(unique.begin(), unique.end());
unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
return unique;
}
// Returns how often we need to call func for sufficient precision, or zero
// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
const Params& p) {
// Min elapsed ticks for any input.
Ticks min_duration = ~0u;
for (const FuncInput input : unique) {
// Make sure a 32-bit timer is sufficient.
const uint64_t t0 = timer::Start64();
PreventElision(func(arg, input));
const uint64_t t1 = timer::Stop64();
const uint64_t elapsed = t1 - t0;
if (elapsed >= (1ULL << 30)) {
fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
input);
return 0;
}
double rel_mad;
const Ticks total = SampleUntilStable(
p.target_rel_mad, &rel_mad, p,
[func, arg, input]() { PreventElision(func(arg, input)); });
min_duration = std::min(min_duration, total - timer_resolution);
}
// Number of repetitions required to reach the target resolution.
const size_t max_skip = p.precision_divisor;
// Number of repetitions given the estimated duration.
const size_t num_skip =
min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
if (p.verbose) {
printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
max_skip, min_duration, num_skip);
}
return num_skip;
}
// Replicates inputs until we can omit "num_skip" occurrences of an input.
InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
const size_t num_unique, const size_t num_skip,
const Params& p) {
InputVec full;
if (num_unique == 1) {
full.assign(p.subset_ratio * num_skip, inputs[0]);
return full;
}
full.reserve(p.subset_ratio * num_skip * num_inputs);
for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
full.insert(full.end(), inputs, inputs + num_inputs);
}
std::mt19937 rng;
std::shuffle(full.begin(), full.end(), rng);
return full;
}
// Copies the "full" to "subset" in the same order, but with "num_skip"
// randomly selected occurrences of "input_to_skip" removed.
void FillSubset(const InputVec& full, const FuncInput input_to_skip,
const size_t num_skip, InputVec* subset) {
const size_t count =
static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
// Generate num_skip random indices: which occurrence to skip.
std::vector<uint32_t> omit(count);
std::iota(omit.begin(), omit.end(), 0);
// omit[] is the same on every call, but that's OK because they identify the
// Nth instance of input_to_skip, so the position within full[] differs.
std::mt19937 rng;
std::shuffle(omit.begin(), omit.end(), rng);
omit.resize(num_skip);
std::sort(omit.begin(), omit.end());
uint32_t occurrence = ~0u; // 0 after preincrement
size_t idx_omit = 0; // cursor within omit[]
size_t idx_subset = 0; // cursor within *subset
for (const FuncInput next : full) {
if (next == input_to_skip) {
++occurrence;
// Haven't removed enough already
if (idx_omit < num_skip) {
// This one is up for removal
if (occurrence == omit[idx_omit]) {
++idx_omit;
continue;
}
}
}
if (idx_subset < subset->size()) {
(*subset)[idx_subset++] = next;
}
}
NANOBENCHMARK_CHECK(idx_subset == subset->size());
NANOBENCHMARK_CHECK(idx_omit == omit.size());
NANOBENCHMARK_CHECK(occurrence == count - 1);
}
// Returns total ticks elapsed for all inputs.
Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
const Params& p, double* max_rel_mad) {
double rel_mad;
const Ticks duration =
SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
for (const FuncInput input : *inputs) {
PreventElision(func(arg, input));
}
});
*max_rel_mad = std::max(*max_rel_mad, rel_mad);
return duration;
}
// (Nearly) empty Func for measuring timer overhead/resolution.
HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
return input;
}
// Returns overhead of accessing inputs[] and calling a function; this will
// be deducted from future TotalDuration return values.
Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
double rel_mad;
// Zero tolerance because repeatability is crucial and EmptyFunc is fast.
return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
for (const FuncInput input : *inputs) {
PreventElision(EmptyFunc(arg, input));
}
});
}
} // namespace
int Unpredictable1() { return timer::Start64() != ~0ULL; }
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results, const Params& p) {
NANOBENCHMARK_CHECK(num_inputs != 0);
const InputVec& unique = UniqueInputs(inputs, num_inputs);
const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
if (num_skip == 0) return 0; // NumSkip already printed error message
// (slightly less work on x86 to cast from signed integer)
const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
const InputVec& full =
ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
InputVec subset(full.size() - num_skip);
const Ticks overhead = Overhead(arg, &full, p);
const Ticks overhead_skip = Overhead(arg, &subset, p);
if (overhead < overhead_skip) {
fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
overhead_skip);
return 0;
}
if (p.verbose) {
printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
overhead, overhead_skip);
}
double max_rel_mad = 0.0;
const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
for (size_t i = 0; i < unique.size(); ++i) {
FillSubset(full, unique[i], num_skip, &subset);
const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
if (total < total_skip) {
fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
return 0;
}
const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
results[i].input = unique[i];
results[i].ticks = static_cast<float>(duration) * mul;
results[i].variability = static_cast<float>(max_rel_mad);
}
return unique.size();
}
} // namespace hwy

186
third_party/highway/hwy/nanobenchmark.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,186 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
#define HIGHWAY_HWY_NANOBENCHMARK_H_
// Benchmarks functions of a single integer argument with realistic branch
// prediction hit rates. Uses a robust estimator to summarize the measurements.
// The precision is about 0.2%.
//
// Examples: see nanobenchmark_test.cc.
//
// Background: Microbenchmarks such as http://github.com/google/benchmark
// can measure elapsed times on the order of a microsecond. Shorter functions
// are typically measured by repeating them thousands of times and dividing
// the total elapsed time by this count. Unfortunately, repetition (especially
// with the same input parameter!) influences the runtime. In time-critical
// code, it is reasonable to expect warm instruction/data caches and TLBs,
// but a perfect record of which branches will be taken is unrealistic.
// Unless the application also repeatedly invokes the measured function with
// the same parameter, the benchmark is measuring something very different -
// a best-case result, almost as if the parameter were made a compile-time
// constant. This may lead to erroneous conclusions about branch-heavy
// algorithms outperforming branch-free alternatives.
//
// Our approach differs in three ways. Adding fences to the timer functions
// reduces variability due to instruction reordering, improving the timer
// resolution to about 40 CPU cycles. However, shorter functions must still
// be invoked repeatedly. For more realistic branch prediction performance,
// we vary the input parameter according to a user-specified distribution.
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
// central tendency of the measurement samples with the "half sample mode",
// which is more robust to outliers and skewed data than the mean or median.
// WARNING if included from multiple translation units compiled with distinct
// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
// macro that is unique to the current compile flags. We must also avoid
// standard library headers such as vector and functional that define functions.
#include <stddef.h>
#include <stdint.h>
// Enables sanity checks that verify correct operation at the cost of
// longer benchmark runs.
#ifndef NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_ENABLE_CHECKS 0
#endif
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
while (!(condition)) { \
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
abort(); \
}
#if NANOBENCHMARK_ENABLE_CHECKS
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
#else
#define NANOBENCHMARK_CHECK(condition)
#endif
namespace hwy {
namespace platform {
// Returns tick rate, useful for converting measurements to seconds. Invariant
// means the tick counter frequency is independent of CPU throttling or sleep.
// This call may be expensive, callers should cache the result.
double InvariantTicksPerSecond();
} // namespace platform
// Returns 1, but without the compiler knowing what the value is. This prevents
// optimizing out code.
int Unpredictable1();
// Input influencing the function being measured (e.g. number of bytes to copy).
using FuncInput = size_t;
// "Proof of work" returned by Func to ensure the compiler does not elide it.
using FuncOutput = uint64_t;
// Function to measure: either 1) a captureless lambda or function with two
// arguments or 2) a lambda with capture, in which case the first argument
// is reserved for use by MeasureClosure.
using Func = FuncOutput (*)(const void*, FuncInput);
// Internal parameters that determine precision/resolution/measuring time.
struct Params {
// For measuring timer overhead/resolution. Used in a nested loop =>
// quadratic time, acceptable because we know timer overhead is "low".
// constexpr because this is used to define array bounds.
static constexpr size_t kTimerSamples = 256;
// Best-case precision, expressed as a divisor of the timer resolution.
// Larger => more calls to Func and higher precision.
size_t precision_divisor = 1024;
// Ratio between full and subset input distribution sizes. Cannot be less
// than 2; larger values increase measurement time but more faithfully
// model the given input distribution.
size_t subset_ratio = 2;
// Together with the estimated Func duration, determines how many times to
// call Func before checking the sample variability. Larger values increase
// measurement time, memory/cache use and precision.
double seconds_per_eval = 4E-3;
// The minimum number of samples before estimating the central tendency.
size_t min_samples_per_eval = 7;
// The mode is better than median for estimating the central tendency of
// skewed/fat-tailed distributions, but it requires sufficient samples
// relative to the width of half-ranges.
size_t min_mode_samples = 64;
// Maximum permissible variability (= median absolute deviation / center).
double target_rel_mad = 0.002;
// Abort after this many evals without reaching target_rel_mad. This
// prevents infinite loops.
size_t max_evals = 9;
// Whether to print additional statistics to stdout.
bool verbose = true;
};
// Measurement result for each unique input.
struct Result {
FuncInput input;
// Robust estimate (mode or median) of duration.
float ticks;
// Measure of variability (median absolute deviation relative to "ticks").
float variability;
};
// Precisely measures the number of ticks elapsed when calling "func" with the
// given inputs, shuffled to ensure realistic branch prediction hit rates.
//
// "func" returns a 'proof of work' to ensure its computations are not elided.
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
// "func". The values should be chosen to maximize coverage of "func". This
// represents a distribution, so a value's frequency should reflect its
// probability in the real application. Order does not matter; for example, a
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
// Returns how many Result were written to "results": one per unique input, or
// zero if the measurement failed (an error message goes to stderr).
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params());
// Calls operator() of the given closure (lambda function).
template <class Closure>
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
return (*f)(input);
}
// Same as Measure, except "closure" is typically a lambda function of
// FuncInput -> FuncOutput with a capture list.
template <class Closure>
static inline size_t MeasureClosure(const Closure& closure,
const FuncInput* inputs,
const size_t num_inputs, Result* results,
const Params& p = Params()) {
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
results, p);
}
} // namespace hwy
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_

104
third_party/highway/hwy/nanobenchmark_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,104 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/nanobenchmark.h"
#include <stdio.h>
#include <stdlib.h> // strtol
#include <unistd.h> // sleep
#include <random>
namespace hwy {
namespace {
FuncOutput Div(const void*, FuncInput in) {
// Here we're measuring the throughput because benchmark invocations are
// independent. Any dividend will do; the divisor is nonzero.
return 0xFFFFF / in;
}
template <size_t N>
void MeasureDiv(const FuncInput (&inputs)[N]) {
Result results[N];
Params params;
params.max_evals = 4; // avoid test timeout
const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
for (size_t i = 0; i < num_results; ++i) {
printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
results[i].ticks, results[i].variability * 100.0);
}
}
std::mt19937 rng;
// A function whose runtime depends on rng.
FuncOutput Random(const void* /*arg*/, FuncInput in) {
const size_t r = rng() & 0xF;
uint32_t ret = in;
for (size_t i = 0; i < r; ++i) {
ret /= ((rng() & 1) + 2);
}
return ret;
}
// Ensure the measured variability is high.
template <size_t N>
void MeasureRandom(const FuncInput (&inputs)[N]) {
Result results[N];
Params p;
p.max_evals = 4; // avoid test timeout
p.verbose = false;
const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
for (size_t i = 0; i < num_results; ++i) {
NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
}
}
template <size_t N>
void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
printf("Expect a 'measurement failed' below:\n");
Result results[N];
const size_t num_results = Measure(
[](const void*, const FuncInput input) -> FuncOutput {
// Loop until the sleep succeeds (not interrupted by signal). We assume
// >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
while (sleep(2) != 0) {
}
return input;
},
nullptr, inputs, N, results);
NANOBENCHMARK_CHECK(num_results == 0);
(void)num_results;
}
void RunAll(const int argc, char** /*argv*/) {
// unpredictable == 1 but the compiler doesn't know that.
const int unpredictable = argc != 999;
static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
static_cast<FuncInput>(unpredictable + 9)};
MeasureDiv(inputs);
MeasureRandom(inputs);
EnsureLongMeasurementFails(inputs);
}
} // namespace
} // namespace hwy
int main(int argc, char* argv[]) {
hwy::RunAll(argc, argv);
return 0;
}

4272
third_party/highway/hwy/ops/arm_neon-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1762
third_party/highway/hwy/ops/rvv-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1191
third_party/highway/hwy/ops/scalar-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

226
third_party/highway/hwy/ops/set_macros-inl.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,226 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Sets macros based on HWY_TARGET.
// This include guard is toggled by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it.
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_SET_MACROS_PER_TARGET
#undef HWY_SET_MACROS_PER_TARGET
#else
#define HWY_SET_MACROS_PER_TARGET
#endif
#endif // HWY_SET_MACROS_PER_TARGET
#include "hwy/targets.h"
#undef HWY_NAMESPACE
#undef HWY_ALIGN
#undef HWY_LANES
#undef HWY_CAP_INTEGER64
#undef HWY_CAP_FLOAT64
#undef HWY_CAP_GE256
#undef HWY_CAP_GE512
#undef HWY_TARGET_STR
// Before include guard so we redefine HWY_TARGET_STR on each include,
// governed by the current HWY_TARGET.
//-----------------------------------------------------------------------------
// SSE4
#if HWY_TARGET == HWY_SSE4
#define HWY_NAMESPACE N_SSE4
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_TARGET_STR "sse2,ssse3,sse4.1"
//-----------------------------------------------------------------------------
// AVX2
#elif HWY_TARGET == HWY_AVX2
#define HWY_NAMESPACE N_AVX2
#define HWY_ALIGN alignas(32)
#define HWY_LANES(T) (32 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 0
#if defined(HWY_DISABLE_BMI2_FMA)
#define HWY_TARGET_STR "avx,avx2,f16c"
#else
#define HWY_TARGET_STR "avx,avx2,bmi,bmi2,fma,f16c"
#endif
//-----------------------------------------------------------------------------
// AVX3
#elif HWY_TARGET == HWY_AVX3
#define HWY_ALIGN alignas(64)
#define HWY_LANES(T) (64 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 1
#define HWY_CAP_GE512 1
#define HWY_NAMESPACE N_AVX3
// Must include AVX2 because an AVX3 test may call AVX2 functions (e.g. when
// converting to half-vectors). HWY_DISABLE_BMI2_FMA is not relevant because if
// we have AVX3, we should also have BMI2/FMA.
#define HWY_TARGET_STR \
"avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw"
//-----------------------------------------------------------------------------
// PPC8
#elif HWY_TARGET == HWY_PPC8
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_PPC8
#define HWY_TARGET_STR "altivec,vsx"
//-----------------------------------------------------------------------------
// NEON
#elif HWY_TARGET == HWY_NEON
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#if HWY_ARCH_ARM_A64
#define HWY_CAP_FLOAT64 1
#else
#define HWY_CAP_FLOAT64 0
#endif
#define HWY_NAMESPACE N_NEON
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
//-----------------------------------------------------------------------------
// WASM
#elif HWY_TARGET == HWY_WASM
#define HWY_ALIGN alignas(16)
#define HWY_LANES(T) (16 / sizeof(T))
#define HWY_CAP_INTEGER64 0
#define HWY_CAP_FLOAT64 0
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_WASM
#define HWY_TARGET_STR "simd128"
//-----------------------------------------------------------------------------
// RVV
#elif HWY_TARGET == HWY_RVV
// RVV only requires lane alignment, not natural alignment of the entire vector,
// and the compiler already aligns builtin types, so nothing to do here.
#define HWY_ALIGN
// Arbitrary constant, not the actual lane count! Large enough that we can
// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
#define HWY_LANES(T) (4096 / sizeof(T))
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_RVV
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
// (rv64gcv is not a valid target)
//-----------------------------------------------------------------------------
// SCALAR
#elif HWY_TARGET == HWY_SCALAR
#define HWY_ALIGN
#define HWY_LANES(T) 1
#define HWY_CAP_INTEGER64 1
#define HWY_CAP_FLOAT64 1
#define HWY_CAP_GE256 0
#define HWY_CAP_GE512 0
#define HWY_NAMESPACE N_SCALAR
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
#else
#pragma message("HWY_TARGET does not match any known target")
#endif // HWY_TARGET
// Clang <9 requires this be invoked at file scope, before any namespace.
#undef HWY_BEFORE_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_BEFORE_NAMESPACE() \
HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_BEFORE_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
// Clang <9 requires any namespaces be closed before this macro.
#undef HWY_AFTER_NAMESPACE
#if defined(HWY_TARGET_STR)
#define HWY_AFTER_NAMESPACE() \
HWY_POP_ATTRIBUTES \
static_assert(true, "For requiring trailing semicolon")
#else
// avoids compiler warning if no HWY_TARGET_STR
#define HWY_AFTER_NAMESPACE() \
static_assert(true, "For requiring trailing semicolon")
#endif
#undef HWY_ATTR
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
#else
#define HWY_ATTR
#endif
// DEPRECATED
#undef HWY_GATHER_LANES
#define HWY_GATHER_LANES(T) HWY_LANES(T)

125
third_party/highway/hwy/ops/shared-inl.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,125 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Per-target definitions shared by ops/*.h and user code.
#include <cmath>
// Separate header because foreach_target.h re-enables its include guard.
#include "hwy/ops/set_macros-inl.h"
// Relies on the external include guard in highway.h.
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// SIMD operations are implemented as overloaded functions selected using a
// "descriptor" D := Simd<T, N>. T is the lane type, N a number of lanes >= 1
// (always a power of two). Users generally do not choose N directly, but
// instead use HWY_FULL(T[, LMUL]) (the largest available size). N is not
// necessarily the actual number of lanes, which is returned by Lanes(D()).
//
// Only HWY_FULL(T) and N <= 16 / sizeof(T) are guaranteed to be available - the
// latter are useful if >128 bit vectors are unnecessary or undesirable.
template <typename Lane, size_t N>
struct Simd {
constexpr Simd() = default;
using T = Lane;
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
// Widening/narrowing ops change the number of lanes and/or their type.
// To initialize such vectors, we need the corresponding descriptor types:
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
template <typename NewLane>
using Rebind = Simd<NewLane, N>;
// MulEven() with another lane type, but same total size.
// Round up to correctly handle scalars with N=1.
template <typename NewLane>
using Repartition =
Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
// LowerHalf() with the same lane type, but half the lanes.
// Round up to correctly handle scalars with N=1.
using Half = Simd<T, (N + 1) / 2>;
// Combine() with the same lane type, but twice the lanes.
using Twice = Simd<T, 2 * N>;
};
template <class D>
using TFromD = typename D::T;
// Descriptor for the same number of lanes as D, but with the LaneType T.
template <class T, class D>
using Rebind = typename D::template Rebind<T>;
template <class D>
using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
template <class D>
using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
template <class D>
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
// Descriptor for the same total size as D, but with the LaneType T.
template <class T, class D>
using Repartition = typename D::template Repartition<T>;
template <class D>
using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
template <class D>
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
// Descriptor for the same lane type as D, but half the lanes.
template <class D>
using Half = typename D::Half;
// Descriptor for the same lane type as D, but twice the lanes.
template <class D>
using Twice = typename D::Twice;
// Same as base.h macros but with a Simd<T, N> argument instead of T.
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
// Compile-time-constant, (typically but not guaranteed) an upper bound on the
// number of lanes.
// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
// `#if HWY_CAP_GE*`.
template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
return N;
}
// Targets with non-constexpr Lanes define this themselves.
#if HWY_TARGET != HWY_RVV
// (Potentially) non-constant actual size of the vector at runtime, subject to
// the limit imposed by the Simd. Useful for advancing loop counters.
template <typename T, size_t N>
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
return N;
}
#endif
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();

3008
third_party/highway/hwy/ops/wasm_128-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

3694
third_party/highway/hwy/ops/x86_128-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

2907
third_party/highway/hwy/ops/x86_256-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

3050
third_party/highway/hwy/ops/x86_512-inl.h поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

286
third_party/highway/hwy/targets.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,286 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/targets.h"
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <atomic>
#include <limits>
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
#endif // defined(*_SANITIZER)
#if HWY_ARCH_X86
#include <xmmintrin.h>
#if HWY_COMPILER_MSVC
#include <intrin.h>
#else // HWY_COMPILER_MSVC
#include <cpuid.h>
#endif // HWY_COMPILER_MSVC
#endif
namespace hwy {
namespace {
#if HWY_ARCH_X86
bool IsBitSet(const uint32_t reg, const int index) {
return (reg & (1U << index)) != 0;
}
// Calls CPUID instruction with eax=level and ecx=count and returns the result
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
void Cpuid(const uint32_t level, const uint32_t count,
uint32_t* HWY_RESTRICT abcd) {
#if HWY_COMPILER_MSVC
int regs[4];
__cpuidex(regs, level, count);
for (int i = 0; i < 4; ++i) {
abcd[i] = regs[i];
}
#else // HWY_COMPILER_MSVC
uint32_t a;
uint32_t b;
uint32_t c;
uint32_t d;
__cpuid_count(level, count, a, b, c, d);
abcd[0] = a;
abcd[1] = b;
abcd[2] = c;
abcd[3] = d;
#endif // HWY_COMPILER_MSVC
}
// Returns the lower 32 bits of extended control register 0.
// Requires CPU support for "OSXSAVE" (see below).
uint32_t ReadXCR0() {
#if HWY_COMPILER_MSVC
return static_cast<uint32_t>(_xgetbv(0));
#else // HWY_COMPILER_MSVC
uint32_t xcr0, xcr0_high;
const uint32_t index = 0;
asm volatile(".byte 0x0F, 0x01, 0xD0"
: "=a"(xcr0), "=d"(xcr0_high)
: "c"(index));
return xcr0;
#endif // HWY_COMPILER_MSVC
}
#endif // HWY_ARCH_X86
// Not function-local => no compiler-generated locking.
std::atomic<uint32_t> supported_{0}; // Not yet initialized
// When running tests, this value can be set to the mocked supported targets
// mask. Only written to from a single thread before the test starts.
uint32_t supported_targets_for_test_ = 0;
// Mask of targets disabled at runtime with DisableTargets.
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
#if HWY_ARCH_X86
// Bits indicating which instruction set extensions are supported.
constexpr uint32_t kSSE = 1 << 0;
constexpr uint32_t kSSE2 = 1 << 1;
constexpr uint32_t kSSE3 = 1 << 2;
constexpr uint32_t kSSSE3 = 1 << 3;
constexpr uint32_t kSSE41 = 1 << 4;
constexpr uint32_t kSSE42 = 1 << 5;
constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
constexpr uint32_t kAVX = 1u << 6;
constexpr uint32_t kAVX2 = 1u << 7;
constexpr uint32_t kFMA = 1u << 8;
constexpr uint32_t kLZCNT = 1u << 9;
constexpr uint32_t kBMI = 1u << 10;
constexpr uint32_t kBMI2 = 1u << 11;
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
// avoiding using and requiring these so AVX2 can still be used.
#ifdef HWY_DISABLE_BMI2_FMA
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
#else
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
#endif
constexpr uint32_t kAVX512F = 1u << 12;
constexpr uint32_t kAVX512VL = 1u << 13;
constexpr uint32_t kAVX512DQ = 1u << 14;
constexpr uint32_t kAVX512BW = 1u << 15;
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
#endif
} // namespace
HWY_NORETURN void HWY_FORMAT(3, 4)
Abort(const char* file, int line, const char* format, ...) {
char buf[2000];
va_list args;
va_start(args, format);
vsnprintf(buf, sizeof(buf), format, args);
va_end(args);
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
defined(THREAD_SANITIZER)
// If compiled with any sanitizer print a stack trace. This call doesn't crash
// the program, instead the trap below will crash it also allowing gdb to
// break there.
__sanitizer_print_stack_trace();
#endif // defined(*_SANITIZER)
fflush(stderr);
#if HWY_COMPILER_MSVC
abort(); // Compile error without this due to HWY_NORETURN.
#else
__builtin_trap();
#endif
}
void DisableTargets(uint32_t disabled_targets) {
supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
// We can call Update() here to initialize the mask but that will trigger a
// call to SupportedTargets() which we use in tests to tell whether any of the
// highway dynamic dispatch functions were used.
chosen_target.DeInit();
}
void SetSupportedTargetsForTest(uint32_t targets) {
// Reset the cached supported_ value to 0 to force a re-evaluation in the
// next call to SupportedTargets() which will use the mocked value set here
// if not zero.
supported_.store(0, std::memory_order_release);
supported_targets_for_test_ = targets;
chosen_target.DeInit();
}
bool SupportedTargetsCalledForTest() {
return supported_.load(std::memory_order_acquire) != 0;
}
uint32_t SupportedTargets() {
uint32_t bits = supported_.load(std::memory_order_acquire);
// Already initialized?
if (HWY_LIKELY(bits != 0)) {
return bits & supported_mask_;
}
// When running tests, this allows to mock the current supported targets.
if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
// Store the value to signal that this was used.
supported_.store(supported_targets_for_test_, std::memory_order_release);
return supported_targets_for_test_ & supported_mask_;
}
bits = HWY_SCALAR;
#if HWY_ARCH_X86
uint32_t flags = 0;
uint32_t abcd[4];
Cpuid(0, 0, abcd);
const uint32_t max_level = abcd[0];
// Standard feature flags
Cpuid(1, 0, abcd);
flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
const bool has_osxsave = IsBitSet(abcd[2], 27);
// Extended feature flags
Cpuid(0x80000001U, 0, abcd);
flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
// Extended features
if (max_level >= 7) {
Cpuid(7, 0, abcd);
flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
}
// Verify OS support for XSAVE, without which XMM/YMM registers are not
// preserved across context switches and are not safe to use.
if (has_osxsave) {
const uint32_t xcr0 = ReadXCR0();
// XMM
if (!IsBitSet(xcr0, 1)) {
flags = 0;
}
// YMM
if (!IsBitSet(xcr0, 2)) {
flags &= ~kGroupAVX2;
}
// ZMM + opmask
if ((xcr0 & 0x70) != 0x70) {
flags &= ~kGroupAVX3;
}
}
// Set target bit(s) if all their group's flags are all set.
if ((flags & kGroupAVX3) == kGroupAVX3) {
bits |= HWY_AVX3;
}
if ((flags & kGroupAVX2) == kGroupAVX2) {
bits |= HWY_AVX2;
}
if ((flags & kGroupSSE4) == kGroupSSE4) {
bits |= HWY_SSE4;
}
#else
// TODO(janwas): detect for other platforms
bits = HWY_ENABLED_BASELINE;
#endif // HWY_ARCH_X86
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
size_t(bits), HWY_ENABLED_BASELINE);
}
supported_.store(bits, std::memory_order_release);
return bits & supported_mask_;
}
// Declared in targets.h
ChosenTarget chosen_target;
void ChosenTarget::Update() {
// The supported variable contains the current CPU supported targets shifted
// to the location expected by the ChosenTarget mask. We enabled SCALAR
// regardless of whether it was compiled since it is also used as the
// fallback mechanism to the baseline target.
uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
HWY_CHOSEN_TARGET_MASK_SCALAR;
mask_.store(supported);
}
} // namespace hwy

491
third_party/highway/hwy/targets.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,491 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef HIGHWAY_HWY_TARGETS_H_
#define HIGHWAY_HWY_TARGETS_H_
#include <vector>
// For SIMD module implementations and their callers. Defines which targets to
// generate and call.
#include "hwy/base.h"
//------------------------------------------------------------------------------
// Optional configuration
// See ../quick_reference.md for documentation of these macros.
// Uncomment to override the default baseline determined from predefined macros:
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
// Uncomment to override the default blocklist:
// #define HWY_BROKEN_TARGETS HWY_AVX3
// Uncomment to definitely avoid generating those target(s):
// #define HWY_DISABLED_TARGETS HWY_SSE4
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
// #define HWY_DISABLE_BMI2_FMA
//------------------------------------------------------------------------------
// Targets
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
//
// All values are unconditionally defined so we can test HWY_TARGETS without
// first checking the HWY_ARCH_*.
//
// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
// can use 32-bit literals.
// 1,2,4: reserved
#define HWY_AVX3 8
#define HWY_AVX2 16
// 32: reserved for AVX
#define HWY_SSE4 64
// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
// dynamic dispatch. All x86 target bits must be lower or equal to
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
// HWY_MAX_DYNAMIC_TARGETS in total.
#define HWY_HIGHEST_TARGET_BIT_X86 9
// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
#define HWY_NEON 0x2000
#define HWY_HIGHEST_TARGET_BIT_ARM 13
// 0x4000, 0x8000 reserved
#define HWY_PPC8 0x10000 // v2.07 or 3
// 0x20000, 0x40000 reserved for prior VSX/AltiVec
#define HWY_HIGHEST_TARGET_BIT_PPC 18
// 0x80000 reserved
#define HWY_WASM 0x100000
#define HWY_HIGHEST_TARGET_BIT_WASM 20
// 0x200000, 0x400000, 0x800000 reserved
#define HWY_RVV 0x1000000
#define HWY_HIGHEST_TARGET_BIT_RVV 24
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
#define HWY_SCALAR 0x20000000
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
//------------------------------------------------------------------------------
// Set default blocklists
// Disabled means excluded from enabled at user's request. A separate config
// macro allows disabling without deactivating the blocklist below.
#ifndef HWY_DISABLED_TARGETS
#define HWY_DISABLED_TARGETS 0
#endif
// Broken means excluded from enabled due to known compiler issues. Allow the
// user to override this blocklist without any guarantee of success.
#ifndef HWY_BROKEN_TARGETS
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
// SSE4 codegen (msan failure), so disable all those targets.
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
// TODO: Disable all non-scalar targets for every build target once we have
// clang-7 enabled in our builders.
#ifdef MEMORY_SANITIZER
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
#else
#define HWY_BROKEN_TARGETS 0
#endif
// This entails a major speed reduction, so warn unless the user explicitly
// opts in to scalar-only.
#if !defined(HWY_COMPILE_ONLY_SCALAR)
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
#endif
// MSVC, or 32-bit may fail to compile AVX2/3.
#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
#else
#define HWY_BROKEN_TARGETS 0
#endif
#endif // HWY_BROKEN_TARGETS
// Enabled means not disabled nor blocklisted.
#define HWY_ENABLED(targets) \
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
//------------------------------------------------------------------------------
// Detect baseline targets using predefined macros
// Baseline means the targets for which the compiler is allowed to generate
// instructions, implying the target CPU would have to support them. Do not use
// this directly because it does not take the blocklist into account. Allow the
// user to override this without any guarantee of success.
#ifndef HWY_BASELINE_TARGETS
#ifdef __wasm_simd128__
#define HWY_BASELINE_WASM HWY_WASM
#else
#define HWY_BASELINE_WASM 0
#endif
#ifdef __VSX__
#define HWY_BASELINE_PPC8 HWY_PPC8
#else
#define HWY_BASELINE_PPC8 0
#endif
// GCC 4.5.4 only defines the former; 5.4 defines both.
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define HWY_BASELINE_NEON HWY_NEON
#else
#define HWY_BASELINE_NEON 0
#endif
#ifdef __SSE4_1__
#define HWY_BASELINE_SSE4 HWY_SSE4
#else
#define HWY_BASELINE_SSE4 0
#endif
#ifdef __AVX2__
#define HWY_BASELINE_AVX2 HWY_AVX2
#else
#define HWY_BASELINE_AVX2 0
#endif
#ifdef __AVX512F__
#define HWY_BASELINE_AVX3 HWY_AVX3
#else
#define HWY_BASELINE_AVX3 0
#endif
#ifdef __riscv_vector
#define HWY_BASELINE_RVV HWY_RVV
#else
#define HWY_BASELINE_RVV 0
#endif
#define HWY_BASELINE_TARGETS \
(HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
HWY_BASELINE_RVV)
#endif // HWY_BASELINE_TARGETS
//------------------------------------------------------------------------------
// Choose target for static dispatch
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
#if HWY_ENABLED_BASELINE == 0
#error "At least one baseline target must be defined and enabled"
#endif
// Best baseline, used for static dispatch. This is the least-significant 1-bit
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
// Start by assuming static dispatch. If we later use dynamic dispatch, this
// will be defined to other targets during the multiple-inclusion, and finally
// return to the initial value. Defining this outside begin/end_target ensures
// inl headers successfully compile by themselves (required by Bazel).
#define HWY_TARGET HWY_STATIC_TARGET
//------------------------------------------------------------------------------
// Choose targets for dynamic dispatch according to one of four policies
#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
#error "Invalid config: can only define a single policy for targets"
#endif
// Attainable means enabled and the compiler allows intrinsics (even when not
// allowed to autovectorize). Used in 3 and 4.
#if HWY_ARCH_X86
#define HWY_ATTAINABLE_TARGETS \
HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
#else
#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
#endif
// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
// to ~HWY_SCALAR, but this is more explicit).
#if defined(HWY_COMPILE_ONLY_SCALAR)
#undef HWY_STATIC_TARGET
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
#define HWY_TARGETS HWY_SCALAR
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
#elif defined(HWY_COMPILE_ONLY_STATIC)
#define HWY_TARGETS HWY_STATIC_TARGET
// 3) For tests: include all attainable targets (in particular: scalar)
#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
// excluding superseded targets, in particular scalar.
#else
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
#endif // target policy
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
#error "Logic error: best baseline should be included in dynamic targets"
#endif
//------------------------------------------------------------------------------
namespace hwy {
// Returns (cached) bitfield of enabled targets that are supported on this CPU.
// Implemented in supported_targets.cc; unconditionally compiled to support the
// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
// allow eliding calls to this function.
uint32_t SupportedTargets();
// Disable from runtime dispatch the mask of compiled in targets. Targets that
// were not enabled at compile time are ignored. This function is useful to
// disable a target supported by the CPU that is known to have bugs or when a
// lower target is desired. For this reason, attempts to disable targets which
// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
// returns at least the baseline target.
void DisableTargets(uint32_t disabled_targets);
// Single target: reduce code size by eliding the call and conditional branches
// inside Choose*() functions.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
#else
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
#endif
// Set the mock mask of CPU supported targets instead of the actual CPU
// supported targets computed in SupportedTargets(). The return value of
// SupportedTargets() will still be affected by the DisabledTargets() mask
// regardless of this mock, to prevent accidentally adding targets that are
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
// mock and use the actual CPU supported targets instead.
void SetSupportedTargetsForTest(uint32_t targets);
// Returns whether the SupportedTargets() function was called since the last
// SetSupportedTargetsForTest() call.
bool SupportedTargetsCalledForTest();
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
// is affected by the current SetSupportedTargetsForTest() mock if any.
HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
std::vector<uint32_t> ret;
for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
targets = targets & (targets - 1)) {
uint32_t current_target = targets & ~(targets - 1);
ret.push_back(current_target);
}
return ret;
}
static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
switch (target) {
#if HWY_ARCH_X86
case HWY_SSE4:
return "SSE4";
case HWY_AVX2:
return "AVX2";
case HWY_AVX3:
return "AVX3";
#endif
#if HWY_ARCH_ARM
case HWY_NEON:
return "Neon";
#endif
#if HWY_ARCH_PPC
case HWY_PPC8:
return "Power8";
#endif
#if HWY_ARCH_WASM
case HWY_WASM:
return "Wasm";
#endif
#if HWY_ARCH_RVV
case HWY_RVV:
return "RVV";
#endif
case HWY_SCALAR:
return "Scalar";
default:
return "?";
}
}
// The maximum number of dynamic targets on any architecture is defined by
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
// For the ChosenTarget mask and index we use a different bit arrangement than
// in the HWY_TARGETS mask. Only the targets involved in the current
// architecture are used in this mask, and therefore only the least significant
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
// significant bit is set when the mask is not initialized, the next
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
// that position and the next more significant bit is used for the scalar
// target. Because of this we need to define equivalent values for HWY_TARGETS
// in this representation.
// This mask representation allows to use ctz() on this mask and obtain a small
// number that's used as an index of the table for dynamic dispatch. In this
// way the first entry is used when the mask is uninitialized, the following
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
// scalar.
// The HWY_SCALAR bit in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
// current architecture.
#define HWY_CHOSEN_TARGET_SHIFT(X) \
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
((1u << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
<< 1)
// The HWY_TARGETS mask in the ChosenTarget mask format.
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
#if HWY_ARCH_X86
// Maximum number of dynamic targets, changing this value is an ABI incompatible
// change
#define HWY_MAX_DYNAMIC_TARGETS 10
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
// These must match the order in which the HWY_TARGETS are defined
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
// corresponds to the best target. Don't include a "," at the end of the list.
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
nullptr, /* AVX */ \
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
nullptr, /* SSSE3 */ \
nullptr, /* SSE3 */ \
nullptr /* SSE2 */
#endif // HWY_ARCH_X86
#if HWY_ARCH_ARM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 4
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_NEON(func_name) /* NEON */
#endif // HWY_ARCH_ARM
#if HWY_ARCH_PPC
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 5
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
nullptr, /* VSX */ \
nullptr /* AltiVec */
#endif // HWY_ARCH_PPC
#if HWY_ARCH_WASM
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 4
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_WASM(func_name) /* WASM */
#endif // HWY_ARCH_WASM
#if HWY_ARCH_RVV
// See HWY_ARCH_X86 above for details.
#define HWY_MAX_DYNAMIC_TARGETS 4
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
#define HWY_CHOOSE_TARGET_LIST(func_name) \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
nullptr, /* reserved */ \
HWY_CHOOSE_RVV(func_name) /* RVV */
#endif // HWY_ARCH_RVV
struct ChosenTarget {
public:
// Update the ChosenTarget mask based on the current CPU supported
// targets.
void Update();
// Reset the ChosenTarget to the uninitialized state.
void DeInit() { mask_.store(1); }
// Whether the ChosenTarget was initialized. This is useful to know whether
// any HWY_DYNAMIC_DISPATCH function was called.
bool IsInitialized() const { return mask_.load() != 1; }
// Return the index in the dynamic dispatch table to be used by the current
// CPU. Note that this method must be in the header file so it uses the value
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
// calls it, which may be different from others. This allows to only consider
// those targets that were actually compiled in this module.
size_t HWY_INLINE GetIndex() const {
return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
HWY_CHOSEN_TARGET_MASK_TARGETS);
}
private:
// Initialized to 1 so GetChosenTargetIndex() returns 0.
std::atomic<uint32_t> mask_{1};
};
extern ChosenTarget chosen_target;
} // namespace hwy
#endif // HIGHWAY_HWY_TARGETS_H_

102
third_party/highway/hwy/targets_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,102 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "hwy/targets.h"
#include "hwy/tests/test_util-inl.h"
namespace fake {
#define DECLARE_FUNCTION(TGT) \
namespace N_##TGT { \
uint32_t FakeFunction(int) { return HWY_##TGT; } \
}
DECLARE_FUNCTION(AVX3)
DECLARE_FUNCTION(AVX2)
DECLARE_FUNCTION(SSE4)
DECLARE_FUNCTION(NEON)
DECLARE_FUNCTION(PPC8)
DECLARE_FUNCTION(WASM)
DECLARE_FUNCTION(RVV)
DECLARE_FUNCTION(SCALAR)
HWY_EXPORT(FakeFunction);
void CheckFakeFunction() {
#define CHECK_ARRAY_ENTRY(TGT) \
if ((HWY_TARGETS & HWY_##TGT) != 0) { \
hwy::SetSupportedTargetsForTest(HWY_##TGT); \
/* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
/* the pointer to the already cached function. */ \
hwy::chosen_target.Update(); \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
/* Calling DeInit() will test that the initializer function */ \
/* also calls the right function. */ \
hwy::chosen_target.DeInit(); \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
/* Second call uses the cached value from the previous call. */ \
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
}
CHECK_ARRAY_ENTRY(AVX3)
CHECK_ARRAY_ENTRY(AVX2)
CHECK_ARRAY_ENTRY(SSE4)
CHECK_ARRAY_ENTRY(NEON)
CHECK_ARRAY_ENTRY(PPC8)
CHECK_ARRAY_ENTRY(WASM)
CHECK_ARRAY_ENTRY(RVV)
CHECK_ARRAY_ENTRY(SCALAR)
#undef CHECK_ARRAY_ENTRY
}
} // namespace fake
namespace hwy {
class HwyTargetsTest : public testing::Test {
protected:
void TearDown() override {
SetSupportedTargetsForTest(0);
DisableTargets(0); // Reset the mask.
}
};
// Test that the order in the HWY_EXPORT static array matches the expected
// value of the target bits. This is only checked for the targets that are
// enabled in the current compilation.
TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
TEST_F(HwyTargetsTest, DisabledTargetsTest) {
DisableTargets(~0u);
// Check that the baseline can't be disabled.
HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
DisableTargets(0); // Reset the mask.
uint32_t current_targets = SupportedTargets();
if ((current_targets & ~HWY_ENABLED_BASELINE) == 0) {
// We can't test anything else if the only compiled target is the baseline.
return;
}
// Get the lowest bit in the mask (the best target) and disable that one.
uint32_t lowest_target = current_targets & (~current_targets + 1);
// The lowest target shouldn't be one in the baseline.
HWY_ASSERT((lowest_target & ~HWY_ENABLED_BASELINE) != 0);
DisableTargets(lowest_target);
// Check that the other targets are still enabled.
HWY_ASSERT((lowest_target ^ current_targets) == SupportedTargets());
DisableTargets(0); // Reset the mask.
}
} // namespace hwy

1259
third_party/highway/hwy/tests/arithmetic_test.cc поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

287
third_party/highway/hwy/tests/combine_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,287 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
// Not yet implemented
#if HWY_TARGET != HWY_RVV
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestLowerHalf {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const Half<D> d2;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
const auto v = Iota(d, 1);
Store(LowerHalf(v), d2, lanes.get());
size_t i = 0;
for (; i < Lanes(d2); ++i) {
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
}
// Other half remains unchanged
for (; i < N; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
}
};
struct TestLowerQuarter {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const Half<Half<D>> d4;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
const auto v = Iota(d, 1);
const auto lo = LowerHalf(LowerHalf(v));
Store(lo, d4, lanes.get());
size_t i = 0;
for (; i < Lanes(d4); ++i) {
HWY_ASSERT_EQ(T(i + 1), lanes[i]);
}
// Upper 3/4 remain unchanged
for (; i < N; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
}
};
HWY_NOINLINE void TestAllLowerHalf() {
constexpr size_t kDiv = 1;
ForAllTypes(ForPartialVectors<TestLowerHalf, kDiv, /*kMinLanes=*/2>());
ForAllTypes(ForPartialVectors<TestLowerQuarter, kDiv, /*kMinLanes=*/4>());
}
struct TestUpperHalf {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Scalar does not define UpperHalf.
#if HWY_TARGET != HWY_SCALAR
const Half<D> d2;
const auto v = Iota(d, 1);
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
Store(UpperHalf(v), d2, lanes.get());
size_t i = 0;
for (; i < Lanes(d2); ++i) {
HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
}
// Other half remains unchanged
for (; i < N; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllUpperHalf() {
ForAllTypes(ForGE128Vectors<TestUpperHalf>());
}
struct TestZeroExtendVector {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_CAP_GE256
const Twice<D> d2;
const auto v = Iota(d, 1);
const size_t N2 = Lanes(d2);
auto lanes = AllocateAligned<T>(N2);
Store(v, d, &lanes[0]);
Store(v, d, &lanes[N2 / 2]);
const auto ext = ZeroExtendVector(v);
Store(ext, d2, lanes.get());
size_t i = 0;
// Lower half is unchanged
for (; i < N2 / 2; ++i) {
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
}
// Upper half is zero
for (; i < N2; ++i) {
HWY_ASSERT_EQ(T(0), lanes[i]);
}
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllZeroExtendVector() {
ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
}
struct TestCombine {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_CAP_GE256
const Twice<D> d2;
const size_t N2 = Lanes(d2);
auto lanes = AllocateAligned<T>(N2);
const auto lo = Iota(d, 1);
const auto hi = Iota(d, N2 / 2 + 1);
const auto combined = Combine(hi, lo);
Store(combined, d2, lanes.get());
const auto expected = Iota(d2, 1);
HWY_ASSERT_VEC_EQ(d2, expected, combined);
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllCombine() {
ForAllTypes(ForExtendableVectors<TestCombine>());
}
template <int kBytes>
struct TestCombineShiftRightBytesR {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
// Scalar does not define CombineShiftRightBytes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
const Repartition<uint8_t, D> d8;
const size_t N8 = Lanes(d8);
const auto lo = BitCast(d, Iota(d8, 1));
const auto hi = BitCast(d, Iota(d8, 1 + N8));
auto expected = AllocateAligned<T>(Lanes(d));
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
const size_t kBlockSize = 16;
for (size_t i = 0; i < N8; ++i) {
const size_t block = i / kBlockSize;
const size_t lane = i % kBlockSize;
const size_t first_lo = block * kBlockSize;
const size_t idx = lane + kBytes;
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
const bool at_end = idx >= 2 * kBlockSize;
expected_bytes[i] =
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
}
HWY_ASSERT_VEC_EQ(d, expected.get(),
CombineShiftRightBytes<kBytes>(hi, lo));
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
#else
(void)t;
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
template <int kLanes>
struct TestCombineShiftRightLanesR {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
const Repartition<uint8_t, D> d8;
const size_t N8 = Lanes(d8);
const auto lo = BitCast(d, Iota(d8, 1));
const auto hi = BitCast(d, Iota(d8, 1 + N8));
auto expected = AllocateAligned<T>(Lanes(d));
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
const size_t kBlockSize = 16;
for (size_t i = 0; i < N8; ++i) {
const size_t block = i / kBlockSize;
const size_t lane = i % kBlockSize;
const size_t first_lo = block * kBlockSize;
const size_t idx = lane + kLanes * sizeof(T);
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
const bool at_end = idx >= 2 * kBlockSize;
expected_bytes[i] =
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
}
HWY_ASSERT_VEC_EQ(d, expected.get(),
CombineShiftRightLanes<kLanes>(hi, lo));
TestCombineShiftRightBytesR<kLanes - 1>()(t, d);
#else
(void)t;
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
template <>
struct TestCombineShiftRightBytesR<0> {
template <class T, class D>
void operator()(T /*unused*/, D /*unused*/) {}
};
template <>
struct TestCombineShiftRightLanesR<0> {
template <class T, class D>
void operator()(T /*unused*/, D /*unused*/) {}
};
struct TestCombineShiftRight {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
TestCombineShiftRightBytesR<15>()(t, d);
TestCombineShiftRightLanesR<16 / sizeof(T) - 1>()(t, d);
}
};
HWY_NOINLINE void TestAllCombineShiftRight() {
ForAllTypes(ForGE128Vectors<TestCombineShiftRight>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyCombineTest);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
} // namespace hwy
#endif
#else
int main(int, char**) { return 0; }
#endif // HWY_TARGET != HWY_RVV

217
third_party/highway/hwy/tests/compare_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,217 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memset
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// All types.
struct TestMask {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
const auto actual_false = MaskFromVec(Load(d, lanes.get()));
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
memset(lanes.get(), 0xFF, N * sizeof(T));
const auto actual_true = MaskFromVec(Load(d, lanes.get()));
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
}
};
HWY_NOINLINE void TestAllMask() { ForAllTypes(ForPartialVectors<TestMask>()); }
// All types.
struct TestEquality {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v2 = Iota(d, 2);
const auto v2b = Iota(d, 2);
const auto v3 = Iota(d, 3);
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
}
};
HWY_NOINLINE void TestAllEquality() {
ForAllTypes(ForPartialVectors<TestEquality>());
}
// a > b should be true, verify that for Gt/Lt and with swapped args.
template <class D>
void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
const auto va = Set(d, a);
const auto vb = Set(d, b);
AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
// Swapped order
AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
// Also ensure irreflexive
AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
}
#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
struct TestStrictInt {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const T min = LimitsMin<T>();
const T max = LimitsMax<T>();
const auto v0 = Zero(d);
const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127
const auto vn = Neg(v2) - Set(d, 1); // -1..-128
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
// Individual values of interest
HWY_ENSURE_GREATER(d, 2, 1);
HWY_ENSURE_GREATER(d, 1, 0);
HWY_ENSURE_GREATER(d, 0, -1);
HWY_ENSURE_GREATER(d, -1, -2);
HWY_ENSURE_GREATER(d, max, max / 2);
HWY_ENSURE_GREATER(d, max, 1);
HWY_ENSURE_GREATER(d, max, 0);
HWY_ENSURE_GREATER(d, max, -1);
HWY_ENSURE_GREATER(d, max, min);
HWY_ENSURE_GREATER(d, 0, min);
HWY_ENSURE_GREATER(d, min / 2, min);
// Also use Iota to ensure lanes are independent
HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
}
};
HWY_NOINLINE void TestAllStrictInt() {
ForSignedTypes(ForExtendableVectors<TestStrictInt>());
}
struct TestStrictFloat {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const T huge_neg = -1E35;
const T huge_pos = 1E36;
const auto v0 = Zero(d);
const auto v2 = Iota(d, T(2));
const auto vn = Neg(v2);
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
// Individual values of interest
HWY_ENSURE_GREATER(d, 2, 1);
HWY_ENSURE_GREATER(d, 1, 0);
HWY_ENSURE_GREATER(d, 0, -1);
HWY_ENSURE_GREATER(d, -1, -2);
HWY_ENSURE_GREATER(d, huge_pos, 1);
HWY_ENSURE_GREATER(d, huge_pos, 0);
HWY_ENSURE_GREATER(d, huge_pos, -1);
HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
HWY_ENSURE_GREATER(d, 0, huge_neg);
// Also use Iota to ensure lanes are independent
HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
}
};
HWY_NOINLINE void TestAllStrictFloat() {
ForFloatTypes(ForExtendableVectors<TestStrictFloat>());
}
struct TestWeakFloat {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v2 = Iota(d, 2);
const auto vn = Iota(d, -T(Lanes(d)));
const auto mask_false = MaskFalse(d);
const auto mask_true = MaskTrue(d);
HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
}
};
HWY_NOINLINE void TestAllWeakFloat() {
ForFloatTypes(ForPartialVectors<TestWeakFloat>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyCompareTest);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
} // namespace hwy
#endif

568
third_party/highway/hwy/tests/convert_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,568 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
// via TestBitCastFrom.
template <typename ToT>
struct TestBitCast {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const Repartition<ToT, D> dto;
HWY_ASSERT_EQ(Lanes(d) * sizeof(T), Lanes(dto) * sizeof(ToT));
const auto vf = Iota(d, 1);
const auto vt = BitCast(dto, vf);
// Must return the same bits
auto from_lanes = AllocateAligned<T>(Lanes(d));
auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
Store(vf, d, from_lanes.get());
Store(vt, dto, to_lanes.get());
HWY_ASSERT(
BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
}
};
// From D to all types.
struct TestBitCastFrom {
template <typename T, class D>
HWY_NOINLINE void operator()(T t, D d) {
TestBitCast<uint8_t>()(t, d);
TestBitCast<uint16_t>()(t, d);
TestBitCast<uint32_t>()(t, d);
#if HWY_CAP_INTEGER64
TestBitCast<uint64_t>()(t, d);
#endif
TestBitCast<int8_t>()(t, d);
TestBitCast<int16_t>()(t, d);
TestBitCast<int32_t>()(t, d);
#if HWY_CAP_INTEGER64
TestBitCast<int64_t>()(t, d);
#endif
TestBitCast<float>()(t, d);
#if HWY_CAP_FLOAT64
TestBitCast<double>()(t, d);
#endif
}
};
HWY_NOINLINE void TestAllBitCast() {
// For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
// the former can't partition its single lane, and the latter can be smaller
// than a destination type.
const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
to_u8(uint8_t());
to_u8(int8_t());
const ForPartialVectors<TestBitCast<int8_t>> to_i8;
to_i8(uint8_t());
to_i8(int8_t());
const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
to_u16(uint16_t());
to_u16(int16_t());
const ForPartialVectors<TestBitCast<int16_t>> to_i16;
to_i16(uint16_t());
to_i16(int16_t());
const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
to_u32(uint32_t());
to_u32(int32_t());
to_u32(float());
const ForPartialVectors<TestBitCast<int32_t>> to_i32;
to_i32(uint32_t());
to_i32(int32_t());
to_i32(float());
#if HWY_CAP_INTEGER64
const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
to_u64(uint64_t());
to_u64(int64_t());
#if HWY_CAP_FLOAT64
to_u64(double());
#endif
const ForPartialVectors<TestBitCast<int64_t>> to_i64;
to_i64(uint64_t());
to_i64(int64_t());
#if HWY_CAP_FLOAT64
to_i64(double());
#endif
#endif // HWY_CAP_INTEGER64
const ForPartialVectors<TestBitCast<float>> to_float;
to_float(uint32_t());
to_float(int32_t());
to_float(float());
#if HWY_CAP_FLOAT64
const ForPartialVectors<TestBitCast<double>> to_double;
to_double(double());
#if HWY_CAP_INTEGER64
to_double(uint64_t());
to_double(int64_t());
#endif // HWY_CAP_INTEGER64
#endif // HWY_CAP_FLOAT64
// For non-scalar vectors, we can cast all types to all.
ForAllTypes(ForGE128Vectors<TestBitCastFrom>());
}
template <typename ToT>
struct TestPromoteTo {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
RandomState rng;
for (size_t rep = 0; rep < 200; ++rep) {
for (size_t i = 0; i < N; ++i) {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
expected[i] = from[i];
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
PromoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllPromoteTo() {
const ForPartialVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
to_u16div2(uint8_t());
const ForPartialVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
to_u32div4(uint8_t());
const ForPartialVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
to_u32div2(uint16_t());
const ForPartialVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
to_i16div2(uint8_t());
to_i16div2(int8_t());
const ForPartialVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
to_i32div2(uint16_t());
to_i32div2(int16_t());
const ForPartialVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
to_i32div4(uint8_t());
to_i32div4(int8_t());
// Must test f16 separately because we can only load/store/convert them.
#if HWY_CAP_INTEGER64
const ForPartialVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
to_u64div2(uint32_t());
const ForPartialVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
to_i64div2(int32_t());
#endif
#if HWY_CAP_FLOAT64
const ForPartialVectors<TestPromoteTo<double>, 2> to_f64div2;
to_f64div2(int32_t());
to_f64div2(float());
#endif
}
template <typename T, HWY_IF_FLOAT(T)>
bool IsFinite(T t) {
return std::isfinite(t);
}
// Wrapper avoids calling std::isfinite for integer types (ambiguous).
template <typename T, HWY_IF_NOT_FLOAT(T)>
bool IsFinite(T /*unused*/) {
return true;
}
template <typename ToT>
struct TestDemoteTo {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
// Narrower range in the wider type, for clamping before we cast
const T min = LimitsMin<ToT>();
const T max = LimitsMax<ToT>();
RandomState rng;
for (size_t rep = 0; rep < 1000; ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!IsFinite(from[i]));
expected[i] = static_cast<ToT>(std::min(std::max(min, from[i]), max));
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
DemoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllDemoteToInt() {
ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int16_t());
ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int16_t());
ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> to_u16;
to_u16(int32_t());
const ForDemoteVectors<TestDemoteTo<int16_t>, 2> to_i16;
to_i16(int32_t());
}
HWY_NOINLINE void TestAllDemoteToMixed() {
#if HWY_CAP_FLOAT64
const ForDemoteVectors<TestDemoteTo<int32_t>, 2> to_i32;
to_i32(double());
#endif
}
template <typename ToT>
struct TestDemoteToFloat {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
// For floats, we clamp differently and cannot call LimitsMin.
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
const Rebind<ToT, D> to_d;
const size_t N = Lanes(from_d);
auto from = AllocateAligned<T>(N);
auto expected = AllocateAligned<ToT>(N);
RandomState rng;
for (size_t rep = 0; rep < 1000; ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(T));
} while (!IsFinite(from[i]));
const T magn = std::abs(from[i]);
const T max_abs = HighestValue<ToT>();
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
const T clipped = copysign(std::min(magn, max_abs), from[i]);
expected[i] = static_cast<ToT>(clipped);
}
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
DemoteTo(to_d, Load(from_d, from.get())));
}
}
};
HWY_NOINLINE void TestAllDemoteToFloat() {
// Must test f16 separately because we can only load/store/convert them.
#if HWY_CAP_FLOAT64
const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
to_float(double());
#endif
}
template <class D>
AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
const float test_cases[] = {
// +/- 1
1.0f, -1.0f,
// +/- 0
0.0f, -0.0f,
// near 0
0.25f, -0.25f,
// +/- integer
4.0f, -32.0f,
// positive near limit
65472.0f, 65504.0f,
// negative near limit
-65472.0f, -65504.0f,
// positive +/- delta
2.00390625f, 3.99609375f,
// negative +/- delta
-2.00390625f, -3.99609375f,
// No infinity/NaN - implementation-defined due to ARM.
};
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
const size_t N = Lanes(d);
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
auto in = AllocateAligned<float>(padded);
auto expected = AllocateAligned<float>(padded);
std::copy(test_cases, test_cases + kNumTestCases, in.get());
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
return in;
}
struct TestF16 {
template <typename TF32, class DF32>
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
size_t padded;
auto in = F16TestCases(d32, padded);
using TF16 = float16_t;
const Rebind<TF16, DF32> d16;
const size_t N = Lanes(d32); // same count for f16
auto temp16 = AllocateAligned<TF16>(N);
for (size_t i = 0; i < padded; i += N) {
const auto loaded = Load(d32, &in[i]);
Store(DemoteTo(d16, loaded), d16, temp16.get());
HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
}
}
};
HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16, 2>()(float()); }
struct TestConvertU8 {
template <typename T, class D>
HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
const Rebind<uint8_t, D> du8;
auto lanes8 = AllocateAligned<uint8_t>(Lanes(du8));
Store(Iota(du8, 0), du8, lanes8.get());
HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(Iota(du32, 0)));
HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), U8FromU32(Iota(du32, 0x7F)));
}
};
HWY_NOINLINE void TestAllConvertU8() {
ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
}
// Separate function to attempt to work around a compiler bug on ARM: when this
// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
struct TestIntFromFloatHuge {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
// Still does not work, although ARMv7 manual says that float->int
// saturates, i.e. chooses the nearest representable value.
#if HWY_TARGET != HWY_NEON
using TI = MakeSigned<TF>;
const Rebind<TI, DF> di;
// Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
// the expected i32 value is otherwise 0x80..00).
const auto expected_max = Set(di, LimitsMax<TI>());
HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));
// Huge negative (also lvalue for safety, but GCC bug was not triggered)
const auto expected_min = Set(di, LimitsMin<TI>());
HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
#else
(void)df;
#endif
}
};
struct TestIntFromFloat {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
using TI = MakeSigned<TF>;
const Rebind<TI, DF> di;
const size_t N = Lanes(df);
// Integer positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
// Integer negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
// Above positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
// Below positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
const TF eps = static_cast<TF>(0.0001);
// Above negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
// Below negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
// TF does not have enough precision to represent TI.
const double min = static_cast<double>(LimitsMin<TI>());
const double max = static_cast<double>(LimitsMax<TI>());
// Also check random values.
auto from = AllocateAligned<TF>(N);
auto expected = AllocateAligned<TI>(N);
RandomState rng;
for (size_t rep = 0; rep < 1000; ++rep) {
for (size_t i = 0; i < N; ++i) {
do {
const uint64_t bits = rng();
memcpy(&from[i], &bits, sizeof(TF));
} while (!std::isfinite(from[i]));
if (from[i] >= max) {
expected[i] = LimitsMax<TI>();
} else if (from[i] <= min) {
expected[i] = LimitsMin<TI>();
} else {
expected[i] = static_cast<TI>(from[i]);
}
}
HWY_ASSERT_VEC_EQ(di, expected.get(),
ConvertTo(di, Load(df, from.get())));
}
}
};
HWY_NOINLINE void TestAllIntFromFloat() {
ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
}
struct TestFloatFromInt {
template <typename TI, class DI>
HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
using TF = MakeFloat<TI>;
const Rebind<TF, DI> df;
const size_t N = Lanes(df);
// Integer positive
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
// Integer negative
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
// Max positive
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
ConvertTo(df, Set(di, LimitsMax<TI>())));
// Min negative
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
ConvertTo(df, Set(di, LimitsMin<TI>())));
}
};
HWY_NOINLINE void TestAllFloatFromInt() {
ForPartialVectors<TestFloatFromInt>()(int32_t());
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
ForPartialVectors<TestFloatFromInt>()(int64_t());
#endif
}
struct TestI32F64 {
template <typename TF, class DF>
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
using TI = int32_t;
const Rebind<TI, DF> di;
const size_t N = Lanes(df);
// Integer positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
// Integer negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
// Above positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
// Below positive
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
const TF eps = static_cast<TF>(0.0001);
// Above negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
// Below negative
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
// Huge positive float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
DemoteTo(di, Set(df, TF(1E12))));
// Huge negative float
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
DemoteTo(di, Set(df, TF(-1E12))));
// Max positive int
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
PromoteTo(df, Set(di, LimitsMax<TI>())));
// Min negative int
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
PromoteTo(df, Set(di, LimitsMin<TI>())));
}
};
HWY_NOINLINE void TestAllI32F64() {
#if HWY_CAP_FLOAT64
ForDemoteVectors<TestI32F64, 2>()(double());
#endif
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyConvertTest);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
} // namespace hwy
#endif

34
third_party/highway/hwy/tests/list_targets.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,34 @@
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Simple tool to print the list of targets that were compiled in when building
// this tool.
#include <stdio.h>
#include "hwy/highway.h"
void PrintTargets(const char* msg, uint32_t targets) {
fprintf(stderr, "%s", msg);
for (unsigned x = targets; x != 0; x = x & (x - 1)) {
fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
}
fprintf(stderr, "\n");
}
int main() {
PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
return 0;
}

691
third_party/highway/hwy/tests/logical_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,691 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h> // memcmp
#include "hwy/base.h"
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestLogicalInteger {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vi = Iota(d, 0);
const auto ones = VecFromMask(d, Eq(v0, v0));
const auto v1 = Set(d, 1);
const auto vnot1 = Set(d, ~T(1));
HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
auto v = vi;
v = And(v, vi);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = And(v, v0);
HWY_ASSERT_VEC_EQ(d, v0, v);
v = Or(v, vi);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = Or(v, v0);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = Xor(v, vi);
HWY_ASSERT_VEC_EQ(d, v0, v);
v = Xor(v, v0);
HWY_ASSERT_VEC_EQ(d, v0, v);
}
};
HWY_NOINLINE void TestAllLogicalInteger() {
ForIntegerTypes(ForPartialVectors<TestLogicalInteger>());
}
struct TestLogicalFloat {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vi = Iota(d, 0);
HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
auto v = vi;
v = And(v, vi);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = And(v, v0);
HWY_ASSERT_VEC_EQ(d, v0, v);
v = Or(v, vi);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = Or(v, v0);
HWY_ASSERT_VEC_EQ(d, vi, v);
v = Xor(v, vi);
HWY_ASSERT_VEC_EQ(d, v0, v);
v = Xor(v, v0);
HWY_ASSERT_VEC_EQ(d, v0, v);
}
};
HWY_NOINLINE void TestAllLogicalFloat() {
ForFloatTypes(ForPartialVectors<TestLogicalFloat>());
}
struct TestCopySign {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vp = Iota(d, 1);
const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
// Zero remains zero regardless of sign
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
// Positive input, positive sign => unchanged
HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
// Positive input, negative sign => negated
HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
// Negative input, negative sign => unchanged
HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
// Negative input, positive sign => negated
HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
}
};
HWY_NOINLINE void TestAllCopySign() {
ForFloatTypes(ForPartialVectors<TestCopySign>());
}
struct TestIfThenElse {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
const size_t N = Lanes(d);
auto in1 = AllocateAligned<T>(N);
auto in2 = AllocateAligned<T>(N);
auto mask_lanes = AllocateAligned<T>(N);
auto expected = AllocateAligned<T>(N);
// NOTE: reverse polarity (mask is true iff lane == 0) because we cannot
// reliably compare against all bits set (NaN for float types).
const T off = 1;
// Each lane should have a chance of having mask=true.
for (size_t rep = 0; rep < 50; ++rep) {
for (size_t i = 0; i < N; ++i) {
in1[i] = static_cast<T>(Random32(&rng));
in2[i] = static_cast<T>(Random32(&rng));
mask_lanes[i] = (Random32(&rng) & 1024) ? off : T(0);
}
const auto v1 = Load(d, in1.get());
const auto v2 = Load(d, in2.get());
const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
for (size_t i = 0; i < N; ++i) {
expected[i] = (mask_lanes[i] == off) ? in2[i] : in1[i];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
for (size_t i = 0; i < N; ++i) {
expected[i] = mask_lanes[i] ? T(0) : in1[i];
}
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
for (size_t i = 0; i < N; ++i) {
expected[i] = mask_lanes[i] ? in2[i] : T(0);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
}
}
};
HWY_NOINLINE void TestAllIfThenElse() {
ForAllTypes(ForPartialVectors<TestIfThenElse>());
}
struct TestMaskVec {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
const size_t N = Lanes(d);
auto mask_lanes = AllocateAligned<T>(N);
// Each lane should have a chance of having mask=true.
for (size_t rep = 0; rep < 100; ++rep) {
for (size_t i = 0; i < N; ++i) {
mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
}
const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
}
}
};
HWY_NOINLINE void TestAllMaskVec() {
const ForPartialVectors<TestMaskVec> test;
test(uint16_t());
test(int16_t());
// TODO(janwas): float16_t - cannot compare yet
test(uint32_t());
test(int32_t());
test(float());
#if HWY_CAP_INTEGER64
test(uint64_t());
test(int64_t());
#endif
#if HWY_CAP_FLOAT64
test(double());
#endif
}
struct TestCompress {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
using TU = MakeUnsigned<T>;
const Rebind<TU, D> du;
const size_t N = Lanes(d);
auto in_lanes = AllocateAligned<T>(N);
auto mask_lanes = AllocateAligned<TU>(N);
auto expected = AllocateAligned<T>(N);
auto actual = AllocateAligned<T>(N);
// Each lane should have a chance of having mask=true.
for (size_t rep = 0; rep < 100; ++rep) {
size_t expected_pos = 0;
for (size_t i = 0; i < N; ++i) {
const uint64_t bits = Random32(&rng);
in_lanes[i] = T(); // cannot initialize float16_t directly.
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
if (mask_lanes[i] == 0) { // Zero means true (easier to compare)
expected[expected_pos++] = in_lanes[i];
}
}
const auto in = Load(d, in_lanes.get());
const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
Store(Compress(in, mask), d, actual.get());
// Upper lanes are undefined.
for (size_t i = 0; i < expected_pos; ++i) {
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
}
// Also check CompressStore in the same way.
memset(actual.get(), 0, N * sizeof(T));
const size_t num_written = CompressStore(in, mask, d, actual.get());
HWY_ASSERT_EQ(expected_pos, num_written);
for (size_t i = 0; i < expected_pos; ++i) {
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
}
}
}
};
#if 0
namespace detail { // for code folding
void PrintCompress16x8Tables() {
constexpr size_t N = 8; // 128-bit SIMD
for (uint64_t code = 0; code < 1ull << N; ++code) {
std::array<uint8_t, N> indices{0};
size_t pos = 0;
for (size_t i = 0; i < N; ++i) {
if (code & (1ull << i)) {
indices[pos++] = i;
}
}
// Doubled (for converting lane to byte indices)
for (size_t i = 0; i < N; ++i) {
printf("%d,", 2 * indices[i]);
}
}
printf("\n");
}
// Compressed to nibbles
void PrintCompress32x8Tables() {
constexpr size_t N = 8; // AVX2
for (uint64_t code = 0; code < 1ull << N; ++code) {
std::array<uint32_t, N> indices{0};
size_t pos = 0;
for (size_t i = 0; i < N; ++i) {
if (code & (1ull << i)) {
indices[pos++] = i;
}
}
// Convert to nibbles
uint64_t packed = 0;
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT(indices[i] < 16);
packed += indices[i] << (i * 4);
}
HWY_ASSERT(packed < (1ull << 32));
printf("0x%08x,", static_cast<uint32_t>(packed));
}
printf("\n");
}
// Pairs of 32-bit lane indices
void PrintCompress64x4Tables() {
constexpr size_t N = 4; // AVX2
for (uint64_t code = 0; code < 1ull << N; ++code) {
std::array<uint32_t, N> indices{0};
size_t pos = 0;
for (size_t i = 0; i < N; ++i) {
if (code & (1ull << i)) {
indices[pos++] = i;
}
}
for (size_t i = 0; i < N; ++i) {
printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
}
}
printf("\n");
}
// 4-tuple of byte indices
void PrintCompress32x4Tables() {
using T = uint32_t;
constexpr size_t N = 4; // SSE4
for (uint64_t code = 0; code < 1ull << N; ++code) {
std::array<uint32_t, N> indices{0};
size_t pos = 0;
for (size_t i = 0; i < N; ++i) {
if (code & (1ull << i)) {
indices[pos++] = i;
}
}
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
}
}
}
printf("\n");
}
// 8-tuple of byte indices
void PrintCompress64x2Tables() {
using T = uint64_t;
constexpr size_t N = 2; // SSE4
for (uint64_t code = 0; code < 1ull << N; ++code) {
std::array<uint32_t, N> indices{0};
size_t pos = 0;
for (size_t i = 0; i < N; ++i) {
if (code & (1ull << i)) {
indices[pos++] = i;
}
}
for (size_t i = 0; i < N; ++i) {
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
}
}
}
printf("\n");
}
} // namespace detail
#endif
HWY_NOINLINE void TestAllCompress() {
// detail::PrintCompress32x8Tables();
// detail::PrintCompress64x4Tables();
// detail::PrintCompress32x4Tables();
// detail::PrintCompress64x2Tables();
// detail::PrintCompress16x8Tables();
const ForPartialVectors<TestCompress> test;
test(uint16_t());
test(int16_t());
test(float16_t());
test(uint32_t());
test(int32_t());
test(float());
#if HWY_CAP_INTEGER64
test(uint64_t());
test(int64_t());
#endif
#if HWY_CAP_FLOAT64
test(double());
#endif
}
struct TestZeroIfNegative {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v0 = Zero(d);
const auto vp = Iota(d, 1);
const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
// Zero and positive remain unchanged
HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
// Negative are all replaced with zero
HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
}
};
HWY_NOINLINE void TestAllZeroIfNegative() {
ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
}
struct TestBroadcastSignBit {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto s0 = Zero(d);
const auto s1 = Set(d, -1); // all bit set
const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
const auto vneg = s1 - vpos;
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
}
};
HWY_NOINLINE void TestAllBroadcastSignBit() {
ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
}
struct TestTestBit {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t kNumBits = sizeof(T) * 8;
for (size_t i = 0; i < kNumBits; ++i) {
const auto bit1 = Set(d, 1ull << i);
const auto bit2 = Set(d, 1ull << ((i + 1) % kNumBits));
const auto bit3 = Set(d, 1ull << ((i + 2) % kNumBits));
const auto bits12 = Or(bit1, bit2);
const auto bits23 = Or(bit2, bit3);
HWY_ASSERT(AllTrue(TestBit(bit1, bit1)));
HWY_ASSERT(AllTrue(TestBit(bits12, bit1)));
HWY_ASSERT(AllTrue(TestBit(bits12, bit2)));
HWY_ASSERT(AllFalse(TestBit(bits12, bit3)));
HWY_ASSERT(AllFalse(TestBit(bits23, bit1)));
HWY_ASSERT(AllFalse(TestBit(bit1, bit2)));
HWY_ASSERT(AllFalse(TestBit(bit2, bit1)));
HWY_ASSERT(AllFalse(TestBit(bit1, bit3)));
HWY_ASSERT(AllFalse(TestBit(bit3, bit1)));
HWY_ASSERT(AllFalse(TestBit(bit2, bit3)));
HWY_ASSERT(AllFalse(TestBit(bit3, bit2)));
}
}
};
HWY_NOINLINE void TestAllTestBit() {
ForIntegerTypes(ForFullVectors<TestTestBit>());
}
struct TestAllTrueFalse {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto zero = Zero(d);
auto v = zero;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(0));
HWY_ASSERT(AllTrue(Eq(v, zero)));
HWY_ASSERT(!AllFalse(Eq(v, zero)));
// Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
// lanes and one is nonzero.
const bool expected_all_false = (N != 1);
// Set each lane to nonzero and back to zero
for (size_t i = 0; i < N; ++i) {
lanes[i] = T(1);
v = Load(d, lanes.get());
HWY_ASSERT(!AllTrue(Eq(v, zero)));
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
lanes[i] = T(-1);
v = Load(d, lanes.get());
HWY_ASSERT(!AllTrue(Eq(v, zero)));
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
// Reset to all zero
lanes[i] = T(0);
v = Load(d, lanes.get());
HWY_ASSERT(AllTrue(Eq(v, zero)));
HWY_ASSERT(!AllFalse(Eq(v, zero)));
}
}
};
HWY_NOINLINE void TestAllAllTrueFalse() {
ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
}
class TestStoreMaskBits {
public:
template <class T, class D>
HWY_NOINLINE void operator()(T /*t*/, D d) {
// TODO(janwas): remove once implemented (cast or vse1)
#if HWY_TARGET != HWY_RVV
RandomState rng;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
const size_t expected_bytes = (N + 7) / 8;
auto bits = AllocateAligned<uint8_t>(expected_bytes);
for (size_t rep = 0; rep < 100; ++rep) {
// Generate random mask pattern.
for (size_t i = 0; i < N; ++i) {
lanes[i] = static_cast<T>((rng() & 1024) ? 1 : 0);
}
const auto mask = Load(d, lanes.get()) == Zero(d);
const size_t bytes_written = StoreMaskBits(mask, bits.get());
HWY_ASSERT_EQ(expected_bytes, bytes_written);
size_t i = 0;
// Stored bits must match original mask
for (; i < N; ++i) {
const bool bit = (bits[i / 8] & (1 << (i % 8))) != 0;
HWY_ASSERT_EQ(bit, lanes[i] == 0);
}
// Any partial bits in the last byte must be zero
for (; i < 8 * bytes_written; ++i) {
const int bit = (bits[i / 8] & (1 << (i % 8)));
HWY_ASSERT_EQ(bit, 0);
}
}
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllStoreMaskBits() {
ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
}
struct TestCountTrue {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
// For all combinations of zero/nonzero state of subset of lanes:
const size_t max_lanes = std::min(N, size_t(10));
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(1));
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
// Number of zeros written = number of mask lanes that are true.
size_t expected = 0;
for (size_t i = 0; i < max_lanes; ++i) {
lanes[i] = T(1);
if (code & (1ull << i)) {
++expected;
lanes[i] = T(0);
}
}
const auto mask = Eq(Load(d, lanes.get()), Zero(d));
const size_t actual = CountTrue(mask);
HWY_ASSERT_EQ(expected, actual);
}
}
};
HWY_NOINLINE void TestAllCountTrue() {
ForAllTypes(ForPartialVectors<TestCountTrue>());
}
struct TestLogicalMask {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto m0 = MaskFalse(d);
const auto m_all = MaskTrue(d);
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
std::fill(lanes.get(), lanes.get() + N, T(1));
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
// For all combinations of zero/nonzero state of subset of lanes:
const size_t max_lanes = std::min(N, size_t(6));
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
for (size_t i = 0; i < max_lanes; ++i) {
lanes[i] = T(1);
if (code & (1ull << i)) {
lanes[i] = T(0);
}
}
const auto m = Eq(Load(d, lanes.get()), Zero(d));
HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
HWY_ASSERT_MASK_EQ(d, m, And(m, m));
HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
}
}
};
HWY_NOINLINE void TestAllLogicalMask() {
ForAllTypes(ForFullVectors<TestLogicalMask>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyLogicalTest);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllTrueFalse);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
} // namespace hwy
#endif

413
third_party/highway/hwy/tests/memory_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,413 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
#include "hwy/cache_control.h"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestLoadStore {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto hi = Iota(d, 1 + N);
const auto lo = Iota(d, 1);
auto lanes = AllocateAligned<T>(2 * N);
Store(hi, d, &lanes[N]);
Store(lo, d, &lanes[0]);
// Aligned load
const auto lo2 = Load(d, &lanes[0]);
HWY_ASSERT_VEC_EQ(d, lo2, lo);
// Aligned store
auto lanes2 = AllocateAligned<T>(2 * N);
Store(lo2, d, &lanes2[0]);
Store(hi, d, &lanes2[N]);
for (size_t i = 0; i < 2 * N; ++i) {
HWY_ASSERT_EQ(lanes[i], lanes2[i]);
}
// Unaligned load
const auto vu = LoadU(d, &lanes[1]);
auto lanes3 = AllocateAligned<T>(N);
Store(vu, d, lanes3.get());
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
}
// Unaligned store
StoreU(lo2, d, &lanes2[N / 2]);
size_t i = 0;
for (; i < N / 2; ++i) {
HWY_ASSERT_EQ(lanes[i], lanes2[i]);
}
for (; i < 3 * N / 2; ++i) {
HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
}
// Subsequent values remain unchanged.
for (; i < 2 * N; ++i) {
HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
}
}
};
HWY_NOINLINE void TestAllLoadStore() {
ForAllTypes(ForPartialVectors<TestLoadStore>());
}
struct TestStoreInterleaved3 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
RandomState rng;
// Data to be interleaved
auto bytes = AllocateAligned<uint8_t>(3 * N);
for (size_t i = 0; i < 3 * N; ++i) {
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
}
const auto in0 = Load(d, &bytes[0 * N]);
const auto in1 = Load(d, &bytes[1 * N]);
const auto in2 = Load(d, &bytes[2 * N]);
// Interleave here, ensure vector results match scalar
auto expected = AllocateAligned<T>(4 * N);
auto actual_aligned = AllocateAligned<T>(4 * N + 1);
T* actual = actual_aligned.get() + 1;
for (size_t rep = 0; rep < 100; ++rep) {
for (size_t i = 0; i < N; ++i) {
expected[3 * i + 0] = bytes[0 * N + i];
expected[3 * i + 1] = bytes[1 * N + i];
expected[3 * i + 2] = bytes[2 * N + i];
// Ensure we do not write more than 3*N bytes
expected[3 * N + i] = actual[3 * N + i] = 0;
}
StoreInterleaved3(in0, in1, in2, d, actual);
size_t pos = 0;
if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
Print(d, "in0", in0, pos / 3);
Print(d, "in1", in1, pos / 3);
Print(d, "in2", in2, pos / 3);
const size_t i = pos - pos % 3;
fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i],
actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
actual[i + 5]);
HWY_ASSERT(false);
}
}
}
};
HWY_NOINLINE void TestAllStoreInterleaved3() {
#if HWY_TARGET == HWY_RVV
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
const ForExtendableVectors<TestStoreInterleaved3, 4> test;
#else
const ForPartialVectors<TestStoreInterleaved3> test;
#endif
test(uint8_t());
}
struct TestStoreInterleaved4 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
RandomState rng;
// Data to be interleaved
auto bytes = AllocateAligned<uint8_t>(4 * N);
for (size_t i = 0; i < 4 * N; ++i) {
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
}
const auto in0 = Load(d, &bytes[0 * N]);
const auto in1 = Load(d, &bytes[1 * N]);
const auto in2 = Load(d, &bytes[2 * N]);
const auto in3 = Load(d, &bytes[3 * N]);
// Interleave here, ensure vector results match scalar
auto expected = AllocateAligned<T>(5 * N);
auto actual_aligned = AllocateAligned<T>(5 * N + 1);
T* actual = actual_aligned.get() + 1;
for (size_t rep = 0; rep < 100; ++rep) {
for (size_t i = 0; i < N; ++i) {
expected[4 * i + 0] = bytes[0 * N + i];
expected[4 * i + 1] = bytes[1 * N + i];
expected[4 * i + 2] = bytes[2 * N + i];
expected[4 * i + 3] = bytes[3 * N + i];
// Ensure we do not write more than 4*N bytes
expected[4 * N + i] = actual[4 * N + i] = 0;
}
StoreInterleaved4(in0, in1, in2, in3, d, actual);
size_t pos = 0;
if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
Print(d, "in0", in0, pos / 4);
Print(d, "in1", in1, pos / 4);
Print(d, "in2", in2, pos / 4);
Print(d, "in3", in3, pos / 4);
const size_t i = pos;
fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i],
actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
actual[i + 5], actual[i + 6], actual[i + 7]);
HWY_ASSERT(false);
}
}
}
};
HWY_NOINLINE void TestAllStoreInterleaved4() {
#if HWY_TARGET == HWY_RVV
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
const ForExtendableVectors<TestStoreInterleaved4, 4> test;
#else
const ForPartialVectors<TestStoreInterleaved4> test;
#endif
test(uint8_t());
}
struct TestLoadDup128 {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Scalar does not define LoadDup128.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
constexpr size_t N128 = 16 / sizeof(T);
alignas(16) T lanes[N128];
for (size_t i = 0; i < N128; ++i) {
lanes[i] = static_cast<T>(1 + i);
}
const auto v = LoadDup128(d, lanes);
const size_t N = Lanes(d);
auto out = AllocateAligned<T>(N);
Store(v, d, out.get());
for (size_t i = 0; i < N; ++i) {
HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
}
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllLoadDup128() {
ForAllTypes(ForGE128Vectors<TestLoadDup128>());
}
struct TestStream {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v = Iota(d, T(1));
const size_t affected_bytes =
(Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
~size_t(HWY_STREAM_MULTIPLE - 1);
const size_t affected_lanes = affected_bytes / sizeof(T);
auto out = AllocateAligned<T>(2 * affected_lanes);
std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
Stream(v, d, out.get());
StoreFence();
const auto actual = Load(d, out.get());
HWY_ASSERT_VEC_EQ(d, v, actual);
// Ensure Stream didn't modify more memory than expected
for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
HWY_ASSERT_EQ(T(0), out[i]);
}
}
};
HWY_NOINLINE void TestAllStream() {
const ForPartialVectors<TestStream> test;
// No u8,u16.
test(uint32_t());
test(uint64_t());
// No i8,i16.
test(int32_t());
test(int64_t());
ForFloatTypes(test);
}
// Assumes little-endian byte order!
struct TestScatter {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using Offset = MakeSigned<T>;
const size_t N = Lanes(d);
const size_t range = 4 * N; // number of items to scatter
const size_t max_bytes = range * sizeof(T); // upper bound on offset
RandomState rng;
// Data to be scattered
auto bytes = AllocateAligned<uint8_t>(max_bytes);
for (size_t i = 0; i < max_bytes; ++i) {
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
}
const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
// Scatter into these regions, ensure vector results match scalar
auto expected = AllocateAligned<T>(range);
auto actual = AllocateAligned<T>(range);
const Rebind<Offset, D> d_offsets;
auto offsets = AllocateAligned<Offset>(N); // or indices
for (size_t rep = 0; rep < 100; ++rep) {
// Byte offsets
std::fill(expected.get(), expected.get() + range, T(0));
std::fill(actual.get(), actual.get() + range, T(0));
for (size_t i = 0; i < N; ++i) {
offsets[i] =
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
CopyBytes<sizeof(T)>(
bytes.get() + i * sizeof(T),
reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
}
const auto voffsets = Load(d_offsets, offsets.get());
ScatterOffset(data, d, actual.get(), voffsets);
if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
Print(d, "Data", data);
Print(d_offsets, "Offsets", voffsets);
HWY_ASSERT(false);
}
// Indices
std::fill(expected.get(), expected.get() + range, T(0));
std::fill(actual.get(), actual.get() + range, T(0));
for (size_t i = 0; i < N; ++i) {
offsets[i] = static_cast<Offset>(Random32(&rng) % range);
CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
&expected[offsets[i]]);
}
const auto vindices = Load(d_offsets, offsets.get());
ScatterIndex(data, d, actual.get(), vindices);
if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
Print(d, "Data", data);
Print(d_offsets, "Indices", vindices);
HWY_ASSERT(false);
}
}
}
};
HWY_NOINLINE void TestAllScatter() {
// No u8,u16,i8,i16.
const ForPartialVectors<TestScatter> test;
test(uint32_t());
test(int32_t());
#if HWY_CAP_INTEGER64
test(uint64_t());
test(int64_t());
#endif
ForFloatTypes(test);
}
struct TestGather {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using Offset = MakeSigned<T>;
const size_t N = Lanes(d);
RandomState rng;
// Data to be gathered from
const size_t max_bytes = 4 * N * sizeof(T); // upper bound on offset
auto bytes = AllocateAligned<uint8_t>(max_bytes);
for (size_t i = 0; i < max_bytes; ++i) {
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
}
auto expected = AllocateAligned<T>(N);
auto offsets = AllocateAligned<Offset>(N);
auto indices = AllocateAligned<Offset>(N);
for (size_t rep = 0; rep < 100; ++rep) {
// Offsets
for (size_t i = 0; i < N; ++i) {
offsets[i] =
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
}
const Rebind<Offset, D> d_offset;
const T* base = reinterpret_cast<const T*>(bytes.get());
auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
// Indices
for (size_t i = 0; i < N; ++i) {
indices[i] =
static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
}
actual = GatherIndex(d, base, Load(d_offset, indices.get()));
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
}
}
};
HWY_NOINLINE void TestAllGather() {
// No u8,u16,i8,i16.
const ForPartialVectors<TestGather> test;
test(uint32_t());
test(int32_t());
#if HWY_CAP_INTEGER64
test(uint64_t());
test(int64_t());
#endif
ForFloatTypes(test);
}
HWY_NOINLINE void TestAllCache() {
LoadFence();
StoreFence();
int test = 0;
Prefetch(&test);
FlushCacheline(&test);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwyMemoryTest);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
} // namespace hwy
#endif

642
third_party/highway/hwy/tests/swizzle_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,642 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestShiftBytes {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Scalar does not define Shift*Bytes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
const Repartition<uint8_t, D> du8;
const size_t N8 = Lanes(du8);
// Zero remains zero
const auto v0 = Zero(d);
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(v0));
// Zero after shifting out the high/low byte
auto bytes = AllocateAligned<uint8_t>(N8);
std::fill(bytes.get(), bytes.get() + N8, 0);
bytes[N8 - 1] = 0x7F;
const auto vhi = BitCast(d, Load(du8, bytes.get()));
bytes[N8 - 1] = 0;
bytes[0] = 0x7F;
const auto vlo = BitCast(d, Load(du8, bytes.get()));
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(vlo));
// Check expected result with Iota
const size_t N = Lanes(d);
auto in = AllocateAligned<T>(N);
const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
const auto v = BitCast(d, Iota(du8, 1));
Store(v, d, in.get());
auto expected = AllocateAligned<T>(N);
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
const size_t kBlockSize = HWY_MIN(N8, 16);
for (size_t block = 0; block < N8; block += kBlockSize) {
expected_bytes[block] = 0;
memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
for (size_t block = 0; block < N8; block += kBlockSize) {
memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
expected_bytes[block + kBlockSize - 1] = 0;
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(v));
#else
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
HWY_NOINLINE void TestAllShiftBytes() {
ForIntegerTypes(ForGE128Vectors<TestShiftBytes>());
}
struct TestShiftLanes {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// Scalar does not define Shift*Lanes.
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
const auto v = Iota(d, T(1));
const size_t N = Lanes(d);
auto expected = AllocateAligned<T>(N);
HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(v));
constexpr size_t kLanesPerBlock = 16 / sizeof(T);
for (size_t i = 0; i < N; ++i) {
expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
for (size_t i = 0; i < N; ++i) {
expected[i] =
(i % kLanesPerBlock) == (kLanesPerBlock - 1) ? T(0) : T(2 + i);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(v));
#else
(void)d;
#endif // #if HWY_TARGET != HWY_SCALAR
}
};
HWY_NOINLINE void TestAllShiftLanes() {
ForAllTypes(ForGE128Vectors<TestShiftLanes>());
}
template <typename D, int kLane>
struct TestBroadcastR {
HWY_NOINLINE void operator()() const {
// TODO(janwas): fix failure
#if HWY_TARGET != HWY_WASM
using T = typename D::T;
const D d;
const size_t N = Lanes(d);
auto in_lanes = AllocateAligned<T>(N);
std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
// Need to set within each 128-bit block
for (size_t block = 0; block < N; block += blockN) {
in_lanes[block + kLane] = static_cast<T>(block + 1);
}
const auto in = Load(d, in_lanes.get());
auto expected = AllocateAligned<T>(N);
for (size_t block = 0; block < N; block += blockN) {
for (size_t i = 0; i < blockN; ++i) {
expected[block + i] = T(block + 1);
}
}
HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
TestBroadcastR<D, kLane - 1>()();
#endif
}
};
template <class D>
struct TestBroadcastR<D, -1> {
void operator()() const {}
};
struct TestBroadcast {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
}
};
HWY_NOINLINE void TestAllBroadcast() {
const ForPartialVectors<TestBroadcast> test;
// No u8.
test(uint16_t());
test(uint32_t());
#if HWY_CAP_INTEGER64
test(uint64_t());
#endif
// No i8.
test(int16_t());
test(int32_t());
#if HWY_CAP_INTEGER64
test(int64_t());
#endif
ForFloatTypes(test);
}
struct TestTableLookupBytes {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
RandomState rng;
const size_t N = Lanes(d);
const size_t N8 = Lanes(Repartition<uint8_t, D>());
auto in_bytes = AllocateAligned<uint8_t>(N8);
for (size_t i = 0; i < N8; ++i) {
in_bytes[i] = Random32(&rng) & 0xFF;
}
const auto in =
BitCast(d, Load(d, reinterpret_cast<const T*>(in_bytes.get())));
// Enough test data; for larger vectors, upper lanes will be zero.
const uint8_t index_bytes_source[64] = {
// Same index as source, multiple outputs from same input,
// unused input (9), ascending/descending and nonconsecutive neighbors.
0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11,
11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0,
4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1};
auto index_bytes = AllocateAligned<uint8_t>(N8);
for (size_t i = 0; i < N8; ++i) {
index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
// Avoid undefined results / asan error for scalar by capping indices.
if (index_bytes[i] >= N * sizeof(T)) {
index_bytes[i] = static_cast<uint8_t>(N * sizeof(T) - 1);
}
}
const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
auto expected = AllocateAligned<T>(N);
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
// Byte indices wrap around
const size_t mod = HWY_MIN(N8, 256);
for (size_t block = 0; block < N8; block += 16) {
for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
const uint8_t index = index_bytes[block + i];
expected_bytes[block + i] = in_bytes[(block + index) % mod];
}
}
HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
}
};
HWY_NOINLINE void TestAllTableLookupBytes() {
ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
}
struct TestTableLookupLanes {
#if HWY_TARGET == HWY_RVV
using Index = uint32_t;
#else
using Index = int32_t;
#endif
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
#if HWY_TARGET != HWY_SCALAR
const size_t N = Lanes(d);
auto idx = AllocateAligned<Index>(N);
std::fill(idx.get(), idx.get() + N, Index(0));
auto expected = AllocateAligned<T>(N);
const auto v = Iota(d, 1);
if (N <= 8) { // Test all permutations
for (size_t i0 = 0; i0 < N; ++i0) {
idx[0] = static_cast<Index>(i0);
for (size_t i1 = 0; i1 < N; ++i1) {
idx[1] = static_cast<Index>(i1);
for (size_t i2 = 0; i2 < N; ++i2) {
idx[2] = static_cast<Index>(i2);
for (size_t i3 = 0; i3 < N; ++i3) {
idx[3] = static_cast<Index>(i3);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
}
const auto opaque = SetTableIndices(d, idx.get());
const auto actual = TableLookupLanes(v, opaque);
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
}
}
}
}
} else {
// Too many permutations to test exhaustively; choose one with repeated
// and cross-block indices and ensure indices do not exceed #lanes.
// For larger vectors, upper lanes will be zero.
HWY_ALIGN Index idx_source[16] = {1, 3, 2, 2, 8, 1, 7, 6,
15, 14, 14, 15, 4, 9, 8, 5};
for (size_t i = 0; i < N; ++i) {
idx[i] = (i < 16) ? idx_source[i] : 0;
// Avoid undefined results / asan error for scalar by capping indices.
if (idx[i] >= static_cast<Index>(N)) {
idx[i] = static_cast<Index>(N - 1);
}
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
}
const auto opaque = SetTableIndices(d, idx.get());
const auto actual = TableLookupLanes(v, opaque);
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
}
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllTableLookupLanes() {
const ForFullVectors<TestTableLookupLanes> test;
test(uint32_t());
test(int32_t());
test(float());
}
struct TestInterleave {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using TU = MakeUnsigned<T>;
const size_t N = Lanes(d);
auto even_lanes = AllocateAligned<T>(N);
auto odd_lanes = AllocateAligned<T>(N);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
even_lanes[i] = static_cast<T>(2 * i + 0);
odd_lanes[i] = static_cast<T>(2 * i + 1);
}
const auto even = Load(d, even_lanes.get());
const auto odd = Load(d, odd_lanes.get());
const size_t blockN = 16 / sizeof(T);
for (size_t i = 0; i < Lanes(d); ++i) {
const size_t block = i / blockN;
const size_t index = (i % blockN) + block * 2 * blockN;
expected[i] = static_cast<T>(index & LimitsMax<TU>());
}
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
for (size_t i = 0; i < Lanes(d); ++i) {
const size_t block = i / blockN;
expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(even, odd));
}
};
HWY_NOINLINE void TestAllInterleave() {
// Not supported by HWY_SCALAR: Interleave(f32, f32) would return f32x2.
ForAllTypes(ForGE128Vectors<TestInterleave>());
}
struct TestZipLower {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using WideT = MakeWide<T>;
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
const size_t N = Lanes(d);
auto even_lanes = AllocateAligned<T>(N);
auto odd_lanes = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
even_lanes[i] = static_cast<T>(2 * i + 0);
odd_lanes[i] = static_cast<T>(2 * i + 1);
}
const auto even = Load(d, even_lanes.get());
const auto odd = Load(d, odd_lanes.get());
const Repartition<WideT, D> dw;
auto expected = AllocateAligned<WideT>(Lanes(dw));
const WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
for (size_t i = 0; i < Lanes(dw); ++i) {
const size_t block = i / blockN;
// Value of least-significant lane in lo-vector.
const WideT lo =
static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
expected[i] =
static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) + lo);
}
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
}
};
struct TestZipUpper {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
using WideT = MakeWide<T>;
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
const size_t N = Lanes(d);
auto even_lanes = AllocateAligned<T>(N);
auto odd_lanes = AllocateAligned<T>(N);
for (size_t i = 0; i < Lanes(d); ++i) {
even_lanes[i] = static_cast<T>(2 * i + 0);
odd_lanes[i] = static_cast<T>(2 * i + 1);
}
const auto even = Load(d, even_lanes.get());
const auto odd = Load(d, odd_lanes.get());
const Repartition<WideT, D> dw;
auto expected = AllocateAligned<WideT>(Lanes(dw));
constexpr WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
for (size_t i = 0; i < Lanes(dw); ++i) {
const size_t block = i / blockN;
const WideT lo =
static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
expected[i] = static_cast<WideT>(
(static_cast<WideT>(lo + 2 * blockN + 1) << kBits) + lo + 2 * blockN);
}
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(even, odd));
}
};
HWY_NOINLINE void TestAllZip() {
const ForPartialVectors<TestZipLower, 2> lower_unsigned;
// TODO(janwas): fix
#if HWY_TARGET != HWY_RVV
lower_unsigned(uint8_t());
#endif
lower_unsigned(uint16_t());
#if HWY_CAP_INTEGER64
lower_unsigned(uint32_t()); // generates u64
#endif
const ForPartialVectors<TestZipLower, 2> lower_signed;
#if HWY_TARGET != HWY_RVV
lower_signed(int8_t());
#endif
lower_signed(int16_t());
#if HWY_CAP_INTEGER64
lower_signed(int32_t()); // generates i64
#endif
const ForGE128Vectors<TestZipUpper> upper_unsigned;
#if HWY_TARGET != HWY_RVV
upper_unsigned(uint8_t());
#endif
upper_unsigned(uint16_t());
#if HWY_CAP_INTEGER64
upper_unsigned(uint32_t()); // generates u64
#endif
const ForGE128Vectors<TestZipUpper> upper_signed;
#if HWY_TARGET != HWY_RVV
upper_signed(int8_t());
#endif
upper_signed(int16_t());
#if HWY_CAP_INTEGER64
upper_signed(int32_t()); // generates i64
#endif
// No float - concatenating f32 does not result in a f64
}
class TestSpecialShuffle32 {
public:
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v = Iota(d, 0);
#define VERIFY_LANES_32(d, v, i3, i2, i1, i0) \
VerifyLanes32((d), (v), (i3), (i2), (i1), (i0), __FILE__, __LINE__)
VERIFY_LANES_32(d, Shuffle2301(v), 2, 3, 0, 1);
VERIFY_LANES_32(d, Shuffle1032(v), 1, 0, 3, 2);
VERIFY_LANES_32(d, Shuffle0321(v), 0, 3, 2, 1);
VERIFY_LANES_32(d, Shuffle2103(v), 2, 1, 0, 3);
VERIFY_LANES_32(d, Shuffle0123(v), 0, 1, 2, 3);
#undef VERIFY_LANES_32
}
private:
template <class D, class V>
HWY_NOINLINE void VerifyLanes32(D d, V v, const int i3, const int i2,
const int i1, const int i0,
const char* filename, const int line) {
using T = typename D::T;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
Store(v, d, lanes.get());
const std::string name = TypeName(lanes[0], N);
constexpr size_t kBlockN = 16 / sizeof(T);
for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
AssertEqual(T(block + i3), lanes[block + 3], name, filename, line);
AssertEqual(T(block + i2), lanes[block + 2], name, filename, line);
AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
}
}
};
class TestSpecialShuffle64 {
public:
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const auto v = Iota(d, 0);
VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
}
private:
template <class D, class V>
HWY_NOINLINE void VerifyLanes64(D d, V v, const int i1, const int i0,
const char* filename, const int line) {
using T = typename D::T;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
Store(v, d, lanes.get());
const std::string name = TypeName(lanes[0], N);
constexpr size_t kBlockN = 16 / sizeof(T);
for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
}
}
};
HWY_NOINLINE void TestAllSpecialShuffles() {
const ForGE128Vectors<TestSpecialShuffle32> test32;
test32(uint32_t());
test32(int32_t());
test32(float());
#if HWY_CAP_INTEGER64
const ForGE128Vectors<TestSpecialShuffle64> test64;
test64(uint64_t());
test64(int64_t());
#endif
#if HWY_CAP_FLOAT64
const ForGE128Vectors<TestSpecialShuffle64> test_d;
test_d(double());
#endif
}
struct TestConcatHalves {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// TODO(janwas): fix
#if HWY_TARGET != HWY_RVV
// Construct inputs such that interleaved halves == iota.
const auto expected = Iota(d, 1);
const size_t N = Lanes(d);
auto lo = AllocateAligned<T>(N);
auto hi = AllocateAligned<T>(N);
size_t i;
for (i = 0; i < N / 2; ++i) {
lo[i] = static_cast<T>(1 + i);
hi[i] = static_cast<T>(lo[i] + T(N) / 2);
}
for (; i < N; ++i) {
lo[i] = hi[i] = 0;
}
HWY_ASSERT_VEC_EQ(d, expected,
ConcatLowerLower(Load(d, hi.get()), Load(d, lo.get())));
// Same for high blocks.
for (i = 0; i < N / 2; ++i) {
lo[i] = hi[i] = 0;
}
for (; i < N; ++i) {
lo[i] = static_cast<T>(1 + i - N / 2);
hi[i] = static_cast<T>(lo[i] + T(N) / 2);
}
HWY_ASSERT_VEC_EQ(d, expected,
ConcatUpperUpper(Load(d, hi.get()), Load(d, lo.get())));
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllConcatHalves() {
ForAllTypes(ForGE128Vectors<TestConcatHalves>());
}
struct TestConcatLowerUpper {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
// TODO(janwas): fix
#if HWY_TARGET != HWY_RVV
const size_t N = Lanes(d);
// Middle part of Iota(1) == Iota(1 + N / 2).
const auto lo = Iota(d, 1);
const auto hi = Iota(d, 1 + N);
HWY_ASSERT_VEC_EQ(d, Iota(d, 1 + N / 2), ConcatLowerUpper(hi, lo));
#else
(void)d;
#endif
}
};
HWY_NOINLINE void TestAllConcatLowerUpper() {
ForAllTypes(ForGE128Vectors<TestConcatLowerUpper>());
}
struct TestConcatUpperLower {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto lo = Iota(d, 1);
const auto hi = Iota(d, 1 + N);
auto expected = AllocateAligned<T>(N);
size_t i = 0;
for (; i < N / 2; ++i) {
expected[i] = static_cast<T>(1 + i);
}
for (; i < N; ++i) {
expected[i] = static_cast<T>(1 + i + N);
}
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(hi, lo));
}
};
HWY_NOINLINE void TestAllConcatUpperLower() {
ForAllTypes(ForGE128Vectors<TestConcatUpperLower>());
}
struct TestOddEven {
template <class T, class D>
HWY_NOINLINE void operator()(T /*unused*/, D d) {
const size_t N = Lanes(d);
const auto even = Iota(d, 1);
const auto odd = Iota(d, 1 + N);
auto expected = AllocateAligned<T>(N);
for (size_t i = 0; i < N; ++i) {
expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
}
HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
}
};
HWY_NOINLINE void TestAllOddEven() {
ForAllTypes(ForGE128Vectors<TestOddEven>());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(HwySwizzleTest);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllBroadcast);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupBytes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInterleave);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllZip);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSpecialShuffles);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatHalves);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
} // namespace hwy
#endif

580
third_party/highway/hwy/tests/test_util-inl.h поставляемый Normal file
Просмотреть файл

@ -0,0 +1,580 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Normal include guard for non-SIMD portion of this header.
#ifndef HWY_TESTS_TEST_UTIL_H_
#define HWY_TESTS_TEST_UTIL_H_
// Helper functions for use by *_test.cc.
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <cstddef>
#include <string>
#include <utility> // std::forward
#include "hwy/aligned_allocator.h"
#include "hwy/base.h"
#include "hwy/highway.h"
#include "gtest/gtest.h"
namespace hwy {
// The maximum vector size used in tests when defining test data. DEPRECATED.
constexpr size_t kTestMaxVectorSize = 64;
// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
// used INSTANTIATE_TEST_CASE_P which is now deprecated.
#ifdef INSTANTIATE_TEST_SUITE_P
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
#else
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
#endif
// Helper class to run parametric tests using the hwy target as parameter. To
// use this define the following in your test:
// class MyTestSuite : public TestWithParamTarget {
// ...
// };
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
// TEST_P(MyTestSuite, MyTest) { ... }
class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
protected:
void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
void TearDown() override {
// Check that the parametric test calls SupportedTargets() when the source
// was compiled with more than one target. In the single-target case only
// static dispatch will be used anyway.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
EXPECT_TRUE(SupportedTargetsCalledForTest())
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
"doesn't need to be parametric.";
#endif
SetSupportedTargetsForTest(0);
}
};
// Function to convert the test parameter of a TestWithParamTarget for
// displaying it in the gtest test name.
static inline std::string TestParamTargetName(
const testing::TestParamInfo<uint32_t>& info) {
return TargetName(info.param);
}
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
suite##Group, suite, \
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
::hwy::TestParamTargetName)
// Helper class similar to TestWithParamTarget to run parametric tests that
// depend on the target and another parametric test. If you need to use multiple
// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
// the generator. To use this class define the following in your test:
// class MyTestSuite : public TestWithParamTargetT<int> {
// ...
// };
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
template <typename T>
class TestWithParamTargetAndT
: public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
public:
// Expose the parametric type here so it can be used by the
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
using HwyParamType = T;
protected:
void SetUp() override {
SetSupportedTargetsForTest(std::get<0>(
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
}
void TearDown() override {
// Check that the parametric test calls SupportedTargets() when the source
// was compiled with more than one target. In the single-target case only
// static dispatch will be used anyway.
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
EXPECT_TRUE(SupportedTargetsCalledForTest())
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
"doesn't need to be parametric.";
#endif
SetSupportedTargetsForTest(0);
}
T GetParam() {
return std::get<1>(
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
}
};
template <typename T>
std::string TestParamTargetNameAndT(
const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
return std::string(TargetName(std::get<0>(info.param))) + "_" +
::testing::PrintToString(std::get<1>(info.param));
}
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
suite##Group, suite, \
::testing::Combine( \
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
generator), \
::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
// Helper macro to export a function and define a test that tests it. This is
// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
// class MyTestSuite : public TestWithParamTarget {
// ...
// };
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
#define HWY_EXPORT_AND_TEST_P(suite, func_name) \
HWY_EXPORT(func_name); \
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
static_assert(true, "For requiring trailing semicolon")
#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \
HWY_EXPORT(func_name); \
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
static_assert(true, "For requiring trailing semicolon")
#define HWY_BEFORE_TEST(suite) \
class suite : public hwy::TestWithParamTarget {}; \
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
static_assert(true, "For requiring trailing semicolon")
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
// which triggers a compiler bug.
class RandomState {
public:
explicit RandomState(const uint64_t seed = 0x123456789ull) {
s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
s1_ = SplitMix64(s0_);
}
HWY_INLINE uint64_t operator()() {
uint64_t s1 = s0_;
const uint64_t s0 = s1_;
const uint64_t bits = s1 + s0;
s0_ = s0;
s1 ^= s1 << 23;
s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
s1_ = s1;
return bits;
}
private:
static uint64_t SplitMix64(uint64_t z) {
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
return z ^ (z >> 31);
}
uint64_t s0_;
uint64_t s1_;
};
static HWY_INLINE uint32_t Random32(RandomState* rng) {
return static_cast<uint32_t>((*rng)());
}
// Prevents the compiler from eliding the computations that led to "output".
// Works by indicating to the compiler that "output" is being read and modified.
// The +r constraint avoids unnecessary writes to memory, but only works for
// built-in types.
template <class T>
inline void PreventElision(T&& output) {
#if HWY_COMPILER_MSVC
(void)output;
#else // HWY_COMPILER_MSVC
asm volatile("" : "+r"(output) : : "memory");
#endif // HWY_COMPILER_MSVC
}
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
// unsigned/signed/floating point, followed by the number of bits per lane;
// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
// understanding which instantiation of a generic test failed.
template <typename T>
static inline std::string TypeName(T /*unused*/, size_t N) {
const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
char name[64];
// Omit the xN suffix for scalars.
if (N == 1) {
snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
} else {
snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
}
return name;
}
// String comparison
template <typename T1, typename T2>
inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
size_t* pos = nullptr) {
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
for (size_t i = 0; i < size; ++i) {
if (bytes1[i] != bytes2[i]) {
fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
TypeName(T2(), 1).c_str());
if (pos != nullptr) {
*pos = i;
}
return false;
}
}
return true;
}
inline bool StringsEqual(const char* s1, const char* s2) {
while (*s1 == *s2++) {
if (*s1++ == '\0') return true;
}
return false;
}
} // namespace hwy
#endif // HWY_TESTS_TEST_UTIL_H_
// Per-target include guard
#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
#else
#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
#endif
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
// Prints lanes around `lane`, in memory order.
template <class D>
HWY_NOINLINE void Print(const D d, const char* caption, const Vec<D> v,
intptr_t lane = 0) {
using T = TFromD<D>;
const size_t N = Lanes(d);
auto lanes = AllocateAligned<T>(N);
Store(v, d, lanes.get());
const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
const size_t end = std::min(begin + 7, N);
fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption,
begin);
for (size_t i = begin; i < end; ++i) {
fprintf(stderr, "%g,", double(lanes[i]));
}
if (begin >= end) fprintf(stderr, "(out of bounds)");
fprintf(stderr, "\n");
}
static HWY_NORETURN HWY_NOINLINE void NotifyFailure(
const char* filename, const int line, const char* type_name,
const size_t lane, const char* expected, const char* actual) {
hwy::Abort(filename, line,
"%s, %s lane %zu mismatch: expected '%s', got '%s'.\n",
hwy::TargetName(HWY_TARGET), type_name, lane, expected, actual);
}
template <class Out, class In>
inline Out BitCast(const In& in) {
static_assert(sizeof(Out) == sizeof(In), "");
Out out;
CopyBytes<sizeof(out)>(&in, &out);
return out;
}
// Computes the difference in units of last place between x and y.
template <typename TF>
MakeUnsigned<TF> ComputeUlpDelta(TF x, TF y) {
static_assert(IsFloat<TF>(), "Only makes sense for floating-point");
using TU = MakeUnsigned<TF>;
// Handle -0 == 0 and infinities.
if (x == y) return 0;
// Consider "equal" if both are NaN, so we can verify an expected NaN.
// Needs a special case because there are many possible NaN representations.
if (std::isnan(x) && std::isnan(y)) return 0;
// NOTE: no need to check for differing signs; they will result in large
// differences, which is fine, and we avoid overflow.
const TU ux = BitCast<TU>(x);
const TU uy = BitCast<TU>(y);
// Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
return std::max(ux, uy) - std::min(ux, uy);
}
template <typename T, HWY_IF_NOT_FLOAT(T)>
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
return expected == actual;
}
template <typename T, HWY_IF_FLOAT(T)>
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
return ComputeUlpDelta(expected, actual) <= 1;
}
// Compare non-vector, non-string T.
template <typename T>
HWY_NOINLINE void AssertEqual(const T expected, const T actual,
const std::string& type_name,
const char* filename = "", const int line = -1,
const size_t lane = 0) {
if (!IsEqual(expected, actual)) {
char expected_str[100];
snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
char actual_str[100];
snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
actual_str);
}
}
static HWY_NOINLINE HWY_MAYBE_UNUSED void AssertStringEqual(
const char* expected, const char* actual, const char* filename = "",
const int line = -1, const size_t lane = 0) {
if (!hwy::StringsEqual(expected, actual)) {
NotifyFailure(filename, line, "string", lane, expected, actual);
}
}
// Compare expected vector to vector.
template <class D, class V>
HWY_NOINLINE void AssertVecEqual(D d, const V expected, const V actual,
const char* filename, const int line) {
using T = TFromD<D>;
const size_t N = Lanes(d);
auto expected_lanes = AllocateAligned<T>(N);
auto actual_lanes = AllocateAligned<T>(N);
Store(expected, d, expected_lanes.get());
Store(actual, d, actual_lanes.get());
for (size_t i = 0; i < N; ++i) {
if (!IsEqual(expected_lanes[i], actual_lanes[i])) {
fprintf(stderr, "\n\n");
Print(d, "expect", expected, i);
Print(d, "actual", actual, i);
char expected_str[100];
snprintf(expected_str, sizeof(expected_str), "%g",
double(expected_lanes[i]));
char actual_str[100];
snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
expected_str, actual_str);
}
}
}
// Compare expected lanes to vector.
template <class D>
HWY_NOINLINE void AssertVecEqual(D d, const TFromD<D>* expected, Vec<D> actual,
const char* filename, int line) {
AssertVecEqual(d, LoadU(d, expected), actual, filename, line);
}
template <class D>
HWY_NOINLINE void AssertMaskEqual(D d, Mask<D> a, Mask<D> b,
const char* filename, int line) {
AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
const std::string type_name = TypeName(TFromD<D>(), Lanes(d));
AssertEqual(CountTrue(a), CountTrue(b), type_name, filename, line, 0);
AssertEqual(AllTrue(a), AllTrue(b), type_name, filename, line, 0);
AssertEqual(AllFalse(a), AllFalse(b), type_name, filename, line, 0);
// TODO(janwas): StoreMaskBits
}
template <class D>
HWY_NOINLINE Mask<D> MaskTrue(const D d) {
const auto v0 = Zero(d);
return Eq(v0, v0);
}
template <class D>
HWY_NOINLINE Mask<D> MaskFalse(const D d) {
// Lt is only for signed types and we cannot yet cast mask types.
return Eq(Zero(d), Set(d, 1));
}
#ifndef HWY_ASSERT_EQ
#define HWY_ASSERT_EQ(expected, actual) \
AssertEqual(expected, actual, hwy::TypeName(expected, 1), __FILE__, __LINE__)
#define HWY_ASSERT_STRING_EQ(expected, actual) \
AssertStringEqual(expected, actual, __FILE__, __LINE__)
#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
#endif // HWY_ASSERT_EQ
// Helpers for instantiating tests with combinations of lane types / counts.
// For all powers of two in [kMinLanes, N * kMinLanes] (so that recursion stops
// at N == 0)
template <typename T, size_t N, size_t kMinLanes, class Test>
struct ForeachSizeR {
static void Do() {
static_assert(N != 0, "End of recursion");
Test()(T(), Simd<T, N * kMinLanes>());
ForeachSizeR<T, N / 2, kMinLanes, Test>::Do();
}
};
// Base case to stop the recursion.
template <typename T, size_t kMinLanes, class Test>
struct ForeachSizeR<T, 0, kMinLanes, Test> {
static void Do() {}
};
// These adapters may be called directly, or via For*Types:
// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
struct ForPartialVectors {
template <typename T>
void operator()(T /*unused*/) const {
#if HWY_TARGET == HWY_RVV
// Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
#else
ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
Test>::Do();
#endif
}
};
// Calls Test for all vectors that can be demoted log2(kFactor) times.
template <class Test, size_t kFactor>
struct ForDemoteVectors {
template <typename T>
void operator()(T /*unused*/) const {
#if HWY_TARGET == HWY_RVV
// Only m1..8 for now.
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
#else
ForeachSizeR<T, HWY_LANES(T), 1, Test>::Do();
#endif
}
};
// Calls Test for all powers of two in [128 bits, max bits].
template <class Test>
struct ForGE128Vectors {
template <typename T>
void operator()(T /*unused*/) const {
#if HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
#else
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
Test>::Do();
#endif
}
};
// Calls Test for all vectors that can be expanded by kFactor.
template <class Test, size_t kFactor = 2>
struct ForExtendableVectors {
template <typename T>
void operator()(T /*unused*/) const {
#if HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
#else
ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
Test>::Do();
#endif
}
};
// Calls Test for full vectors only.
template <class Test>
struct ForFullVectors {
template <typename T>
void operator()(T t) const {
#if HWY_TARGET == HWY_RVV
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
(void)t;
#else
Test()(t, HWY_FULL(T)());
#endif
}
};
// Type lists to shorten call sites:
template <class Func>
void ForSignedTypes(const Func& func) {
func(int8_t());
func(int16_t());
func(int32_t());
#if HWY_CAP_INTEGER64
func(int64_t());
#endif
}
template <class Func>
void ForUnsignedTypes(const Func& func) {
func(uint8_t());
func(uint16_t());
func(uint32_t());
#if HWY_CAP_INTEGER64
func(uint64_t());
#endif
}
template <class Func>
void ForIntegerTypes(const Func& func) {
ForSignedTypes(func);
ForUnsignedTypes(func);
}
template <class Func>
void ForFloatTypes(const Func& func) {
func(float());
#if HWY_CAP_FLOAT64
func(double());
#endif
}
template <class Func>
void ForAllTypes(const Func& func) {
ForIntegerTypes(func);
ForFloatTypes(func);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#endif // per-target include guard

102
third_party/highway/hwy/tests/test_util_test.cc поставляемый Normal file
Просмотреть файл

@ -0,0 +1,102 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stddef.h>
#include <stdint.h>
#undef HWY_TARGET_INCLUDE
#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
#include "hwy/foreach_target.h"
#include "hwy/highway.h"
#include "hwy/tests/test_util-inl.h"
HWY_BEFORE_NAMESPACE();
namespace hwy {
namespace HWY_NAMESPACE {
struct TestName {
template <class T, class D>
HWY_NOINLINE void operator()(T t, D d) {
char num[10];
std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
snprintf(num, sizeof(num), "%zu", sizeof(T) * 8);
expected += num;
const size_t N = Lanes(d);
if (N != 1) {
expected += 'x';
snprintf(num, sizeof(num), "%zu", N);
expected += num;
}
const std::string actual = TypeName(t, N);
if (expected != actual) {
NotifyFailure(__FILE__, __LINE__, expected.c_str(), 0, expected.c_str(),
actual.c_str());
}
}
};
HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
struct TestEqualInteger {
template <class T>
HWY_NOINLINE void operator()(T /*t*/) const {
HWY_ASSERT(IsEqual(T(0), T(0)));
HWY_ASSERT(IsEqual(T(1), T(1)));
HWY_ASSERT(IsEqual(T(-1), T(-1)));
HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));
HWY_ASSERT(!IsEqual(T(0), T(1)));
HWY_ASSERT(!IsEqual(T(1), T(0)));
HWY_ASSERT(!IsEqual(T(1), T(-1)));
HWY_ASSERT(!IsEqual(T(-1), T(1)));
HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
}
};
struct TestEqualFloat {
template <class T>
HWY_NOINLINE void operator()(T /*t*/) const {
HWY_ASSERT(IsEqual(T(0), T(0)));
HWY_ASSERT(IsEqual(T(1), T(1)));
HWY_ASSERT(IsEqual(T(-1), T(-1)));
HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
HWY_ASSERT(!IsEqual(T(0), T(1)));
HWY_ASSERT(!IsEqual(T(1), T(0)));
HWY_ASSERT(!IsEqual(T(1), T(-1)));
HWY_ASSERT(!IsEqual(T(-1), T(1)));
HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
}
};
HWY_NOINLINE void TestAllEqual() {
ForIntegerTypes(TestEqualInteger());
ForFloatTypes(TestEqualFloat());
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
} // namespace HWY_NAMESPACE
} // namespace hwy
HWY_AFTER_NAMESPACE();
#if HWY_ONCE
namespace hwy {
HWY_BEFORE_TEST(TestUtilTest);
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
} // namespace hwy
#endif

8
third_party/highway/libhwy-test.pc.in поставляемый Normal file
Просмотреть файл

@ -0,0 +1,8 @@
prefix=@CMAKE_INSTALL_PREFIX@
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: libhwy-test
Description: Efficient and performance-portable SIMD wrapper, test helpers.
Requires: gtest
Version: @HWY_LIBRARY_VERSION@
Cflags: -I${includedir}

10
third_party/highway/libhwy.pc.in поставляемый Normal file
Просмотреть файл

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
Name: libhwy
Description: Efficient and performance-portable SIMD wrapper
Version: @HWY_LIBRARY_VERSION@
Libs: -L${libdir} -lhwy
Cflags: -I${includedir}

20
third_party/highway/run_tests.bat поставляемый Normal file
Просмотреть файл

@ -0,0 +1,20 @@
@echo off
REM Switch directory of this batch file
cd %~dp0
if not exist build mkdir build
cd build
cmake .. -G Ninja || goto error
ninja || goto error
ctest -j || goto error
cd ..
echo Success
goto end
:error
echo Failure
exit /b 1
:end

15
third_party/highway/run_tests.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,15 @@
#!/bin/bash
# Switch to directory of this script
MYDIR=$(dirname $(realpath "$0"))
cd "${MYDIR}"
# Exit if anything fails
set -e
mkdir -p build
cd build
cmake ..
make -j
ctest -j
echo Success

4
third_party/jpeg-xl/.clang-format поставляемый Normal file
Просмотреть файл

@ -0,0 +1,4 @@
BasedOnStyle: Google
IncludeCategories:
- Regex: '^<hwy/'
Priority: 2

70
third_party/jpeg-xl/.clang-tidy поставляемый Normal file
Просмотреть файл

@ -0,0 +1,70 @@
# Disabled checks:
# - google-readability-todo: We don't use the google TODO format.
#
# - modernize-deprecated-headers: We don't use std:: versions of the standard
# types and functions like size_t or printf, so we should include <stdio.h>
# instead <cstdio>.
# - modernize-return-braced-init-list: this often doesn't improve readability.
# - modernize-use-auto: is too aggressive towards using auto.
# - modernize-use-default-member-init: with a mix of constructors and default
# member initialization this can be confusing if enforced.
# - modernize-use-trailing-return-type: does not improve readability when used
# systematically.
# - modernize-use-using: typedefs are ok.
#
# - readability-else-after-return: It doesn't always improve readability.
# - readability-static-accessed-through-instance
# It is often more useful and readable to access a constant of a passed
# variable (like d.N) instead of using the type of the variable that could be
# long and complex.
# - readability-uppercase-literal-suffix: we write 1.0f, not 1.0F.
Checks: >-
bugprone-*,
clang-*,
-clang-diagnostic-unused-command-line-argument,
google-*,
modernize-*,
performance-*,
readability-*,
-google-readability-todo,
-modernize-deprecated-headers,
-modernize-return-braced-init-list,
-modernize-use-auto,
-modernize-use-default-member-init,
-modernize-use-trailing-return-type,
-modernize-use-using,
-readability-else-after-return,
-readability-function-cognitive-complexity,
-readability-static-accessed-through-instance,
-readability-uppercase-literal-suffix,
WarningsAsErrors: >-
bugprone-argument-comment,
bugprone-macro-parentheses,
bugprone-suspicious-string-compare,
bugprone-use-after-move,
clang-*,
clang-analyzer-*,
-clang-diagnostic-unused-command-line-argument,
google-build-using-namespace,
google-explicit-constructor,
google-readability-braces-around-statements,
google-readability-namespace-comments,
modernize-use-override,
readability-inconsistent-declaration-parameter-name
# We are only interested in the headers from this projects, excluding
# third_party/ and build/.
HeaderFilterRegex: '^.*/(lib|tools)/.*\.h$'
CheckOptions:
- key: readability-braces-around-statements.ShortStatementLines
value: '2'
- key: google-readability-braces-around-statements.ShortStatementLines
value: '2'
- key: readability-implicit-bool-conversion.AllowPointerConditions
value: '1'
- key: readability-implicit-bool-conversion.AllowIntegerConditions
value: '1'

334
third_party/jpeg-xl/CMakeLists.txt поставляемый Normal file
Просмотреть файл

@ -0,0 +1,334 @@
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Ubuntu bionic ships with cmake 3.10.
cmake_minimum_required(VERSION 3.10)
# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
if(POLICY CMP0063)
cmake_policy(SET CMP0063 NEW)
endif()
# Pass CMAKE_EXE_LINKER_FLAGS to CC and CXX compilers when testing if they work.
if(POLICY CMP0065)
cmake_policy(SET CMP0065 NEW)
endif()
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
if(POLICY CMP0083)
cmake_policy(SET CMP0083 NEW)
endif()
project(JPEGXL LANGUAGES C CXX)
include(CheckCXXSourceCompiles)
check_cxx_source_compiles(
"int main() {
#if !defined(__EMSCRIPTEN__)
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
#endif
return 0;
}"
JPEGXL_EMSCRIPTEN
)
message(STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-fsanitize=fuzzer-no-link" CXX_FUZZERS_SUPPORTED)
check_cxx_compiler_flag("-Xclang -mconstructor-aliases" CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
# Enabled PIE binaries by default if supported.
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
if(CHECK_PIE_SUPPORTED)
check_pie_supported(LANGUAGES CXX)
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()
endif()
### Project build options:
if(${CXX_FUZZERS_SUPPORTED})
# Enabled by default except on arm64, Windows and Apple builds.
set(ENABLE_FUZZERS_DEFAULT true)
endif()
find_package(PkgConfig)
if(NOT APPLE AND NOT WIN32 AND NOT HAIKU AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
pkg_check_modules(TCMallocMinimalVersionCheck QUIET IMPORTED_TARGET
libtcmalloc_minimal)
if(TCMallocMinimalVersionCheck_FOUND AND
NOT TCMallocMinimalVersionCheck_VERSION VERSION_EQUAL 2.8.0)
# Enabled by default except on Windows and Apple builds for
# tcmalloc != 2.8.0. tcmalloc 2.8.1 already has a fix for this issue.
set(ENABLE_TCMALLOC_DEFAULT true)
else()
message(STATUS
"tcmalloc version ${TCMallocMinimalVersionCheck_VERSION} -- "
"tcmalloc 2.8.0 disabled due to "
"https://github.com/gperftools/gperftools/issues/1204")
endif()
endif()
set(WARNINGS_AS_ERRORS_DEFAULT false)
set(JPEGXL_ENABLE_FUZZERS ${ENABLE_FUZZERS_DEFAULT} CACHE BOOL
"Build JPEGXL fuzzer targets.")
set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
"Build JPEGXL developer tools.")
set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
"Build and install man pages for the command-line tools.")
set(JPEGXL_ENABLE_BENCHMARK true CACHE BOOL
"Build JPEGXL benchmark tools.")
set(JPEGXL_ENABLE_EXAMPLES true CACHE BOOL
"Build JPEGXL library usage examples.")
set(JPEGXL_ENABLE_SJPEG true CACHE BOOL
"Build JPEGXL with support for encoding with sjpeg.")
set(JPEGXL_ENABLE_OPENEXR true CACHE BOOL
"Build JPEGXL with support for OpenEXR if available.")
set(JPEGXL_ENABLE_SKCMS true CACHE BOOL
"Build with skcms instead of lcms2.")
set(JPEGXL_ENABLE_VIEWERS false CACHE BOOL
"Build JPEGXL viewer tools for evaluation.")
set(JPEGXL_ENABLE_TCMALLOC ${ENABLE_TCMALLOC_DEFAULT} CACHE BOOL
"Build JPEGXL using gperftools (tcmalloc) allocator.")
set(JPEGXL_ENABLE_PLUGINS false CACHE BOOL
"Build third-party plugings to support JPEG XL in other applications.")
set(JPEGXL_ENABLE_COVERAGE false CACHE BOOL
"Enable code coverage tracking for libjxl. This also enables debug and disables optimizations.")
set(JPEGXL_STATIC false CACHE BOOL
"Build tools as static binaries.")
set(JPEGXL_WARNINGS_AS_ERRORS ${WARNINGS_AS_ERRORS_DEFAULT} CACHE BOOL
"Treat warnings as errors during compilation.")
set(JPEGXL_DEP_LICENSE_DIR "" CACHE STRING
"Directory where to search for system dependencies \"copyright\" files.")
set(JPEGXL_FORCE_NEON false CACHE BOOL
"Set flags to enable NEON in arm if not enabled by your toolchain.")
# Force system dependencies.
set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
"Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
"Force using system installed brotli instead of third_party/brotli source.")
set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
"Force using system installed highway (libhwy-dev) instead of third_party/highway source.")
# Check minimum compiler versions. Older compilers are not supported and fail
# with hard to understand errors.
if (NOT ${CMAKE_C_COMPILER_ID} STREQUAL ${CMAKE_CXX_COMPILER_ID})
message(FATAL_ERROR "Different C/C++ compilers set: "
"${CMAKE_C_COMPILER_ID} vs ${CMAKE_CXX_COMPILER_ID}")
endif()
if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
# Android NDK's toolchain.cmake fakes the clang version in
# CMAKE_CXX_COMPILER_VERSION with an incorrect number, so ignore this.
if (NOT ${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION} MATCHES "clang"
AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 6)
message(FATAL_ERROR
"Minimum Clang version required is Clang 6, please update.")
endif()
elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7)
message(FATAL_ERROR
"Minimum GCC version required is 7, please update.")
endif()
endif()
message(STATUS
"Compiled IDs C:${CMAKE_C_COMPILER_ID}, C++:${CMAKE_CXX_COMPILER_ID}")
# CMAKE_EXPORT_COMPILE_COMMANDS is used to generate the compilation database
# used by clang-tidy.
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(JPEGXL_STATIC)
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
set(BUILD_SHARED_LIBS 0)
set(CMAKE_EXE_LINKER_FLAGS
"${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
if (MINGW)
# In MINGW libstdc++ uses pthreads directly. When building statically a
# program (regardless of whether the source code uses pthread or not) the
# toolchain will add stdc++ and pthread to the linking step but stdc++ will
# be linked statically while pthread will be linked dynamically.
# To avoid this and have pthread statically linked with need to pass it in
# the command line with "-Wl,-Bstatic -lpthread -Wl,-Bdynamic" but the
# linker will discard it if not used by anything else up to that point in
# the linker command line. If the program or any dependency don't use
# pthread directly -lpthread is discarded and libstdc++ (added by the
# toolchain later) will then use the dynamic version. For this we also need
# to pass -lstdc++ explicitly before -lpthread. For pure C programs -lstdc++
# will be discarded anyway.
# This adds these flags as dependencies for *all* targets. Adding this to
# CMAKE_EXE_LINKER_FLAGS instead would cause them to be included before any
# object files and therefore discarded.
link_libraries(-Wl,-Bstatic -lstdc++ -lpthread -Wl,-Bdynamic)
endif() # MINGW
endif() # JPEGXL_STATIC
if (MSVC)
# TODO(janwas): add flags
else ()
# Global compiler flags for all targets here and in subdirectories.
add_definitions(
# Avoid changing the binary based on the current time and date.
-D__DATE__="redacted"
-D__TIMESTAMP__="redacted"
-D__TIME__="redacted"
)
if("${JPEGXL_ENABLE_FUZZERS}" OR "${JPEGXL_ENABLE_COVERAGE}")
add_definitions(
-DJXL_ENABLE_FUZZERS
)
endif() # JPEGXL_ENABLE_FUZZERS
# In CMake before 3.12 it is problematic to pass repeated flags like -Xclang.
# For this reason we place them in CMAKE_CXX_FLAGS instead.
# See https://gitlab.kitware.com/cmake/cmake/issues/15826
# Machine flags.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funwind-tables")
if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mrelax-all")
endif()
if ("${CXX_CONSTRUCTOR_ALIASES_SUPPORTED}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mconstructor-aliases")
endif()
if(WIN32)
# Not supported by clang-cl, but frame pointers are default on Windows
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
endif()
# CPU flags - remove once we have NEON dynamic dispatch
# TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
if(JPEGXL_FORCE_NEON)
# GCC requires these flags, otherwise __ARM_NEON is undefined.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-mfpu=neon-vfpv4 -mfloat-abi=hard")
endif()
endif()
# Force build with optimizations in release mode.
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
add_compile_options(
# Ignore this to allow redefining __DATE__ and others.
-Wno-builtin-macro-redefined
# Global warning settings.
-Wall
)
if (JPEGXL_WARNINGS_AS_ERRORS)
add_compile_options(-Werror)
endif ()
endif () # !MSVC
include(GNUInstallDirs)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
add_subdirectory(third_party)
set(THREADS_PREFER_PTHREAD_FLAG YES)
find_package(Threads REQUIRED)
# Copy the JXL license file to the output build directory.
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/LICENSE"
${PROJECT_BINARY_DIR}/LICENSE.jpeg-xl COPYONLY)
# Enable tests regardless of where are they defined.
enable_testing()
include(CTest)
# Libraries.
add_subdirectory(lib)
if(BUILD_TESTING)
# Script to run tests over the source code in bash.
find_program (BASH_PROGRAM bash)
if(BASH_PROGRAM)
add_test(
NAME bash_test
COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
endif()
endif() # BUILD_TESTING
# Documentation generated by Doxygen
find_package(Doxygen)
if(DOXYGEN_FOUND)
set(DOXYGEN_GENERATE_HTML "YES")
set(DOXYGEN_GENERATE_XML "NO")
set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/include")
set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md")
set(DOXYGEN_WARN_AS_ERROR "YES")
doxygen_add_docs(doc
"${CMAKE_CURRENT_SOURCE_DIR}/lib/include/jxl"
"${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
COMMENT "Generating C API documentation")
else()
# Create a "doc" target for compatibility since "doc" is not otherwise added to
# the build when doxygen is not installed.
add_custom_target(doc false
COMMENT "Error: Can't generate doc since Doxygen not installed.")
endif() # DOXYGEN_FOUND
if(JPEGXL_ENABLE_MANPAGES)
find_package(Python COMPONENTS Interpreter)
if(Python_Interpreter_FOUND)
find_program(ASCIIDOC a2x)
endif()
if(NOT Python_Interpreter_FOUND OR "${ASCIIDOC}" STREQUAL "ASCIIDOC-NOTFOUND")
message(WARNING "asciidoc was not found, the man pages will not be installed.")
else()
set(MANPAGE_FILES "")
set(MANPAGES "")
foreach(PAGE IN ITEMS cjxl djxl)
# Invoking the Python interpreter ourselves instead of running the a2x binary
# directly is necessary on MSYS2, otherwise it is run through cmd.exe which
# does not recognize it.
add_custom_command(
OUTPUT "${PAGE}.1"
COMMAND Python::Interpreter
ARGS "${ASCIIDOC}"
--format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
list(APPEND MANPAGES "${PAGE}.1")
endforeach()
add_custom_target(manpages ALL DEPENDS ${MANPAGES})
install(FILES ${MANPAGE_FILES} DESTINATION share/man/man1)
endif()
endif()
# Example usage code.
if (${JPEGXL_ENABLE_EXAMPLES})
add_subdirectory(examples)
endif ()
# Plugins for third-party software
if (${JPEGXL_ENABLE_PLUGINS})
add_subdirectory(plugins)
endif ()
# Binary tools
add_subdirectory(tools)

10
third_party/jpeg-xl/CONTRIBUTING.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,10 @@
# How to Contribute
We are currently unable to accept patches to this project, but we'd very much
appreciate if you open a Gitlab issue for any bug reports/feature requests.
We will be happy to investigate and hopefully fix any issues.
# Community Guidelines
This project follows
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).

23
third_party/jpeg-xl/CONTRIBUTORS поставляемый Normal file
Просмотреть файл

@ -0,0 +1,23 @@
# This files lists individuals who made significant contributions to the JPEG XL
# code base, such as design, adding features, performing experiments, ...
# Small changes such as a small bugfix or fixing spelling errors are not
# included. If you'd like to be included in this file thanks to a significant
# contribution, feel free to send a pull request changing this file.
Alex Deymo
Alexander Rhatushnyak
Evgenii Kliuchnikov
Iulia-Maria Comșa
Jan Wassenberg
Jon Sneyers
Jyrki Alakuijala
Krzysztof Potempa
Lode Vandevenne
Luca Versari
Martin Bruse
Moritz Firsching
Renata Khasanova
Robert Obryk
Sami Boukortt
Sebastian Gomez-Gonzalez
Thomas Fischbacher
Zoltan Szabadka

203
third_party/jpeg-xl/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,203 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

20
third_party/jpeg-xl/README.Haiku.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,20 @@
## Disclaimer
Haiku builds are not officially supported, i.e. the build might not work at all,
some tests may fail and some sub-projects are excluded from build.
This manual outlines Haiku-specific setup. For general building and testing
instructions see "[README](README.md)" and
"[Building and Testing changes](doc/building_and_testing.md)".
## Dependencies
```shell
pkgman install llvm9_clang ninja cmake doxygen libjpeg_turbo_devel
```
## Building
```shell
TEST_STACK_LIMIT=none CMAKE_FLAGS="-I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++ -I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++/x86_64-unknown-haiku" CMAKE_SHARED_LINKER_FLAGS="-shared -Xlinker -soname=libjpegxl.so -lpthread" ./ci.sh opt
```

41
third_party/jpeg-xl/README.OSX.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,41 @@
## Disclaimer
OSX builds have "best effort" support, i.e. build might not work at all, some
tests may fail and some sub-projects are excluded from build.
This manual outlines OSX specific setup. For general building and testing
instructions see "[README](README.md)" and
"[Building and Testing changes](doc/building_and_testing.md)".
[Homebrew](https://brew.sh/) is a popular package manager. JPEG XL library and
binaries could be installed using it:
```bash
brew install jpeg-xl
```
## Dependencies
Make sure that `brew doctor` does not report serious problems and up-to-date
version of XCode is installed.
Installing (actually, building) `clang` might take a couple hours.
```bash
brew install llvm
```
```bash
brew install coreutils cmake giflib jpeg-turbo libpng ninja zlib
```
Before building the project check that `which clang` is
`/usr/local/opt/llvm/bin/clang`, not the one provided by XCode. If not, update
`PATH` environment variable.
Also, setting `CMAKE_PREFIX_PATH` might be necessary for correct include paths
resolving, e.g.:
```bash
export CMAKE_PREFIX_PATH=`brew --prefix giflib`:`brew --prefix jpeg-turbo`:`brew --prefix libpng`:`brew --prefix zlib`
```

224
third_party/jpeg-xl/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,224 @@
# JPEG XL reference implementation
<img src="doc/jxl.svg" width="100" align="right" alt="JXL logo">
This repository contains a reference implementation of JPEG XL (encoder and
decoder), called `libjxl`.
JPEG XL is in the final stages of standardization and its codestream format is
frozen.
The libraries API, command line options and tools in this repository are subject
to change, however files encoded with `cjxl` conform to the JPEG XL format
specification and can be decoded with current and future `djxl` decoders or
`libjxl` decoding library.
## Quick start guide
For more details and other workflows see the "Advanced guide" below.
### Checking out the code
```bash
git clone https://gitlab.com/wg1/jpeg-xl.git --recursive
```
This repository uses git submodules to handle some third party dependencies
under `third_party/`, that's why is important to pass `--recursive`. If you
didn't check out with `--recursive`, or any submodule has changed, run:
`git submodule update --init --recursive`.
Important: If you downloaded a zip file or tarball from the web interface you
won't get the needed submodules and the code will not compile. You can download
these external dependencies from source running `./deps.sh`. The git workflow
described above is recommended instead.
### Installing dependencies
Required dependencies for compiling the code, in a Debian/Ubuntu based
distribution run:
```bash
sudo apt install cmake pkg-config libbrotli-dev
```
Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
in a Debian/Ubuntu based distribution run:
```bash
sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
```
We recommend using a recent Clang compiler (version 7 or newer), for that
install clang and set `CC` and `CXX` variables. For example, with clang-7:
```bash
sudo apt install clang-7
export CC=clang-7 CXX=clang++-7
```
### Building
```bash
cd jpeg-xl
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
cmake --build . -- -j$(nproc)
```
The encoder/decoder tools will be available in the `build/tools` directory.
### <a name="installing"></a> Installing
```bash
sudo cmake --install .
```
### Basic encoder/decoder
To encode a source image to JPEG XL with default settings:
```bash
build/tools/cjxl input.png output.jxl
```
For more settings run `build/tools/cjxl --help` or for a full list of options
run `build/tools/cjxl -v -v --help`.
To decode a JPEG XL file run:
```bash
build/tools/djxl input.jxl output.png
```
When possible `cjxl`/`djxl` are able to read/write the following
image formats: .exr, .gif, .jpeg/.jpg, .pfm, .pgm/.ppm, .pgx, .png.
### Benchmarking
For speed benchmarks on single images in single or multi-threaded decoding
`djxl` can print decoding speed information. See `djxl --help` for details
on the decoding options and note that the output image is optional for
benchmarking purposes.
For a more comprehensive comparison of compression density between multiple
options see "Benchmarking with benchmark_xl" section below.
## Advanced guide
### Building with Docker
We build a common environment based on Debian/Ubuntu using Docker. Other
systems may have different combinations of versions and dependencies that
have not been tested and may not work. For those cases we recommend using the
Docker environment as explained in the
[step by step guide](doc/developing_in_docker.md).
### Building JPEG XL for developers
For experienced developers, we also provide build instructions for an [up to
date Debian-based Linux](doc/developing_in_debian.md) and [64-bit
Windows](doc/developing_in_windows.md). If you encounter any difficulties,
please use Docker instead.
## Benchmarking with benchmark_xl
We recommend `build/tools/benchmark_xl` as a convenient method for reading
images or image sequences, encoding them using various codecs (jpeg jxl png
webp), decoding the result, and computing objective quality metrics. An example
invocation is:
```bash
build/tools/benchmark_xl --input "/path/*.png" --codec jxl:wombat:d1,jxl:cheetah:d2
```
Multiple comma-separated codecs are allowed. The characters after : are
parameters for the codec, separated by colons, in this case specifying maximum
target psychovisual distances of 1 and 2 (higher implies lower quality) and
the encoder effort (see below). Other common parameters are `r0.5` (target
bitrate 0.5 bits per pixel) and `q92` (quality 92, on a scale of 0-100, where
higher is better). The `jxl` codec supports the following additional parameters:
Speed: `falcon`, `cheetah`, `hare`, `wombat`, `squirrel`, `kitten`, `tortoise`
control the encoder effort in ascending order. This also affects memory usage:
using lower effort will typically reduce memory consumption during encoding.
* `falcon` disables all of the following tools.
* `cheetah` enables coefficient reordering, context clustering, and heuristics
for selecting DCT sizes and quantization steps.
* `hare` enables Gaborish filtering, chroma from luma, and an initial estimate
of quantization steps.
* `wombat` enables error diffusion quantization and full DCT size selection
heuristics.
* `squirrel` (default) enables dots, patches, and spline detection, and full
context clustering.
* `kitten` optimizes the adaptive quantization for a psychovisual metric.
* `tortoise` enables a more thorough adaptive quantization search.
Mode: JPEG XL has two modes. The default is Var-DCT mode, which is suitable for
lossy compression. The other mode is Modular mode, which is suitable for lossless
compression. Modular mode can also do lossy compression (e.g. `jxl:m:q50`).
* `m` activates modular mode.
Other arguments to benchmark_xl include:
* `--save_compressed`: save codestreams to `output_dir`.
* `--save_decompressed`: save decompressed outputs to `output_dir`.
* `--output_extension`: selects the format used to output decoded images.
* `--num_threads`: number of codec instances that will independently
encode/decode images, or 0.
* `--inner_threads`: how many threads each instance should use for parallel
encoding/decoding, or 0.
* `--encode_reps`/`--decode_reps`: how many times to repeat encoding/decoding
each image, for more consistent measurements (we recommend 10).
The benchmark output begins with a header:
```
Compr Input Compr Compr Compr Decomp Butteraugli
Method Pixels Size BPP # MP/s MP/s Distance Error p norm BPP*pnorm Errors
```
`ComprMethod` lists each each comma-separated codec. `InputPixels` is the number
of pixels in the input image. `ComprSize` is the codestream size in bytes and
`ComprBPP` the bitrate. `Compr MP/s` and `Decomp MP/s` are the
compress/decompress throughput, in units of Megapixels/second.
`Butteraugli Distance` indicates the maximum psychovisual error in the decoded
image (larger is worse). `Error p norm` is a similar summary of the psychovisual
error, but closer to an average, giving less weight to small low-quality
regions. `BPP*pnorm` is the product of `ComprBPP` and `Error p norm`, which is a
figure of merit for the codec (lower is better). `Errors` is nonzero if errors
occurred while loading or encoding/decoding the image.
## License
This software is available under Apache 2.0 license which can be found in the
[LICENSE](LICENSE) file.
## Additional documentation
### Codec description
* [Introductory paper](https://www.spiedigitallibrary.org/proceedings/Download?fullDOI=10.1117%2F12.2529237) (open-access)
* [XL Overview](doc/xl_overview.md) - a brief introduction to the source code modules
* [JPEG XL white paper](http://ds.jpeg.org/whitepapers/jpeg-xl-whitepaper.pdf)
* [JPEG XL website](https://jpeg.org/jpegxl/)
* [Jon's JXL info page](https://sneyers.info/jxl/)
### Development process
* [Docker setup - **start here**](doc/developing_in_docker.md)
* [Building on Debian](doc/developing_in_debian.md) - for experts only
* [Building on Windows](doc/developing_in_windows.md) - for experts only
* [More information on testing/build options](doc/building_and_testing.md)
* [Git guide for JPEG XL](doc/developing_in_gitlab.md) - for developers only
* [Building Web Assembly artifacts](doc/building_wasm.md)
### Contact
If you encounter a bug or other issue with the software, please open an Issue here.
There is a [subreddit about JPEG XL](https://www.reddit.com/r/jpegxl/), and
informal chatting with developers and early adopters of `libjxl` can be done on the
[JPEG XL Discord server](https://discord.gg/DqkQgDRTFu).

37
third_party/jpeg-xl/SECURITY.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,37 @@
# Security and Vulnerability Policy for JPEG XL
The current focus of the reference implementation is to provide a vehicle for
evaluating the JPEG XL codec compression density, quality, features and its
actual performance on different platforms. With this focus in mind we provide
source code releases with improvements on performance and quality so developers
can evaluate the codec.
At this time, **we don't provide security and vulnerability support** for any
of these releases. This means that the source code may contain bugs, including
security bugs, that may be added or fixed between releases and will **not** be
individually documented. All of these
[releases](https://gitlab.com/wg1/jpeg-xl/-/releases) include the following
note to that effect:
* Note: This release is for evaluation purposes and may contain bugs, including
security bugs, that will *not* be individually documented when fixed. Always
prefer to use the latest release. Please provide feedback and report bugs
[here](https://gitlab.com/wg1/jpeg-xl/-/issues).
To be clear, this means that because a release doesn't mention any CVE it
doesn't mean that no security issues in previous versions were fixed. You should
assume that any previous release contains security issues if that's a concern
for your use case.
This however doesn't impede you from evaluating the codec with your own trusted
inputs, such as `.jxl` you encoded yourself, or when taking appropriate measures
for your application like sandboxing if processing untrusted inputs.
## Future plans
To help our users and developers integrating this implementation into their
software we plan to provide support for security and vulnerability tracking of
this implementation in the future.
When we can provide such support we will update this Policy with the details and
expectations and clearly mention that fact in the release notes.

229
third_party/jpeg-xl/bash_test.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,229 @@
#!/bin/bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Tests implemented in bash. These typically will run checks about the source
# code rather than the compiled one.
MYDIR=$(dirname $(realpath "$0"))
set -u
test_includes() {
local ret=0
local f
for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h)$'); do
# Check that the public files (in lib/include/ directory) don't use the full
# path to the public header since users of the library will include the
# library as: #include "jxl/foobar.h".
if [[ "${f#lib/include/}" != "${f}" ]]; then
if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
echo "Don't add \"include/\" to the include path of public headers." >&2
ret=1
fi
fi
if [[ "${f#third_party/}" == "$f" ]]; then
# $f is not in third_party/
# Check that local files don't use the full path to third_party/
# directory since the installed versions will not have that path.
# Add an exception for third_party/dirent.h.
if grep -v -F 'third_party/dirent.h' "$f" | \
grep -i -H -n -E '#include\s*[<"]third_party/' >&2 &&
[[ $ret -eq 0 ]]; then
cat >&2 <<EOF
$f: Don't add third_party/ to the include path of third_party projects. This \
makes it harder to use installed system libraries instead of the third_party/ \
ones.
EOF
ret=1
fi
fi
done
return ${ret}
}
test_include_collision() {
local ret=0
local f
for f in $(git ls-files | grep -E '^lib/include/'); do
local base=${f#lib/include/}
if [[ -e "lib/${base}" ]]; then
echo "$f: Name collision, both $f and lib/${base} exist." >&2
ret=1
fi
done
return ${ret}
}
test_copyright() {
local ret=0
local f
for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h|\.sh|\.m|\.py)$'); do
if [[ "${f#third_party/}" == "$f" ]]; then
# $f is not in third_party/
if ! head -n 10 "$f" |
grep -F 'Copyright (c) the JPEG XL Project' >/dev/null ; then
echo "$f: Missing Copyright blob near the top of the file." >&2
ret=1
fi
fi
done
return ${ret}
}
# Check that "dec_" code doesn't depend on "enc_" headers.
test_dec_enc_deps() {
local ret=0
local f
for f in $(git ls-files | grep -E '/dec_'); do
if [[ "${f#third_party/}" == "$f" ]]; then
# $f is not in third_party/
if grep -n -H -E "#include.*/enc_" "$f" >&2; then
echo "$f: Don't include \"enc_*\" files from \"dec_*\" files." >&2
ret=1
fi
fi
done
return ${ret}
}
# Check for git merge conflict markers.
test_merge_conflict() {
local ret=0
TEXT_FILES='(\.cc|\.cpp|\.h|\.sh|\.m|\.py|\.md|\.txt|\.cmake)$'
for f in $(git ls-files | grep -E "${TEXT_FILES}"); do
if grep -E '^<<<<<<< ' "$f"; then
echo "$f: Found git merge conflict marker. Please resolve." >&2
ret=1
fi
done
return ${ret}
}
# Check that the library and the package have the same version. This prevents
# accidentally having them out of sync.
get_version() {
local varname=$1
local line=$(grep -F "set(${varname} " lib/CMakeLists.txt | head -n 1)
[[ -n "${line}" ]]
line="${line#set(${varname} }"
line="${line%)}"
echo "${line}"
}
test_version() {
local major=$(get_version JPEGXL_MAJOR_VERSION)
local minor=$(get_version JPEGXL_MINOR_VERSION)
local patch=$(get_version JPEGXL_PATCH_VERSION)
# Check that the version is not empty
if [[ -z "${major}${minor}${patch}" ]]; then
echo "Couldn't parse version from CMakeLists.txt" >&2
return 1
fi
local pkg_version=$(head -n 1 debian/changelog)
# Get only the part between the first "jpeg-xl (" and the following ")".
pkg_version="${pkg_version#jpeg-xl (}"
pkg_version="${pkg_version%%)*}"
if [[ -z "${pkg_version}" ]]; then
echo "Couldn't parse version from debian package" >&2
return 1
fi
local lib_version="${major}.${minor}.${patch}"
lib_version="${lib_version%.0}"
if [[ "${pkg_version}" != "${lib_version}"* ]]; then
echo "Debian package version (${pkg_version}) doesn't match library" \
"version (${lib_version})." >&2
return 1
fi
return 0
}
# Check that the SHA versions in deps.sh matches the git submodules.
test_deps_version() {
while IFS= read -r line; do
if [[ "${line:0:10}" != "[submodule" ]]; then
continue
fi
line="${line#[submodule \"}"
line="${line%\"]}"
local varname=$(tr '[:lower:]' '[:upper:]' <<< "${line}")
varname="${varname/\//_}"
if ! grep -F "${varname}=" deps.sh >/dev/null; then
# Ignoring submodule not in deps.sh
continue
fi
local deps_sha=$(grep -F "${varname}=" deps.sh | cut -f 2 -d '"')
[[ -n "${deps_sha}" ]]
local git_sha=$(git ls-tree -r HEAD "${line}" | cut -f 1 | cut -f 3 -d ' ')
if [[ "${deps_sha}" != "${git_sha}" ]]; then
cat >&2 <<EOF
deps.sh: SHA for project ${line} is at ${deps_sha} but the git submodule is at
${git_sha}. Please update deps.sh
EOF
return 1
fi
done < .gitmodules
}
# Make sure that all the Fields objects are fuzzed directly.
test_fuzz_fields() {
local ret=0
# List all the classes of the form "ClassName : public Fields".
# This doesn't catch class names that are too long to fit.
local field_classes=$( git ls-files |
grep -E '\.(cc|h)' | grep -v 'test\.cc$' |
xargs grep -h -o -E '\b[^ ]+ : public Fields' | cut -f 1 -d ' ')
local classname
for classname in ${field_classes}; do
if ! grep -E "\\b${classname}\\b" tools/fields_fuzzer.cc >/dev/null; then
cat >&2 <<EOF
tools/fields_fuzzer.cc: Class ${classname} not found in the fields_fuzzer.
EOF
ret=1
fi
done
return $ret
}
main() {
local ret=0
cd "${MYDIR}"
if ! git rev-parse >/dev/null 2>/dev/null; then
echo "Not a git checkout, skipping bash_test"
return 0
fi
IFS=$'\n'
for f in $(declare -F); do
local test_name=$(echo "$f" | cut -f 3 -d ' ')
# Runs all the local bash functions that start with "test_".
if [[ "${test_name}" == test_* ]]; then
echo "Test ${test_name}: Start"
if ${test_name}; then
echo "Test ${test_name}: PASS"
else
echo "Test ${test_name}: FAIL"
ret=1
fi
fi
done
return ${ret}
}
main "$@"

1330
third_party/jpeg-xl/ci.sh поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

86
third_party/jpeg-xl/debian/changelog поставляемый Normal file
Просмотреть файл

@ -0,0 +1,86 @@
jpeg-xl (0.3.7) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.7.
* Fix a rounding issue in 8-bit decoding.
-- Sami Boukortt <sboukortt@google.com> Mon, 29 Mar 2021 12:14:20 +0200
jpeg-xl (0.3.6) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.6.
* Fix a bug that could result in the generation of invalid codestreams as
well as failure to decode valid streams.
-- Sami Boukortt <sboukortt@google.com> Thu, 25 Mar 2021 17:40:58 +0100
jpeg-xl (0.3.5) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.5.
* Memory usage improvements.
* New encode-time options for faster decoding at the cost of quality.
* Faster decoding to 8-bit output with the C API.
* GIMP plugin: avoid the sRGB conversion dialog for sRGB images, do not show
a console window on Windows.
* Various bug fixes.
* Man pages for cjxl and djxl.
-- Sami Boukortt <sboukortt@google.com> Tue, 23 Mar 2021 15:20:44 +0100
jpeg-xl (0.3.4) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.4.
* Improved box parsing.
* Improved metadata handling.
* Performance and memory usage improvements.
-- Sami Boukortt <sboukortt@google.com> Tue, 16 Mar 2021 12:13:59 +0100
jpeg-xl (0.3.3) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.3.
* Performance improvements for small images.
* Add a (flag-protected) non-high-precision mode with better speed.
* Significantly speed up the PQ EOTF.
* Allow optional HDR tone mapping in djxl (--tone_map, --display_nits).
* Change the behavior of djxl -j to make it consistent with cjxl (#153).
* Improve image quality.
* Improve EXIF handling.
-- Sami Boukortt <sboukortt@google.com> Fri, 5 Mar 2021 19:15:26 +0100
jpeg-xl (0.3.2) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.2.
* Fix embedded ICC encoding regression #149.
-- Alex Deymo <deymo@google.com> Fri, 12 Feb 2021 21:00:12 +0100
jpeg-xl (0.3.1) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.1.
-- Alex Deymo <deymo@google.com> Tue, 09 Feb 2021 09:48:43 +0100
jpeg-xl (0.3) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.3.
-- Alex Deymo <deymo@google.com> Wed, 27 Jan 2021 22:36:32 +0100
jpeg-xl (0.2) UNRELEASED; urgency=medium
* Bump JPEG XL version to 0.2.
-- Alex Deymo <deymo@google.com> Wed, 23 Nov 2020 20:42:10 +0100
jpeg-xl (0.1) UNRELEASED; urgency=medium
* JPEG XL format release candidate.
-- Alex Deymo <deymo@google.com> Fri, 13 Nov 2020 17:42:24 +0100
jpeg-xl (0.0.2-1) UNRELEASED; urgency=medium
* Initial debian package.
-- Alex Deymo <deymo@google.com> Tue, 27 Oct 2020 15:27:59 +0100

1
third_party/jpeg-xl/debian/compat поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
10

61
third_party/jpeg-xl/debian/control поставляемый Normal file
Просмотреть файл

@ -0,0 +1,61 @@
Source: jpeg-xl
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
Section: misc
Priority: optional
Standards-Version: 3.9.8
Build-Depends: cmake,
debhelper (>= 9),
libbrotli-dev,
libgif-dev,
libgmock-dev,
libgoogle-perftools-dev,
libgtest-dev,
libhwy-dev,
libjpeg-dev,
libopenexr-dev,
libpng-dev,
libwebp-dev,
pkg-config,
Homepage: https://gitlab.com/wg1/jpeg-xl
Rules-Requires-Root: no
Package: jxl
Architecture: any
Section: utils
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: JPEG XL Image Coding System - "JXL" (command line utility)
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
lossless image compression format. It has a rich feature set and is
particularly optimized for responsive web environments, so that
content renders well on a wide range of devices. Moreover, it includes
several features that help transition from the legacy JPEG format.
.
This package installs the command line utilities.
Package: libjxl-dev
Architecture: any
Section: libdevel
Depends: libjxl (= ${binary:Version}), ${misc:Depends}
Description: JPEG XL Image Coding System - "JXL" (development files)
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
lossless image compression format. It has a rich feature set and is
particularly optimized for responsive web environments, so that
content renders well on a wide range of devices. Moreover, it includes
several features that help transition from the legacy JPEG format.
.
This package installs development files.
Package: libjxl
Architecture: any
Multi-Arch: same
Section: libs
Depends: ${shlibs:Depends}, ${misc:Depends}
Pre-Depends: ${misc:Pre-Depends}
Description: JPEG XL Image Coding System - "JXL" (shared libraries)
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
lossless image compression format. It has a rich feature set and is
particularly optimized for responsive web environments, so that
content renders well on a wide range of devices. Moreover, it includes
several features that help transition from the legacy JPEG format.
.
This package installs shared libraries.

236
third_party/jpeg-xl/debian/copyright поставляемый Normal file
Просмотреть файл

@ -0,0 +1,236 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: jpeg-xl
Files: *
Copyright: 2020 the JPEG XL Project
License: Apache-2.0
Files: third_party/sjpeg/*
Copyright: 2017 Google, Inc
License: Apache-2.0
Files: third_party/lodepng/*
Copyright: 2005-2018 Lode Vandevenne
License: Zlib License
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
.
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
.
3. This notice may not be removed or altered from any source
distribution.
Files: third_party/skcms/*
Copyright: 2018 Google Inc.
License: BSD-3-clause
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Files: third_party/testdata/imagecompression.info/*
Copyright: their respective owners.
License: License without any prohibitive copyright restrictions.
See https://imagecompression.info/test_images/ for details.
.
These Images are available without any prohibitive copyright restrictions.
.
These images are (c) there respective owners. You are granted full
redistribution and publication rights on these images provided:
.
1. The origin of the pictures must not be misrepresented; you must not claim
that you took the original pictures. If you use, publish or redistribute them,
an acknowledgment would be appreciated but is not required.
2. Altered versions must be plainly marked as such, and must not be
misinterpreted as being the originals.
3. No payment is required for distribution of this material, it must be
available freely under the conditions stated here. That is, it is prohibited to
sell the material.
4. This notice may not be removed or altered from any distribution.
Files: third_party/testdata/pngsuite/*
Copyright: Willem van Schaik, 1996, 2011
License: PngSuite License
See http://www.schaik.com/pngsuite/ for details.
.
Permission to use, copy, modify and distribute these images for any
purpose and without fee is hereby granted.
Files: third_party/testdata/raw.pixls/*
Copyright: their respective owners listed in https://raw.pixls.us/
License: CC0-1.0
Files: third_party/testdata/raw.pixls/*
Copyright: their respective owners listed in https://www.wesaturate.com/
License: CC0-1.0
Files: third_party/testdata/wide-gamut-tests/
Copyright: github.com/codelogic/wide-gamut-tests authors.
License: Apache-2.0
License: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
.
http://www.apache.org/licenses/LICENSE-2.0
.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
.
On Debian systems, the complete text of the Apache License, Version 2
can be found in "/usr/share/common-licenses/Apache-2.0".
License: CC0
Creative Commons Zero v1.0 Universal
.
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL
SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT
RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS"
BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS
DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS
LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE
INFORMATION OR WORKS PROVIDED HEREUNDER.
.
Statement of Purpose
.
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator and
subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
.
Certain owners wish to permanently relinquish those rights to a Work for the
purpose of contributing to a commons of creative, cultural and scientific
works ("Commons") that the public can reliably and without fear of later
claims of infringement build upon, modify, incorporate in other works, reuse
and redistribute as freely as possible in any form whatsoever and for any
purposes, including without limitation commercial purposes. These owners may
contribute to the Commons to promote the ideal of a free culture and the
further production of creative, cultural and scientific works, or to gain
reputation or greater distribution for their Work in part through the use
and efforts of others.
.
For these and/or other purposes and motivations, and without any expectation
of additional consideration or compensation, the person associating CC0 with
a Work (the "Affirmer"), to the extent that he or she is an owner of
Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to
the Work and publicly distribute the Work under its terms, with knowledge of
his or her Copyright and Related Rights in the Work and the meaning and
intended legal effect of CC0 on those rights.
.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not limited
to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation thereof,
including any amended or successor version of such directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national implementations
thereof.
.
2. Waiver. To the greatest extent permitted by, but not in contravention of,
applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
and Related Rights and associated claims and causes of action, whether now
known or unknown (including existing as well as future claims and causes of
action), in the Work (i) in all territories worldwide, (ii) for the maximum
duration provided by applicable law or treaty (including future time
extensions), (iii) in any current or future medium and for any number of
copies, and (iv) for any purpose whatsoever, including without limitation
commercial, advertising or promotional purposes (the "Waiver"). Affirmer
makes the Waiver for the benefit of each member of the public at large and
to the detriment of Affirmer's heirs and successors, fully intending that
such Waiver shall not be subject to revocation, rescission, cancellation,
termination, or any other legal or equitable action to disrupt the quiet
enjoyment of the Work by the public as contemplated by Affirmer's express
Statement of Purpose.
.
3. Public License Fallback. Should any part of the Waiver for any reason be
judged legally invalid or ineffective under applicable law, then the Waiver
shall be preserved to the maximum extent permitted taking into account
Affirmer's express Statement of Purpose. In addition, to the extent the
Waiver is so judged Affirmer hereby grants to each affected person a
royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future time
extensions), (iii) in any current or future medium and for any number of
copies, and (iv) for any purpose whatsoever, including without limitation
commercial, advertising or promotional purposes (the "License"). The License
shall be deemed effective as of the date CC0 was applied by Affirmer to the
Work. Should any part of the License for any reason be judged legally
invalid or ineffective under applicable law, such partial invalidity or
ineffectiveness shall not invalidate the remainder of the License, and in
such case Affirmer hereby affirms that he or she will not (i) exercise any
of his or her remaining Copyright and Related Rights in the Work or (ii)
assert any associated claims and causes of action with respect to the Work,
in either case contrary to Affirmer's express Statement of Purpose.
.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied, statutory or
otherwise, including without limitation warranties of title,
merchantability, fitness for a particular purpose, non infringement, or the
absence of latent or other defects, accuracy, or the present or absence of
errors, whether or not discoverable, all to the greatest extent permissible
under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without limitation
any person's Copyright and Related Rights in the Work. Further, Affirmer
disclaims responsibility for obtaining any necessary consents, permissions
or other rights required for any use of the Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to this
CC0 or use of the Work.
.
For more information, please see:
http://creativecommons.org/publicdomain/zero/1.0/>

1
third_party/jpeg-xl/debian/jxl.install поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
debian/tmp/usr/bin/*

4
third_party/jpeg-xl/debian/libjxl-dev.install поставляемый Normal file
Просмотреть файл

@ -0,0 +1,4 @@
debian/tmp/usr/include/jxl/*.h
debian/tmp/usr/lib/*/*.a
debian/tmp/usr/lib/*/*.so
debian/tmp/usr/lib/*/pkgconfig/*.pc

1
third_party/jpeg-xl/debian/libjxl.install поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
debian/tmp/usr/lib/*/libjxl*.so.*

12
third_party/jpeg-xl/debian/rules поставляемый Normal file
Просмотреть файл

@ -0,0 +1,12 @@
#!/usr/bin/make -f
%:
dh $@ --buildsystem=cmake
override_dh_auto_configure:
# TODO(deymo): Remove the DCMAKE_BUILD_TYPE once builds without NDEBUG
# are as useful as Release builds.
dh_auto_configure -- \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DJPEGXL_FORCE_SYSTEM_GTEST=ON \
-DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
-DJPEGXL_FORCE_SYSTEM_HWY=ON

1
third_party/jpeg-xl/debian/source/format поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
3.0 (quilt)

89
third_party/jpeg-xl/deps.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,89 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file downloads the dependencies needed to build JPEG XL into third_party.
# These dependencies are normally pulled by gtest.
set -eu
MYDIR=$(dirname $(realpath "$0"))
# Git revisions we use for the given submodules. Update these whenever you
# update a git submodule.
THIRD_PARTY_HIGHWAY="ca1a57c342cd815053abfcffa29b44eaead4f20b"
THIRD_PARTY_LODEPNG="48e5364ef48ec2408f44c727657ac1b6703185f8"
THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
# Download the target revision from GitHub.
download_github() {
local path="$1"
local project="$2"
local varname="${path^^}"
varname="${varname/\//_}"
local sha
eval "sha=\${${varname}}"
local down_dir="${MYDIR}/downloads"
local local_fn="${down_dir}/${sha}.tar.gz"
if [[ -e "${local_fn}" && -d "${MYDIR}/${path}" ]]; then
echo "${path} already up to date." >&2
return 0
fi
local url
local strip_components=0
if [[ "${project:0:4}" == "http" ]]; then
# "project" is a googlesource.com base url.
url="${project}${sha}.tar.gz"
else
# GitHub files have a top-level directory
strip_components=1
url="https://github.com/${project}/tarball/${sha}"
fi
echo "Downloading ${path} version ${sha}..." >&2
mkdir -p "${down_dir}"
curl -L --show-error -o "${local_fn}.tmp" "${url}"
mkdir -p "${MYDIR}/${path}"
tar -zxf "${local_fn}.tmp" -C "${MYDIR}/${path}" \
--strip-components="${strip_components}"
mv "${local_fn}.tmp" "${local_fn}"
}
main() {
if git -C "${MYDIR}" rev-parse; then
cat >&2 <<EOF
Currenty directory is a git repository, downloading dependencies via git:
git submodule update --init --recursive
EOF
git -C "${MYDIR}" submodule update --init --recursive
return 0
fi
# Sources downloaded from a tarball.
download_github third_party/highway google/highway
download_github third_party/lodepng lvandeve/lodepng
download_github third_party/sjpeg webmproject/sjpeg
download_github third_party/skcms \
"https://skia.googlesource.com/skcms/+archive/"
echo "Done."
}
main "$@"

30
third_party/jpeg-xl/docker/Dockerfile.jpegxl-builder поставляемый Normal file
Просмотреть файл

@ -0,0 +1,30 @@
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Build an Ubuntu-based docker image with the installed software needed to
# develop and test JPEG XL.
FROM ubuntu:bionic
# Set a prompt for when using it locally.
ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
COPY scripts /jpegxl_scripts
ARG DEBIAN_FRONTEND=noninteractive
RUN /jpegxl_scripts/jpegxl_builder.sh && \
rm -rf /jpegxl_scripts

Просмотреть файл

@ -0,0 +1,46 @@
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Build an Ubuntu-based docker image for aarch64 with the installed software
# needed to run JPEG XL. This is only useful when running on actual aarch64
# hardware.
FROM arm64v8/ubuntu:bionic
COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
# Set a prompt for when using it locally.
ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
ARG DEBIAN_FRONTEND=noninteractive
RUN set -ex; \
apt-get update -y; \
apt-get install -y \
bsdmainutils \
cmake \
curl \
ca-certificates \
extra-cmake-modules \
git \
imagemagick \
libjpeg8 \
libgif7 \
libgoogle-perftools4 \
libopenexr22 \
libpng16-16 \
libqt5x11extras5 \
libsdl2-2.0-0 \
parallel; \
rm -rf /var/lib/apt/lists/*;

7
third_party/jpeg-xl/docker/README.md поставляемый Normal file
Просмотреть файл

@ -0,0 +1,7 @@
### Docker container infrastructure for JPEG XL
This directory contains the requirements to build a docker image for the
JPEG XL project builder.
Docker images need to be created and upload manually. See ./build.sh for
details.

92
third_party/jpeg-xl/docker/build.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,92 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eu
MYDIR=$(dirname $(realpath "$0"))
declare -a TARGETS
load_targets() {
# Built-in OSX "find" does not support "-m".
FIND=$(which "gfind" || which "find")
for f in $(${FIND} -maxdepth 1 -name 'Dockerfile.*' | sort); do
local target="${f#*Dockerfile.}"
TARGETS+=("${target}")
done
}
usage() {
cat >&2 <<EOF
Use: $1 [targets]
Available targets:
* all
EOF
for target in "${TARGETS[@]}"; do
echo " * ${target}" >&2
done
}
build_target() {
local target="$1"
local dockerfile="${MYDIR}/Dockerfile.${target}"
# JPEG XL builder images are stored in the gcr.io/jpegxl project.
local tag="gcr.io/jpegxl/${target}"
echo "Building ${target}"
if ! sudo docker build --no-cache -t "${tag}" -f "${dockerfile}" "${MYDIR}" \
>"${target}.log" 2>&1; then
echo "${target} failed. See ${target}.log" >&2
else
echo "Done, to upload image run:" >&2
echo " sudo docker push ${tag}"
if [[ "${JPEGXL_PUSH:-}" == "1" ]]; then
echo "sudo docker push ${tag}" >&2
sudo docker push "${tag}"
# The RepoDigest is only created after it is pushed.
local fulltag=$(sudo docker inspect --format="{{.RepoDigests}}" "${tag}")
fulltag="${fulltag#[}"
fulltag="${fulltag%]}"
echo "Updating .gitlab-ci.yml to ${fulltag}" >&2
sed -E "s;${tag}@sha256:[0-9a-f]+;${fulltag};" \
-i "${MYDIR}/../.gitlab-ci.yml"
fi
fi
}
main() {
cd "${MYDIR}"
local target="${1:-}"
load_targets
if [[ -z "${target}" ]]; then
usage $0
exit 1
fi
if [[ "${target}" == "all" ]]; then
for target in "${TARGETS[@]}"; do
build_target "${target}"
done
else
for target in "$@"; do
build_target "${target}"
done
fi
}
main "$@"

1
third_party/jpeg-xl/docker/scripts/99_norecommends поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@
APT::Install-Recommends "false";

28
third_party/jpeg-xl/docker/scripts/binutils_align_fix.patch поставляемый Normal file
Просмотреть файл

@ -0,0 +1,28 @@
Description: fix lack of alignment in relocations (crashes on mingw)
See https://sourceware.org/git/?p=binutils-gdb.git;a=patch;h=73af69e74974eaa155eec89867e3ccc77ab39f6d
From: Marc <marc@groundctl.com>
Date: Fri, 9 Nov 2018 11:13:50 +0000
Subject: [PATCH] Allow for compilers that do not produce aligned .rdat
sections in PE format files.
--- a/upstream/ld/scripttempl/pe.sc 2020-05-12 18:45:12.000000000 +0200
+++ b/upstream/ld/scripttempl/pe.sc 2020-05-12 18:47:12.000000000 +0200
@@ -143,6 +143,7 @@
.rdata ${RELOCATING+BLOCK(__section_alignment__)} :
{
${R_RDATA}
+ . = ALIGN(4);
${RELOCATING+__rt_psrelocs_start = .;}
${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
${RELOCATING+__rt_psrelocs_end = .;}
--- a/upstream/ld/scripttempl/pep.sc 2020-05-12 18:45:19.000000000 +0200
+++ b/upstream/ld/scripttempl/pep.sc 2020-05-12 18:47:18.000000000 +0200
@@ -143,6 +143,7 @@
.rdata ${RELOCATING+BLOCK(__section_alignment__)} :
{
${R_RDATA}
+ . = ALIGN(4);
${RELOCATING+__rt_psrelocs_start = .;}
${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
${RELOCATING+__rt_psrelocs_end = .;}

46
third_party/jpeg-xl/docker/scripts/emsdk_install.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
EMSDK_URL="https://github.com/emscripten-core/emsdk/archive/master.tar.gz"
EMSDK_DIR="/opt/emsdk"
EMSDK_RELEASE="2.0.4"
set -eu -x
# Temporary files cleanup hooks.
CLEANUP_FILES=()
cleanup() {
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
rm -fr "${CLEANUP_FILES[@]}"
fi
}
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
main() {
local workdir=$(mktemp -d --suffix=emsdk)
CLEANUP_FILES+=("${workdir}")
local emsdktar="${workdir}/emsdk.tar.gz"
curl --output "${emsdktar}" "${EMSDK_URL}" --location
mkdir -p "${EMSDK_DIR}"
tar -zxf "${emsdktar}" -C "${EMSDK_DIR}" --strip-components=1
cd "${EMSDK_DIR}"
./emsdk install --shallow "${EMSDK_RELEASE}"
./emsdk activate --embedded "${EMSDK_RELEASE}"
}
main "$@"

515
third_party/jpeg-xl/docker/scripts/jpegxl_builder.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,515 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Main entry point for all the Dockerfile for jpegxl-builder. This centralized
# file helps sharing code and configuration between Dockerfiles.
set -eux
MYDIR=$(dirname $(realpath "$0"))
# libjpeg-turbo.
JPEG_TURBO_RELEASE="2.0.4"
JPEG_TURBO_URL="https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${JPEG_TURBO_RELEASE}.tar.gz"
JPEG_TURBO_SHA256="7777c3c19762940cff42b3ba4d7cd5c52d1671b39a79532050c85efb99079064"
# zlib (dependency of libpng)
ZLIB_RELEASE="1.2.11"
ZLIB_URL="https://www.zlib.net/zlib-${ZLIB_RELEASE}.tar.gz"
ZLIB_SHA256="c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
# The name in the .pc and the .dll generated don't match in zlib for Windows
# because they use different .dll names in Windows. We avoid that by defining
# UNIX=1. We also install all the .dll files to ${prefix}/lib instead of the
# default ${prefix}/bin.
ZLIB_FLAGS='-DUNIX=1 -DINSTALL_PKGCONFIG_DIR=/${CMAKE_INSTALL_PREFIX}/lib/pkgconfig -DINSTALL_BIN_DIR=/${CMAKE_INSTALL_PREFIX}/lib'
# libpng
LIBPNG_RELEASE="1.6.37"
LIBPNG_URL="https://github.com/glennrp/libpng/archive/v${LIBPNG_RELEASE}.tar.gz"
LIBPNG_SHA256="ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307"
# giflib
GIFLIB_RELEASE="5.2.1"
GIFLIB_URL="https://netcologne.dl.sourceforge.net/project/giflib/giflib-${GIFLIB_RELEASE}.tar.gz"
GIFLIB_SHA256="31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd"
# A patch needed to compile GIFLIB in mingw.
GIFLIB_PATCH_URL="https://github.com/msys2/MINGW-packages/raw/3afde38fcee7b3ba2cafd97d76cca8f06934504f/mingw-w64-giflib/001-mingw-build.patch"
GIFLIB_PATCH_SHA256="2b2262ddea87fc07be82e10aeb39eb699239f883c899aa18a16e4d4e40af8ec8"
# webp
WEBP_RELEASE="1.0.2"
WEBP_URL="https://codeload.github.com/webmproject/libwebp/tar.gz/v${WEBP_RELEASE}"
WEBP_SHA256="347cf85ddc3497832b5fa9eee62164a37b249c83adae0ba583093e039bf4881f"
# Google benchmark
BENCHMARK_RELEASE="1.5.2"
BENCHMARK_URL="https://github.com/google/benchmark/archive/v${BENCHMARK_RELEASE}.tar.gz"
BENCHMARK_SHA256="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
BENCHMARK_FLAGS="-DGOOGLETEST_PATH=${MYDIR}/../../third_party/googletest"
# attribute(format(__MINGW_PRINTF_FORMAT, ...)) doesn't work in our
# environment, so we disable the warning.
BENCHMARK_FLAGS="-DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_TESTING=OFF \
-DCMAKE_CXX_FLAGS=-Wno-ignored-attributes \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
# V8
V8_VERSION="8.7.230"
# Temporary files cleanup hooks.
CLEANUP_FILES=()
cleanup() {
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
rm -fr "${CLEANUP_FILES[@]}"
fi
}
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
# List of Ubuntu arch names supported by the builder (such as "i386").
LIST_ARCHS=(
amd64
i386
arm64
armhf
)
# List of target triplets supported by the builder.
LIST_TARGETS=(
x86_64-linux-gnu
i686-linux-gnu
arm-linux-gnueabihf
aarch64-linux-gnu
)
LIST_MINGW_TARGETS=(
i686-w64-mingw32
x86_64-w64-mingw32
)
LIST_WASM_TARGETS=(
wasm32
)
# Setup the apt repositories and supported architectures.
setup_apt() {
apt-get update -y
apt-get install -y curl gnupg ca-certificates
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
# node sources.
cat >/etc/apt/sources.list.d/nodesource.list <<EOF
deb https://deb.nodesource.com/node_14.x bionic main
deb-src https://deb.nodesource.com/node_14.x bionic main
EOF
curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
local port_list=()
local main_list=()
local ubarch
for ubarch in "${LIST_ARCHS[@]}"; do
if [[ "${ubarch}" != "amd64" && "${ubarch}" != "i386" ]]; then
# other archs are not part of the main mirrors, but available in
# ports.ubuntu.com.
port_list+=("${ubarch}")
else
main_list+=("${ubarch}")
fi
# Add the arch to the system.
if [[ "${ubarch}" != "amd64" ]]; then
dpkg --add-architecture "${ubarch}"
fi
done
# Update the sources.list with the split of supported architectures.
local bkplist="/etc/apt/sources.list.bkp"
[[ -e "${bkplist}" ]] || \
mv /etc/apt/sources.list "${bkplist}"
local newlist="/etc/apt/sources.list.tmp"
rm -f "${newlist}"
port_list=$(echo "${port_list[@]}" | tr ' ' ,)
if [[ -n "${port_list}" ]]; then
local port_url="http://ports.ubuntu.com/ubuntu-ports/"
grep -v -E '^#' "${bkplist}" |
sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${port_list}] ${port_url} \\2;" \
>>"${newlist}"
fi
main_list=$(echo "${main_list[@]}" | tr ' ' ,)
grep -v -E '^#' "${bkplist}" |
sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
>>"${newlist}"
mv "${newlist}" /etc/apt/sources.list
}
install_pkgs() {
packages=(
# Native compilers (minimum for SIMD is clang-7)
clang-7 clang-format-7 clang-tidy-7
# TODO: Consider adding clang-8 to every builder:
# clang-8 clang-format-8 clang-tidy-8
# For cross-compiling to Windows with mingw.
mingw-w64
wine64
wine-binfmt
# Native tools.
bsdmainutils
cmake
extra-cmake-modules
git
llvm
nasm
ninja-build
parallel
pkg-config
# These are used by the ./ci.sh lint in the native builder.
clang-format-7
clang-format-8
# For coverage builds
gcovr
# For compiling giflib documentation.
xmlto
# Common libraries.
libstdc++-8-dev
# We don't use tcmalloc on archs other than amd64. This installs
# libgoogle-perftools4:amd64.
google-perftools
# NodeJS for running WASM tests
nodejs
# To generate API documentation.
doxygen
# Freezes version that builds (passes tests). Newer version
# (2.30-21ubuntu1~18.04.4) claims to fix "On Intel Skylake
# (-march=native) generated avx512 instruction can be wrong",
# but newly added tests does not pass. Perhaps the problem is
# that mingw package is not updated.
binutils-source=2.30-15ubuntu1
)
# Install packages that are arch-dependent.
local ubarch
for ubarch in "${LIST_ARCHS[@]}"; do
packages+=(
# Library dependencies. These normally depend on the target architecture
# we are compiling for and can't usually be installed for multiple
# architectures at the same time.
libgif7:"${ubarch}"
libjpeg-dev:"${ubarch}"
libpng-dev:"${ubarch}"
libqt5x11extras5-dev:"${ubarch}"
libstdc++-8-dev:"${ubarch}"
qtbase5-dev:"${ubarch}"
# For OpenEXR:
libilmbase12:"${ubarch}"
libopenexr22:"${ubarch}"
# TCMalloc dependency
libunwind-dev:"${ubarch}"
# Cross-compiling tools per arch.
libc6-dev-"${ubarch}"-cross
libstdc++-8-dev-"${ubarch}"-cross
)
done
local target
for target in "${LIST_TARGETS[@]}"; do
# Per target cross-compiling tools.
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
packages+=(
binutils-"${target}"
gcc-"${target}"
)
fi
done
# Install all the manual packages via "apt install" for the main arch. These
# will be installed for other archs via manual download and unpack.
apt install -y "${packages[@]}" "${UNPACK_PKGS[@]}"
}
# binutils <2.32 need a patch.
install_binutils() {
local workdir=$(mktemp -d --suffix=_install)
CLEANUP_FILES+=("${workdir}")
pushd "${workdir}"
apt source binutils-mingw-w64
apt -y build-dep binutils-mingw-w64
cd binutils-mingw-w64-8ubuntu1
cp "${MYDIR}/binutils_align_fix.patch" debian/patches
echo binutils_align_fix.patch >> debian/patches/series
dpkg-buildpackage -b
cd ..
dpkg -i *deb
popd
}
# Install a library from the source code for multiple targets.
# Usage: install_from_source <tar_url> <sha256> <target> [<target...>]
install_from_source() {
local package="$1"
shift
local url
eval "url=\${${package}_URL}"
local sha256
eval "sha256=\${${package}_SHA256}"
# Optional package flags
local pkgflags
eval "pkgflags=\${${package}_FLAGS:-}"
local workdir=$(mktemp -d --suffix=_install)
CLEANUP_FILES+=("${workdir}")
local tarfile="${workdir}"/$(basename "${url}")
curl -L --output "${tarfile}" "${url}"
if ! echo "${sha256} ${tarfile}" | sha256sum -c --status -; then
echo "SHA256 mismatch for ${url}: expected ${sha256} but found:"
sha256sum "${tarfile}"
exit 1
fi
local target
for target in "$@"; do
echo "Installing ${package} for target ${target} from ${url}"
local srcdir="${workdir}/source-${target}"
mkdir -p "${srcdir}"
tar -zxf "${tarfile}" -C "${srcdir}" --strip-components=1
local prefix="/usr"
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
prefix="/usr/${target}"
fi
# Apply patches to buildfiles.
if [[ "${package}" == "GIFLIB" && "${target}" == *mingw32 ]]; then
# GIFLIB Makefile has several problems so we need to fix them here. We are
# using a patch from MSYS2 that already fixes the compilation for mingw.
local make_patch="${srcdir}/libgif.patch"
curl -L "${GIFLIB_PATCH_URL}" -o "${make_patch}"
echo "${GIFLIB_PATCH_SHA256} ${make_patch}" | sha256sum -c --status -
patch "${srcdir}/Makefile" < "${make_patch}"
elif [[ "${package}" == "LIBPNG" && "${target}" == wasm* ]]; then
# Cut the dependency to libm; there is pull request to fix it, so this
# might not be needed in the future.
sed -i 's/APPLE/EMSCRIPTEN/g' "${srcdir}/CMakeLists.txt"
fi
local cmake_args=()
local export_args=("CC=clang-7" "CXX=clang++-7")
local cmake="cmake"
local make="make"
local system_name="Linux"
if [[ "${target}" == *mingw32 ]]; then
system_name="Windows"
# When compiling with clang, CMake doesn't detect that we are using mingw.
cmake_args+=(
-DMINGW=1
# Googletest needs this when cross-compiling to windows
-DCMAKE_CROSSCOMPILING=1
-DHAVE_STD_REGEX=0
-DHAVE_POSIX_REGEX=0
-DHAVE_GNU_POSIX_REGEX=0
)
local windres=$(which ${target}-windres || true)
if [[ -n "${windres}" ]]; then
cmake_args+=(-DCMAKE_RC_COMPILER="${windres}")
fi
fi
if [[ "${target}" == wasm* ]]; then
system_name="WASM"
cmake="emcmake cmake"
make="emmake make"
export_args=()
cmake_args+=(
-DCMAKE_FIND_ROOT_PATH="${prefix}"
-DCMAKE_PREFIX_PATH="${prefix}"
)
# Static and shared library link to the same file -> race condition.
nproc=1
else
nproc=`nproc --all`
fi
cmake_args+=(-DCMAKE_SYSTEM_NAME="${system_name}")
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
# Cross-compiling.
cmake_args+=(
-DCMAKE_C_COMPILER_TARGET="${target}"
-DCMAKE_CXX_COMPILER_TARGET="${target}"
-DCMAKE_SYSTEM_PROCESSOR="${target%%-*}"
)
fi
if [[ -e "${srcdir}/CMakeLists.txt" ]]; then
# Most pacakges use cmake for building which is easier to configure for
# cross-compiling.
if [[ "${package}" == "JPEG_TURBO" && "${target}" == wasm* ]]; then
# JT erroneously detects WASM CPU as i386 and tries to use asm.
# Wasm/Emscripten support for dynamic linking is incomplete; disable
# to avoid CMake warning.
cmake_args+=(-DWITH_SIMD=0 -DENABLE_SHARED=OFF)
fi
(
cd "${srcdir}"
export ${export_args[@]}
${cmake} \
-DCMAKE_INSTALL_PREFIX="${prefix}" \
"${cmake_args[@]}" ${pkgflags}
${make} -j${nproc}
${make} install
)
elif [[ "${package}" == "GIFLIB" ]]; then
# GIFLIB doesn't yet have a cmake build system. There is a pull
# request in giflib for adding CMakeLists.txt so this might not be
# needed in the future.
(
cd "${srcdir}"
local giflib_make_flags=(
CFLAGS="-O2 --target=${target} -std=gnu99"
PREFIX="${prefix}"
)
if [[ "${target}" != wasm* ]]; then
giflib_make_flags+=(CC=clang-7)
fi
# giflib make dependencies are not properly set up so parallel building
# doesn't work for everything.
${make} -j${nproc} libgif.a "${giflib_make_flags[@]}"
${make} -j${nproc} all "${giflib_make_flags[@]}"
${make} install "${giflib_make_flags[@]}"
)
else
echo "Don't know how to install ${package}"
exit 1
fi
done
}
# Packages that are manually unpacked for each architecture.
UNPACK_PKGS=(
libgif-dev
libclang-common-7-dev
# For OpenEXR:
libilmbase-dev
libopenexr-dev
# TCMalloc
libgoogle-perftools-dev
libtcmalloc-minimal4
libgoogle-perftools4
)
# Main script entry point.
main() {
cd "${MYDIR}"
# Configure the repositories with the sources for multi-arch cross
# compilation.
setup_apt
apt-get update -y
apt-get dist-upgrade -y
install_pkgs
install_binutils
apt clean
# Manually extract packages for the target arch that can't install it directly
# at the same time as the native ones.
local ubarch
for ubarch in "${LIST_ARCHS[@]}"; do
if [[ "${ubarch}" != "amd64" ]]; then
local pkg
for pkg in "${UNPACK_PKGS[@]}"; do
apt download "${pkg}":"${ubarch}"
dpkg -x "${pkg}"_*_"${ubarch}".deb /
done
fi
done
# TODO: Add clang from the llvm repos. This is problematic since we are
# installing libclang-common-7-dev:"${ubarch}" from the ubuntu ports repos
# which is not available in the llvm repos so it might have a different
# version than the ubuntu ones.
# Remove the win32 libgcc version. The gcc-mingw-w64-x86-64 (and i686)
# packages install two libgcc versions:
# /usr/lib/gcc/x86_64-w64-mingw32/7.3-posix
# /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32
# (exact libgcc version number depends on the package version).
#
# Clang will pick the best libgcc, sorting by version, but it doesn't
# seem to be a way to specify one or the other one, except by passing
# -nostdlib and setting all the include paths from the command line.
# To check which one is being used you can run:
# clang++-7 --target=x86_64-w64-mingw32 -v -print-libgcc-file-name
# We need to use the "posix" versions for thread support, so here we
# just remove the other one.
local target
for target in "${LIST_MINGW_TARGETS[@]}"; do
update-alternatives --set "${target}-gcc" $(which "${target}-gcc-posix")
local gcc_win32_path=$("${target}-cpp-win32" -print-libgcc-file-name)
rm -rf $(dirname "${gcc_win32_path}")
done
# TODO: Add msan for the target when cross-compiling. This only installs it
# for amd64.
./msan_install.sh
# Build and install qemu user-linux targets.
./qemu_install.sh
# Install emscripten SDK.
./emsdk_install.sh
# Setup environment for building WASM libraries from sources.
source /opt/emsdk/emsdk_env.sh
# Install some dependency libraries manually for the different targets.
install_from_source JPEG_TURBO "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
install_from_source ZLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
install_from_source LIBPNG "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
install_from_source GIFLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
# webp in Ubuntu is relatively old so we install it from source for everybody.
install_from_source WEBP "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
install_from_source BENCHMARK "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
# Install v8. v8 has better WASM SIMD support than NodeJS 14.
# First we need the installer to install v8.
npm install jsvu -g
# install specific version;
HOME=/opt jsvu --os=linux64 "v8@${V8_VERSION}"
ln -s "/opt/.jsvu/v8-${V8_VERSION}" "/opt/.jsvu/v8"
# Cleanup.
find /var/lib/apt/lists/ -mindepth 1 -delete
}
main "$@"

140
third_party/jpeg-xl/docker/scripts/msan_install.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,140 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -eu
MYDIR=$(dirname $(realpath "$0"))
# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
CMAKE_FLAGS=${CMAKE_FLAGS:-}
CMAKE_C_FLAGS=${CMAKE_C_FLAGS:-${CMAKE_FLAGS}}
CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS:-${CMAKE_FLAGS}}
CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
CLANG_VERSION="${CLANG_VERSION:-}"
# Detect the clang version suffix and store it in CLANG_VERSION. For example,
# "6.0" for clang 6 or "7" for clang 7.
detect_clang_version() {
if [[ -n "${CLANG_VERSION}" ]]; then
return 0
fi
local clang_version=$("${CC:-clang}" --version | head -n1)
local llvm_tag
case "${clang_version}" in
"clang version 6."*)
CLANG_VERSION="6.0"
;;
"clang version 7."*)
CLANG_VERSION="7"
;;
"clang version 8."*)
CLANG_VERSION="8"
;;
"clang version 9."*)
CLANG_VERSION="9"
;;
*)
echo "Unknown clang version: ${clang_version}" >&2
return 1
esac
}
# Temporary files cleanup hooks.
CLEANUP_FILES=()
cleanup() {
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
rm -fr "${CLEANUP_FILES[@]}"
fi
}
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
# Install libc++ libraries compiled with msan in the msan_prefix for the current
# compiler version.
cmd_msan_install() {
local tmpdir=$(mktemp -d)
CLEANUP_FILES+=("${tmpdir}")
# Detect the llvm to install:
export CC="${CC:-clang}"
export CXX="${CXX:-clang++}"
detect_clang_version
local llvm_tag
case "${CLANG_VERSION}" in
"6.0")
llvm_tag="llvmorg-6.0.1"
;;
"7")
llvm_tag="llvmorg-7.0.1"
;;
"8")
llvm_tag="llvmorg-8.0.0"
;;
*)
echo "Unknown clang version: ${clang_version}" >&2
return 1
esac
local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
curl -L --show-error -o "${llvm_targz}" \
"https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
tar -C "${tmpdir}" -zxf "${llvm_targz}"
local llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
rm -rf "${msan_prefix}"
declare -A CMAKE_EXTRAS
CMAKE_EXTRAS[libcxx]="\
-DLIBCXX_CXX_ABI=libstdc++ \
-DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
for project in libcxx; do
local proj_build="${tmpdir}/build-${project}"
local proj_dir="${llvm_root}/${project}"
mkdir -p "${proj_build}"
cmake -B"${proj_build}" -H"${proj_dir}" \
-G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DLLVM_USE_SANITIZER=Memory \
-DLLVM_PATH="${llvm_root}/llvm" \
-DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
head -n1)" \
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
-DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
-DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
${CMAKE_EXTRAS[${project}]}
cmake --build "${proj_build}"
ninja -C "${proj_build}" install
done
}
main() {
set -x
for version in 6.0 7 8; do
if ! which "clang-${version}" >/dev/null; then
echo "Skipping msan install for clang version ${version}"
continue
fi
(
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
export CLANG_VERSION=${version}
export CC=clang-${version}
export CXX=clang++-${version}
cmd_msan_install
) &
done
wait
}
main "$@"

92
third_party/jpeg-xl/docker/scripts/qemu_install.sh поставляемый Normal file
Просмотреть файл

@ -0,0 +1,92 @@
#!/usr/bin/env bash
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
QEMU_RELEASE="4.1.0"
QEMU_URL="https://download.qemu.org/qemu-${QEMU_RELEASE}.tar.xz"
QEMU_ARCHS=(
aarch64
arm
i386
# TODO: Consider adding these:
# aarch64_be
# mips64el
# mips64
# mips
# ppc64
# ppc
)
# Ubuntu packages not installed that are needed to build qemu.
QEMU_BUILD_DEPS=(
libglib2.0-dev
libpixman-1-dev
flex
bison
)
set -eu -x
# Temporary files cleanup hooks.
CLEANUP_FILES=()
cleanup() {
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
rm -fr "${CLEANUP_FILES[@]}"
fi
}
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
main() {
local workdir=$(mktemp -d --suffix=qemu)
CLEANUP_FILES+=("${workdir}")
apt install -y "${QEMU_BUILD_DEPS[@]}"
local qemutar="${workdir}/qemu.tar.gz"
curl --output "${qemutar}" "${QEMU_URL}"
tar -Jxf "${qemutar}" -C "${workdir}"
local srcdir="${workdir}/qemu-${QEMU_RELEASE}"
local builddir="${workdir}/build"
local prefixdir="${workdir}/prefix"
mkdir -p "${builddir}"
# List of targets to build.
local targets=""
local make_targets=()
local target
for target in "${QEMU_ARCHS[@]}"; do
targets="${targets} ${target}-linux-user"
# Build just the linux-user targets.
make_targets+=("${target}-linux-user/all")
done
cd "${builddir}"
"${srcdir}/configure" \
--prefix="${prefixdir}" \
--static --disable-system --enable-linux-user \
--target-list="${targets}"
make -j $(nproc --all || echo 1) "${make_targets[@]}"
# Manually install these into the non-standard location. This script runs as
# root anyway.
for target in "${QEMU_ARCHS[@]}"; do
cp "${target}-linux-user/qemu-${target}" "/usr/bin/qemu-${target}-static"
done
apt autoremove -y --purge "${QEMU_BUILD_DEPS[@]}"
}
main "$@"

28
third_party/jpeg-xl/examples/CMakeLists.txt поставляемый Normal file
Просмотреть файл

@ -0,0 +1,28 @@
# Copyright (c) the JPEG XL Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_executable(decode_oneshot decode_oneshot.cc)
target_link_libraries(decode_oneshot jxl_dec jxl_threads)
add_executable(encode_oneshot encode_oneshot.cc)
target_link_libraries(encode_oneshot jxl jxl_threads)
add_executable(jxlinfo jxlinfo.c)
target_link_libraries(jxlinfo jxl)
if(NOT ${SANITIZER} STREQUAL "none")
# Linking a C test binary with the C++ JPEG XL implementation when using
# address sanitizer is not well supported by clang 9, so force using clang++
# for linking this test if a sanitizer is used.
set_target_properties(jxlinfo PROPERTIES LINKER_LANGUAGE CXX)
endif() # SANITIZER != "none"

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше