зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1707590 - Part 1: Add vendored libjxl source r=aosmond
Differential Revision: https://phabricator.services.mozilla.com/D113358
This commit is contained in:
Родитель
8fb52356f4
Коммит
626cb0e6e1
|
@ -55,6 +55,9 @@ if CONFIG["CPU_ARCH"] == "arm":
|
|||
if CONFIG["MOZ_FFVPX"]:
|
||||
external_dirs += ["media/ffvpx"]
|
||||
|
||||
if CONFIG["MOZ_JXL"]:
|
||||
external_dirs += ["media/libjxl", "media/highway"]
|
||||
|
||||
external_dirs += [
|
||||
"media/kiss_fft",
|
||||
"media/libcubeb",
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
|
||||
# vim: set filetype=python:
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
LOCAL_INCLUDES += [
|
||||
"/third_party/highway/",
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
"/third_party/highway/contrib/image/image.cc",
|
||||
"/third_party/highway/hwy/aligned_allocator.cc",
|
||||
"/third_party/highway/hwy/targets.cc",
|
||||
]
|
||||
|
||||
EXPORTS.hwy += [
|
||||
"/third_party/highway/hwy/aligned_allocator.h",
|
||||
"/third_party/highway/hwy/base.h",
|
||||
"/third_party/highway/hwy/cache_control.h",
|
||||
"/third_party/highway/hwy/foreach_target.h",
|
||||
"/third_party/highway/hwy/highway.h",
|
||||
"/third_party/highway/hwy/targets.h",
|
||||
]
|
||||
|
||||
EXPORTS.hwy.ops += [
|
||||
"/third_party/highway/hwy/ops/arm_neon-inl.h",
|
||||
"/third_party/highway/hwy/ops/rvv-inl.h",
|
||||
"/third_party/highway/hwy/ops/scalar-inl.h",
|
||||
"/third_party/highway/hwy/ops/set_macros-inl.h",
|
||||
"/third_party/highway/hwy/ops/shared-inl.h",
|
||||
"/third_party/highway/hwy/ops/wasm_128-inl.h",
|
||||
"/third_party/highway/hwy/ops/x86_128-inl.h",
|
||||
"/third_party/highway/hwy/ops/x86_256-inl.h",
|
||||
"/third_party/highway/hwy/ops/x86_512-inl.h",
|
||||
]
|
||||
|
||||
FINAL_LIBRARY = "gkmedias"
|
||||
|
||||
# We allow warnings for third-party code that can be updated from upstream.
|
||||
AllowCompilerWarnings()
|
|
@ -0,0 +1,43 @@
|
|||
# Version of this schema
|
||||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
# Bugzilla product and component for this directory and subdirectories
|
||||
product: Core
|
||||
component: "ImageLib"
|
||||
|
||||
# Document the source of externally hosted code
|
||||
origin:
|
||||
|
||||
# Short name of the package/library
|
||||
name: highway
|
||||
|
||||
description: Performance-portable, length-agnostic SIMD with runtime dispatch
|
||||
|
||||
# Full URL for the package's homepage/etc
|
||||
# Usually different from repository url
|
||||
url: https://github.com/google/highway
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit ca1a57c342cd815053abfcffa29b44eaead4f20b (2021-04-15T17:54:35Z).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: ca1a57c342cd815053abfcffa29b44eaead4f20b
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
# Multiple licenses can be specified (as a YAML list)
|
||||
# A "LICENSE" file must exist containing the full license text
|
||||
license: Apache-2.0
|
||||
|
||||
license-file: LICENSE
|
||||
|
||||
vendoring:
|
||||
url: https://github.com/google/highway.git
|
||||
source-hosting: github
|
||||
vendor-directory: third_party/jpeg-xl/third_party/highway
|
||||
|
||||
exclude:
|
||||
- g3doc/
|
|
@ -0,0 +1,12 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef JXL_EXPORT_H
|
||||
#define JXL_EXPORT_H
|
||||
|
||||
#define JXL_EXPORT
|
||||
|
||||
#endif /* JXL_EXPORT_H */
|
|
@ -0,0 +1,12 @@
|
|||
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef JXL_THREADS_EXPORT_H
|
||||
#define JXL_THREADS_EXPORT_H
|
||||
|
||||
#define JXL_THREADS_EXPORT
|
||||
|
||||
#endif /* JXL_THREADS_EXPORT_H */
|
|
@ -0,0 +1,114 @@
|
|||
# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*-
|
||||
# vim: set filetype=python:
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
LOCAL_INCLUDES += [
|
||||
"./include/",
|
||||
"/third_party/jpeg-xl/",
|
||||
"/third_party/jpeg-xl/lib/include/",
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
"/third_party/jpeg-xl/lib/jxl/ac_strategy.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/alpha.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/ans_common.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/aux_out.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/cache_aligned.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/data_parallel.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/descriptive_statistics.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/padded_bytes.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/status.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/base/time.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/blending.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/chroma_from_luma.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/coeff_order.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/color_management.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/compressed_dc.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/convolve.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dct_scales.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_ans.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_context_map.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_external_image.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_frame.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_group.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_group_border.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_huffman.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_modular.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_noise.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_patch_dictionary.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_reconstruct.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_upsample.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/dec_xyb.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/decode.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/enc_bit_writer.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/entropy_coder.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/epf.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/fields.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/filters.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/frame_header.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/gauss_blur.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/headers.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/huffman_table.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/icc_codec.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/icc_codec_common.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/image.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/image_bundle.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/image_metadata.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/jpeg/dec_jpeg_data_writer.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/jpeg/jpeg_data.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/loop_filter.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/luminance.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/memory_manager_internal.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/modular/encoding/dec_ma.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/modular/encoding/encoding.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/modular/modular_image.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/modular/transform/transform.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/opsin_params.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/passes_state.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/quant_weights.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/quantizer.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/splines.cc",
|
||||
"/third_party/jpeg-xl/lib/jxl/toc.cc",
|
||||
]
|
||||
|
||||
SOURCES += [
|
||||
"/third_party/jpeg-xl/lib/threads/thread_parallel_runner.cc",
|
||||
"/third_party/jpeg-xl/lib/threads/thread_parallel_runner_internal.cc",
|
||||
]
|
||||
|
||||
DEFINES["JPEGXL_MAJOR_VERSION"] = "0"
|
||||
DEFINES["JPEGXL_MINOR_VERSION"] = "0"
|
||||
DEFINES["JPEGXL_PATCH_VERSION"] = "0"
|
||||
|
||||
EXPORTS.jxl += [
|
||||
"./include/jxl/jxl_export.h",
|
||||
"./include/jxl/jxl_threads_export.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/butteraugli.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/butteraugli_cxx.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/codestream_header.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/color_encoding.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/decode.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/encode.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/memory_manager.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/parallel_runner.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h",
|
||||
"/third_party/jpeg-xl/lib/include/jxl/types.h",
|
||||
]
|
||||
|
||||
FINAL_LIBRARY = "gkmedias"
|
||||
|
||||
# We allow warnings for third-party code that can be updated from upstream.
|
||||
AllowCompilerWarnings()
|
||||
|
||||
# Clang 5.0 has a compiler bug that prevents build in c++17
|
||||
# See https://gitlab.com/wg1/jpeg-xl/-/issues/227
|
||||
# This should be okay since we are using the C API.
|
||||
if CONFIG["CC_TYPE"] == "clang":
|
||||
CXXFLAGS += ["-std=c++11"]
|
|
@ -0,0 +1,53 @@
|
|||
# Version of this schema
|
||||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
# Bugzilla product and component for this directory and subdirectories
|
||||
product: Core
|
||||
component: "ImageLib"
|
||||
|
||||
# Document the source of externally hosted code
|
||||
origin:
|
||||
|
||||
# Short name of the package/library
|
||||
name: jpeg-xl
|
||||
|
||||
description: JPEG XL image format reference implementation
|
||||
|
||||
# Full URL for the package's homepage/etc
|
||||
# Usually different from repository url
|
||||
url: https://gitlab.com/wg1/jpeg-xl
|
||||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit 9a8f5195e4d1c45112fd65f184ebe115f4163ba2 (2021-05-04T13:15:00.000+02:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
# NOTE(krosylight): Update highway together when updating this!
|
||||
revision: 9a8f5195e4d1c45112fd65f184ebe115f4163ba2
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
# Multiple licenses can be specified (as a YAML list)
|
||||
# A "LICENSE" file must exist containing the full license text
|
||||
license: Apache-2.0
|
||||
|
||||
license-file: LICENSE
|
||||
|
||||
updatebot:
|
||||
maintainer-phab: saschanaz
|
||||
maintainer-bz: krosylight@mozilla.com
|
||||
tasks:
|
||||
- type: vendoring
|
||||
enabled: True
|
||||
|
||||
vendoring:
|
||||
url: https://gitlab.com/wg1/jpeg-xl.git
|
||||
source-hosting: gitlab
|
||||
vendor-directory: third_party/jpeg-xl
|
||||
|
||||
exclude:
|
||||
- doc/
|
||||
- third_party/testdata/
|
||||
- tools/
|
|
@ -0,0 +1,300 @@
|
|||
# Copyright 2019 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
|
||||
if(POLICY CMP0083)
|
||||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(hwy VERSION 0.1)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED YES)
|
||||
|
||||
# Enabled PIE binaries by default if supported.
|
||||
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
|
||||
if(CHECK_PIE_SUPPORTED)
|
||||
check_pie_supported(LANGUAGES CXX)
|
||||
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
|
||||
if (NOT CMAKE_BUILD_TYPE)
|
||||
set(CMAKE_BUILD_TYPE RelWithDebInfo)
|
||||
endif()
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
#if !defined(__EMSCRIPTEN__)
|
||||
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
|
||||
#endif
|
||||
return 0;
|
||||
}"
|
||||
HWY_EMSCRIPTEN
|
||||
)
|
||||
|
||||
set(HWY_SOURCES
|
||||
contrib/image/image.cc
|
||||
contrib/image/image.h
|
||||
contrib/math/math-inl.h
|
||||
hwy/aligned_allocator.cc
|
||||
hwy/aligned_allocator.h
|
||||
hwy/base.h
|
||||
hwy/cache_control.h
|
||||
hwy/foreach_target.h
|
||||
hwy/highway.h
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h
|
||||
hwy/ops/arm_neon-inl.h
|
||||
hwy/ops/scalar-inl.h
|
||||
hwy/ops/set_macros-inl.h
|
||||
hwy/ops/shared-inl.h
|
||||
hwy/ops/wasm_128-inl.h
|
||||
hwy/ops/x86_128-inl.h
|
||||
hwy/ops/x86_256-inl.h
|
||||
hwy/ops/x86_512-inl.h
|
||||
hwy/targets.cc
|
||||
hwy/targets.h
|
||||
hwy/tests/test_util-inl.h
|
||||
)
|
||||
|
||||
if (MSVC)
|
||||
# TODO(janwas): add flags
|
||||
else()
|
||||
set(HWY_FLAGS
|
||||
# Avoid changing binaries based on the current time and date.
|
||||
-Wno-builtin-macro-redefined
|
||||
-D__DATE__="redacted"
|
||||
-D__TIMESTAMP__="redacted"
|
||||
-D__TIME__="redacted"
|
||||
|
||||
# Optimizations
|
||||
-fmerge-all-constants
|
||||
|
||||
# Warnings
|
||||
-Wall
|
||||
-Wextra
|
||||
-Wformat-security
|
||||
-Wno-unused-function
|
||||
-Wnon-virtual-dtor
|
||||
-Woverloaded-virtual
|
||||
-Wvla
|
||||
)
|
||||
|
||||
if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wc++2a-extensions
|
||||
-Wfloat-overflow-conversion
|
||||
-Wfloat-zero-conversion
|
||||
-Wfor-loop-analysis
|
||||
-Wgnu-redeclared-enum
|
||||
-Winfinite-recursion
|
||||
-Wself-assign
|
||||
-Wstring-conversion
|
||||
-Wtautological-overlap-compare
|
||||
-Wthread-safety-analysis
|
||||
-Wundefined-func-template
|
||||
|
||||
-fno-cxx-exceptions
|
||||
-fno-slp-vectorize
|
||||
-fno-vectorize
|
||||
|
||||
# Use color in messages
|
||||
-fdiagnostics-show-option -fcolor-diagnostics
|
||||
)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
list(APPEND HWY_FLAGS
|
||||
-Wno-c++98-compat-pedantic
|
||||
-Wno-cast-align
|
||||
-Wno-double-promotion
|
||||
-Wno-float-equal
|
||||
-Wno-format-nonliteral
|
||||
-Wno-global-constructors
|
||||
-Wno-language-extension-token
|
||||
-Wno-missing-prototypes
|
||||
-Wno-shadow
|
||||
-Wno-shadow-field-in-constructor
|
||||
-Wno-sign-conversion
|
||||
-Wno-unused-member-function
|
||||
-Wno-unused-template
|
||||
-Wno-used-but-marked-unused
|
||||
-Wno-zero-as-null-pointer-constant
|
||||
)
|
||||
else()
|
||||
list(APPEND HWY_FLAGS
|
||||
-fmath-errno
|
||||
-fno-exceptions
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_library(hwy STATIC ${HWY_SOURCES})
|
||||
target_compile_options(hwy PRIVATE ${HWY_FLAGS})
|
||||
set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR})
|
||||
|
||||
# -------------------------------------------------------- install library
|
||||
install(TARGETS hwy
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}")
|
||||
# Install all the headers keeping the relative path to the current directory
|
||||
# when installing them.
|
||||
foreach (source ${HWY_SOURCES})
|
||||
if ("${source}" MATCHES "\.h$")
|
||||
get_filename_component(dirname "${source}" DIRECTORY)
|
||||
install(FILES "${source}"
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Add a pkg-config file for libhwy and the test library.
|
||||
set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
|
||||
foreach (pc libhwy.pc libhwy-test.pc)
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
|
||||
endforeach()
|
||||
|
||||
# -------------------------------------------------------- hwy_list_targets
|
||||
# Generate a tool to print the compiled-in targets as defined by the current
|
||||
# flags. This tool will print to stderr at build time, after building hwy.
|
||||
add_executable(hwy_list_targets hwy/tests/list_targets.cc)
|
||||
target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
|
||||
target_include_directories(hwy_list_targets PRIVATE
|
||||
$<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
|
||||
# TARGET_FILE always returns the path to executable
|
||||
# Naked target also not always could be run (due to the lack of '.\' prefix)
|
||||
# Thus effective command to run should contain the full path
|
||||
# and emulator prefix (if any).
|
||||
add_custom_command(TARGET hwy POST_BUILD
|
||||
COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
|
||||
|
||||
# -------------------------------------------------------- Examples
|
||||
|
||||
# Avoids mismatch between GTest's static CRT and our dynamic.
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Programming exercise with integrated benchmark
|
||||
add_executable(hwy_benchmark hwy/examples/benchmark.cc)
|
||||
target_sources(hwy_benchmark PRIVATE
|
||||
hwy/nanobenchmark.cc
|
||||
hwy/nanobenchmark.h)
|
||||
# Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to
|
||||
# observe the difference in targets printed.
|
||||
target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
|
||||
target_link_libraries(hwy_benchmark hwy)
|
||||
set_target_properties(hwy_benchmark
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
|
||||
|
||||
# -------------------------------------------------------- Tests
|
||||
|
||||
include(CTest)
|
||||
|
||||
if(BUILD_TESTING)
|
||||
enable_testing()
|
||||
include(GoogleTest)
|
||||
|
||||
set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
find_package(GTest REQUIRED)
|
||||
else()
|
||||
# Download and unpack googletest at configure time
|
||||
configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
|
||||
RESULT_VARIABLE result
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
|
||||
if(result)
|
||||
message(FATAL_ERROR "CMake step for googletest failed: ${result}")
|
||||
endif()
|
||||
execute_process(COMMAND ${CMAKE_COMMAND} --build .
|
||||
RESULT_VARIABLE result
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
|
||||
if(result)
|
||||
message(FATAL_ERROR "Build step for googletest failed: ${result}")
|
||||
endif()
|
||||
|
||||
# Prevent overriding the parent project's compiler/linker
|
||||
# settings on Windows
|
||||
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
|
||||
|
||||
# Add googletest directly to our build. This defines
|
||||
# the gtest and gtest_main targets.
|
||||
add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
|
||||
${CMAKE_CURRENT_BINARY_DIR}/googletest-build
|
||||
EXCLUDE_FROM_ALL)
|
||||
|
||||
# The gtest/gtest_main targets carry header search path
|
||||
# dependencies automatically when using CMake 2.8.11 or
|
||||
# later. Otherwise we have to add them here ourselves.
|
||||
if (CMAKE_VERSION VERSION_LESS 2.8.11)
|
||||
include_directories("${gtest_SOURCE_DIR}/include")
|
||||
endif()
|
||||
endif() # HWY_SYSTEM_GTEST
|
||||
|
||||
set(HWY_TEST_FILES
|
||||
contrib/image/image_test.cc
|
||||
# contrib/math/math_test.cc
|
||||
hwy/aligned_allocator_test.cc
|
||||
hwy/base_test.cc
|
||||
hwy/highway_test.cc
|
||||
hwy/targets_test.cc
|
||||
hwy/examples/skeleton_test.cc
|
||||
hwy/tests/arithmetic_test.cc
|
||||
hwy/tests/combine_test.cc
|
||||
hwy/tests/compare_test.cc
|
||||
hwy/tests/convert_test.cc
|
||||
hwy/tests/logical_test.cc
|
||||
hwy/tests/memory_test.cc
|
||||
hwy/tests/swizzle_test.cc
|
||||
hwy/tests/test_util_test.cc
|
||||
)
|
||||
|
||||
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
|
||||
foreach (TESTFILE IN LISTS HWY_TEST_FILES)
|
||||
# The TESTNAME is the name without the extension or directory.
|
||||
get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
|
||||
add_executable(${TESTNAME} ${TESTFILE})
|
||||
target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
|
||||
|
||||
if(HWY_SYSTEM_GTEST)
|
||||
target_link_libraries(${TESTNAME} hwy GTest::GTest GTest::Main)
|
||||
else()
|
||||
target_link_libraries(${TESTNAME} hwy gtest gtest_main)
|
||||
endif()
|
||||
# Output test targets in the test directory.
|
||||
set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
|
||||
|
||||
if (HWY_EMSCRIPTEN)
|
||||
set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
|
||||
endif()
|
||||
|
||||
if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
|
||||
gtest_discover_tests(${TESTNAME} TIMEOUT 60)
|
||||
else ()
|
||||
gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
# The skeleton test uses the skeleton library code.
|
||||
target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
|
||||
|
||||
endif() # BUILD_TESTING
|
|
@ -0,0 +1,15 @@
|
|||
cmake_minimum_required(VERSION 2.8.2)
|
||||
|
||||
project(googletest-download NONE)
|
||||
|
||||
include(ExternalProject)
|
||||
ExternalProject_Add(googletest
|
||||
GIT_REPOSITORY https://github.com/google/googletest.git
|
||||
GIT_TAG master
|
||||
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
|
||||
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
|
||||
CONFIGURE_COMMAND ""
|
||||
BUILD_COMMAND ""
|
||||
INSTALL_COMMAND ""
|
||||
TEST_COMMAND ""
|
||||
)
|
|
@ -0,0 +1,33 @@
|
|||
# How to Contribute
|
||||
|
||||
We'd love to accept your patches and contributions to this project. There are
|
||||
just a few small guidelines you need to follow.
|
||||
|
||||
## Contributor License Agreement
|
||||
|
||||
Contributions to this project must be accompanied by a Contributor License
|
||||
Agreement. You (or your employer) retain the copyright to your contribution;
|
||||
this simply gives us permission to use and redistribute your contributions as
|
||||
part of the project. Head over to <https://cla.developers.google.com/> to see
|
||||
your current agreements on file or to sign a new one.
|
||||
|
||||
You generally only need to submit a CLA once, so if you've already submitted one
|
||||
(even if it was for a different project), you probably don't need to do it
|
||||
again.
|
||||
|
||||
## Code reviews
|
||||
|
||||
All submissions, including submissions by project members, require review. We
|
||||
use GitHub pull requests for this purpose. Consult
|
||||
[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
|
||||
information on using pull requests.
|
||||
|
||||
## Testing
|
||||
|
||||
This repository is used by JPEG XL, so major API changes will require
|
||||
coordination. Please get in touch with us beforehand, e.g. by raising an issue.
|
||||
|
||||
## Community Guidelines
|
||||
|
||||
This project follows
|
||||
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
|
@ -0,0 +1,44 @@
|
|||
.DELETE_ON_ERROR:
|
||||
|
||||
OBJS := aligned_allocator.o nanobenchmark.o targets.o
|
||||
IMAGE_OBJS := image.o
|
||||
TEST_NAMES := arithmetic_test combine_test compare_test convert_test hwy_test logical_test memory_test swizzle_test
|
||||
TESTS := $(foreach i, $(TEST_NAMES), bin/$(i))
|
||||
ROOT_TEST_NAMES := aligned_allocator_test nanobenchmark_test
|
||||
ROOT_TESTS := $(foreach i, $(ROOT_TEST_NAMES), bin/$(i))
|
||||
|
||||
CXXFLAGS += -I. -fmerge-all-constants -std=c++17 -O2 \
|
||||
-Wno-builtin-macro-redefined -D__DATE__="redacted" \
|
||||
-D__TIMESTAMP__="redacted" -D__TIME__="redacted" \
|
||||
-Wall -Wextra -Wformat-security -Wno-unused-function \
|
||||
-Wnon-virtual-dtor -Woverloaded-virtual -Wvla
|
||||
|
||||
.PHONY: all
|
||||
all: $(TESTS) $(ROOT_TESTS) benchmark test
|
||||
|
||||
.PHONY: clean
|
||||
clean: ; @rm -rf $(OBJS) bin/ benchmark.o
|
||||
|
||||
$(OBJS): %.o: hwy/%.cc
|
||||
$(CXX) -c $(CXXFLAGS) $< -o $@
|
||||
|
||||
$(IMAGE_OBJS): %.o: contrib/image/%.cc
|
||||
$(CXX) -c $(CXXFLAGS) $< -o $@
|
||||
|
||||
benchmark: $(OBJS) hwy/examples/benchmark.cc
|
||||
mkdir -p bin && $(CXX) $(CXXFLAGS) $^ -o bin/benchmark
|
||||
|
||||
bin/%: hwy/%.cc $(OBJS)
|
||||
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
|
||||
|
||||
bin/%: hwy/tests/%.cc $(OBJS)
|
||||
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
|
||||
|
||||
bin/%: contrib/image/%.cc $(OBJS)
|
||||
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
|
||||
bin/%: contrib/math/%.cc $(OBJS)
|
||||
mkdir -p bin && $(CXX) $(CXXFLAGS) $< $(OBJS) -o $@ -lgtest -lgtest_main -lpthread
|
||||
|
||||
.PHONY: test
|
||||
test: $(TESTS) $(ROOT_TESTS)
|
||||
for name in $^; do echo ---------------------$$name && $$name; done
|
|
@ -0,0 +1,361 @@
|
|||
# Efficient and performance-portable SIMD
|
||||
|
||||
Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e.
|
||||
applying the same operation to 'lanes'.
|
||||
|
||||
## Why Highway?
|
||||
|
||||
- more portable (same source code) than platform-specific intrinsics,
|
||||
- works on a wider range of compilers than compiler-specific vector extensions,
|
||||
- more dependable than autovectorization,
|
||||
- easier to write/maintain than assembly language,
|
||||
- supports **runtime dispatch**,
|
||||
- supports **variable-length vector** architectures.
|
||||
|
||||
## Current status
|
||||
|
||||
Supported targets: scalar, SSE4, AVX2, AVX-512, NEON (ARMv7 and v8), WASM SIMD.
|
||||
A port to RVV is in progress.
|
||||
|
||||
Version 0.11 is considered stable enough to use in other projects, and is
|
||||
expected to remain backwards compatible unless serious issues are discovered
|
||||
while implementing SVE/RVV targets. After these targets are added, Highway will
|
||||
reach version 1.0.
|
||||
|
||||
Continuous integration tests build with a recent version of Clang (running on
|
||||
x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). Also periodically
|
||||
tested on x86 with Clang 7-11 and GCC 8, 9 and 10.2.1.
|
||||
|
||||
The `contrib` directory contains SIMD-related utilities: an image class with
|
||||
aligned rows, and a math library (16 functions already implemented, mostly
|
||||
trigonometry).
|
||||
|
||||
## Installation
|
||||
|
||||
This project uses cmake to generate and build. In a Debian-based system you can
|
||||
install it via:
|
||||
|
||||
```bash
|
||||
sudo apt install cmake
|
||||
```
|
||||
|
||||
Highway's unit tests use [googletest](https://github.com/google/googletest).
|
||||
By default, Highway's CMake downloads this dependency at configuration time.
|
||||
You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
|
||||
installing gtest separately:
|
||||
|
||||
```bash
|
||||
sudo apt install libgtest-dev
|
||||
```
|
||||
|
||||
To build and test the library the standard cmake workflow can be used:
|
||||
|
||||
```bash
|
||||
mkdir -p build && cd build
|
||||
cmake ..
|
||||
make -j && make test
|
||||
```
|
||||
|
||||
Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
|
||||
|
||||
To test on all the attainable targets for your platform, use
|
||||
`cmake .. -DCMAKE_CXX_FLAGS="-DHWY_COMPILE_ALL_ATTAINABLE"`. Otherwise, the
|
||||
default configuration skips baseline targets (e.g. scalar) that are superseded
|
||||
by another baseline target.
|
||||
|
||||
## Quick start
|
||||
|
||||
You can use the `benchmark` inside examples/ as a starting point.
|
||||
|
||||
A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
|
||||
and their parameters, and the [instruction_matrix][instmtx] indicates the
|
||||
number of instructions per operation.
|
||||
|
||||
We recommend using full SIMD vectors whenever possible for maximum performance
|
||||
portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as
|
||||
`Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of
|
||||
two) lanes: `HWY_CAPPED(T, N)`. 128-bit vectors are guaranteed to be available
|
||||
for lanes of type `T` if `HWY_TARGET != HWY_SCALAR` and `N == 16 / sizeof(T)`.
|
||||
|
||||
Functions using Highway must be inside a namespace `namespace HWY_NAMESPACE {`
|
||||
(possibly nested in one or more other namespaces defined by the project), and
|
||||
additionally either prefixed with `HWY_ATTR`, or residing between
|
||||
`HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`.
|
||||
|
||||
* For static dispatch, `HWY_TARGET` will be the best available target among
|
||||
`HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
|
||||
[quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE`
|
||||
can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module
|
||||
they are defined in. You can call the function from other modules by
|
||||
wrapping it in a regular function and declaring the regular function in a
|
||||
header.
|
||||
|
||||
* For dynamic dispatch, a table of function pointers is generated via the
|
||||
`HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
|
||||
call the best function pointer for the current CPU supported targets. A
|
||||
module is automatically compiled for each target in `HWY_TARGETS` (see
|
||||
[quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
|
||||
defined and foreach_target.h is included.
|
||||
|
||||
## Strip-mining loops
|
||||
|
||||
To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
|
||||
loop with number of iterations matching the preferred vector width.
|
||||
|
||||
In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the
|
||||
number of elements to process, and `N = Lanes(d)` the number of lanes in a full
|
||||
vector. Assume the loop body is given as a function `template<bool partial,
|
||||
class D> void LoopBody(D d, size_t max_n)`.
|
||||
|
||||
Highway offers several ways to express loops where `N` need not divide `count`:
|
||||
|
||||
* Ensure all inputs/outputs are padded. Then the loop is simply
|
||||
|
||||
```
|
||||
for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0);
|
||||
```
|
||||
Here, the template parameter and second function argument are not needed.
|
||||
|
||||
This is the preferred option, unless `N` is in the thousands and vector
|
||||
operations are pipelined with long latencies. This was the case for
|
||||
supercomputers in the 90s, but nowadays ALUs are cheap and we see most
|
||||
implementations split vectors into 1, 2 or 4 parts, so there is little cost
|
||||
to processing entire vectors even if we do not need all their lanes. Indeed
|
||||
this avoids the (potentially large) cost of predication or partial
|
||||
loads/stores on older targets, and does not duplicate code.
|
||||
|
||||
* Process whole vectors as above, followed by a scalar loop:
|
||||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) LoopBody<false>(d, 0);
|
||||
for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0);
|
||||
```
|
||||
The template parameter and second function arguments are again not needed.
|
||||
|
||||
This avoids duplicating code, and is reasonable if `count` is large.
|
||||
Otherwise, multiple iterations may be slower than one `LoopBody` variant
|
||||
with masking, especially because the `HWY_SCALAR` target selected by
|
||||
`HWY_CAPPED(T, 1)` is slower for some operations due to workarounds for
|
||||
undefined behavior in C++.
|
||||
|
||||
* Process whole vectors as above, followed by a single call to a modified
|
||||
`LoopBody` with masking:
|
||||
|
||||
```
|
||||
size_t i = 0;
|
||||
for (; i + N <= count; i += N) {
|
||||
LoopBody<false>(d, 0);
|
||||
}
|
||||
if (i < count) {
|
||||
LoopBody<true>(d, count - i);
|
||||
}
|
||||
```
|
||||
Now the template parameter and second function argument can be used inside
|
||||
`LoopBody` to replace `Load/Store` of full aligned vectors with
|
||||
`LoadN/StoreN(n)` that affect no more than `1 <= n <= N` aligned elements
|
||||
(pending implementation).
|
||||
|
||||
This is a good default when it is infeasible to ensure vectors are padded.
|
||||
In contrast to the scalar loop, only a single final iteration is needed.
|
||||
|
||||
## Design philosophy
|
||||
|
||||
* Performance is important but not the sole consideration. Anyone who goes to
|
||||
the trouble of using SIMD clearly cares about speed. However, portability,
|
||||
maintainability and readability also matter, otherwise we would write in
|
||||
assembly. We aim for performance within 10-20% of a hand-written assembly
|
||||
implementation on the development platform.
|
||||
|
||||
* The guiding principles of C++ are "pay only for what you use" and "leave no
|
||||
room for a lower-level language below C++". We apply these by defining a
|
||||
SIMD API that ensures operation costs are visible, predictable and minimal.
|
||||
|
||||
* Performance portability is important, i.e. the API should be efficient on
|
||||
all target platforms. Unfortunately, common idioms for one platform can be
|
||||
inefficient on others. For example: summing lanes horizontally versus
|
||||
shuffling. Documenting which operations are expensive does not prevent their
|
||||
use, as evidenced by widespread use of `HADDPS`. Performance acceptance
|
||||
tests may detect large regressions, but do not help choose the approach
|
||||
during initial development. Analysis tools can warn about some potential
|
||||
inefficiencies, but likely not all. We instead provide [a carefully chosen
|
||||
set of vector types and operations that are efficient on all target
|
||||
platforms][instmtx] (PPC8, SSE4/AVX2+, ARMv8).
|
||||
|
||||
* Future SIMD hardware features are difficult to predict. For example, AVX2
|
||||
came with surprising semantics (almost no interaction between 128-bit
|
||||
blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
|
||||
To ensure the API reflects hardware realities, we suggest a flexible
|
||||
approach that adds new operations as they become commonly available, with
|
||||
scalar fallbacks where not supported.
|
||||
|
||||
* Masking is not yet widely supported on current CPUs. It is difficult to
|
||||
define an interface that provides access to all platform features while
|
||||
retaining performance portability. The P0214R5 proposal lacks support for
|
||||
AVX-512/ARM SVE zeromasks. We suggest limiting usage of masks to the
|
||||
`IfThen[Zero]Else[Zero]` functions until the community has gained more
|
||||
experience with them.
|
||||
|
||||
* "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
|
||||
For example, valarray-like code can iterate over a 1D array with a
|
||||
library-specified vector width. This will result in better code when vector
|
||||
sizes increase, and matches the direction taken by
|
||||
[ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
|
||||
RiscV V as well as Agner Fog's
|
||||
[ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
|
||||
applications may require fixed sizes, so we also guarantee support for
|
||||
128-bit vectors in each instruction set.
|
||||
|
||||
* The API and its implementation should be usable and efficient with commonly
|
||||
used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
|
||||
instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
|
||||
(https://godbolt.org/g/rKx5Ga). Highway requires function-specific
|
||||
target attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
|
||||
|
||||
* Efficient and safe runtime dispatch is important. Modules such as image or
|
||||
video codecs are typically embedded into larger applications such as
|
||||
browsers, so they cannot require separate binaries for each CPU. Libraries
|
||||
also cannot predict whether the application already uses AVX2 (and pays the
|
||||
frequency throttling cost), so this decision must be left to the
|
||||
application. Using only the lowest-common denominator instructions
|
||||
sacrifices too much performance.
|
||||
Therefore, we provide code paths for multiple instruction sets and choose
|
||||
the most suitable at runtime. To reduce overhead, dispatch should be hoisted
|
||||
to higher layers instead of checking inside every low-level function.
|
||||
Highway supports inlining functions in the same file or in *-inl.h headers.
|
||||
We generate all code paths from the same source to reduce implementation-
|
||||
and debugging cost.
|
||||
|
||||
* Not every CPU need be supported. For example, pre-SSE4.1 CPUs are
|
||||
increasingly rare and the AVX instruction set is limited to floating-point
|
||||
operations. To reduce code size and compile time, we provide specializations
|
||||
for SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar fallback.
|
||||
|
||||
* Access to platform-specific intrinsics is necessary for acceptance in
|
||||
performance-critical projects. We provide conversions to and from intrinsics
|
||||
to allow utilizing specialized platform-specific functionality, and simplify
|
||||
incremental porting of existing code.
|
||||
|
||||
* The core API should be compact and easy to learn. We provide only the few
|
||||
dozen operations which are necessary and sufficient for most of the 150+
|
||||
SIMD applications we examined.
|
||||
|
||||
## Prior API designs
|
||||
|
||||
The author has been writing SIMD code since 2002: first via assembly language,
|
||||
then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
|
||||
of custom vector classes. The first used macros to generate the classes, which
|
||||
reduces duplication but also readability. The second used templates instead.
|
||||
The third (used in highwayhash and PIK) added support for AVX2 and runtime
|
||||
dispatch. The current design (used in JPEG XL) enables code generation for
|
||||
multiple platforms and/or instruction sets from the same source, and improves
|
||||
runtime dispatch.
|
||||
|
||||
## Differences versus [P0214R5 proposal](https://goo.gl/zKW4SA)
|
||||
|
||||
1. Adding widely used and portable operations such as `AndNot`, `AverageRound`,
|
||||
bit-shift by immediates and `IfThenElse`.
|
||||
|
||||
1. Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
|
||||
crossing 128-bit 'block' boundaries.
|
||||
|
||||
1. Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
|
||||
returns `fixed_size<>` vectors which are more expensive to access because
|
||||
they reside on the stack. We can avoid this plus additional overhead on
|
||||
ARM/AVX2 by defining width-expanding operations as functions of a vector
|
||||
part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
|
||||
`uint16_t`, or demoting full vectors to half vectors with half-width lanes.
|
||||
|
||||
1. Guaranteeing access to the underlying intrinsic vector type. This ensures
|
||||
all platform-specific capabilities can be used. P0214R5 instead only
|
||||
'encourages' implementations to provide access.
|
||||
|
||||
1. Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
|
||||
based on the Vc library, which does not provide assistance for linking
|
||||
multiple instruction sets into the same binary. The Vc documentation
|
||||
suggests compiling separate executables for each instruction set or using
|
||||
GCC's ifunc (indirect functions). The latter is compiler-specific and risks
|
||||
crashes due to ODR violations when compiling the same function with
|
||||
different compiler flags. We solve this problem via target-specific
|
||||
namespaces and attributes (see HOWTO section below). We also permit a mix of
|
||||
static target selection and runtime dispatch for hotspots that may benefit
|
||||
from newer instruction sets if available.
|
||||
|
||||
1. Using built-in PPC vector types without a wrapper class. This leads to much
|
||||
better code generation with GCC 6.3: https://godbolt.org/z/pd2PNP.
|
||||
By contrast, P0214R5 requires a wrapper. We avoid this by using only the
|
||||
member operators provided by the PPC vectors; all other functions and
|
||||
typedefs are non-members. 2019-04 update: Clang power64le does not have
|
||||
this issue, so we simplified get_part(d, v) to GetLane(v).
|
||||
|
||||
1. Omitting inefficient or non-performance-portable operations such as `hmax`,
|
||||
`operator[]`, and unsupported integer comparisons. Applications can often
|
||||
replace these operations at lower cost than emulating that exact behavior.
|
||||
|
||||
1. Omitting `long double` types: these are not commonly available in hardware.
|
||||
|
||||
1. Ensuring signed integer overflow has well-defined semantics (wraparound).
|
||||
|
||||
1. Simple header-only implementation and less than a tenth of the size of the
|
||||
Vc library from which P0214 was derived (98,000 lines in
|
||||
https://github.com/VcDevel/Vc according to the gloc Chrome extension).
|
||||
|
||||
1. Avoiding hidden performance costs. P0214R5 allows implicit conversions from
|
||||
integer to float, which costs 3-4 cycles on x86. We make these conversions
|
||||
explicit to ensure their cost is visible.
|
||||
|
||||
## Other related work
|
||||
|
||||
* [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
|
||||
adopts a similar approach with interchangeable vector/scalar types and
|
||||
a compact interface. It allows access to the underlying intrinsics, but
|
||||
does not appear to be designed for other platforms than x86.
|
||||
|
||||
* UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
|
||||
also adopts an explicit vectorization model with vector classes.
|
||||
However, it exposes the union of all platform capabilities, which makes the
|
||||
API harder to learn (209-page spec) and implement (the estimated LOC count
|
||||
is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
|
||||
because it allows applications to use operations that are inefficient on
|
||||
other platforms.
|
||||
|
||||
* Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
|
||||
is a vector library for scientific computing with some innovative features:
|
||||
automatic FLOPS counting, and "if/else branches" using lambda functions.
|
||||
It supports IBM Power8, but only provides float and double types.
|
||||
|
||||
## Overloaded function API
|
||||
|
||||
Most C++ vector APIs rely on class templates. However, the ARM SVE vector
|
||||
type is sizeless and cannot be wrapped in a class. We instead rely on overloaded
|
||||
functions. Overloading based on vector types is also undesirable because SVE
|
||||
vectors cannot be default-constructed. We instead use a dedicated 'descriptor'
|
||||
type `Simd` for overloading, abbreviated to `D` for template arguments and
|
||||
`d` in lvalues.
|
||||
|
||||
Note that generic function templates are possible (see highway.h).
|
||||
|
||||
## Masks
|
||||
|
||||
AVX-512 introduced a major change to the SIMD interface: special mask registers
|
||||
(one bit per lane) that serve as predicates. It would be expensive to force
|
||||
AVX-512 implementations to conform to the prior model of full vectors with lanes
|
||||
set to all one or all zero bits. We instead provide a Mask type that emulates
|
||||
a subset of this functionality on other platforms at zero cost.
|
||||
|
||||
Masks are returned by comparisons and `TestBit`; they serve as the input to
|
||||
`IfThen*`. We provide conversions between masks and vector lanes. For clarity
|
||||
and safety, we use FF..FF as the definition of true. To also benefit from
|
||||
x86 instructions that only require the sign bit of floating-point inputs to be
|
||||
set, we provide a special `ZeroIfNegative` function.
|
||||
|
||||
## Additional resources
|
||||
|
||||
* [Highway introduction (slides)][intro]
|
||||
* [Overview of instructions per operation on different architectures][instmtx]
|
||||
|
||||
[intro]: g3doc/highway_intro.pdf
|
||||
[instmtx]: g3doc/instruction_matrix.pdf
|
||||
|
||||
This is not an officially supported Google product.
|
||||
Contact: janwas@google.com
|
|
@ -0,0 +1,145 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "contrib/image/image.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "contrib/image/image.cc"
|
||||
|
||||
#include <algorithm> // swap
|
||||
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
size_t GetVectorSize() { return Lanes(HWY_FULL(uint8_t)()); }
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
namespace {
|
||||
HWY_EXPORT(GetVectorSize); // Local function.
|
||||
} // namespace
|
||||
|
||||
size_t ImageBase::VectorSize() {
|
||||
// Do not cache result - must return the current value, which may be greater
|
||||
// than the first call if it was subject to DisableTargets!
|
||||
return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
|
||||
}
|
||||
|
||||
size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
|
||||
const size_t vec_size = VectorSize();
|
||||
size_t valid_bytes = xsize * sizeof_t;
|
||||
|
||||
// Allow unaligned accesses starting at the last valid value - this may raise
|
||||
// msan errors unless the user calls InitializePaddingForUnalignedAccesses.
|
||||
// Skip for the scalar case because no extra lanes will be loaded.
|
||||
if (vec_size != 1) {
|
||||
HWY_DASSERT(vec_size >= sizeof_t);
|
||||
valid_bytes += vec_size - sizeof_t;
|
||||
}
|
||||
|
||||
// Round up to vector and cache line size.
|
||||
const size_t align = std::max<size_t>(vec_size, HWY_ALIGNMENT);
|
||||
size_t bytes_per_row = RoundUpTo(valid_bytes, align);
|
||||
|
||||
// During the lengthy window before writes are committed to memory, CPUs
|
||||
// guard against read after write hazards by checking the address, but
|
||||
// only the lower 11 bits. We avoid a false dependency between writes to
|
||||
// consecutive rows by ensuring their sizes are not multiples of 2 KiB.
|
||||
// Avoid2K prevents the same problem for the planes of an Image3.
|
||||
if (bytes_per_row % HWY_ALIGNMENT == 0) {
|
||||
bytes_per_row += align;
|
||||
}
|
||||
|
||||
HWY_DASSERT(bytes_per_row % align == 0);
|
||||
return bytes_per_row;
|
||||
}
|
||||
|
||||
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
|
||||
const size_t sizeof_t)
|
||||
: xsize_(static_cast<uint32_t>(xsize)),
|
||||
ysize_(static_cast<uint32_t>(ysize)),
|
||||
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
|
||||
HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
|
||||
|
||||
bytes_per_row_ = 0;
|
||||
// Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
|
||||
// if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
|
||||
if (xsize != 0 && ysize != 0) {
|
||||
bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
|
||||
bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
|
||||
HWY_ASSERT(bytes_.get() != nullptr);
|
||||
InitializePadding(sizeof_t, Padding::kRoundUp);
|
||||
}
|
||||
}
|
||||
|
||||
ImageBase::ImageBase(const size_t xsize, const size_t ysize,
|
||||
const size_t bytes_per_row, void* const aligned)
|
||||
: xsize_(static_cast<uint32_t>(xsize)),
|
||||
ysize_(static_cast<uint32_t>(ysize)),
|
||||
bytes_per_row_(bytes_per_row),
|
||||
bytes_(static_cast<uint8_t*>(aligned),
|
||||
AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
|
||||
const size_t vec_size = VectorSize();
|
||||
HWY_ASSERT(bytes_per_row % vec_size == 0);
|
||||
HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
|
||||
}
|
||||
|
||||
void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
|
||||
#if defined(MEMORY_SANITIZER) || HWY_IDE
|
||||
if (xsize_ == 0 || ysize_ == 0) return;
|
||||
|
||||
const size_t vec_size = VectorSize(); // Bytes, independent of sizeof_t!
|
||||
if (vec_size == 1) return; // Scalar mode: no padding needed
|
||||
|
||||
const size_t valid_size = xsize_ * sizeof_t;
|
||||
const size_t initialize_size = padding == Padding::kRoundUp
|
||||
? RoundUpTo(valid_size, vec_size)
|
||||
: valid_size + vec_size - sizeof_t;
|
||||
if (valid_size == initialize_size) return;
|
||||
|
||||
for (size_t y = 0; y < ysize_; ++y) {
|
||||
uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
|
||||
#if defined(__clang__) && (__clang_major__ <= 6)
|
||||
// There's a bug in msan in clang-6 when handling AVX2 operations. This
|
||||
// workaround allows tests to pass on msan, although it is slower and
|
||||
// prevents msan warnings from uninitialized images.
|
||||
memset(row, 0, initialize_size);
|
||||
#else
|
||||
memset(row + valid_size, 0, initialize_size - valid_size);
|
||||
#endif // clang6
|
||||
}
|
||||
#else
|
||||
(void)sizeof_t;
|
||||
(void)padding;
|
||||
#endif // MEMORY_SANITIZER
|
||||
}
|
||||
|
||||
void ImageBase::Swap(ImageBase& other) {
|
||||
std::swap(xsize_, other.xsize_);
|
||||
std::swap(ysize_, other.ysize_);
|
||||
std::swap(bytes_per_row_, other.bytes_per_row_);
|
||||
std::swap(bytes_, other.bytes_);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HWY_ONCE
|
|
@ -0,0 +1,468 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
|
||||
#define HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
|
||||
|
||||
// SIMD/multicore-friendly planar image representation with row accessors.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <utility> // std::move
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Type-independent parts of Image<> - reduces code duplication and facilitates
|
||||
// moving member function implementations to cc file.
|
||||
struct ImageBase {
|
||||
// Returns required alignment in bytes for externally allocated memory.
|
||||
static size_t VectorSize();
|
||||
|
||||
// Returns distance [bytes] between the start of two consecutive rows, a
|
||||
// multiple of VectorSize but NOT kAlias (see implementation).
|
||||
static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
|
||||
|
||||
// No allocation (for output params or unused images)
|
||||
ImageBase()
|
||||
: xsize_(0),
|
||||
ysize_(0),
|
||||
bytes_per_row_(0),
|
||||
bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
|
||||
|
||||
// Allocates memory (this is the common case)
|
||||
ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
|
||||
|
||||
// References but does not take ownership of external memory. Useful for
|
||||
// interoperability with other libraries. `aligned` must be aligned to a
|
||||
// multiple of VectorSize() and `bytes_per_row` must also be a multiple of
|
||||
// VectorSize() or preferably equal to BytesPerRow().
|
||||
ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
|
||||
|
||||
// Copy construction/assignment is forbidden to avoid inadvertent copies,
|
||||
// which can be very expensive. Use CopyImageTo() instead.
|
||||
ImageBase(const ImageBase& other) = delete;
|
||||
ImageBase& operator=(const ImageBase& other) = delete;
|
||||
|
||||
// Move constructor (required for returning Image from function)
|
||||
ImageBase(ImageBase&& other) noexcept = default;
|
||||
|
||||
// Move assignment (required for std::vector)
|
||||
ImageBase& operator=(ImageBase&& other) noexcept = default;
|
||||
|
||||
void Swap(ImageBase& other);
|
||||
|
||||
// Useful for pre-allocating image with some padding for alignment purposes
|
||||
// and later reporting the actual valid dimensions. Caller is responsible
|
||||
// for ensuring xsize/ysize are <= the original dimensions.
|
||||
void ShrinkTo(const size_t xsize, const size_t ysize) {
|
||||
xsize_ = static_cast<uint32_t>(xsize);
|
||||
ysize_ = static_cast<uint32_t>(ysize);
|
||||
// NOTE: we can't recompute bytes_per_row for more compact storage and
|
||||
// better locality because that would invalidate the image contents.
|
||||
}
|
||||
|
||||
// How many pixels.
|
||||
HWY_INLINE size_t xsize() const { return xsize_; }
|
||||
HWY_INLINE size_t ysize() const { return ysize_; }
|
||||
|
||||
// NOTE: do not use this for copying rows - the valid xsize may be much less.
|
||||
HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
|
||||
|
||||
// Raw access to byte contents, for interfacing with other libraries.
|
||||
// Unsigned char instead of char to avoid surprises (sign extension).
|
||||
HWY_INLINE uint8_t* bytes() {
|
||||
void* p = bytes_.get();
|
||||
return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
|
||||
}
|
||||
HWY_INLINE const uint8_t* bytes() const {
|
||||
const void* p = bytes_.get();
|
||||
return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
|
||||
}
|
||||
|
||||
protected:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidRow(const size_t y) const {
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (y >= ysize_) {
|
||||
HWY_ABORT("Row(%zu) >= %u\n", y, ysize_);
|
||||
}
|
||||
#endif
|
||||
|
||||
void* row = bytes_.get() + y * bytes_per_row_;
|
||||
return HWY_ASSUME_ALIGNED(row, 64);
|
||||
}
|
||||
|
||||
enum class Padding {
|
||||
// Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
|
||||
kRoundUp,
|
||||
// Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
|
||||
// vector to be initialized. If done by default, this would suppress
|
||||
// legitimate msan warnings. We therefore require users to explicitly call
|
||||
// InitializePadding before using unaligned loads (e.g. convolution).
|
||||
kUnaligned
|
||||
};
|
||||
|
||||
// Initializes the minimum bytes required to suppress msan warnings from
|
||||
// legitimate (according to Padding mode) vector loads/stores on the right
|
||||
// border, where some lanes are uninitialized and assumed to be unused.
|
||||
void InitializePadding(size_t sizeof_t, Padding padding);
|
||||
|
||||
// (Members are non-const to enable assignment during move-assignment.)
|
||||
uint32_t xsize_; // In valid pixels, not including any padding.
|
||||
uint32_t ysize_;
|
||||
size_t bytes_per_row_; // Includes padding.
|
||||
AlignedFreeUniquePtr<uint8_t[]> bytes_;
|
||||
};
|
||||
|
||||
// Single channel, aligned rows separated by padding. T must be POD.
|
||||
//
|
||||
// 'Single channel' (one 2D array per channel) simplifies vectorization
|
||||
// (repeating the same operation on multiple adjacent components) without the
|
||||
// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
|
||||
// can easily iterate over all components in a row and Image requires no
|
||||
// knowledge of the pixel format beyond the component type "T".
|
||||
//
|
||||
// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
|
||||
// false sharing between two threads operating on adjacent rows.
|
||||
//
|
||||
// 'Padding' is still relevant because vectors could potentially be larger than
|
||||
// a cache line. By rounding up row sizes to the vector size, we allow
|
||||
// reading/writing ALIGNED vectors whose first lane is a valid sample. This
|
||||
// avoids needing a separate loop to handle remaining unaligned lanes.
|
||||
//
|
||||
// This image layout could also be achieved with a vector and a row accessor
|
||||
// function, but a class wrapper with support for "deleter" allows wrapping
|
||||
// existing memory allocated by clients without copying the pixels. It also
|
||||
// provides convenient accessors for xsize/ysize, which shortens function
|
||||
// argument lists. Supports move-construction so it can be stored in containers.
|
||||
template <typename ComponentType>
|
||||
class Image : public ImageBase {
|
||||
public:
|
||||
using T = ComponentType;
|
||||
|
||||
Image() = default;
|
||||
Image(const size_t xsize, const size_t ysize)
|
||||
: ImageBase(xsize, ysize, sizeof(T)) {}
|
||||
Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
|
||||
void* aligned)
|
||||
: ImageBase(xsize, ysize, bytes_per_row, aligned) {}
|
||||
|
||||
void InitializePaddingForUnalignedAccesses() {
|
||||
InitializePadding(sizeof(T), Padding::kUnaligned);
|
||||
}
|
||||
|
||||
HWY_INLINE const T* ConstRow(const size_t y) const {
|
||||
return static_cast<const T*>(VoidRow(y));
|
||||
}
|
||||
HWY_INLINE const T* ConstRow(const size_t y) {
|
||||
return static_cast<const T*>(VoidRow(y));
|
||||
}
|
||||
|
||||
// Returns pointer to non-const. This allows passing const Image* parameters
|
||||
// when the callee is only supposed to fill the pixels, as opposed to
|
||||
// allocating or resizing the image.
|
||||
HWY_INLINE T* MutableRow(const size_t y) const {
|
||||
return static_cast<T*>(VoidRow(y));
|
||||
}
|
||||
HWY_INLINE T* MutableRow(const size_t y) {
|
||||
return static_cast<T*>(VoidRow(y));
|
||||
}
|
||||
|
||||
// Returns number of pixels (some of which are padding) per row. Useful for
|
||||
// computing other rows via pointer arithmetic. WARNING: this must
|
||||
// NOT be used to determine xsize.
|
||||
HWY_INLINE intptr_t PixelsPerRow() const {
|
||||
return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
|
||||
}
|
||||
};
|
||||
|
||||
using ImageF = Image<float>;
|
||||
|
||||
// A bundle of 3 same-sized images. To fill an existing Image3 using
|
||||
// single-channel producers, we also need access to each const Image*. Const
|
||||
// prevents breaking the same-size invariant, while still allowing pixels to be
|
||||
// changed via MutableRow.
|
||||
template <typename ComponentType>
|
||||
class Image3 {
|
||||
public:
|
||||
using T = ComponentType;
|
||||
using ImageT = Image<T>;
|
||||
static constexpr size_t kNumPlanes = 3;
|
||||
|
||||
Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
|
||||
|
||||
Image3(const size_t xsize, const size_t ysize)
|
||||
: planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
|
||||
ImageT(xsize, ysize)} {}
|
||||
|
||||
Image3(Image3&& other) noexcept {
|
||||
for (size_t i = 0; i < kNumPlanes; i++) {
|
||||
planes_[i] = std::move(other.planes_[i]);
|
||||
}
|
||||
}
|
||||
|
||||
Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
|
||||
if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
|
||||
HWY_ABORT("Not same size: %zu x %zu, %zu x %zu, %zu x %zu\n",
|
||||
plane0.xsize(), plane0.ysize(), plane1.xsize(), plane1.ysize(),
|
||||
plane2.xsize(), plane2.ysize());
|
||||
}
|
||||
planes_[0] = std::move(plane0);
|
||||
planes_[1] = std::move(plane1);
|
||||
planes_[2] = std::move(plane2);
|
||||
}
|
||||
|
||||
// Copy construction/assignment is forbidden to avoid inadvertent copies,
|
||||
// which can be very expensive. Use CopyImageTo instead.
|
||||
Image3(const Image3& other) = delete;
|
||||
Image3& operator=(const Image3& other) = delete;
|
||||
|
||||
Image3& operator=(Image3&& other) noexcept {
|
||||
for (size_t i = 0; i < kNumPlanes; i++) {
|
||||
planes_[i] = std::move(other.planes_[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
|
||||
return static_cast<const T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
|
||||
return static_cast<const T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
|
||||
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
|
||||
return static_cast<T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
|
||||
return static_cast<T*>(VoidPlaneRow(c, y));
|
||||
}
|
||||
|
||||
HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
|
||||
|
||||
void Swap(Image3& other) {
|
||||
for (size_t c = 0; c < 3; ++c) {
|
||||
other.planes_[c].Swap(planes_[c]);
|
||||
}
|
||||
}
|
||||
|
||||
void ShrinkTo(const size_t xsize, const size_t ysize) {
|
||||
for (ImageT& plane : planes_) {
|
||||
plane.ShrinkTo(xsize, ysize);
|
||||
}
|
||||
}
|
||||
|
||||
// Sizes of all three images are guaranteed to be equal.
|
||||
HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
|
||||
HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
|
||||
// Returns offset [bytes] from one row to the next row of the same plane.
|
||||
// WARNING: this must NOT be used to determine xsize, nor for copying rows -
|
||||
// the valid xsize may be much less.
|
||||
HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
|
||||
// Returns number of pixels (some of which are padding) per row. Useful for
|
||||
// computing other rows via pointer arithmetic. WARNING: this must NOT be used
|
||||
// to determine xsize.
|
||||
HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
|
||||
|
||||
private:
|
||||
// Returns pointer to the start of a row.
|
||||
HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
if (c >= kNumPlanes || y >= ysize()) {
|
||||
HWY_ABORT("PlaneRow(%zu, %zu) >= %zu\n", c, y, ysize());
|
||||
}
|
||||
#endif
|
||||
// Use the first plane's stride because the compiler might not realize they
|
||||
// are all equal. Thus we only need a single multiplication for all planes.
|
||||
const size_t row_offset = y * planes_[0].bytes_per_row();
|
||||
const void* row = planes_[c].bytes() + row_offset;
|
||||
return static_cast<const T * HWY_RESTRICT>(
|
||||
HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
|
||||
}
|
||||
|
||||
private:
|
||||
ImageT planes_[kNumPlanes];
|
||||
};
|
||||
|
||||
using Image3F = Image3<float>;
|
||||
|
||||
// Rectangular region in image(s). Factoring this out of Image instead of
|
||||
// shifting the pointer by x0/y0 allows this to apply to multiple images with
|
||||
// different resolutions. Can compare size via SameSize(rect1, rect2).
|
||||
class Rect {
|
||||
public:
|
||||
// Most windows are xsize_max * ysize_max, except those on the borders where
|
||||
// begin + size_max > end.
|
||||
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
|
||||
size_t ysize_max, size_t xend, size_t yend)
|
||||
: x0_(xbegin),
|
||||
y0_(ybegin),
|
||||
xsize_(ClampedSize(xbegin, xsize_max, xend)),
|
||||
ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
|
||||
|
||||
// Construct with origin and known size (typically from another Rect).
|
||||
constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
|
||||
: x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
|
||||
|
||||
// Construct a rect that covers a whole image.
|
||||
template <typename Image>
|
||||
explicit Rect(const Image& image)
|
||||
: Rect(0, 0, image.xsize(), image.ysize()) {}
|
||||
|
||||
Rect() : Rect(0, 0, 0, 0) {}
|
||||
|
||||
Rect(const Rect&) = default;
|
||||
Rect& operator=(const Rect&) = default;
|
||||
|
||||
Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
|
||||
size_t ysize_max) {
|
||||
return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
|
||||
y0_ + ysize_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* ConstRow(const Image<T>* image, size_t y) const {
|
||||
return image->ConstRow(y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* MutableRow(const Image<T>* image, size_t y) const {
|
||||
return image->MutableRow(y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
|
||||
return image.ConstPlaneRow(c, y + y0_) + x0_;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
|
||||
return image->MutablePlaneRow(c, y + y0_) + x0_;
|
||||
}
|
||||
|
||||
// Returns true if this Rect fully resides in the given image. ImageT could be
|
||||
// Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
|
||||
template <class ImageT>
|
||||
bool IsInside(const ImageT& image) const {
|
||||
return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
|
||||
}
|
||||
|
||||
size_t x0() const { return x0_; }
|
||||
size_t y0() const { return y0_; }
|
||||
size_t xsize() const { return xsize_; }
|
||||
size_t ysize() const { return ysize_; }
|
||||
|
||||
private:
|
||||
// Returns size_max, or whatever is left in [begin, end).
|
||||
static constexpr size_t ClampedSize(size_t begin, size_t size_max,
|
||||
size_t end) {
|
||||
return (begin + size_max <= end) ? size_max
|
||||
: (end > begin ? end - begin : 0);
|
||||
}
|
||||
|
||||
size_t x0_;
|
||||
size_t y0_;
|
||||
|
||||
size_t xsize_;
|
||||
size_t ysize_;
|
||||
};
|
||||
|
||||
// Works for any image-like input type(s).
|
||||
template <class Image1, class Image2>
|
||||
HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
|
||||
return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
|
||||
}
|
||||
|
||||
// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
|
||||
// We assume the radius (distance outside the image) is small compared to the
|
||||
// image size, otherwise this might not terminate.
|
||||
// The mirror is outside the last column (border pixel is also replicated).
|
||||
static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
|
||||
const int64_t xsize) {
|
||||
HWY_DASSERT(xsize != 0);
|
||||
|
||||
// TODO(janwas): replace with branchless version
|
||||
while (x < 0 || x >= xsize) {
|
||||
if (x < 0) {
|
||||
x = -x - 1;
|
||||
} else {
|
||||
x = 2 * xsize - 1 - x;
|
||||
}
|
||||
}
|
||||
return static_cast<size_t>(x);
|
||||
}
|
||||
|
||||
// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
|
||||
|
||||
// Mirrors (repeating the edge pixel once). Useful for convolutions.
|
||||
struct WrapMirror {
|
||||
HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
|
||||
return Mirror(coord, static_cast<int64_t>(size));
|
||||
}
|
||||
};
|
||||
|
||||
// Returns the same coordinate, for when we know "coord" is already valid (e.g.
|
||||
// interior of an image).
|
||||
struct WrapUnchanged {
|
||||
HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
|
||||
return static_cast<size_t>(coord);
|
||||
}
|
||||
};
|
||||
|
||||
// Similar to Wrap* but for row pointers (reduces Row() multiplications).
|
||||
|
||||
class WrapRowMirror {
|
||||
public:
|
||||
template <class View>
|
||||
WrapRowMirror(const View& image, size_t ysize)
|
||||
: first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
|
||||
|
||||
const float* operator()(const float* const HWY_RESTRICT row,
|
||||
const int64_t stride) const {
|
||||
if (row < first_row_) {
|
||||
const int64_t num_before = first_row_ - row;
|
||||
// Mirrored; one row before => row 0, two before = row 1, ...
|
||||
return first_row_ + num_before - stride;
|
||||
}
|
||||
if (row > last_row_) {
|
||||
const int64_t num_after = row - last_row_;
|
||||
// Mirrored; one row after => last row, two after = last - 1, ...
|
||||
return last_row_ - num_after + stride;
|
||||
}
|
||||
return row;
|
||||
}
|
||||
|
||||
private:
|
||||
const float* const HWY_RESTRICT first_row_;
|
||||
const float* const HWY_RESTRICT last_row_;
|
||||
};
|
||||
|
||||
struct WrapRowUnchanged {
|
||||
HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
|
||||
int64_t /*stride*/) const {
|
||||
return row;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_CONTRIB_IMAGE_IMAGE_H_
|
|
@ -0,0 +1,151 @@
|
|||
// Copyright (c) the JPEG XL Project
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "contrib/image/image.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "contrib/image/image_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <random>
|
||||
#include <utility>
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Ensure we can always write full aligned vectors.
|
||||
struct TestAlignedT {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 16);
|
||||
const HWY_FULL(T) d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 64; ++xsize) {
|
||||
Image<T> img(xsize, ysize);
|
||||
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; x += Lanes(d)) {
|
||||
const auto values = Iota(d, dist(rng));
|
||||
Store(values, d, row + x);
|
||||
}
|
||||
}
|
||||
|
||||
// Sanity check to prevent optimizing out the writes
|
||||
const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
|
||||
const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
|
||||
HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
|
||||
|
||||
// Ensure we can write an unaligned vector starting at the last valid value.
|
||||
struct TestUnalignedT {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
std::mt19937 rng(129);
|
||||
std::uniform_int_distribution<int> dist(0, 3);
|
||||
const HWY_FULL(T) d;
|
||||
|
||||
for (size_t ysize = 1; ysize < 4; ++ysize) {
|
||||
for (size_t xsize = 1; xsize < 128; ++xsize) {
|
||||
Image<T> img(xsize, ysize);
|
||||
img.InitializePaddingForUnalignedAccesses();
|
||||
|
||||
// This test reads padding, which only works if it was initialized,
|
||||
// which only happens in MSAN builds.
|
||||
#if defined(MEMORY_SANITIZER) || HWY_IDE
|
||||
// Initialize only the valid samples
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
row[x] = 1 << dist(rng);
|
||||
}
|
||||
}
|
||||
|
||||
// Read padding bits
|
||||
auto accum = Zero(d);
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
accum |= LoadU(d, row + x);
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure padding was zero
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(accum, d, lanes.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(lanes[i] < 16);
|
||||
}
|
||||
#else // Check that writing padding does not overwrite valid samples
|
||||
// Initialize only the valid samples
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize; ++x) {
|
||||
row[x] = static_cast<T>(x);
|
||||
}
|
||||
}
|
||||
|
||||
// Zero padding and rightmost sample
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
StoreU(Zero(d), d, row + xsize - 1);
|
||||
}
|
||||
|
||||
// Ensure no samples except the rightmost were overwritten
|
||||
for (size_t y = 0; y < ysize; ++y) {
|
||||
T* HWY_RESTRICT row = img.MutableRow(y);
|
||||
for (size_t x = 0; x < xsize - 1; ++x) {
|
||||
HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(ImageTest);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
|
||||
HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
|
||||
} // namespace hwy
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,188 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <cfloat> // FLT_MAX
|
||||
#include <iostream>
|
||||
#include <type_traits>
|
||||
|
||||
// clang-format off
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "contrib/math/math_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "contrib/math/math-inl.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
// clang-format on
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
template <class T, class D>
|
||||
void TestMath(const std::string name, T (*fx1)(T), Vec<D> (*fxN)(D, Vec<D>),
|
||||
D d, T min, T max, uint64_t max_error_ulp) {
|
||||
constexpr bool kIsF32 = (sizeof(T) == 4);
|
||||
using UintT = MakeUnsigned<T>;
|
||||
|
||||
const UintT min_bits = BitCast<UintT>(min);
|
||||
const UintT max_bits = BitCast<UintT>(max);
|
||||
|
||||
// If min is negative and max is positive, the range needs to be broken into
|
||||
// two pieces, [+0, max] and [-0, min], otherwise [min, max].
|
||||
int range_count = 1;
|
||||
UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
|
||||
if ((min < 0.0) && (max > 0.0)) {
|
||||
ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
|
||||
ranges[0][1] = max_bits;
|
||||
ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
|
||||
ranges[1][1] = min_bits;
|
||||
range_count = 2;
|
||||
}
|
||||
|
||||
uint64_t max_ulp = 0;
|
||||
#if HWY_ARCH_ARM
|
||||
// Emulation is slower, so cannot afford as many.
|
||||
constexpr UintT kSamplesPerRange = 25000;
|
||||
#else
|
||||
constexpr UintT kSamplesPerRange = 100000;
|
||||
#endif
|
||||
for (int range_index = 0; range_index < range_count; ++range_index) {
|
||||
const UintT start = ranges[range_index][0];
|
||||
const UintT stop = ranges[range_index][1];
|
||||
const UintT step = std::max<UintT>(1, ((stop - start) / kSamplesPerRange));
|
||||
for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
|
||||
const T value = BitCast<T>(std::min(value_bits, stop));
|
||||
const T actual = GetLane(fxN(d, Set(d, value)));
|
||||
const T expected = fx1(value);
|
||||
|
||||
// Skip small inputs and outputs on armv7, it flushes subnormals to zero.
|
||||
#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
|
||||
if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto ulp = ComputeUlpDelta(actual, expected);
|
||||
max_ulp = std::max<uint64_t>(max_ulp, ulp);
|
||||
if (ulp > max_error_ulp) {
|
||||
std::cout << name << "<" << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
|
||||
<< ">(" << value << ") expected: " << expected
|
||||
<< " actual: " << actual << std::endl;
|
||||
}
|
||||
HWY_ASSERT(ulp <= max_error_ulp);
|
||||
}
|
||||
}
|
||||
std::cout << (kIsF32 ? "F32x" : "F64x") << Lanes(d)
|
||||
<< ", Max ULP: " << max_ulp << std::endl;
|
||||
}
|
||||
|
||||
#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
|
||||
F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \
|
||||
struct Test##NAME { \
|
||||
template <class T, class D> \
|
||||
HWY_NOINLINE void operator()(T, D d) { \
|
||||
if (sizeof(T) == 4) { \
|
||||
TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \
|
||||
F32_ERROR); \
|
||||
} else { \
|
||||
TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d, F64_MIN, F64_MAX, \
|
||||
F64_ERROR); \
|
||||
} \
|
||||
} \
|
||||
}; \
|
||||
HWY_NOINLINE void TestAll##NAME() { \
|
||||
ForFloatTypes(ForPartialVectors<Test##NAME>()); \
|
||||
}
|
||||
|
||||
// Floating point values closest to but less than 1.0
|
||||
const float kNearOneF = BitCast<float>(0x3F7FFFFF);
|
||||
const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
|
||||
|
||||
// clang-format off
|
||||
DEFINE_MATH_TEST(Acos,
|
||||
std::acos, CallAcos, -1.0, +1.0, 3, // NEON is 3 instead of 2
|
||||
std::acos, CallAcos, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Acosh,
|
||||
std::acosh, CallAcosh, +1.0, +FLT_MAX, 3,
|
||||
std::acosh, CallAcosh, +1.0, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Asin,
|
||||
std::asin, CallAsin, -1.0, +1.0, 3, // NEON is 3 instead of 2
|
||||
std::asin, CallAsin, -1.0, +1.0, 2)
|
||||
DEFINE_MATH_TEST(Asinh,
|
||||
std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3,
|
||||
std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Atan,
|
||||
std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3,
|
||||
std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3)
|
||||
DEFINE_MATH_TEST(Atanh,
|
||||
std::atanh, CallAtanh, -kNearOneF, +kNearOneF, 4, // NEON is 4 instead of 3
|
||||
std::atanh, CallAtanh, -kNearOneD, +kNearOneD, 3)
|
||||
DEFINE_MATH_TEST(Cos,
|
||||
std::cos, CallCos, -39000.0, +39000.0, 3,
|
||||
std::cos, CallCos, -39000.0, +39000.0, 3)
|
||||
DEFINE_MATH_TEST(Exp,
|
||||
std::exp, CallExp, -FLT_MAX, +104.0, 1,
|
||||
std::exp, CallExp, -DBL_MAX, +104.0, 1)
|
||||
DEFINE_MATH_TEST(Expm1,
|
||||
std::expm1, CallExpm1, -FLT_MAX, +104.0, 4,
|
||||
std::expm1, CallExpm1, -DBL_MAX, +104.0, 4)
|
||||
DEFINE_MATH_TEST(Log,
|
||||
std::log, CallLog, +FLT_MIN, +FLT_MAX, 1,
|
||||
std::log, CallLog, +DBL_MIN, +DBL_MAX, 1)
|
||||
DEFINE_MATH_TEST(Log10,
|
||||
std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log1p,
|
||||
std::log1p, CallLog1p, +0.0f, +1e37, 3, // NEON is 3 instead of 2
|
||||
std::log1p, CallLog1p, +0.0, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Log2,
|
||||
std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2,
|
||||
std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2)
|
||||
DEFINE_MATH_TEST(Sin,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 3,
|
||||
std::sin, CallSin, -39000.0, +39000.0, 3)
|
||||
DEFINE_MATH_TEST(Sinh,
|
||||
std::sinh, CallSinh, -80.0f, +80.0f, 4,
|
||||
std::sinh, CallSinh, -709.0, +709.0, 4)
|
||||
DEFINE_MATH_TEST(Tanh,
|
||||
std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4,
|
||||
std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4)
|
||||
// clang-format on
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMathTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,31 @@
|
|||
highway (0.12.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
|
||||
* Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
|
||||
* Proper IEEE rounding, reduce libstdc++ usage, inlined math
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 15 Apr 2021 20:00:00 +0200
|
||||
|
||||
highway (0.11.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Fix clang7 asan error, finish f16 conversions and add test
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 25 Feb 2021 16:00:00 +0200
|
||||
|
||||
highway (0.11.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Thu, 18 Feb 2021 20:00:00 +0200
|
||||
|
||||
highway (0.7.0-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
|
||||
|
||||
-- Jan Wassenberg <janwas@google.com> Tue, 5 Jan 2021 17:00:00 +0200
|
||||
|
||||
highway (0.1-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Initial debian package.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Mon, 19 Oct 2020 16:48:07 +0200
|
|
@ -0,0 +1 @@
|
|||
10
|
|
@ -0,0 +1,23 @@
|
|||
Source: highway
|
||||
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
|
||||
Section: misc
|
||||
Priority: optional
|
||||
Standards-Version: 3.9.8
|
||||
Build-Depends: cmake,
|
||||
debhelper (>= 9),
|
||||
libgtest-dev
|
||||
Homepage: https://github.com/google/highway
|
||||
|
||||
Package: libhwy-dev
|
||||
Architecture: any
|
||||
Section: libdevel
|
||||
Depends: ${misc:Depends}
|
||||
Description: Efficient and performance-portable SIMD wrapper (developer files)
|
||||
This library provides type-safe and source-code portable wrappers over
|
||||
existing platform-specific intrinsics. Its design aims for simplicity,
|
||||
reliable efficiency across platforms, and immediate usability with current
|
||||
compilers.
|
||||
.
|
||||
This package installs the development files. There's no runtime library
|
||||
since most of Highway is implemented in headers and only a very small
|
||||
static library is needed.
|
|
@ -0,0 +1,20 @@
|
|||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: highway
|
||||
|
||||
Files: *
|
||||
Copyright: 2020 Google LLC
|
||||
License: Apache-2.0
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
.
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
.
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
.
|
||||
On Debian systems, the complete text of the Apache License, Version 2
|
||||
can be found in "/usr/share/common-licenses/Apache-2.0".
|
|
@ -0,0 +1,6 @@
|
|||
#!/usr/bin/make -f
|
||||
%:
|
||||
dh $@ --buildsystem=cmake
|
||||
|
||||
override_dh_auto_configure:
|
||||
dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
|
|
@ -0,0 +1 @@
|
|||
3.0 (quilt)
|
|
@ -0,0 +1,138 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // malloc
|
||||
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize);
|
||||
// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
|
||||
// if this is used for single-vector allocations. 256 is more reasonable.
|
||||
constexpr size_t kAlias = kAlignment * 4;
|
||||
|
||||
#pragma pack(push, 1)
|
||||
struct AllocationHeader {
|
||||
void* allocated;
|
||||
size_t payload_size;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
|
||||
// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
|
||||
size_t NextAlignedOffset() {
|
||||
static std::atomic<uint32_t> next{0};
|
||||
constexpr uint32_t kGroups = kAlias / kAlignment;
|
||||
const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
|
||||
const size_t offset = kAlignment * group;
|
||||
HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
|
||||
return offset;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void* AllocateAlignedBytes(const size_t payload_size, AllocPtr alloc_ptr,
|
||||
void* opaque_ptr) {
|
||||
if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
|
||||
HWY_DASSERT(false && "payload_size too large");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t offset = NextAlignedOffset();
|
||||
|
||||
// What: | misalign | unused | AllocationHeader |payload
|
||||
// Size: |<= kAlias | offset |payload_size
|
||||
// ^allocated.^aligned.^header............^payload
|
||||
// The header must immediately precede payload, which must remain aligned.
|
||||
// To avoid wasting space, the header resides at the end of `unused`,
|
||||
// which therefore cannot be empty (offset == 0).
|
||||
if (offset == 0) {
|
||||
offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
|
||||
static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
|
||||
}
|
||||
|
||||
const size_t allocated_size = kAlias + offset + payload_size;
|
||||
void* allocated;
|
||||
if (alloc_ptr == nullptr) {
|
||||
allocated = malloc(allocated_size);
|
||||
} else {
|
||||
allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
|
||||
}
|
||||
if (allocated == nullptr) return nullptr;
|
||||
// Always round up even if already aligned - we already asked for kAlias
|
||||
// extra bytes and there's no way to give them back.
|
||||
uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
|
||||
static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
|
||||
static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
|
||||
aligned &= ~(kAlias - 1);
|
||||
|
||||
const uintptr_t payload = aligned + offset; // still aligned
|
||||
|
||||
// Stash `allocated` and payload_size inside header for FreeAlignedBytes().
|
||||
// The allocated_size can be reconstructed from the payload_size.
|
||||
AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
|
||||
header->allocated = allocated;
|
||||
header->payload_size = payload_size;
|
||||
|
||||
return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kMaxVectorSize);
|
||||
}
|
||||
|
||||
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
HWY_DASSERT(payload % kAlignment == 0);
|
||||
const AllocationHeader* header =
|
||||
reinterpret_cast<const AllocationHeader*>(payload) - 1;
|
||||
|
||||
if (free_ptr == nullptr) {
|
||||
free(header->allocated);
|
||||
} else {
|
||||
(*free_ptr)(opaque_ptr, header->allocated);
|
||||
}
|
||||
}
|
||||
|
||||
// static
|
||||
void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr,
|
||||
ArrayDeleter deleter) {
|
||||
if (aligned_pointer == nullptr) return;
|
||||
|
||||
const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
|
||||
HWY_DASSERT(payload % kAlignment == 0);
|
||||
const AllocationHeader* header =
|
||||
reinterpret_cast<const AllocationHeader*>(payload) - 1;
|
||||
|
||||
if (deleter) {
|
||||
(*deleter)(aligned_pointer, header->payload_size);
|
||||
}
|
||||
|
||||
if (free_ptr == nullptr) {
|
||||
free(header->allocated);
|
||||
} else {
|
||||
(*free_ptr)(opaque_ptr, header->allocated);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace hwy
|
|
@ -0,0 +1,179 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
||||
|
||||
// Memory allocator with support for alignment and offsets.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <memory>
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
|
||||
// requires a literal. This matches typical L1 cache line sizes, which prevents
|
||||
// false sharing.
|
||||
#define HWY_ALIGNMENT 64
|
||||
|
||||
// Pointers to functions equivalent to malloc/free with an opaque void* passed
|
||||
// to them.
|
||||
using AllocPtr = void* (*)(void* opaque, size_t bytes);
|
||||
using FreePtr = void (*)(void* opaque, void* memory);
|
||||
|
||||
// Returns null or a pointer to at least `payload_size` (which can be zero)
|
||||
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
|
||||
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
|
||||
// memory or malloc() if it is null.
|
||||
void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr,
|
||||
void* opaque_ptr);
|
||||
|
||||
// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
|
||||
// must have been returned from a previous call to `AllocateAlignedBytes`.
|
||||
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
|
||||
// `free_ptr` function is null, uses the default free().
|
||||
void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr);
|
||||
|
||||
// Class that deletes the aligned pointer passed to operator() calling the
|
||||
// destructor before freeing the pointer. This is equivalent to the
|
||||
// std::default_delete but for aligned objects. For a similar deleter equivalent
|
||||
// to free() for aligned memory see AlignedFreer().
|
||||
class AlignedDeleter {
|
||||
public:
|
||||
AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
|
||||
TypedArrayDeleter<T>);
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
|
||||
size_t elems = size_in_bytes / sizeof(T);
|
||||
for (size_t i = 0; i < elems; i++) {
|
||||
// Explicitly call the destructor on each element.
|
||||
(static_cast<T*>(ptr) + i)->~T();
|
||||
}
|
||||
}
|
||||
|
||||
// Function prototype that calls the destructor for each element in a typed
|
||||
// array. TypeArrayDeleter<T> would match this prototype.
|
||||
using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
|
||||
|
||||
static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr,
|
||||
void* opaque_ptr, ArrayDeleter deleter);
|
||||
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to T with custom aligned deleter. This can be a single
|
||||
// element U or an array of element if T is a U[]. The custom aligned deleter
|
||||
// will call the destructor on U or each element of a U[] in the array case.
|
||||
template <typename T>
|
||||
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
|
||||
|
||||
// Aligned memory equivalent of make_unique<T> using the custom allocators
|
||||
// alloc/free with the passed `opaque` pointer. This function calls the
|
||||
// constructor with the passed Args... and calls the destructor of the object
|
||||
// when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
|
||||
void* opaque, Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
|
||||
return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
|
||||
AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
|
||||
// functions.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
|
||||
T* ptr = static_cast<T*>(AllocateAlignedBytes(
|
||||
sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
|
||||
return AlignedUniquePtr<T>(
|
||||
new (ptr) T(std::forward<Args>(args)...), AlignedDeleter());
|
||||
}
|
||||
|
||||
// Aligned memory equivalent of make_unique<T[]> for array types using the
|
||||
// custom allocators alloc/free. This function calls the constructor with the
|
||||
// passed Args... on every created item. The destructor of each element will be
|
||||
// called when the AlignedUniquePtr is destroyed.
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
|
||||
size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
|
||||
T* ptr =
|
||||
static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque));
|
||||
for (size_t i = 0; i < items; i++) {
|
||||
new (ptr + i) T(std::forward<Args>(args)...);
|
||||
}
|
||||
return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
|
||||
}
|
||||
|
||||
template <typename T, typename... Args>
|
||||
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
|
||||
return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
|
||||
items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
|
||||
// but for aligned memory.
|
||||
class AlignedFreer {
|
||||
public:
|
||||
// Pass address of this to ctor to skip deleting externally-owned memory.
|
||||
static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
|
||||
|
||||
AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
|
||||
AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
|
||||
: free_(free_ptr), opaque_ptr_(opaque_ptr) {}
|
||||
|
||||
template <typename T>
|
||||
void operator()(T* aligned_pointer) const {
|
||||
// TODO(deymo): assert that we are using a POD type T.
|
||||
FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
|
||||
}
|
||||
|
||||
private:
|
||||
FreePtr free_;
|
||||
void* opaque_ptr_;
|
||||
};
|
||||
|
||||
// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
|
||||
// data use AlignedUniquePtr.
|
||||
template <typename T>
|
||||
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
|
||||
|
||||
// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
|
||||
// Upon destruction of the unique_ptr the aligned array will be freed.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
|
||||
FreePtr free, void* opaque) {
|
||||
return AlignedFreeUniquePtr<T[]>(
|
||||
static_cast<T*>(AllocateAlignedBytes(items * sizeof(T), alloc, opaque)),
|
||||
AlignedFreer(free, opaque));
|
||||
}
|
||||
|
||||
// Same as previous AllocateAligned(), using default allocate/free functions.
|
||||
template <typename T>
|
||||
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
|
||||
return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
#endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
|
|
@ -0,0 +1,250 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include <new>
|
||||
#include <random>
|
||||
#include <vector>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "hwy/base.h"
|
||||
|
||||
namespace {
|
||||
|
||||
// Sample object that keeps track on an external counter of how many times was
|
||||
// the explicit constructor and destructor called.
|
||||
template <size_t N>
|
||||
class SampleObject {
|
||||
public:
|
||||
SampleObject() { data_[0] = 'a'; }
|
||||
explicit SampleObject(int* counter) : counter_(counter) {
|
||||
if (counter) (*counter)++;
|
||||
data_[0] = 'b';
|
||||
}
|
||||
|
||||
~SampleObject() {
|
||||
if (counter_) (*counter_)--;
|
||||
}
|
||||
|
||||
static_assert(N > sizeof(int*), "SampleObject size too small.");
|
||||
int* counter_ = nullptr;
|
||||
char data_[N - sizeof(int*)];
|
||||
};
|
||||
|
||||
class FakeAllocator {
|
||||
public:
|
||||
// static AllocPtr and FreePtr member to be used with the alligned
|
||||
// allocator. These functions calls the private non-static members.
|
||||
static void* StaticAlloc(void* opaque, size_t bytes) {
|
||||
return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
|
||||
}
|
||||
static void StaticFree(void* opaque, void* memory) {
|
||||
return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
|
||||
}
|
||||
|
||||
// Returns the number of pending allocations to be freed.
|
||||
size_t PendingAllocs() { return allocs_.size(); }
|
||||
|
||||
private:
|
||||
void* Alloc(size_t bytes) {
|
||||
void* ret = malloc(bytes);
|
||||
allocs_.insert(ret);
|
||||
return ret;
|
||||
}
|
||||
void Free(void* memory) {
|
||||
if (!memory) return;
|
||||
EXPECT_NE(allocs_.end(), allocs_.find(memory));
|
||||
free(memory);
|
||||
allocs_.erase(memory);
|
||||
}
|
||||
|
||||
std::set<void*> allocs_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
namespace hwy {
|
||||
|
||||
class AlignedAllocatorTest : public testing::Test {};
|
||||
|
||||
TEST(AlignedAllocatorTest, FreeNullptr) {
|
||||
// Calling free with a nullptr is always ok.
|
||||
FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
|
||||
/*opaque_ptr=*/nullptr);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocDefaultPointers) {
|
||||
const size_t kSize = 7777;
|
||||
void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
|
||||
/*opaque_ptr=*/nullptr);
|
||||
ASSERT_NE(nullptr, ptr);
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
||||
char* p = static_cast<char*>(ptr);
|
||||
size_t ret = 0;
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
// Performs a computation using p[] to prevent it being optimized away.
|
||||
p[i] = static_cast<char>(i & 0x7F);
|
||||
if (i) ret += p[i] * p[i - 1];
|
||||
}
|
||||
EXPECT_NE(0U, ret);
|
||||
FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
|
||||
AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
|
||||
AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
|
||||
AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
|
||||
AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, CustomAlloc) {
|
||||
FakeAllocator fake_alloc;
|
||||
|
||||
const size_t kSize = 7777;
|
||||
void* ptr =
|
||||
AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
|
||||
ASSERT_NE(nullptr, ptr);
|
||||
// We should have only requested one alloc from the allocator.
|
||||
EXPECT_EQ(1U, fake_alloc.PendingAllocs());
|
||||
// Make sure the pointer is actually aligned.
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % kMaxVectorSize);
|
||||
FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
|
||||
EXPECT_EQ(0U, fake_alloc.PendingAllocs());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
|
||||
{
|
||||
auto ptr = MakeUniqueAligned<SampleObject<24>>();
|
||||
// Default constructor sets the data_[0] to 'a'.
|
||||
EXPECT_EQ('a', ptr->data_[0]);
|
||||
EXPECT_EQ(nullptr, ptr->counter_);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAligned) {
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the object, initializes it with the explicit constructor and
|
||||
// returns an unique_ptr to it.
|
||||
auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
|
||||
EXPECT_EQ(1, counter);
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', ptr->data_[0]);
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the array of objects and initializes them with the explicit
|
||||
// constructor.
|
||||
auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
|
||||
EXPECT_EQ(7, counter);
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocSingleInt) {
|
||||
auto ptr = AllocateAligned<uint32_t>(1);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
||||
// Force delete of the unique_ptr now to check that it doesn't crash.
|
||||
ptr.reset(nullptr);
|
||||
EXPECT_EQ(nullptr, ptr.get());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocMultipleInt) {
|
||||
const size_t kSize = 7777;
|
||||
auto ptr = AllocateAligned<uint32_t>(kSize);
|
||||
ASSERT_NE(nullptr, ptr.get());
|
||||
EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % kMaxVectorSize);
|
||||
// ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
|
||||
// underlying type chosen by AllocateAligned() for the std::unique_ptr.
|
||||
EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
|
||||
|
||||
size_t ret = 0;
|
||||
for (size_t i = 0; i < kSize; i++) {
|
||||
// Performs a computation using ptr[] to prevent it being optimized away.
|
||||
ptr[i] = static_cast<uint32_t>(i);
|
||||
if (i) ret += ptr[i] * ptr[i - 1];
|
||||
}
|
||||
EXPECT_NE(0U, ret);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
|
||||
int counter = 0;
|
||||
{
|
||||
// This doesn't call the constructor.
|
||||
auto obj = AllocateAligned<SampleObject<24>>(1);
|
||||
obj[0].counter_ = &counter;
|
||||
}
|
||||
// Destroying the unique_ptr shouldn't have called the destructor of the
|
||||
// SampleObject<24>.
|
||||
EXPECT_EQ(0, counter);
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
|
||||
FakeAllocator fake_alloc;
|
||||
int counter = 0;
|
||||
{
|
||||
// Creates the array of objects and initializes them with the explicit
|
||||
// constructor.
|
||||
auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
|
||||
7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
|
||||
&counter);
|
||||
// An array shold still only call a single allocation.
|
||||
EXPECT_EQ(1u, fake_alloc.PendingAllocs());
|
||||
EXPECT_EQ(7, counter);
|
||||
for (size_t i = 0; i < 7; i++) {
|
||||
// Custom constructor sets the data_[0] to 'b'.
|
||||
EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(0, counter);
|
||||
EXPECT_EQ(0u, fake_alloc.PendingAllocs());
|
||||
}
|
||||
|
||||
TEST(AlignedAllocatorTest, DefaultInit) {
|
||||
// The test is whether this compiles. Default-init is useful for output params
|
||||
// and per-thread storage.
|
||||
std::vector<AlignedUniquePtr<int[]>> ptrs;
|
||||
std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
|
||||
ptrs.resize(128);
|
||||
free_ptrs.resize(128);
|
||||
// The following is to prevent elision of the pointers.
|
||||
std::mt19937 rng(129); // Emscripten lacks random_device.
|
||||
std::uniform_int_distribution<size_t> dist(0, 127);
|
||||
ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
|
||||
free_ptrs[dist(rng)] = AllocateAligned<double>(456);
|
||||
// "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
|
||||
const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
|
||||
const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
|
||||
constexpr size_t kBits = sizeof(uintptr_t) * 8;
|
||||
EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
|
||||
(addr2 >> (kBits - 1)) >> (kBits - 1));
|
||||
}
|
||||
|
||||
} // namespace hwy
|
|
@ -0,0 +1,635 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_BASE_H_
|
||||
#define HIGHWAY_HWY_BASE_H_
|
||||
|
||||
// For SIMD module implementations and their callers, target-independent.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cfloat>
|
||||
|
||||
// Add to #if conditions to prevent IDE from graying out code.
|
||||
#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
|
||||
(defined Q_CREATOR_RUN) || (defined(__CLANGD__))
|
||||
#define HWY_IDE 1
|
||||
#else
|
||||
#define HWY_IDE 0
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect compiler using predefined macros
|
||||
|
||||
// clang-cl defines _MSC_VER but doesn't behave like MSVC in other aspects like
|
||||
// used in HWY_DIAGNOSTICS(). We include a check that we are not clang for that
|
||||
// purpose.
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#define HWY_COMPILER_MSVC _MSC_VER
|
||||
#else
|
||||
#define HWY_COMPILER_MSVC 0
|
||||
#endif
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
#define HWY_COMPILER_ICC __INTEL_COMPILER
|
||||
#else
|
||||
#define HWY_COMPILER_ICC 0
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
|
||||
#else
|
||||
#define HWY_COMPILER_GCC 0
|
||||
#endif
|
||||
|
||||
// Clang can masquerade as MSVC/GCC, in which case both are set.
|
||||
#ifdef __clang__
|
||||
#ifdef __APPLE__
|
||||
// Apple LLVM version is unrelated to the actual Clang version, which we need
|
||||
// for enabling workarounds. Use the presence of warning flags to deduce it.
|
||||
// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
|
||||
#if __has_warning("-Wformat-insufficient-args")
|
||||
#define HWY_COMPILER_CLANG 1200
|
||||
#elif __has_warning("-Wimplicit-const-int-float-conversion")
|
||||
#define HWY_COMPILER_CLANG 1100
|
||||
#elif __has_warning("-Wmisleading-indentation")
|
||||
#define HWY_COMPILER_CLANG 1000
|
||||
#elif defined(__FILE_NAME__)
|
||||
#define HWY_COMPILER_CLANG 900
|
||||
#elif __has_warning("-Wextra-semi-stmt") || \
|
||||
__has_builtin(__builtin_rotateleft32)
|
||||
#define HWY_COMPILER_CLANG 800
|
||||
#elif __has_warning("-Wc++98-compat-extra-semi")
|
||||
#define HWY_COMPILER_CLANG 700
|
||||
#else // Anything older than 7.0 is not recommended for Highway.
|
||||
#define HWY_COMPILER_CLANG 600
|
||||
#endif // __has_warning chain
|
||||
#else // Non-Apple: normal version
|
||||
#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
|
||||
#endif
|
||||
#else // Not clang
|
||||
#define HWY_COMPILER_CLANG 0
|
||||
#endif
|
||||
|
||||
// More than one may be nonzero, but we want at least one.
|
||||
#if !HWY_COMPILER_MSVC && !HWY_COMPILER_ICC && !HWY_COMPILER_GCC && \
|
||||
!HWY_COMPILER_CLANG
|
||||
#error "Unsupported compiler"
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Compiler-specific definitions
|
||||
|
||||
#define HWY_STR_IMPL(macro) #macro
|
||||
#define HWY_STR(macro) HWY_STR_IMPL(macro)
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
|
||||
#include <intrin.h>
|
||||
|
||||
#define HWY_RESTRICT __restrict
|
||||
#define HWY_INLINE __forceinline
|
||||
#define HWY_NOINLINE __declspec(noinline)
|
||||
#define HWY_FLATTEN
|
||||
#define HWY_NORETURN __declspec(noreturn)
|
||||
#define HWY_LIKELY(expr) (expr)
|
||||
#define HWY_UNLIKELY(expr) (expr)
|
||||
#define HWY_PRAGMA(tokens) __pragma(tokens)
|
||||
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
|
||||
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
|
||||
#define HWY_MAYBE_UNUSED
|
||||
#define HWY_HAS_ASSUME_ALIGNED 0
|
||||
#if (_MSC_VER >= 1700)
|
||||
#define HWY_MUST_USE_RESULT _Check_return_
|
||||
#else
|
||||
#define HWY_MUST_USE_RESULT
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define HWY_RESTRICT __restrict__
|
||||
#define HWY_INLINE inline __attribute__((always_inline))
|
||||
#define HWY_NOINLINE __attribute__((noinline))
|
||||
#define HWY_FLATTEN __attribute__((flatten))
|
||||
#define HWY_NORETURN __attribute__((noreturn))
|
||||
#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
|
||||
#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
|
||||
#define HWY_PRAGMA(tokens) _Pragma(#tokens)
|
||||
#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
|
||||
#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
|
||||
// Encountered "attribute list cannot appear here" when using the C++17
|
||||
// [[maybe_unused]], so only use the old style attribute for now.
|
||||
#define HWY_MAYBE_UNUSED __attribute__((unused))
|
||||
#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
|
||||
|
||||
#endif // !HWY_COMPILER_MSVC
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Builtin/attributes
|
||||
|
||||
#ifdef __has_builtin
|
||||
#define HWY_HAS_BUILTIN(name) __has_builtin(name)
|
||||
#else
|
||||
#define HWY_HAS_BUILTIN(name) 0
|
||||
#endif
|
||||
|
||||
#ifdef __has_attribute
|
||||
#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
|
||||
#else
|
||||
#define HWY_HAS_ATTRIBUTE(name) 0
|
||||
#endif
|
||||
|
||||
// Enables error-checking of format strings.
|
||||
#if HWY_HAS_ATTRIBUTE(__format__)
|
||||
#define HWY_FORMAT(idx_fmt, idx_arg) \
|
||||
__attribute__((__format__(__printf__, idx_fmt, idx_arg)))
|
||||
#else
|
||||
#define HWY_FORMAT(idx_fmt, idx_arg)
|
||||
#endif
|
||||
|
||||
// Returns a void* pointer which the compiler then assumes is N-byte aligned.
|
||||
// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
|
||||
//
|
||||
// The assignment semantics are required by GCC/Clang. ICC provides an in-place
|
||||
// __assume_aligned, whereas MSVC's __assume appears unsuitable.
|
||||
#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
|
||||
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
|
||||
#else
|
||||
#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
|
||||
#endif
|
||||
|
||||
// Clang and GCC require attributes on each function into which SIMD intrinsics
|
||||
// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
|
||||
// automatic annotation via pragmas.
|
||||
#if HWY_COMPILER_CLANG
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
|
||||
apply_to = function))
|
||||
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
|
||||
#elif HWY_COMPILER_GCC
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str) \
|
||||
HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
|
||||
#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
|
||||
#else
|
||||
#define HWY_PUSH_ATTRIBUTES(targets_str)
|
||||
#define HWY_POP_ATTRIBUTES
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect architecture using predefined macros
|
||||
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#define HWY_ARCH_X86_32 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_32 0
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#define HWY_ARCH_X86_64 1
|
||||
#else
|
||||
#define HWY_ARCH_X86_64 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
|
||||
#define HWY_ARCH_X86 1
|
||||
#else
|
||||
#define HWY_ARCH_X86 0
|
||||
#endif
|
||||
|
||||
#if defined(__powerpc64__) || defined(_M_PPC)
|
||||
#define HWY_ARCH_PPC 1
|
||||
#else
|
||||
#define HWY_ARCH_PPC 0
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define HWY_ARCH_ARM_A64 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_A64 0
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_M_ARM)
|
||||
#define HWY_ARCH_ARM_V7 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM_V7 0
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
|
||||
#error "Cannot have both A64 and V7"
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
|
||||
#define HWY_ARCH_ARM 1
|
||||
#else
|
||||
#define HWY_ARCH_ARM 0
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
|
||||
#define HWY_ARCH_WASM 1
|
||||
#else
|
||||
#define HWY_ARCH_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#define HWY_ARCH_RVV 1
|
||||
#else
|
||||
#define HWY_ARCH_RVV 0
|
||||
#endif
|
||||
|
||||
#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \
|
||||
HWY_ARCH_RVV) != 1
|
||||
#error "Must detect exactly one platform"
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Macros
|
||||
|
||||
#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
|
||||
|
||||
#define HWY_CONCAT_IMPL(a, b) a##b
|
||||
#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
|
||||
|
||||
#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
|
||||
// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
|
||||
// does, without generating code.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
|
||||
#else
|
||||
// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
|
||||
#define HWY_FENCE
|
||||
#endif
|
||||
|
||||
// 4 instances of a given literal value, useful as input to LoadDup128.
|
||||
#define HWY_REP4(literal) literal, literal, literal, literal
|
||||
|
||||
#define HWY_ABORT(format, ...) \
|
||||
::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
|
||||
|
||||
// Always enabled.
|
||||
#define HWY_ASSERT(condition) \
|
||||
do { \
|
||||
if (!(condition)) { \
|
||||
HWY_ABORT("Assert %s", #condition); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
// Only for "debug" builds
|
||||
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || \
|
||||
defined(MEMORY_SANITIZER) || defined(THREAD_SANITIZER)
|
||||
#define HWY_DASSERT(condition) HWY_ASSERT(condition)
|
||||
#else
|
||||
#define HWY_DASSERT(condition) \
|
||||
do { \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
|
||||
namespace hwy {
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Alignment
|
||||
|
||||
// Not guaranteed to be an upper bound, but the alignment established by
|
||||
// aligned_allocator is HWY_MAX(HWY_ALIGNMENT, kMaxVectorSize).
|
||||
#if HWY_ARCH_X86
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512
|
||||
#define HWY_ALIGN_MAX alignas(64)
|
||||
#elif HWY_ARCH_RVV
|
||||
// Not actually an upper bound on the size, but this value prevents crossing a
|
||||
// 4K boundary (relevant on Andes).
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
|
||||
#define HWY_ALIGN_MAX alignas(8) // only elements need be aligned
|
||||
#else
|
||||
static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
|
||||
#define HWY_ALIGN_MAX alignas(16)
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Lane types
|
||||
|
||||
// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
|
||||
// by concatenating base type and bits.
|
||||
|
||||
// RVV already has a builtin type and the GCC intrinsics require it.
|
||||
#if HWY_ARCH_RVV && HWY_COMPILER_GCC
|
||||
using float16_t = __fp16;
|
||||
// Clang does not allow __fp16 arguments, but scalar.h requires LaneType
|
||||
// arguments, so use a wrapper.
|
||||
// TODO(janwas): replace with _Float16 when that is supported?
|
||||
#else
|
||||
#pragma pack(push, 1)
|
||||
struct float16_t {
|
||||
uint16_t bits;
|
||||
};
|
||||
#pragma pack(pop)
|
||||
#endif
|
||||
|
||||
using float32_t = float;
|
||||
using float64_t = double;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Controlling overload resolution (SFINAE)
|
||||
|
||||
template <bool Condition, class T>
|
||||
struct EnableIfT {};
|
||||
template <class T>
|
||||
struct EnableIfT<true, T> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <bool Condition, class T = void>
|
||||
using EnableIf = typename EnableIfT<Condition, T>::type;
|
||||
|
||||
// Insert into template/function arguments to enable this overload only for
|
||||
// vectors of AT MOST this many bits.
|
||||
//
|
||||
// Note that enabling for exactly 128 bits is unnecessary because a function can
|
||||
// simply be overloaded with Vec128<T> and Full128<T> descriptor. Enabling for
|
||||
// other sizes (e.g. 64 bit) can be achieved with Simd<T, 8 / sizeof(T)>.
|
||||
#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
|
||||
#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
|
||||
#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
|
||||
|
||||
#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
|
||||
#define HWY_IF_SIGNED(T) \
|
||||
hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
|
||||
#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
|
||||
#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
|
||||
|
||||
#define HWY_IF_LANE_SIZE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
|
||||
#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
|
||||
hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
|
||||
|
||||
// Empty struct used as a size tag type.
|
||||
template <size_t N>
|
||||
struct SizeTag {};
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type traits
|
||||
|
||||
template <typename T>
|
||||
constexpr bool IsFloat() {
|
||||
return T(1.25) != T(1);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr bool IsSigned() {
|
||||
return T(0) > T(-1);
|
||||
}
|
||||
|
||||
// Largest/smallest representable integer values.
|
||||
template <typename T>
|
||||
constexpr T LimitsMax() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1)
|
||||
: static_cast<T>(~0ull);
|
||||
}
|
||||
template <typename T>
|
||||
constexpr T LimitsMin() {
|
||||
static_assert(!IsFloat<T>(), "Only for integer types");
|
||||
return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
|
||||
}
|
||||
|
||||
// Largest/smallest representable value (integer or float). This naming avoids
|
||||
// confusion with numeric_limits<float>::min() (the smallest positive value).
|
||||
template <typename T>
|
||||
constexpr T LowestValue() {
|
||||
return LimitsMin<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float LowestValue<float>() {
|
||||
return -FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double LowestValue<double>() {
|
||||
return -DBL_MAX;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
constexpr T HighestValue() {
|
||||
return LimitsMax<T>();
|
||||
}
|
||||
template <>
|
||||
constexpr float HighestValue<float>() {
|
||||
return FLT_MAX;
|
||||
}
|
||||
template <>
|
||||
constexpr double HighestValue<double>() {
|
||||
return DBL_MAX;
|
||||
}
|
||||
|
||||
// Returns bitmask of the exponent field in IEEE binary32/64.
|
||||
template <typename T>
|
||||
constexpr T ExponentMask() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr uint32_t ExponentMask<uint32_t>() {
|
||||
return 0x7F800000;
|
||||
}
|
||||
template <>
|
||||
constexpr uint64_t ExponentMask<uint64_t>() {
|
||||
return 0x7FF0000000000000ULL;
|
||||
}
|
||||
|
||||
// Returns 1 << mantissa_bits as a floating-point number. All integers whose
|
||||
// absolute value are less than this can be represented exactly.
|
||||
template <typename T>
|
||||
constexpr T MantissaEnd() {
|
||||
static_assert(sizeof(T) == 0, "Only instantiate the specializations");
|
||||
return 0;
|
||||
}
|
||||
template <>
|
||||
constexpr float MantissaEnd<float>() {
|
||||
return 8388608.0f; // 1 << 23
|
||||
}
|
||||
template <>
|
||||
constexpr double MantissaEnd<double>() {
|
||||
// floating point literal with p52 requires C++17.
|
||||
return 4503599627370496.0; // 1 << 52
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Type relations
|
||||
|
||||
namespace detail {
|
||||
|
||||
template <typename T>
|
||||
struct Relations;
|
||||
template <>
|
||||
struct Relations<uint8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = uint16_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<int8_t> {
|
||||
using Unsigned = uint8_t;
|
||||
using Signed = int8_t;
|
||||
using Wide = int16_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = uint32_t;
|
||||
using Narrow = uint8_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<int16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Wide = int32_t;
|
||||
using Narrow = int8_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint32_t> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = uint64_t;
|
||||
using Narrow = uint16_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<int32_t> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = int64_t;
|
||||
using Narrow = int16_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<uint64_t> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = uint32_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<int64_t> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = int32_t;
|
||||
};
|
||||
template <>
|
||||
struct Relations<float16_t> {
|
||||
using Unsigned = uint16_t;
|
||||
using Signed = int16_t;
|
||||
using Float = float16_t;
|
||||
using Wide = float;
|
||||
};
|
||||
template <>
|
||||
struct Relations<float> {
|
||||
using Unsigned = uint32_t;
|
||||
using Signed = int32_t;
|
||||
using Float = float;
|
||||
using Wide = double;
|
||||
};
|
||||
template <>
|
||||
struct Relations<double> {
|
||||
using Unsigned = uint64_t;
|
||||
using Signed = int64_t;
|
||||
using Float = double;
|
||||
using Narrow = float;
|
||||
};
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// Aliases for types of a different category, but the same size.
|
||||
template <typename T>
|
||||
using MakeUnsigned = typename detail::Relations<T>::Unsigned;
|
||||
template <typename T>
|
||||
using MakeSigned = typename detail::Relations<T>::Signed;
|
||||
template <typename T>
|
||||
using MakeFloat = typename detail::Relations<T>::Float;
|
||||
|
||||
// Aliases for types of the same category, but different size.
|
||||
template <typename T>
|
||||
using MakeWide = typename detail::Relations<T>::Wide;
|
||||
template <typename T>
|
||||
using MakeNarrow = typename detail::Relations<T>::Narrow;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Helper functions
|
||||
|
||||
template <typename T1, typename T2>
|
||||
constexpr inline T1 DivCeil(T1 a, T2 b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
// Works for any `align`; if a power of two, compiler emits ADD+AND.
|
||||
constexpr inline size_t RoundUpTo(size_t what, size_t align) {
|
||||
return DivCeil(what, align) * align;
|
||||
}
|
||||
|
||||
// Undefined results for x == 0.
|
||||
HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
unsigned long index; // NOLINT
|
||||
_BitScanForward(&index, x);
|
||||
return index;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
return static_cast<size_t>(__builtin_ctz(x));
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
HWY_API size_t PopCount(uint64_t x) {
|
||||
#if HWY_COMPILER_CLANG || HWY_COMPILER_GCC
|
||||
return static_cast<size_t>(__builtin_popcountll(x));
|
||||
#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
|
||||
return _mm_popcnt_u64(x);
|
||||
#elif HWY_COMPILER_MSVC
|
||||
return _mm_popcnt_u32(uint32_t(x)) + _mm_popcnt_u32(uint32_t(x >> 32));
|
||||
#else
|
||||
x -= ((x >> 1) & 0x55555555U);
|
||||
x = (((x >> 2) & 0x33333333U) + (x & 0x33333333U));
|
||||
x = (((x >> 4) + x) & 0x0F0F0F0FU);
|
||||
x += (x >> 8);
|
||||
x += (x >> 16);
|
||||
x += (x >> 32);
|
||||
x = x & 0x0000007FU;
|
||||
return (unsigned int)x;
|
||||
#endif
|
||||
}
|
||||
|
||||
// The source/destination must not overlap/alias.
|
||||
template <size_t kBytes, typename From, typename To>
|
||||
HWY_API void CopyBytes(const From* from, To* to) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
const uint8_t* HWY_RESTRICT from_bytes =
|
||||
reinterpret_cast<const uint8_t*>(from);
|
||||
uint8_t* HWY_RESTRICT to_bytes = reinterpret_cast<uint8_t*>(to);
|
||||
for (size_t i = 0; i < kBytes; ++i) {
|
||||
to_bytes[i] = from_bytes[i];
|
||||
}
|
||||
#else
|
||||
// Avoids horrible codegen on Clang (series of PINSRB)
|
||||
__builtin_memcpy(to, from, kBytes);
|
||||
#endif
|
||||
}
|
||||
|
||||
HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...);
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_BASE_H_
|
|
@ -0,0 +1,123 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "base_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
HWY_NOINLINE void TestAllLimits() {
|
||||
HWY_ASSERT_EQ(uint8_t(0), LimitsMin<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t(0), LimitsMin<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t(0), LimitsMin<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t(0), LimitsMin<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t(-128), LimitsMin<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t(-32768), LimitsMin<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t(0x80000000u), LimitsMin<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t(0x8000000000000000ull), LimitsMin<int64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(uint8_t(0xFF), LimitsMax<uint8_t>());
|
||||
HWY_ASSERT_EQ(uint16_t(0xFFFF), LimitsMax<uint16_t>());
|
||||
HWY_ASSERT_EQ(uint32_t(0xFFFFFFFFu), LimitsMax<uint32_t>());
|
||||
HWY_ASSERT_EQ(uint64_t(0xFFFFFFFFFFFFFFFFull), LimitsMax<uint64_t>());
|
||||
|
||||
HWY_ASSERT_EQ(int8_t(0x7F), LimitsMax<int8_t>());
|
||||
HWY_ASSERT_EQ(int16_t(0x7FFF), LimitsMax<int16_t>());
|
||||
HWY_ASSERT_EQ(int32_t(0x7FFFFFFFu), LimitsMax<int32_t>());
|
||||
HWY_ASSERT_EQ(int64_t(0x7FFFFFFFFFFFFFFFull), LimitsMax<int64_t>());
|
||||
}
|
||||
|
||||
struct TestLowestHighest {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
|
||||
HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
|
||||
struct TestIsUnsigned {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(!IsFloat<T>(), "Expected !IsFloat");
|
||||
static_assert(!IsSigned<T>(), "Expected !IsSigned");
|
||||
}
|
||||
};
|
||||
|
||||
struct TestIsSigned {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(!IsFloat<T>(), "Expected !IsFloat");
|
||||
static_assert(IsSigned<T>(), "Expected IsSigned");
|
||||
}
|
||||
};
|
||||
|
||||
struct TestIsFloat {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*unused*/) const {
|
||||
static_assert(IsFloat<T>(), "Expected IsFloat");
|
||||
static_assert(IsSigned<T>(), "Floats are also considered signed");
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllType() {
|
||||
ForUnsignedTypes(TestIsUnsigned());
|
||||
ForSignedTypes(TestIsSigned());
|
||||
ForFloatTypes(TestIsFloat());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllPopCount() {
|
||||
HWY_ASSERT_EQ(size_t(0), PopCount(0u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(1u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(2u));
|
||||
HWY_ASSERT_EQ(size_t(2), PopCount(3u));
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000u));
|
||||
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFu));
|
||||
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFu));
|
||||
|
||||
HWY_ASSERT_EQ(size_t(1), PopCount(0x80000000ull));
|
||||
HWY_ASSERT_EQ(size_t(31), PopCount(0x7FFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(32), PopCount(0xFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(33), PopCount(0x10FFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(63), PopCount(0xFFFEFFFFFFFFFFFFull));
|
||||
HWY_ASSERT_EQ(size_t(64), PopCount(0xFFFFFFFFFFFFFFFFull));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(BaseTest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
|
||||
HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,88 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
#define HIGHWAY_HWY_CACHE_CONTROL_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
// Requires SSE2; fails to compile on 32-bit Clang 7 (see
|
||||
// https://github.com/gperftools/gperftools/issues/946).
|
||||
#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
|
||||
#undef HWY_DISABLE_CACHE_CONTROL
|
||||
#define HWY_DISABLE_CACHE_CONTROL
|
||||
#endif
|
||||
|
||||
// intrin.h is sufficient on MSVC and already included by base.h.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#include <emmintrin.h> // SSE2
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
|
||||
#define HWY_STREAM_MULTIPLE 16
|
||||
|
||||
// The following functions may also require an attribute.
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
|
||||
#define HWY_ATTR_CACHE __attribute__((target("sse2")))
|
||||
#else
|
||||
#define HWY_ATTR_CACHE
|
||||
#endif
|
||||
|
||||
// Delays subsequent loads until prior loads are visible. On Intel CPUs, also
|
||||
// serves as a full fence (waits for all prior instructions to complete).
|
||||
// No effect on non-x86.
|
||||
HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_lfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Ensures previous weakly-ordered stores are visible. No effect on non-x86.
|
||||
HWY_INLINE HWY_ATTR_CACHE void StoreFence() {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_sfence();
|
||||
#endif
|
||||
}
|
||||
|
||||
// Begins loading the cache line containing "p".
|
||||
template <typename T>
|
||||
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
|
||||
#elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
|
||||
// Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
|
||||
// desirable, so use the default 3 (keep in caches).
|
||||
__builtin_prefetch(p, /*write=*/0, /*hint=*/3);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
// Invalidates and flushes the cache line containing "p". No effect on non-x86.
|
||||
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
|
||||
#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
|
||||
_mm_clflush(p);
|
||||
#else
|
||||
(void)p;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_CACHE_CONTROL_H_
|
|
@ -0,0 +1,243 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <memory>
|
||||
#include <numeric> // iota
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h"
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// These templates are not found via ADL.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
using hwy::HWY_NAMESPACE::CombineShiftRightBytes;
|
||||
#endif
|
||||
|
||||
class TwoArray {
|
||||
public:
|
||||
// Passed to ctor as a value NOT known to the compiler. Must be a multiple of
|
||||
// the vector lane count * 8.
|
||||
static size_t NumItems() { return 3456; }
|
||||
|
||||
explicit TwoArray(const size_t num_items)
|
||||
: a_(AllocateAligned<float>(num_items * 2)), b_(a_.get() + num_items) {
|
||||
const float init = num_items / NumItems(); // 1, but compiler doesn't know
|
||||
std::iota(a_.get(), a_.get() + num_items, init);
|
||||
std::iota(b_, b_ + num_items, init);
|
||||
}
|
||||
|
||||
protected:
|
||||
AlignedFreeUniquePtr<float[]> a_;
|
||||
float* b_;
|
||||
};
|
||||
|
||||
// Measures durations, verifies results, prints timings.
|
||||
template <class Benchmark>
|
||||
void RunBenchmark(const char* caption) {
|
||||
printf("%10s: ", caption);
|
||||
const size_t kNumInputs = 1;
|
||||
const size_t num_items = Benchmark::NumItems() * Unpredictable1();
|
||||
const FuncInput inputs[kNumInputs] = {num_items};
|
||||
Result results[kNumInputs];
|
||||
|
||||
Benchmark benchmark(num_items);
|
||||
|
||||
Params p;
|
||||
p.verbose = false;
|
||||
p.max_evals = 7;
|
||||
p.target_rel_mad = 0.002;
|
||||
const size_t num_results = MeasureClosure(
|
||||
[&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
|
||||
kNumInputs, results, p);
|
||||
if (num_results != kNumInputs) {
|
||||
fprintf(stderr, "MeasureClosure failed.\n");
|
||||
}
|
||||
|
||||
benchmark.Verify(num_items);
|
||||
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
const double cycles_per_item = results[i].ticks / results[i].input;
|
||||
const double mad = results[i].variability * cycles_per_item;
|
||||
printf("%6zu: %6.3f (+/- %5.3f)\n", results[i].input, cycles_per_item, mad);
|
||||
}
|
||||
}
|
||||
|
||||
void Intro() {
|
||||
HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6};
|
||||
HWY_ALIGN float out[16];
|
||||
HWY_FULL(float) d; // largest possible vector
|
||||
for (size_t i = 0; i < 16; i += Lanes(d)) {
|
||||
const auto vec = Load(d, in + i); // aligned!
|
||||
auto result = vec * vec;
|
||||
result += result; // can update if not const
|
||||
Store(result, d, out + i);
|
||||
}
|
||||
printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
|
||||
}
|
||||
|
||||
// BEGINNER: dot product
|
||||
// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
|
||||
class BenchmarkDot : public TwoArray {
|
||||
public:
|
||||
explicit BenchmarkDot(size_t num_items) : TwoArray(num_items), dot_{-1.0f} {}
|
||||
|
||||
FuncOutput operator()(const size_t num_items) {
|
||||
HWY_FULL(float) d;
|
||||
const size_t N = Lanes(d);
|
||||
using V = decltype(Zero(d));
|
||||
constexpr int unroll = 8;
|
||||
// Compiler doesn't make independent sum* accumulators, so unroll manually.
|
||||
// Some older compilers might not be able to fit the 8 arrays in registers,
|
||||
// so manual unrolling can be helpfull if you run into this issue.
|
||||
// 2 FMA ports * 4 cycle latency = 8x unrolled.
|
||||
V sum[unroll];
|
||||
for (int i = 0; i < unroll; ++i) {
|
||||
sum[i] = Zero(d);
|
||||
}
|
||||
const float* const HWY_RESTRICT pa = &a_[0];
|
||||
const float* const HWY_RESTRICT pb = b_;
|
||||
for (size_t i = 0; i < num_items; i += unroll * N) {
|
||||
for (int j = 0; j < unroll; ++j) {
|
||||
const auto a = Load(d, pa + i + j * N);
|
||||
const auto b = Load(d, pb + i + j * N);
|
||||
sum[j] = MulAdd(a, b, sum[j]);
|
||||
}
|
||||
}
|
||||
// Reduction tree: sum of all accumulators by pairs into sum[0], then the
|
||||
// lanes.
|
||||
for (int power = 1; power < unroll; power *= 2) {
|
||||
for (int i = 0; i < unroll; i += 2 * power) {
|
||||
sum[i] += sum[i + power];
|
||||
}
|
||||
}
|
||||
return dot_ = GetLane(SumOfLanes(sum[0]));
|
||||
}
|
||||
void Verify(size_t num_items) {
|
||||
if (dot_ == -1.0f) {
|
||||
fprintf(stderr, "Dot: must call Verify after benchmark");
|
||||
abort();
|
||||
}
|
||||
|
||||
const float expected =
|
||||
std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
|
||||
const float rel_err = std::abs(expected - dot_) / expected;
|
||||
if (rel_err > 1.1E-6f) {
|
||||
fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
|
||||
rel_err);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
float dot_; // for Verify
|
||||
};
|
||||
|
||||
// INTERMEDIATE: delta coding
|
||||
// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
|
||||
struct BenchmarkDelta : public TwoArray {
|
||||
explicit BenchmarkDelta(size_t num_items) : TwoArray(num_items) {}
|
||||
|
||||
FuncOutput operator()(const size_t num_items) const {
|
||||
#if HWY_TARGET == HWY_SCALAR
|
||||
b_[0] = a_[0];
|
||||
for (size_t i = 1; i < num_items; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
#elif HWY_CAP_GE256
|
||||
// Larger vectors are split into 128-bit blocks, easiest to use the
|
||||
// unaligned load support to shift between them.
|
||||
const HWY_FULL(float) df;
|
||||
const size_t N = Lanes(df);
|
||||
size_t i;
|
||||
b_[0] = a_[0];
|
||||
for (i = 1; i < N; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
for (; i < num_items; i += N) {
|
||||
const auto a = Load(df, &a_[i]);
|
||||
const auto shifted = LoadU(df, &a_[i - 1]);
|
||||
Store(a - shifted, df, &b_[i]);
|
||||
}
|
||||
#else // 128-bit
|
||||
// Slightly better than unaligned loads
|
||||
const HWY_CAPPED(float, 4) df;
|
||||
const size_t N = Lanes(df);
|
||||
size_t i;
|
||||
b_[0] = a_[0];
|
||||
for (i = 1; i < N; ++i) {
|
||||
b_[i] = a_[i] - a_[i - 1];
|
||||
}
|
||||
auto prev = Load(df, &a_[0]);
|
||||
for (; i < num_items; i += Lanes(df)) {
|
||||
const auto a = Load(df, &a_[i]);
|
||||
const auto shifted = CombineShiftRightLanes<3>(a, prev);
|
||||
prev = a;
|
||||
Store(a - shifted, df, &b_[i]);
|
||||
}
|
||||
#endif
|
||||
return b_[num_items - 1];
|
||||
}
|
||||
|
||||
void Verify(size_t num_items) {
|
||||
for (size_t i = 0; i < num_items; ++i) {
|
||||
const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
|
||||
const float err = std::abs(expected - b_[i]);
|
||||
if (err > 1E-6f) {
|
||||
fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
void RunBenchmarks() {
|
||||
Intro();
|
||||
printf("------------------------ %s\n", TargetName(HWY_TARGET));
|
||||
RunBenchmark<BenchmarkDot>("dot");
|
||||
RunBenchmark<BenchmarkDelta>("delta");
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_EXPORT(RunBenchmarks);
|
||||
|
||||
void Run() {
|
||||
for (uint32_t target : SupportedAndGeneratedTargets()) {
|
||||
SetSupportedTargetsForTest(target);
|
||||
HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
|
||||
}
|
||||
SetSupportedTargetsForTest(0); // Reset the mask afterwards.
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
int main(int /*argc*/, char** /*argv*/) {
|
||||
hwy::Run();
|
||||
return 0;
|
||||
}
|
||||
#endif // HWY_ONCE
|
|
@ -0,0 +1,62 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Demo of functions that might be called from multiple SIMD modules (either
|
||||
// other -inl.h files, or a .cc file between begin/end_target-inl). This is
|
||||
// optional - all SIMD code can reside in .cc files. However, this allows
|
||||
// splitting code into different files while still inlining instead of requiring
|
||||
// calling through function pointers.
|
||||
|
||||
// Include guard (still compiled once per target)
|
||||
#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
|
||||
#endif
|
||||
|
||||
// It is fine to #include normal or *-inl headers.
|
||||
#include <stddef.h>
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
|
||||
// Example of a type-agnostic (caller-specified lane type) and width-agnostic
|
||||
// (uses best available instruction set) function in a header.
|
||||
//
|
||||
// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
|
||||
template <class D, typename T>
|
||||
HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
|
||||
const T* HWY_RESTRICT add_array,
|
||||
const size_t size, T* HWY_RESTRICT x_array) {
|
||||
for (size_t i = 0; i < size; i += Lanes(d)) {
|
||||
const auto mul = Load(d, mul_array + i);
|
||||
const auto add = Load(d, add_array + i);
|
||||
auto x = Load(d, x_array + i);
|
||||
x = MulAdd(mul, x, add);
|
||||
Store(x, d, x_array + i);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // include guard
|
|
@ -0,0 +1,103 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/examples/skeleton.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// First undef to prevent error when re-included.
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
// For runtime dispatch, specify the name of the current file (unfortunately
|
||||
// __FILE__ is not reliable) so that foreach_target.h can re-include it.
|
||||
#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
|
||||
// Generates code for each enabled target by re-including this source file.
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
// Optional, can instead add HWY_ATTR to all functions.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Highway ops reside here; ADL does not find templates nor builtins.
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
|
||||
// Computes log2 by converting to a vector of floats. Compiled once per target.
|
||||
template <class DF>
|
||||
HWY_NOINLINE void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
// Type tags for converting to other element types (Rebind = same count).
|
||||
const Rebind<int32_t, DF> d32;
|
||||
const Rebind<uint8_t, DF> d8;
|
||||
|
||||
const auto u8 = Load(d8, values);
|
||||
const auto bits = BitCast(d32, ConvertTo(df, PromoteTo(d32, u8)));
|
||||
const auto exponent = ShiftRight<23>(bits) - Set(d32, 127);
|
||||
Store(DemoteTo(d8, exponent), d8, log2);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void CodepathDemo() {
|
||||
// Highway defaults to portability, but per-target codepaths may be selected
|
||||
// via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
|
||||
#if HWY_CAP_INTEGER64
|
||||
const char* gather = "Has int64";
|
||||
#else
|
||||
const char* gather = "No int64";
|
||||
#endif
|
||||
printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
|
||||
uint8_t* HWY_RESTRICT log2) {
|
||||
CodepathDemo();
|
||||
|
||||
// Second argument is necessary on RVV until it supports fractional lengths.
|
||||
HWY_FULL(float, 4) df;
|
||||
|
||||
// Caller padded memory to a multiple of Lanes(). If that is not possible,
|
||||
// we could use blended or masked stores, see README.md.
|
||||
const size_t N = Lanes(df);
|
||||
for (size_t i = 0; i < count; i += N) {
|
||||
OneFloorLog2(df, values + i, log2 + i);
|
||||
}
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
|
||||
namespace skeleton {
|
||||
|
||||
// This macro declares a static array used for dynamic dispatch; it resides in
|
||||
// the same outer namespace that contains FloorLog2.
|
||||
HWY_EXPORT(FloorLog2);
|
||||
|
||||
// This function is optional and only needed in the case of exposing it in the
|
||||
// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
|
||||
// is equivalent to inlining this function.
|
||||
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
||||
uint8_t* HWY_RESTRICT out) {
|
||||
return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
|
||||
}
|
||||
|
||||
// Optional: anything to compile only once, e.g. non-SIMD implementations of
|
||||
// public functions provided by this module, can go inside #if HWY_ONCE.
|
||||
|
||||
} // namespace skeleton
|
||||
#endif // HWY_ONCE
|
|
@ -0,0 +1,35 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Demo interface to target-specific code in skeleton.cc
|
||||
|
||||
// Normal header with include guard and namespace.
|
||||
#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
||||
#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
// Platform-specific definitions used for declaring an interface, independent of
|
||||
// the SIMD instruction set.
|
||||
#include "hwy/base.h" // HWY_RESTRICT
|
||||
|
||||
namespace skeleton {
|
||||
|
||||
// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
|
||||
void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count,
|
||||
uint8_t* HWY_RESTRICT out);
|
||||
|
||||
} // namespace skeleton
|
||||
|
||||
#endif // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
|
|
@ -0,0 +1,104 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Example of unit test for the "skeleton" library.
|
||||
|
||||
#include "hwy/examples/skeleton.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Optional: factor out parts of the implementation into *-inl.h
|
||||
#include "hwy/examples/skeleton-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace skeleton {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
using namespace hwy::HWY_NAMESPACE;
|
||||
|
||||
// Calls function defined in skeleton.cc.
|
||||
struct TestFloorLog2 {
|
||||
template <class T, class DF>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, DF df) {
|
||||
const size_t count = 5 * Lanes(df);
|
||||
auto in = hwy::AllocateAligned<uint8_t>(count);
|
||||
auto expected = hwy::AllocateAligned<uint8_t>(count);
|
||||
|
||||
hwy::RandomState rng;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected[i] = Random32(&rng) & 7;
|
||||
in[i] = static_cast<uint8_t>(1u << expected[i]);
|
||||
}
|
||||
auto out = hwy::AllocateAligned<uint8_t>(count);
|
||||
CallFloorLog2(in.get(), count, out.get());
|
||||
int sum = 0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
HWY_ASSERT_EQ(expected[i], out[i]);
|
||||
sum += out[i];
|
||||
}
|
||||
hwy::PreventElision(sum);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFloorLog2() {
|
||||
ForFullVectors<TestFloorLog2>()(float());
|
||||
}
|
||||
|
||||
// Calls function defined in skeleton-inl.h.
|
||||
struct TestSumMulAdd {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
hwy::RandomState rng;
|
||||
const size_t count = 4096;
|
||||
EXPECT_TRUE(count % Lanes(d) == 0);
|
||||
auto mul = hwy::AllocateAligned<T>(count);
|
||||
auto x = hwy::AllocateAligned<T>(count);
|
||||
auto add = hwy::AllocateAligned<T>(count);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
mul[i] = static_cast<T>(Random32(&rng) & 0xF);
|
||||
x[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
add[i] = static_cast<T>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
double expected_sum = 0.0;
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
expected_sum += mul[i] * x[i] + add[i];
|
||||
}
|
||||
|
||||
MulAddLoop(d, mul.get(), add.get(), count, x.get());
|
||||
HWY_ASSERT_EQ(4344240.0, expected_sum);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSumMulAdd() {
|
||||
ForFloatTypes(ForPartialVectors<TestSumMulAdd>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace skeleton
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace skeleton {
|
||||
HWY_BEFORE_TEST(SkeletonTest);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
|
||||
HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
|
||||
} // namespace skeleton
|
||||
#endif
|
|
@ -0,0 +1,161 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
#define HIGHWAY_HWY_FOREACH_TARGET_H_
|
||||
|
||||
// Re-includes the translation unit zero or more times to compile for any
|
||||
// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
|
||||
// highway.h defines the corresponding macro/namespace.
|
||||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
// *_inl.h may include other headers, which requires include guards to prevent
|
||||
// repeated inclusion. The guards must be reset after compiling each target, so
|
||||
// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
|
||||
// defining it if undefined and vice versa. This macro is initially undefined
|
||||
// so that IDEs don't gray out the contents of each header.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#error "This macro must not be defined outside foreach_target.h"
|
||||
#endif
|
||||
|
||||
#ifdef HWY_HIGHWAY_INCLUDED // highway.h include guard
|
||||
// Trigger fixup at the bottom of this header.
|
||||
#define HWY_ALREADY_INCLUDED
|
||||
|
||||
// The next highway.h must re-include set_macros-inl.h because the first
|
||||
// highway.h chose the static target instead of what we will set below.
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
|
||||
// Disable HWY_EXPORT in user code until we have generated all targets. Note
|
||||
// that a subsequent highway.h will not override this definition.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE (0 || HWY_IDE)
|
||||
|
||||
// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
|
||||
// also skip if only 1 target defined (no re-inclusion will be necessary).
|
||||
#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
#if !defined(HWY_TARGET_INCLUDE)
|
||||
#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SCALAR
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_NEON
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_SSE4
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX2
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_AVX3
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_WASM
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_PPC8
|
||||
#include HWY_TARGET_INCLUDE
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
|
||||
|
||||
// Now that all but the static target have been generated, re-enable HWY_EXPORT.
|
||||
#undef HWY_ONCE
|
||||
#define HWY_ONCE 1
|
||||
|
||||
// If we re-include once per enabled target, the translation unit's
|
||||
// implementation would have to be skipped via #if to avoid redefining symbols.
|
||||
// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
|
||||
// implementation when resuming compilation of the translation unit.
|
||||
#undef HWY_TARGET
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
#ifdef HWY_ALREADY_INCLUDED
|
||||
// Revert the previous toggle to prevent redefinitions for the static target.
|
||||
#ifdef HWY_TARGET_TOGGLE
|
||||
#undef HWY_TARGET_TOGGLE
|
||||
#else
|
||||
#define HWY_TARGET_TOGGLE
|
||||
#endif
|
||||
|
||||
// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
|
||||
#ifdef HWY_SET_MACROS_PER_TARGET
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#else
|
||||
#define HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // HIGHWAY_HWY_FOREACH_TARGET_H_
|
|
@ -0,0 +1,337 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// This include guard is checked by foreach_target, so avoid the usual _H_
|
||||
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
|
||||
// after/outside this include guard.
|
||||
#ifndef HWY_HIGHWAY_INCLUDED
|
||||
#define HWY_HIGHWAY_INCLUDED
|
||||
|
||||
// Main header required before using vector types.
|
||||
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/targets.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// API version (https://semver.org/)
|
||||
#define HWY_MAJOR 0
|
||||
#define HWY_MINOR 12
|
||||
#define HWY_PATCH 0
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Shorthand for descriptors (defined in shared-inl.h) used to select overloads.
|
||||
|
||||
// Because Highway functions take descriptor and/or vector arguments, ADL finds
|
||||
// these functions without requiring users in project::HWY_NAMESPACE to
|
||||
// qualify Highway functions with hwy::HWY_NAMESPACE. However, ADL rules for
|
||||
// templates require `using hwy::HWY_NAMESPACE::ShiftLeft;` etc. declarations.
|
||||
|
||||
// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
|
||||
// registers in the group, and is ignored on targets that do not support groups.
|
||||
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
|
||||
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
|
||||
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
|
||||
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
|
||||
// Trailing comma avoids -pedantic false alarm
|
||||
#define HWY_CHOOSE_FULL(...) \
|
||||
HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
|
||||
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
|
||||
|
||||
// Vector of up to MAX_N lanes.
|
||||
#define HWY_CAPPED(T, MAX_N) \
|
||||
hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))>
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Export user functions for static/dynamic dispatch
|
||||
|
||||
// Evaluates to 0 inside a translation unit if it is generating anything but the
|
||||
// static target (the last one if multiple targets are enabled). Used to prevent
|
||||
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
|
||||
// compile once anyway, so this is 1 unless it is or has been included.
|
||||
#ifndef HWY_ONCE
|
||||
#define HWY_ONCE 1
|
||||
#endif
|
||||
|
||||
// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
|
||||
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
|
||||
// defined), and can be used to deduce the return type of Choose*.
|
||||
#if HWY_STATIC_TARGET == HWY_SCALAR
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_RVV
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_WASM
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_NEON
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_PPC8
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_SSE4
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX2
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
|
||||
#elif HWY_STATIC_TARGET == HWY_AVX3
|
||||
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
|
||||
#endif
|
||||
|
||||
// Dynamic dispatch declarations.
|
||||
|
||||
template <typename RetType, typename... Args>
|
||||
struct FunctionCache {
|
||||
public:
|
||||
typedef RetType(FunctionType)(Args...);
|
||||
|
||||
// A template function that when instantiated has the same signature as the
|
||||
// function being called. This function initializes the global cache of the
|
||||
// current supported targets mask used for dynamic dispatch and calls the
|
||||
// appropriate function. Since this mask used for dynamic dispatch is a
|
||||
// global cache, all the highway exported functions, even those exposed by
|
||||
// different modules, will be initialized after this function runs for any one
|
||||
// of those exported functions.
|
||||
template <FunctionType* const table[]>
|
||||
static RetType ChooseAndCall(Args... args) {
|
||||
// If we are running here it means we need to update the chosen target.
|
||||
chosen_target.Update();
|
||||
return (table[chosen_target.GetIndex()])(args...);
|
||||
}
|
||||
};
|
||||
|
||||
// Factory function only used to infer the template parameters RetType and Args
|
||||
// from a function passed to the factory.
|
||||
template <typename RetType, typename... Args>
|
||||
FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) {
|
||||
return FunctionCache<RetType, Args...>();
|
||||
}
|
||||
|
||||
// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
|
||||
// nullptr is that target was not compiled.
|
||||
#if HWY_TARGETS & HWY_SCALAR
|
||||
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &N_SCALAR::FUNC_NAME
|
||||
#else
|
||||
// When scalar is not present and we try to use scalar because other targets
|
||||
// were disabled at runtime we fall back to the baseline with
|
||||
// HWY_STATIC_DISPATCH()
|
||||
#define HWY_CHOOSE_SCALAR(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_WASM
|
||||
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_RVV
|
||||
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_NEON
|
||||
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_PPC8
|
||||
#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_SSE4
|
||||
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX2
|
||||
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#if HWY_TARGETS & HWY_AVX3
|
||||
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
|
||||
#else
|
||||
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
|
||||
#endif
|
||||
|
||||
#define HWY_DISPATCH_TABLE(FUNC_NAME) \
|
||||
HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
|
||||
|
||||
// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
|
||||
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
|
||||
// static array must be defined at the same namespace level as the function
|
||||
// it is exporting.
|
||||
// After being exported, it can be called from other parts of the same source
|
||||
// file using HWY_DYNAMIC_DISTPATCH(), in particular from a function wrapper
|
||||
// like in the following example:
|
||||
//
|
||||
// #include "hwy/highway.h"
|
||||
// HWY_BEFORE_NAMESPACE();
|
||||
// namespace skeleton {
|
||||
// namespace HWY_NAMESPACE {
|
||||
//
|
||||
// void MyFunction(int a, char b, const char* c) { ... }
|
||||
//
|
||||
// // NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
// } // namespace HWY_NAMESPACE
|
||||
// } // namespace skeleton
|
||||
// HWY_AFTER_NAMESPACE();
|
||||
//
|
||||
// namespace skeleton {
|
||||
// HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope.
|
||||
//
|
||||
// void MyFunction(int a, char b, const char* c) {
|
||||
// return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
|
||||
// }
|
||||
// } // namespace skeleton
|
||||
//
|
||||
|
||||
#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
||||
|
||||
// Simplified version for IDE or the dynamic dispatch case with only one target.
|
||||
// This case still uses a table, although of a single element, to provide the
|
||||
// same compile error conditions as with the dynamic dispatch case when multiple
|
||||
// targets are being compiled.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
|
||||
const HWY_DISPATCH_TABLE(FUNC_NAME)[1] = { \
|
||||
&HWY_STATIC_DISPATCH(FUNC_NAME)}
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
|
||||
|
||||
#else
|
||||
|
||||
// Dynamic dispatch case with one entry per dynamic target plus the scalar
|
||||
// mode and the initialization wrapper.
|
||||
#define HWY_EXPORT(FUNC_NAME) \
|
||||
static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) \
|
||||
const HWY_DISPATCH_TABLE(FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \
|
||||
/* The first entry in the table initializes the global cache and \
|
||||
* calls the appropriate function. */ \
|
||||
&decltype(hwy::FunctionCacheFactory(&HWY_STATIC_DISPATCH( \
|
||||
FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \
|
||||
HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \
|
||||
HWY_CHOOSE_SCALAR(FUNC_NAME), \
|
||||
}
|
||||
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
|
||||
(*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()]))
|
||||
|
||||
#endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_HIGHWAY_INCLUDED
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
|
||||
// to include them once per target, which is ensured by the toggle check.
|
||||
// Because ops/*.h are included under it, they do not need their own guard.
|
||||
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HWY_HIGHWAY_PER_TARGET
|
||||
#undef HWY_HIGHWAY_PER_TARGET
|
||||
#else
|
||||
#define HWY_HIGHWAY_PER_TARGET
|
||||
#endif
|
||||
|
||||
#undef HWY_FULL2
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)>
|
||||
#else
|
||||
#define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)>
|
||||
#endif
|
||||
|
||||
// These define ops inside namespace hwy::HWY_NAMESPACE.
|
||||
#if HWY_TARGET == HWY_SSE4
|
||||
#include "hwy/ops/x86_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX2
|
||||
#include "hwy/ops/x86_256-inl.h"
|
||||
#elif HWY_TARGET == HWY_AVX3
|
||||
#include "hwy/ops/x86_512-inl.h"
|
||||
#elif HWY_TARGET == HWY_PPC8
|
||||
#elif HWY_TARGET == HWY_NEON
|
||||
#include "hwy/ops/arm_neon-inl.h"
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
#include "hwy/ops/wasm_128-inl.h"
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
#include "hwy/ops/rvv-inl.h"
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
#include "hwy/ops/scalar-inl.h"
|
||||
#else
|
||||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
// Commonly used functions/types that must come after ops are defined.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
|
||||
template <class V>
|
||||
using LaneType = decltype(GetLane(V()));
|
||||
|
||||
// Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
|
||||
// of functions that do not take a vector argument, or as an argument type if
|
||||
// the function only has a template argument for D, or for explicit type names
|
||||
// instead of auto. This may be a built-in type.
|
||||
template <class D>
|
||||
using Vec = decltype(Zero(D()));
|
||||
|
||||
// Mask type. Useful as the return type of functions that do not take a mask
|
||||
// argument, or as an argument type if the function only has a template argument
|
||||
// for D, or for explicit type names instead of auto.
|
||||
template <class D>
|
||||
using Mask = decltype(MaskFromVec(Zero(D())));
|
||||
|
||||
// Returns the closest value to v within [lo, hi].
|
||||
template <class V>
|
||||
HWY_API V Clamp(const V v, const V lo, const V hi) {
|
||||
return Min(Max(lo, v), hi);
|
||||
}
|
||||
|
||||
// CombineShiftRightBytes (and ..Lanes) are not available for the scalar target.
|
||||
// TODO(janwas): implement for RVV
|
||||
#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
|
||||
|
||||
template <size_t kLanes, class V>
|
||||
HWY_API V CombineShiftRightLanes(const V hi, const V lo) {
|
||||
return CombineShiftRightBytes<kLanes * sizeof(LaneType<V>)>(hi, lo);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// Returns lanes with the most significant bit set and all other bits zero.
|
||||
template <class D>
|
||||
HWY_API Vec<D> SignBit(D d) {
|
||||
using Unsigned = MakeUnsigned<TFromD<D>>;
|
||||
const Unsigned bit = Unsigned(1) << (sizeof(Unsigned) * 8 - 1);
|
||||
return BitCast(d, Set(Rebind<Unsigned, D>(), bit));
|
||||
}
|
||||
|
||||
// Returns quiet NaN.
|
||||
template <class D>
|
||||
HWY_API Vec<D> NaN(D d) {
|
||||
const RebindToSigned<D> di;
|
||||
// LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
|
||||
// mantissa MSB (to indicate quiet) would be sufficient.
|
||||
return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // HWY_HIGHWAY_PER_TARGET
|
|
@ -0,0 +1,313 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "highway_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/nanobenchmark.h" // Unpredictable1
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestSet {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Zero
|
||||
const auto v0 = Zero(d);
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
std::fill(expected.get(), expected.get() + N, T(0));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
|
||||
|
||||
// Set
|
||||
const auto v2 = Set(d, T(2));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = 2;
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
|
||||
|
||||
// Iota
|
||||
const auto vi = Iota(d, T(5));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = T(5 + i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
|
||||
|
||||
// Undefined
|
||||
const auto vu = Undefined(d);
|
||||
Store(vu, d, expected.get());
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
|
||||
|
||||
// Ensures wraparound (mod 2^bits)
|
||||
struct TestOverflow {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(1));
|
||||
const auto vmax = Set(d, LimitsMax<T>());
|
||||
const auto vmin = Set(d, LimitsMin<T>());
|
||||
// Unsigned underflow / negative -> positive
|
||||
HWY_ASSERT_VEC_EQ(d, vmax, vmin - v1);
|
||||
// Unsigned overflow / positive -> negative
|
||||
HWY_ASSERT_VEC_EQ(d, vmin, vmax + v1);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllOverflow() {
|
||||
ForIntegerTypes(ForPartialVectors<TestOverflow>());
|
||||
}
|
||||
|
||||
struct TestSignBitInteger {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto all = VecFromMask(d, Eq(v0, v0));
|
||||
const auto vs = SignBit(d);
|
||||
const auto other = Sub(vs, Set(d, 1));
|
||||
|
||||
// Shifting left by one => overflow, equal zero
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
|
||||
// Verify the lower bits are zero (only +/- and logical ops are available
|
||||
// for all types)
|
||||
HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestSignBitFloat {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vs = SignBit(d);
|
||||
const auto vp = Set(d, 2.25);
|
||||
const auto vn = Set(d, -2.25);
|
||||
HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
|
||||
HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, vs);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSignBit() {
|
||||
ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
|
||||
ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
|
||||
}
|
||||
|
||||
// std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
|
||||
template <typename TF>
|
||||
bool IsNaN(TF f) {
|
||||
MakeUnsigned<TF> bits;
|
||||
memcpy(&bits, &f, sizeof(TF));
|
||||
bits += bits;
|
||||
bits >>= 1; // clear sign bit
|
||||
// NaN if all exponent bits are set and the mantissa is not zero.
|
||||
return bits > ExponentMask<decltype(bits)>();
|
||||
}
|
||||
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void AssertNaN(const D d, const V v, const char* file, int line) {
|
||||
using T = TFromD<D>;
|
||||
const T lane = GetLane(v);
|
||||
if (!IsNaN(lane)) {
|
||||
const std::string type_name = TypeName(T(), Lanes(d));
|
||||
MakeUnsigned<T> bits;
|
||||
memcpy(&bits, &lane, sizeof(T));
|
||||
// RVV lacks PRIu64, so use size_t; double will be truncated on 32-bit.
|
||||
Abort(file, line, "Expected %s NaN, got %E (%zu)", type_name.c_str(), lane,
|
||||
size_t(bits));
|
||||
}
|
||||
}
|
||||
|
||||
#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
|
||||
|
||||
struct TestNaN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
HWY_ASSERT_NAN(d, nan);
|
||||
|
||||
// Arithmetic
|
||||
HWY_ASSERT_NAN(d, Add(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Add(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Sub(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Sub(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Mul(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Mul(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Div(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Div(v1, nan));
|
||||
|
||||
// FMA
|
||||
HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
|
||||
HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
|
||||
HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
|
||||
HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
|
||||
HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
|
||||
HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
|
||||
HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
|
||||
HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
|
||||
HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
|
||||
HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
|
||||
HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
|
||||
HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
|
||||
|
||||
// Rcp/Sqrt
|
||||
HWY_ASSERT_NAN(d, Sqrt(nan));
|
||||
|
||||
// Sign manipulation
|
||||
HWY_ASSERT_NAN(d, Abs(nan));
|
||||
HWY_ASSERT_NAN(d, Neg(nan));
|
||||
HWY_ASSERT_NAN(d, CopySign(nan, v1));
|
||||
HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
|
||||
|
||||
// Rounding
|
||||
HWY_ASSERT_NAN(d, Ceil(nan));
|
||||
HWY_ASSERT_NAN(d, Floor(nan));
|
||||
HWY_ASSERT_NAN(d, Round(nan));
|
||||
HWY_ASSERT_NAN(d, Trunc(nan));
|
||||
|
||||
// Logical (And/AndNot/Xor will clear NaN!)
|
||||
HWY_ASSERT_NAN(d, Or(nan, v1));
|
||||
|
||||
// Comparison
|
||||
HWY_ASSERT(AllFalse(Eq(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Gt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Lt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Ge(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Le(nan, v1)));
|
||||
}
|
||||
};
|
||||
|
||||
// For functions only available for float32
|
||||
struct TestF32NaN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
|
||||
HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
|
||||
HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
|
||||
HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
|
||||
}
|
||||
};
|
||||
|
||||
// TODO(janwas): move to TestNaN once supported for partial vectors
|
||||
struct TestFullNaN {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v1 = Set(d, T(Unpredictable1()));
|
||||
const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
|
||||
|
||||
HWY_ASSERT_NAN(d, SumOfLanes(nan));
|
||||
// Reduction (pending clarification on RVV)
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
HWY_ASSERT_NAN(d, MinOfLanes(nan));
|
||||
HWY_ASSERT_NAN(d, MaxOfLanes(nan));
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_X86 && HWY_TARGET != HWY_SCALAR
|
||||
// x86 SIMD returns the second operand if any input is NaN.
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Min(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Max(v1, nan));
|
||||
#elif HWY_ARCH_WASM
|
||||
// Should return NaN if any input is NaN, but does not for scalar.
|
||||
// TODO(janwas): remove once this is fixed.
|
||||
#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
|
||||
// ARMv7 NEON returns NaN if any input is NaN.
|
||||
HWY_ASSERT_NAN(d, Min(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Max(v1, nan));
|
||||
HWY_ASSERT_NAN(d, Min(nan, v1));
|
||||
HWY_ASSERT_NAN(d, Max(nan, v1));
|
||||
#else
|
||||
// IEEE 754-2019 minimumNumber is defined as the other argument if exactly
|
||||
// one is NaN, and qNaN if both are.
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
|
||||
#endif
|
||||
HWY_ASSERT_NAN(d, Min(nan, nan));
|
||||
HWY_ASSERT_NAN(d, Max(nan, nan));
|
||||
|
||||
// Comparison
|
||||
HWY_ASSERT(AllFalse(Eq(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Gt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Lt(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Ge(nan, v1)));
|
||||
HWY_ASSERT(AllFalse(Le(nan, v1)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllNaN() {
|
||||
ForFloatTypes(ForPartialVectors<TestNaN>());
|
||||
ForPartialVectors<TestF32NaN>()(float());
|
||||
ForFloatTypes(ForFullVectors<TestFullNaN>());
|
||||
}
|
||||
|
||||
struct TestCopyAndAssign {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// copy V
|
||||
const auto v3 = Iota(d, 3);
|
||||
auto v3b(v3);
|
||||
HWY_ASSERT_VEC_EQ(d, v3, v3b);
|
||||
|
||||
// assign V
|
||||
auto v3c = Undefined(d);
|
||||
v3c = v3;
|
||||
HWY_ASSERT_VEC_EQ(d, v3, v3c);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCopyAndAssign() {
|
||||
ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
|
||||
}
|
||||
|
||||
struct TestGetLane {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
|
||||
HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllGetLane() {
|
||||
ForAllTypes(ForPartialVectors<TestGetLane>());
|
||||
}
|
||||
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HighwayTest);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
|
||||
HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,722 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // abort
|
||||
#include <string.h> // memcpy
|
||||
#include <time.h> // clock_gettime
|
||||
|
||||
#include <algorithm> // sort
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
#include <numeric> // iota
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "hwy/base.h"
|
||||
#if HWY_ARCH_PPC
|
||||
#include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
|
||||
#elif HWY_ARCH_X86
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <cpuid.h> // NOLINT
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
namespace hwy {
|
||||
namespace platform {
|
||||
namespace {
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
|
||||
void Cpuid(const uint32_t level, const uint32_t count,
|
||||
uint32_t* HWY_RESTRICT abcd) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
int regs[4];
|
||||
__cpuidex(regs, level, count);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
abcd[i] = regs[i];
|
||||
}
|
||||
#else
|
||||
uint32_t a;
|
||||
uint32_t b;
|
||||
uint32_t c;
|
||||
uint32_t d;
|
||||
__cpuid_count(level, count, a, b, c, d);
|
||||
abcd[0] = a;
|
||||
abcd[1] = b;
|
||||
abcd[2] = c;
|
||||
abcd[3] = d;
|
||||
#endif
|
||||
}
|
||||
|
||||
std::string BrandString() {
|
||||
char brand_string[49];
|
||||
std::array<uint32_t, 4> abcd;
|
||||
|
||||
// Check if brand string is supported (it is on all reasonable Intel/AMD)
|
||||
Cpuid(0x80000000U, 0, abcd.data());
|
||||
if (abcd[0] < 0x80000004U) {
|
||||
return std::string();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
|
||||
memcpy(brand_string + i * 16, abcd.data(), sizeof(abcd));
|
||||
}
|
||||
brand_string[48] = 0;
|
||||
return brand_string;
|
||||
}
|
||||
|
||||
// Returns the frequency quoted inside the brand string. This does not
|
||||
// account for throttling nor Turbo Boost.
|
||||
double NominalClockRate() {
|
||||
const std::string& brand_string = BrandString();
|
||||
// Brand strings include the maximum configured frequency. These prefixes are
|
||||
// defined by Intel CPUID documentation.
|
||||
const char* prefixes[3] = {"MHz", "GHz", "THz"};
|
||||
const double multipliers[3] = {1E6, 1E9, 1E12};
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
const size_t pos_prefix = brand_string.find(prefixes[i]);
|
||||
if (pos_prefix != std::string::npos) {
|
||||
const size_t pos_space = brand_string.rfind(' ', pos_prefix - 1);
|
||||
if (pos_space != std::string::npos) {
|
||||
const std::string digits =
|
||||
brand_string.substr(pos_space + 1, pos_prefix - pos_space - 1);
|
||||
return std::stod(digits) * multipliers[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
} // namespace
|
||||
|
||||
// Returns tick rate. Invariant means the tick counter frequency is independent
|
||||
// of CPU throttling or sleep. May be expensive, caller should cache the result.
|
||||
double InvariantTicksPerSecond() {
|
||||
#if HWY_ARCH_PPC
|
||||
return __ppc_get_timebase_freq();
|
||||
#elif HWY_ARCH_X86
|
||||
// We assume the TSC is invariant; it is on all recent Intel/AMD CPUs.
|
||||
return NominalClockRate();
|
||||
#else
|
||||
// Fall back to clock_gettime nanoseconds.
|
||||
return 1E9;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace platform
|
||||
namespace {
|
||||
|
||||
// Prevents the compiler from eliding the computations that led to "output".
|
||||
template <class T>
|
||||
inline void PreventElision(T&& output) {
|
||||
#if HWY_COMPILER_MSVC == 0
|
||||
// Works by indicating to the compiler that "output" is being read and
|
||||
// modified. The +r constraint avoids unnecessary writes to memory, but only
|
||||
// works for built-in types (typically FuncOutput).
|
||||
asm volatile("" : "+r"(output) : : "memory");
|
||||
#else
|
||||
// MSVC does not support inline assembly anymore (and never supported GCC's
|
||||
// RTL constraints). Self-assignment with #pragma optimize("off") might be
|
||||
// expected to prevent elision, but it does not with MSVC 2015. Type-punning
|
||||
// with volatile pointers generates inefficient code on MSVC 2017.
|
||||
static std::atomic<T> dummy(T{});
|
||||
dummy.store(output, std::memory_order_relaxed);
|
||||
#endif
|
||||
}
|
||||
|
||||
namespace timer {
|
||||
|
||||
// Start/Stop return absolute timestamps and must be placed immediately before
|
||||
// and after the region to measure. We provide separate Start/Stop functions
|
||||
// because they use different fences.
|
||||
//
|
||||
// Background: RDTSC is not 'serializing'; earlier instructions may complete
|
||||
// after it, and/or later instructions may complete before it. 'Fences' ensure
|
||||
// regions' elapsed times are independent of such reordering. The only
|
||||
// documented unprivileged serializing instruction is CPUID, which acts as a
|
||||
// full fence (no reordering across it in either direction). Unfortunately
|
||||
// the latency of CPUID varies wildly (perhaps made worse by not initializing
|
||||
// its EAX input). Because it cannot reliably be deducted from the region's
|
||||
// elapsed time, it must not be included in the region to measure (i.e.
|
||||
// between the two RDTSC).
|
||||
//
|
||||
// The newer RDTSCP is sometimes described as serializing, but it actually
|
||||
// only serves as a half-fence with release semantics. Although all
|
||||
// instructions in the region will complete before the final timestamp is
|
||||
// captured, subsequent instructions may leak into the region and increase the
|
||||
// elapsed time. Inserting another fence after the final RDTSCP would prevent
|
||||
// such reordering without affecting the measured region.
|
||||
//
|
||||
// Fortunately, such a fence exists. The LFENCE instruction is only documented
|
||||
// to delay later loads until earlier loads are visible. However, Intel's
|
||||
// reference manual says it acts as a full fence (waiting until all earlier
|
||||
// instructions have completed, and delaying later instructions until it
|
||||
// completes). AMD assigns the same behavior to MFENCE.
|
||||
//
|
||||
// We need a fence before the initial RDTSC to prevent earlier instructions
|
||||
// from leaking into the region, and arguably another after RDTSC to avoid
|
||||
// region instructions from completing before the timestamp is recorded.
|
||||
// When surrounded by fences, the additional RDTSCP half-fence provides no
|
||||
// benefit, so the initial timestamp can be recorded via RDTSC, which has
|
||||
// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
|
||||
// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
|
||||
//
|
||||
// Using Start+Start leads to higher variance and overhead than Stop+Stop.
|
||||
// However, Stop+Stop includes an LFENCE in the region measurements, which
|
||||
// adds a delay dependent on earlier loads. The combination of Start+Stop
|
||||
// is faster than Start+Start and more consistent than Stop+Stop because
|
||||
// the first LFENCE already delayed subsequent loads before the measured
|
||||
// region. This combination seems not to have been considered in prior work:
|
||||
// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
|
||||
//
|
||||
// Note: performance counters can measure 'exact' instructions-retired or
|
||||
// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
|
||||
// requires fences. Unfortunately, it is not accessible on all OSes and we
|
||||
// prefer to avoid kernel-mode drivers. Performance counters are also affected
|
||||
// by several under/over-count errata, so we use the TSC instead.
|
||||
|
||||
// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
|
||||
// divide by InvariantTicksPerSecond.
|
||||
inline uint64_t Start64() {
|
||||
uint64_t t;
|
||||
#if HWY_ARCH_PPC
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
t = __rdtsc();
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
asm volatile(
|
||||
"lfence\n\t"
|
||||
"rdtsc\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rdx = TSC >> 32.
|
||||
// "cc" = flags modified by SHL.
|
||||
: "rdx", "memory", "cc");
|
||||
#elif HWY_ARCH_RVV
|
||||
asm volatile("rdcycle %0" : "=r"(t));
|
||||
#else
|
||||
// Fall back to OS - unsure how to reliably query cntvct_el0 frequency.
|
||||
timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
t = ts.tv_sec * 1000000000LL + ts.tv_nsec;
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint64_t Stop64() {
|
||||
uint64_t t;
|
||||
#if HWY_ARCH_PPC
|
||||
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
|
||||
#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
unsigned aux;
|
||||
t = __rdtscp(&aux);
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
||||
asm volatile(
|
||||
"rdtscp\n\t"
|
||||
"shl $32, %%rdx\n\t"
|
||||
"or %%rdx, %0\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
||||
// "cc" = flags modified by SHL.
|
||||
: "rcx", "rdx", "memory", "cc");
|
||||
#else
|
||||
t = Start64();
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
// Returns a 32-bit timestamp with about 4 cycles less overhead than
|
||||
// Start64. Only suitable for measuring very short regions because the
|
||||
// timestamp overflows about once a second.
|
||||
inline uint32_t Start32() {
|
||||
uint32_t t;
|
||||
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
t = static_cast<uint32_t>(__rdtsc());
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
asm volatile(
|
||||
"lfence\n\t"
|
||||
"rdtsc\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rdx = TSC >> 32.
|
||||
: "rdx", "memory");
|
||||
#elif HWY_ARCH_RVV
|
||||
asm volatile("rdcycle %0" : "=r"(t));
|
||||
#else
|
||||
t = static_cast<uint32_t>(Start64());
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
inline uint32_t Stop32() {
|
||||
uint32_t t;
|
||||
#if HWY_ARCH_X86 && HWY_COMPILER_MSVC
|
||||
_ReadWriteBarrier();
|
||||
unsigned aux;
|
||||
t = static_cast<uint32_t>(__rdtscp(&aux));
|
||||
_ReadWriteBarrier();
|
||||
_mm_lfence();
|
||||
_ReadWriteBarrier();
|
||||
#elif HWY_ARCH_X86_64
|
||||
// Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
|
||||
asm volatile(
|
||||
"rdtscp\n\t"
|
||||
"lfence"
|
||||
: "=a"(t)
|
||||
:
|
||||
// "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
|
||||
: "rcx", "rdx", "memory");
|
||||
#else
|
||||
t = static_cast<uint32_t>(Stop64());
|
||||
#endif
|
||||
return t;
|
||||
}
|
||||
|
||||
} // namespace timer
|
||||
|
||||
namespace robust_statistics {
|
||||
|
||||
// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
|
||||
// than std::sort for input distributions with very few unique values.
|
||||
template <class T>
|
||||
void CountingSort(T* values, size_t num_values) {
|
||||
// Unique values and their frequency (similar to flat_map).
|
||||
using Unique = std::pair<T, int>;
|
||||
std::vector<Unique> unique;
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
const T value = values[i];
|
||||
const auto pos =
|
||||
std::find_if(unique.begin(), unique.end(),
|
||||
[value](const Unique u) { return u.first == value; });
|
||||
if (pos == unique.end()) {
|
||||
unique.push_back(std::make_pair(value, 1));
|
||||
} else {
|
||||
++pos->second;
|
||||
}
|
||||
}
|
||||
|
||||
// Sort in ascending order of value (pair.first).
|
||||
std::sort(unique.begin(), unique.end());
|
||||
|
||||
// Write that many copies of each unique value to the array.
|
||||
T* HWY_RESTRICT p = values;
|
||||
for (const auto& value_count : unique) {
|
||||
std::fill(p, p + value_count.second, value_count.first);
|
||||
p += value_count.second;
|
||||
}
|
||||
NANOBENCHMARK_CHECK(p == values + num_values);
|
||||
}
|
||||
|
||||
// @return i in [idx_begin, idx_begin + half_count) that minimizes
|
||||
// sorted[i + half_count] - sorted[i].
|
||||
template <typename T>
|
||||
size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
|
||||
const size_t half_count) {
|
||||
T min_range = std::numeric_limits<T>::max();
|
||||
size_t min_idx = 0;
|
||||
|
||||
for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
|
||||
NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
|
||||
const T range = sorted[idx + half_count] - sorted[idx];
|
||||
if (range < min_range) {
|
||||
min_range = range;
|
||||
min_idx = idx;
|
||||
}
|
||||
}
|
||||
|
||||
return min_idx;
|
||||
}
|
||||
|
||||
// Returns an estimate of the mode by calling MinRange on successively
|
||||
// halved intervals. "sorted" must be in ascending order. This is the
|
||||
// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
|
||||
// estimator of the mode", with complexity O(N log N). The mode is less
|
||||
// affected by outliers in highly-skewed distributions than the median.
|
||||
// The averaging operation below assumes "T" is an unsigned integer type.
|
||||
template <typename T>
|
||||
T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
|
||||
size_t idx_begin = 0;
|
||||
size_t half_count = num_values / 2;
|
||||
while (half_count > 1) {
|
||||
idx_begin = MinRange(sorted, idx_begin, half_count);
|
||||
half_count >>= 1;
|
||||
}
|
||||
|
||||
const T x = sorted[idx_begin + 0];
|
||||
if (half_count == 0) {
|
||||
return x;
|
||||
}
|
||||
NANOBENCHMARK_CHECK(half_count == 1);
|
||||
const T average = (x + sorted[idx_begin + 1] + 1) / 2;
|
||||
return average;
|
||||
}
|
||||
|
||||
// Returns the mode. Side effect: sorts "values".
|
||||
template <typename T>
|
||||
T Mode(T* values, const size_t num_values) {
|
||||
CountingSort(values, num_values);
|
||||
return ModeOfSorted(values, num_values);
|
||||
}
|
||||
|
||||
template <typename T, size_t N>
|
||||
T Mode(T (&values)[N]) {
|
||||
return Mode(&values[0], N);
|
||||
}
|
||||
|
||||
// Returns the median value. Side effect: sorts "values".
|
||||
template <typename T>
|
||||
T Median(T* values, const size_t num_values) {
|
||||
NANOBENCHMARK_CHECK(!values->empty());
|
||||
std::sort(values, values + num_values);
|
||||
const size_t half = num_values / 2;
|
||||
// Odd count: return middle
|
||||
if (num_values % 2) {
|
||||
return values[half];
|
||||
}
|
||||
// Even count: return average of middle two.
|
||||
return (values[half] + values[half - 1] + 1) / 2;
|
||||
}
|
||||
|
||||
// Returns a robust measure of variability.
|
||||
template <typename T>
|
||||
T MedianAbsoluteDeviation(const T* values, const size_t num_values,
|
||||
const T median) {
|
||||
NANOBENCHMARK_CHECK(num_values != 0);
|
||||
std::vector<T> abs_deviations;
|
||||
abs_deviations.reserve(num_values);
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
const int64_t abs = std::abs(int64_t(values[i]) - int64_t(median));
|
||||
abs_deviations.push_back(static_cast<T>(abs));
|
||||
}
|
||||
return Median(abs_deviations.data(), num_values);
|
||||
}
|
||||
|
||||
} // namespace robust_statistics
|
||||
|
||||
// Ticks := platform-specific timer values (CPU cycles on x86). Must be
|
||||
// unsigned to guarantee wraparound on overflow. 32 bit timers are faster to
|
||||
// read than 64 bit.
|
||||
using Ticks = uint32_t;
|
||||
|
||||
// Returns timer overhead / minimum measurable difference.
|
||||
Ticks TimerResolution() {
|
||||
// Nested loop avoids exceeding stack/L1 capacity.
|
||||
Ticks repetitions[Params::kTimerSamples];
|
||||
for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
|
||||
Ticks samples[Params::kTimerSamples];
|
||||
for (size_t i = 0; i < Params::kTimerSamples; ++i) {
|
||||
const Ticks t0 = timer::Start32();
|
||||
const Ticks t1 = timer::Stop32();
|
||||
samples[i] = t1 - t0;
|
||||
}
|
||||
repetitions[rep] = robust_statistics::Mode(samples);
|
||||
}
|
||||
return robust_statistics::Mode(repetitions);
|
||||
}
|
||||
|
||||
static const Ticks timer_resolution = TimerResolution();
|
||||
|
||||
// Estimates the expected value of "lambda" values with a variable number of
|
||||
// samples until the variability "rel_mad" is less than "max_rel_mad".
|
||||
template <class Lambda>
|
||||
Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
|
||||
const Params& p, const Lambda& lambda) {
|
||||
// Choose initial samples_per_eval based on a single estimated duration.
|
||||
Ticks t0 = timer::Start32();
|
||||
lambda();
|
||||
Ticks t1 = timer::Stop32();
|
||||
Ticks est = t1 - t0;
|
||||
static const double ticks_per_second = platform::InvariantTicksPerSecond();
|
||||
const size_t ticks_per_eval =
|
||||
static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
|
||||
size_t samples_per_eval =
|
||||
est == 0 ? p.min_samples_per_eval : ticks_per_eval / est;
|
||||
samples_per_eval = std::max(samples_per_eval, p.min_samples_per_eval);
|
||||
|
||||
std::vector<Ticks> samples;
|
||||
samples.reserve(1 + samples_per_eval);
|
||||
samples.push_back(est);
|
||||
|
||||
// Percentage is too strict for tiny differences, so also allow a small
|
||||
// absolute "median absolute deviation".
|
||||
const Ticks max_abs_mad = (timer_resolution + 99) / 100;
|
||||
*rel_mad = 0.0; // ensure initialized
|
||||
|
||||
for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
|
||||
samples.reserve(samples.size() + samples_per_eval);
|
||||
for (size_t i = 0; i < samples_per_eval; ++i) {
|
||||
t0 = timer::Start32();
|
||||
lambda();
|
||||
t1 = timer::Stop32();
|
||||
samples.push_back(t1 - t0);
|
||||
}
|
||||
|
||||
if (samples.size() >= p.min_mode_samples) {
|
||||
est = robust_statistics::Mode(samples.data(), samples.size());
|
||||
} else {
|
||||
// For "few" (depends also on the variance) samples, Median is safer.
|
||||
est = robust_statistics::Median(samples.data(), samples.size());
|
||||
}
|
||||
NANOBENCHMARK_CHECK(est != 0);
|
||||
|
||||
// Median absolute deviation (mad) is a robust measure of 'variability'.
|
||||
const Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
|
||||
samples.data(), samples.size(), est);
|
||||
*rel_mad = static_cast<double>(int(abs_mad)) / est;
|
||||
|
||||
if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
|
||||
if (p.verbose) {
|
||||
printf("%6zu samples => %5u (abs_mad=%4u, rel_mad=%4.2f%%)\n",
|
||||
samples.size(), est, abs_mad, *rel_mad * 100.0);
|
||||
}
|
||||
return est;
|
||||
}
|
||||
}
|
||||
|
||||
if (p.verbose) {
|
||||
printf(
|
||||
"WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6zu samples.\n",
|
||||
*rel_mad * 100.0, max_rel_mad * 100.0, samples.size());
|
||||
}
|
||||
return est;
|
||||
}
|
||||
|
||||
using InputVec = std::vector<FuncInput>;
|
||||
|
||||
// Returns vector of unique input values.
|
||||
InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
|
||||
InputVec unique(inputs, inputs + num_inputs);
|
||||
std::sort(unique.begin(), unique.end());
|
||||
unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
|
||||
return unique;
|
||||
}
|
||||
|
||||
// Returns how often we need to call func for sufficient precision, or zero
|
||||
// on failure (e.g. the elapsed time is too long for a 32-bit tick count).
|
||||
size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
|
||||
const Params& p) {
|
||||
// Min elapsed ticks for any input.
|
||||
Ticks min_duration = ~0u;
|
||||
|
||||
for (const FuncInput input : unique) {
|
||||
// Make sure a 32-bit timer is sufficient.
|
||||
const uint64_t t0 = timer::Start64();
|
||||
PreventElision(func(arg, input));
|
||||
const uint64_t t1 = timer::Stop64();
|
||||
const uint64_t elapsed = t1 - t0;
|
||||
if (elapsed >= (1ULL << 30)) {
|
||||
fprintf(stderr, "Measurement failed: need 64-bit timer for input=%zu\n",
|
||||
input);
|
||||
return 0;
|
||||
}
|
||||
|
||||
double rel_mad;
|
||||
const Ticks total = SampleUntilStable(
|
||||
p.target_rel_mad, &rel_mad, p,
|
||||
[func, arg, input]() { PreventElision(func(arg, input)); });
|
||||
min_duration = std::min(min_duration, total - timer_resolution);
|
||||
}
|
||||
|
||||
// Number of repetitions required to reach the target resolution.
|
||||
const size_t max_skip = p.precision_divisor;
|
||||
// Number of repetitions given the estimated duration.
|
||||
const size_t num_skip =
|
||||
min_duration == 0 ? 0 : (max_skip + min_duration - 1) / min_duration;
|
||||
if (p.verbose) {
|
||||
printf("res=%u max_skip=%zu min_dur=%u num_skip=%zu\n", timer_resolution,
|
||||
max_skip, min_duration, num_skip);
|
||||
}
|
||||
return num_skip;
|
||||
}
|
||||
|
||||
// Replicates inputs until we can omit "num_skip" occurrences of an input.
|
||||
InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
|
||||
const size_t num_unique, const size_t num_skip,
|
||||
const Params& p) {
|
||||
InputVec full;
|
||||
if (num_unique == 1) {
|
||||
full.assign(p.subset_ratio * num_skip, inputs[0]);
|
||||
return full;
|
||||
}
|
||||
|
||||
full.reserve(p.subset_ratio * num_skip * num_inputs);
|
||||
for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
|
||||
full.insert(full.end(), inputs, inputs + num_inputs);
|
||||
}
|
||||
std::mt19937 rng;
|
||||
std::shuffle(full.begin(), full.end(), rng);
|
||||
return full;
|
||||
}
|
||||
|
||||
// Copies the "full" to "subset" in the same order, but with "num_skip"
|
||||
// randomly selected occurrences of "input_to_skip" removed.
|
||||
void FillSubset(const InputVec& full, const FuncInput input_to_skip,
|
||||
const size_t num_skip, InputVec* subset) {
|
||||
const size_t count =
|
||||
static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
|
||||
// Generate num_skip random indices: which occurrence to skip.
|
||||
std::vector<uint32_t> omit(count);
|
||||
std::iota(omit.begin(), omit.end(), 0);
|
||||
// omit[] is the same on every call, but that's OK because they identify the
|
||||
// Nth instance of input_to_skip, so the position within full[] differs.
|
||||
std::mt19937 rng;
|
||||
std::shuffle(omit.begin(), omit.end(), rng);
|
||||
omit.resize(num_skip);
|
||||
std::sort(omit.begin(), omit.end());
|
||||
|
||||
uint32_t occurrence = ~0u; // 0 after preincrement
|
||||
size_t idx_omit = 0; // cursor within omit[]
|
||||
size_t idx_subset = 0; // cursor within *subset
|
||||
for (const FuncInput next : full) {
|
||||
if (next == input_to_skip) {
|
||||
++occurrence;
|
||||
// Haven't removed enough already
|
||||
if (idx_omit < num_skip) {
|
||||
// This one is up for removal
|
||||
if (occurrence == omit[idx_omit]) {
|
||||
++idx_omit;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (idx_subset < subset->size()) {
|
||||
(*subset)[idx_subset++] = next;
|
||||
}
|
||||
}
|
||||
NANOBENCHMARK_CHECK(idx_subset == subset->size());
|
||||
NANOBENCHMARK_CHECK(idx_omit == omit.size());
|
||||
NANOBENCHMARK_CHECK(occurrence == count - 1);
|
||||
}
|
||||
|
||||
// Returns total ticks elapsed for all inputs.
|
||||
Ticks TotalDuration(const Func func, const uint8_t* arg, const InputVec* inputs,
|
||||
const Params& p, double* max_rel_mad) {
|
||||
double rel_mad;
|
||||
const Ticks duration =
|
||||
SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
|
||||
for (const FuncInput input : *inputs) {
|
||||
PreventElision(func(arg, input));
|
||||
}
|
||||
});
|
||||
*max_rel_mad = std::max(*max_rel_mad, rel_mad);
|
||||
return duration;
|
||||
}
|
||||
|
||||
// (Nearly) empty Func for measuring timer overhead/resolution.
|
||||
HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
|
||||
return input;
|
||||
}
|
||||
|
||||
// Returns overhead of accessing inputs[] and calling a function; this will
|
||||
// be deducted from future TotalDuration return values.
|
||||
Ticks Overhead(const uint8_t* arg, const InputVec* inputs, const Params& p) {
|
||||
double rel_mad;
|
||||
// Zero tolerance because repeatability is crucial and EmptyFunc is fast.
|
||||
return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
|
||||
for (const FuncInput input : *inputs) {
|
||||
PreventElision(EmptyFunc(arg, input));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int Unpredictable1() { return timer::Start64() != ~0ULL; }
|
||||
|
||||
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results, const Params& p) {
|
||||
NANOBENCHMARK_CHECK(num_inputs != 0);
|
||||
const InputVec& unique = UniqueInputs(inputs, num_inputs);
|
||||
|
||||
const size_t num_skip = NumSkip(func, arg, unique, p); // never 0
|
||||
if (num_skip == 0) return 0; // NumSkip already printed error message
|
||||
// (slightly less work on x86 to cast from signed integer)
|
||||
const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
|
||||
|
||||
const InputVec& full =
|
||||
ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
|
||||
InputVec subset(full.size() - num_skip);
|
||||
|
||||
const Ticks overhead = Overhead(arg, &full, p);
|
||||
const Ticks overhead_skip = Overhead(arg, &subset, p);
|
||||
if (overhead < overhead_skip) {
|
||||
fprintf(stderr, "Measurement failed: overhead %u < %u\n", overhead,
|
||||
overhead_skip);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (p.verbose) {
|
||||
printf("#inputs=%5zu,%5zu overhead=%5u,%5u\n", full.size(), subset.size(),
|
||||
overhead, overhead_skip);
|
||||
}
|
||||
|
||||
double max_rel_mad = 0.0;
|
||||
const Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
|
||||
|
||||
for (size_t i = 0; i < unique.size(); ++i) {
|
||||
FillSubset(full, unique[i], num_skip, &subset);
|
||||
const Ticks total_skip = TotalDuration(func, arg, &subset, p, &max_rel_mad);
|
||||
|
||||
if (total < total_skip) {
|
||||
fprintf(stderr, "Measurement failed: total %u < %u\n", total, total_skip);
|
||||
return 0;
|
||||
}
|
||||
|
||||
const Ticks duration = (total - overhead) - (total_skip - overhead_skip);
|
||||
results[i].input = unique[i];
|
||||
results[i].ticks = static_cast<float>(duration) * mul;
|
||||
results[i].variability = static_cast<float>(max_rel_mad);
|
||||
}
|
||||
|
||||
return unique.size();
|
||||
}
|
||||
|
||||
} // namespace hwy
|
|
@ -0,0 +1,186 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
|
||||
#define HIGHWAY_HWY_NANOBENCHMARK_H_
|
||||
|
||||
// Benchmarks functions of a single integer argument with realistic branch
|
||||
// prediction hit rates. Uses a robust estimator to summarize the measurements.
|
||||
// The precision is about 0.2%.
|
||||
//
|
||||
// Examples: see nanobenchmark_test.cc.
|
||||
//
|
||||
// Background: Microbenchmarks such as http://github.com/google/benchmark
|
||||
// can measure elapsed times on the order of a microsecond. Shorter functions
|
||||
// are typically measured by repeating them thousands of times and dividing
|
||||
// the total elapsed time by this count. Unfortunately, repetition (especially
|
||||
// with the same input parameter!) influences the runtime. In time-critical
|
||||
// code, it is reasonable to expect warm instruction/data caches and TLBs,
|
||||
// but a perfect record of which branches will be taken is unrealistic.
|
||||
// Unless the application also repeatedly invokes the measured function with
|
||||
// the same parameter, the benchmark is measuring something very different -
|
||||
// a best-case result, almost as if the parameter were made a compile-time
|
||||
// constant. This may lead to erroneous conclusions about branch-heavy
|
||||
// algorithms outperforming branch-free alternatives.
|
||||
//
|
||||
// Our approach differs in three ways. Adding fences to the timer functions
|
||||
// reduces variability due to instruction reordering, improving the timer
|
||||
// resolution to about 40 CPU cycles. However, shorter functions must still
|
||||
// be invoked repeatedly. For more realistic branch prediction performance,
|
||||
// we vary the input parameter according to a user-specified distribution.
|
||||
// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
|
||||
// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
|
||||
// central tendency of the measurement samples with the "half sample mode",
|
||||
// which is more robust to outliers and skewed data than the mean or median.
|
||||
|
||||
// WARNING if included from multiple translation units compiled with distinct
|
||||
// flags: this header requires textual inclusion and a predefined NB_NAMESPACE
|
||||
// macro that is unique to the current compile flags. We must also avoid
|
||||
// standard library headers such as vector and functional that define functions.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// Enables sanity checks that verify correct operation at the cost of
|
||||
// longer benchmark runs.
|
||||
#ifndef NANOBENCHMARK_ENABLE_CHECKS
|
||||
#define NANOBENCHMARK_ENABLE_CHECKS 0
|
||||
#endif
|
||||
|
||||
#define NANOBENCHMARK_CHECK_ALWAYS(condition) \
|
||||
while (!(condition)) { \
|
||||
fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
|
||||
abort(); \
|
||||
}
|
||||
|
||||
#if NANOBENCHMARK_ENABLE_CHECKS
|
||||
#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
|
||||
#else
|
||||
#define NANOBENCHMARK_CHECK(condition)
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
|
||||
namespace platform {
|
||||
|
||||
// Returns tick rate, useful for converting measurements to seconds. Invariant
|
||||
// means the tick counter frequency is independent of CPU throttling or sleep.
|
||||
// This call may be expensive, callers should cache the result.
|
||||
double InvariantTicksPerSecond();
|
||||
|
||||
} // namespace platform
|
||||
|
||||
// Returns 1, but without the compiler knowing what the value is. This prevents
|
||||
// optimizing out code.
|
||||
int Unpredictable1();
|
||||
|
||||
// Input influencing the function being measured (e.g. number of bytes to copy).
|
||||
using FuncInput = size_t;
|
||||
|
||||
// "Proof of work" returned by Func to ensure the compiler does not elide it.
|
||||
using FuncOutput = uint64_t;
|
||||
|
||||
// Function to measure: either 1) a captureless lambda or function with two
|
||||
// arguments or 2) a lambda with capture, in which case the first argument
|
||||
// is reserved for use by MeasureClosure.
|
||||
using Func = FuncOutput (*)(const void*, FuncInput);
|
||||
|
||||
// Internal parameters that determine precision/resolution/measuring time.
|
||||
struct Params {
|
||||
// For measuring timer overhead/resolution. Used in a nested loop =>
|
||||
// quadratic time, acceptable because we know timer overhead is "low".
|
||||
// constexpr because this is used to define array bounds.
|
||||
static constexpr size_t kTimerSamples = 256;
|
||||
|
||||
// Best-case precision, expressed as a divisor of the timer resolution.
|
||||
// Larger => more calls to Func and higher precision.
|
||||
size_t precision_divisor = 1024;
|
||||
|
||||
// Ratio between full and subset input distribution sizes. Cannot be less
|
||||
// than 2; larger values increase measurement time but more faithfully
|
||||
// model the given input distribution.
|
||||
size_t subset_ratio = 2;
|
||||
|
||||
// Together with the estimated Func duration, determines how many times to
|
||||
// call Func before checking the sample variability. Larger values increase
|
||||
// measurement time, memory/cache use and precision.
|
||||
double seconds_per_eval = 4E-3;
|
||||
|
||||
// The minimum number of samples before estimating the central tendency.
|
||||
size_t min_samples_per_eval = 7;
|
||||
|
||||
// The mode is better than median for estimating the central tendency of
|
||||
// skewed/fat-tailed distributions, but it requires sufficient samples
|
||||
// relative to the width of half-ranges.
|
||||
size_t min_mode_samples = 64;
|
||||
|
||||
// Maximum permissible variability (= median absolute deviation / center).
|
||||
double target_rel_mad = 0.002;
|
||||
|
||||
// Abort after this many evals without reaching target_rel_mad. This
|
||||
// prevents infinite loops.
|
||||
size_t max_evals = 9;
|
||||
|
||||
// Whether to print additional statistics to stdout.
|
||||
bool verbose = true;
|
||||
};
|
||||
|
||||
// Measurement result for each unique input.
|
||||
struct Result {
|
||||
FuncInput input;
|
||||
|
||||
// Robust estimate (mode or median) of duration.
|
||||
float ticks;
|
||||
|
||||
// Measure of variability (median absolute deviation relative to "ticks").
|
||||
float variability;
|
||||
};
|
||||
|
||||
// Precisely measures the number of ticks elapsed when calling "func" with the
|
||||
// given inputs, shuffled to ensure realistic branch prediction hit rates.
|
||||
//
|
||||
// "func" returns a 'proof of work' to ensure its computations are not elided.
|
||||
// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
|
||||
// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
|
||||
// "func". The values should be chosen to maximize coverage of "func". This
|
||||
// represents a distribution, so a value's frequency should reflect its
|
||||
// probability in the real application. Order does not matter; for example, a
|
||||
// uniform distribution over [0, 4) could be represented as {3,0,2,1}.
|
||||
// Returns how many Result were written to "results": one per unique input, or
|
||||
// zero if the measurement failed (an error message goes to stderr).
|
||||
size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results,
|
||||
const Params& p = Params());
|
||||
|
||||
// Calls operator() of the given closure (lambda function).
|
||||
template <class Closure>
|
||||
static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
|
||||
return (*f)(input);
|
||||
}
|
||||
|
||||
// Same as Measure, except "closure" is typically a lambda function of
|
||||
// FuncInput -> FuncOutput with a capture list.
|
||||
template <class Closure>
|
||||
static inline size_t MeasureClosure(const Closure& closure,
|
||||
const FuncInput* inputs,
|
||||
const size_t num_inputs, Result* results,
|
||||
const Params& p = Params()) {
|
||||
return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
|
||||
reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
|
||||
results, p);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_NANOBENCHMARK_H_
|
|
@ -0,0 +1,104 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/nanobenchmark.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h> // strtol
|
||||
#include <unistd.h> // sleep
|
||||
|
||||
#include <random>
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
FuncOutput Div(const void*, FuncInput in) {
|
||||
// Here we're measuring the throughput because benchmark invocations are
|
||||
// independent. Any dividend will do; the divisor is nonzero.
|
||||
return 0xFFFFF / in;
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
void MeasureDiv(const FuncInput (&inputs)[N]) {
|
||||
Result results[N];
|
||||
Params params;
|
||||
params.max_evals = 4; // avoid test timeout
|
||||
const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
printf("%5zu: %6.2f ticks; MAD=%4.2f%%\n", results[i].input,
|
||||
results[i].ticks, results[i].variability * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
std::mt19937 rng;
|
||||
|
||||
// A function whose runtime depends on rng.
|
||||
FuncOutput Random(const void* /*arg*/, FuncInput in) {
|
||||
const size_t r = rng() & 0xF;
|
||||
uint32_t ret = in;
|
||||
for (size_t i = 0; i < r; ++i) {
|
||||
ret /= ((rng() & 1) + 2);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Ensure the measured variability is high.
|
||||
template <size_t N>
|
||||
void MeasureRandom(const FuncInput (&inputs)[N]) {
|
||||
Result results[N];
|
||||
Params p;
|
||||
p.max_evals = 4; // avoid test timeout
|
||||
p.verbose = false;
|
||||
const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
|
||||
for (size_t i = 0; i < num_results; ++i) {
|
||||
NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N>
|
||||
void EnsureLongMeasurementFails(const FuncInput (&inputs)[N]) {
|
||||
printf("Expect a 'measurement failed' below:\n");
|
||||
Result results[N];
|
||||
|
||||
const size_t num_results = Measure(
|
||||
[](const void*, const FuncInput input) -> FuncOutput {
|
||||
// Loop until the sleep succeeds (not interrupted by signal). We assume
|
||||
// >= 512 MHz, so 2 seconds will exceed the 1 << 30 tick safety limit.
|
||||
while (sleep(2) != 0) {
|
||||
}
|
||||
return input;
|
||||
},
|
||||
nullptr, inputs, N, results);
|
||||
NANOBENCHMARK_CHECK(num_results == 0);
|
||||
(void)num_results;
|
||||
}
|
||||
|
||||
void RunAll(const int argc, char** /*argv*/) {
|
||||
// unpredictable == 1 but the compiler doesn't know that.
|
||||
const int unpredictable = argc != 999;
|
||||
static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
|
||||
static_cast<FuncInput>(unpredictable + 9)};
|
||||
|
||||
MeasureDiv(inputs);
|
||||
MeasureRandom(inputs);
|
||||
EnsureLongMeasurementFails(inputs);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace hwy
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
hwy::RunAll(argc, argv);
|
||||
return 0;
|
||||
}
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,226 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Sets macros based on HWY_TARGET.
|
||||
|
||||
// This include guard is toggled by foreach_target, so avoid the usual _H_
|
||||
// suffix to prevent copybara from renaming it.
|
||||
#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HWY_SET_MACROS_PER_TARGET
|
||||
#undef HWY_SET_MACROS_PER_TARGET
|
||||
#else
|
||||
#define HWY_SET_MACROS_PER_TARGET
|
||||
#endif
|
||||
|
||||
#endif // HWY_SET_MACROS_PER_TARGET
|
||||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
#undef HWY_NAMESPACE
|
||||
#undef HWY_ALIGN
|
||||
#undef HWY_LANES
|
||||
|
||||
#undef HWY_CAP_INTEGER64
|
||||
#undef HWY_CAP_FLOAT64
|
||||
#undef HWY_CAP_GE256
|
||||
#undef HWY_CAP_GE512
|
||||
|
||||
#undef HWY_TARGET_STR
|
||||
|
||||
// Before include guard so we redefine HWY_TARGET_STR on each include,
|
||||
// governed by the current HWY_TARGET.
|
||||
//-----------------------------------------------------------------------------
|
||||
// SSE4
|
||||
#if HWY_TARGET == HWY_SSE4
|
||||
|
||||
#define HWY_NAMESPACE N_SSE4
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_TARGET_STR "sse2,ssse3,sse4.1"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX2
|
||||
#elif HWY_TARGET == HWY_AVX2
|
||||
|
||||
#define HWY_NAMESPACE N_AVX2
|
||||
#define HWY_ALIGN alignas(32)
|
||||
#define HWY_LANES(T) (32 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if defined(HWY_DISABLE_BMI2_FMA)
|
||||
#define HWY_TARGET_STR "avx,avx2,f16c"
|
||||
#else
|
||||
#define HWY_TARGET_STR "avx,avx2,bmi,bmi2,fma,f16c"
|
||||
#endif
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// AVX3
|
||||
#elif HWY_TARGET == HWY_AVX3
|
||||
|
||||
#define HWY_ALIGN alignas(64)
|
||||
#define HWY_LANES(T) (64 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 1
|
||||
#define HWY_CAP_GE512 1
|
||||
|
||||
#define HWY_NAMESPACE N_AVX3
|
||||
|
||||
// Must include AVX2 because an AVX3 test may call AVX2 functions (e.g. when
|
||||
// converting to half-vectors). HWY_DISABLE_BMI2_FMA is not relevant because if
|
||||
// we have AVX3, we should also have BMI2/FMA.
|
||||
#define HWY_TARGET_STR \
|
||||
"avx,avx2,bmi,bmi2,fma,f16c,avx512f,avx512vl,avx512dq,avx512bw"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// PPC8
|
||||
#elif HWY_TARGET == HWY_PPC8
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_PPC8
|
||||
|
||||
#define HWY_TARGET_STR "altivec,vsx"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// NEON
|
||||
#elif HWY_TARGET == HWY_NEON
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#if HWY_ARCH_ARM_A64
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#else
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#endif
|
||||
|
||||
#define HWY_NAMESPACE N_NEON
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// WASM
|
||||
#elif HWY_TARGET == HWY_WASM
|
||||
|
||||
#define HWY_ALIGN alignas(16)
|
||||
#define HWY_LANES(T) (16 / sizeof(T))
|
||||
|
||||
#define HWY_CAP_INTEGER64 0
|
||||
#define HWY_CAP_FLOAT64 0
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_WASM
|
||||
|
||||
#define HWY_TARGET_STR "simd128"
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// RVV
|
||||
#elif HWY_TARGET == HWY_RVV
|
||||
|
||||
// RVV only requires lane alignment, not natural alignment of the entire vector,
|
||||
// and the compiler already aligns builtin types, so nothing to do here.
|
||||
#define HWY_ALIGN
|
||||
|
||||
// Arbitrary constant, not the actual lane count! Large enough that we can
|
||||
// mul/div by 8 for LMUL. Value matches kMaxVectorSize, see base.h.
|
||||
#define HWY_LANES(T) (4096 / sizeof(T))
|
||||
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_RVV
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
// (rv64gcv is not a valid target)
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
// SCALAR
|
||||
#elif HWY_TARGET == HWY_SCALAR
|
||||
|
||||
#define HWY_ALIGN
|
||||
#define HWY_LANES(T) 1
|
||||
|
||||
#define HWY_CAP_INTEGER64 1
|
||||
#define HWY_CAP_FLOAT64 1
|
||||
#define HWY_CAP_GE256 0
|
||||
#define HWY_CAP_GE512 0
|
||||
|
||||
#define HWY_NAMESPACE N_SCALAR
|
||||
|
||||
// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
|
||||
|
||||
#else
|
||||
#pragma message("HWY_TARGET does not match any known target")
|
||||
#endif // HWY_TARGET
|
||||
|
||||
// Clang <9 requires this be invoked at file scope, before any namespace.
|
||||
#undef HWY_BEFORE_NAMESPACE
|
||||
#if defined(HWY_TARGET_STR)
|
||||
#define HWY_BEFORE_NAMESPACE() \
|
||||
HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#else
|
||||
// avoids compiler warning if no HWY_TARGET_STR
|
||||
#define HWY_BEFORE_NAMESPACE() \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#endif
|
||||
|
||||
// Clang <9 requires any namespaces be closed before this macro.
|
||||
#undef HWY_AFTER_NAMESPACE
|
||||
#if defined(HWY_TARGET_STR)
|
||||
#define HWY_AFTER_NAMESPACE() \
|
||||
HWY_POP_ATTRIBUTES \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#else
|
||||
// avoids compiler warning if no HWY_TARGET_STR
|
||||
#define HWY_AFTER_NAMESPACE() \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
#endif
|
||||
|
||||
#undef HWY_ATTR
|
||||
#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
|
||||
#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
|
||||
#else
|
||||
#define HWY_ATTR
|
||||
#endif
|
||||
|
||||
// DEPRECATED
|
||||
#undef HWY_GATHER_LANES
|
||||
#define HWY_GATHER_LANES(T) HWY_LANES(T)
|
|
@ -0,0 +1,125 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Per-target definitions shared by ops/*.h and user code.
|
||||
|
||||
#include <cmath>
|
||||
|
||||
// Separate header because foreach_target.h re-enables its include guard.
|
||||
#include "hwy/ops/set_macros-inl.h"
|
||||
|
||||
// Relies on the external include guard in highway.h.
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// SIMD operations are implemented as overloaded functions selected using a
|
||||
// "descriptor" D := Simd<T, N>. T is the lane type, N a number of lanes >= 1
|
||||
// (always a power of two). Users generally do not choose N directly, but
|
||||
// instead use HWY_FULL(T[, LMUL]) (the largest available size). N is not
|
||||
// necessarily the actual number of lanes, which is returned by Lanes(D()).
|
||||
//
|
||||
// Only HWY_FULL(T) and N <= 16 / sizeof(T) are guaranteed to be available - the
|
||||
// latter are useful if >128 bit vectors are unnecessary or undesirable.
|
||||
template <typename Lane, size_t N>
|
||||
struct Simd {
|
||||
constexpr Simd() = default;
|
||||
using T = Lane;
|
||||
static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
|
||||
|
||||
// Widening/narrowing ops change the number of lanes and/or their type.
|
||||
// To initialize such vectors, we need the corresponding descriptor types:
|
||||
|
||||
// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
|
||||
template <typename NewLane>
|
||||
using Rebind = Simd<NewLane, N>;
|
||||
|
||||
// MulEven() with another lane type, but same total size.
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
template <typename NewLane>
|
||||
using Repartition =
|
||||
Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>;
|
||||
|
||||
// LowerHalf() with the same lane type, but half the lanes.
|
||||
// Round up to correctly handle scalars with N=1.
|
||||
using Half = Simd<T, (N + 1) / 2>;
|
||||
|
||||
// Combine() with the same lane type, but twice the lanes.
|
||||
using Twice = Simd<T, 2 * N>;
|
||||
};
|
||||
|
||||
template <class D>
|
||||
using TFromD = typename D::T;
|
||||
|
||||
// Descriptor for the same number of lanes as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Rebind = typename D::template Rebind<T>;
|
||||
|
||||
template <class D>
|
||||
using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
|
||||
|
||||
// Descriptor for the same total size as D, but with the LaneType T.
|
||||
template <class T, class D>
|
||||
using Repartition = typename D::template Repartition<T>;
|
||||
|
||||
template <class D>
|
||||
using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
|
||||
template <class D>
|
||||
using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
|
||||
|
||||
// Descriptor for the same lane type as D, but half the lanes.
|
||||
template <class D>
|
||||
using Half = typename D::Half;
|
||||
|
||||
// Descriptor for the same lane type as D, but twice the lanes.
|
||||
template <class D>
|
||||
using Twice = typename D::Twice;
|
||||
|
||||
// Same as base.h macros but with a Simd<T, N> argument instead of T.
|
||||
#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
|
||||
#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
|
||||
#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
|
||||
#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
|
||||
#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
|
||||
|
||||
// Compile-time-constant, (typically but not guaranteed) an upper bound on the
|
||||
// number of lanes.
|
||||
// Prefer instead using Lanes() and dynamic allocation, or Rebind, or
|
||||
// `#if HWY_CAP_GE*`.
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) {
|
||||
return N;
|
||||
}
|
||||
|
||||
// Targets with non-constexpr Lanes define this themselves.
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
|
||||
// (Potentially) non-constant actual size of the vector at runtime, subject to
|
||||
// the limit imposed by the Simd. Useful for advancing loop counters.
|
||||
template <typename T, size_t N>
|
||||
HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) {
|
||||
return N;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,286 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
#include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace
|
||||
#endif // defined(*_SANITIZER)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
#include <xmmintrin.h>
|
||||
#if HWY_COMPILER_MSVC
|
||||
#include <intrin.h>
|
||||
#else // HWY_COMPILER_MSVC
|
||||
#include <cpuid.h>
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
#endif
|
||||
|
||||
namespace hwy {
|
||||
namespace {
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
|
||||
bool IsBitSet(const uint32_t reg, const int index) {
|
||||
return (reg & (1U << index)) != 0;
|
||||
}
|
||||
|
||||
// Calls CPUID instruction with eax=level and ecx=count and returns the result
|
||||
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
|
||||
void Cpuid(const uint32_t level, const uint32_t count,
|
||||
uint32_t* HWY_RESTRICT abcd) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
int regs[4];
|
||||
__cpuidex(regs, level, count);
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
abcd[i] = regs[i];
|
||||
}
|
||||
#else // HWY_COMPILER_MSVC
|
||||
uint32_t a;
|
||||
uint32_t b;
|
||||
uint32_t c;
|
||||
uint32_t d;
|
||||
__cpuid_count(level, count, a, b, c, d);
|
||||
abcd[0] = a;
|
||||
abcd[1] = b;
|
||||
abcd[2] = c;
|
||||
abcd[3] = d;
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
// Returns the lower 32 bits of extended control register 0.
|
||||
// Requires CPU support for "OSXSAVE" (see below).
|
||||
uint32_t ReadXCR0() {
|
||||
#if HWY_COMPILER_MSVC
|
||||
return static_cast<uint32_t>(_xgetbv(0));
|
||||
#else // HWY_COMPILER_MSVC
|
||||
uint32_t xcr0, xcr0_high;
|
||||
const uint32_t index = 0;
|
||||
asm volatile(".byte 0x0F, 0x01, 0xD0"
|
||||
: "=a"(xcr0), "=d"(xcr0_high)
|
||||
: "c"(index));
|
||||
return xcr0;
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
// Not function-local => no compiler-generated locking.
|
||||
std::atomic<uint32_t> supported_{0}; // Not yet initialized
|
||||
|
||||
// When running tests, this value can be set to the mocked supported targets
|
||||
// mask. Only written to from a single thread before the test starts.
|
||||
uint32_t supported_targets_for_test_ = 0;
|
||||
|
||||
// Mask of targets disabled at runtime with DisableTargets.
|
||||
uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()};
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Bits indicating which instruction set extensions are supported.
|
||||
constexpr uint32_t kSSE = 1 << 0;
|
||||
constexpr uint32_t kSSE2 = 1 << 1;
|
||||
constexpr uint32_t kSSE3 = 1 << 2;
|
||||
constexpr uint32_t kSSSE3 = 1 << 3;
|
||||
constexpr uint32_t kSSE41 = 1 << 4;
|
||||
constexpr uint32_t kSSE42 = 1 << 5;
|
||||
constexpr uint32_t kGroupSSE4 = kSSE | kSSE2 | kSSE3 | kSSSE3 | kSSE41 | kSSE42;
|
||||
|
||||
constexpr uint32_t kAVX = 1u << 6;
|
||||
constexpr uint32_t kAVX2 = 1u << 7;
|
||||
constexpr uint32_t kFMA = 1u << 8;
|
||||
constexpr uint32_t kLZCNT = 1u << 9;
|
||||
constexpr uint32_t kBMI = 1u << 10;
|
||||
constexpr uint32_t kBMI2 = 1u << 11;
|
||||
|
||||
// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
|
||||
// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
|
||||
// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
|
||||
// avoiding using and requiring these so AVX2 can still be used.
|
||||
#ifdef HWY_DISABLE_BMI2_FMA
|
||||
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kLZCNT;
|
||||
#else
|
||||
constexpr uint32_t kGroupAVX2 = kAVX | kAVX2 | kFMA | kLZCNT | kBMI | kBMI2;
|
||||
#endif
|
||||
|
||||
constexpr uint32_t kAVX512F = 1u << 12;
|
||||
constexpr uint32_t kAVX512VL = 1u << 13;
|
||||
constexpr uint32_t kAVX512DQ = 1u << 14;
|
||||
constexpr uint32_t kAVX512BW = 1u << 15;
|
||||
constexpr uint32_t kGroupAVX3 = kAVX512F | kAVX512VL | kAVX512DQ | kAVX512BW;
|
||||
#endif
|
||||
|
||||
} // namespace
|
||||
|
||||
HWY_NORETURN void HWY_FORMAT(3, 4)
|
||||
Abort(const char* file, int line, const char* format, ...) {
|
||||
char buf[2000];
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
vsnprintf(buf, sizeof(buf), format, args);
|
||||
va_end(args);
|
||||
|
||||
fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
|
||||
#if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
|
||||
defined(THREAD_SANITIZER)
|
||||
// If compiled with any sanitizer print a stack trace. This call doesn't crash
|
||||
// the program, instead the trap below will crash it also allowing gdb to
|
||||
// break there.
|
||||
__sanitizer_print_stack_trace();
|
||||
#endif // defined(*_SANITIZER)
|
||||
fflush(stderr);
|
||||
|
||||
#if HWY_COMPILER_MSVC
|
||||
abort(); // Compile error without this due to HWY_NORETURN.
|
||||
#else
|
||||
__builtin_trap();
|
||||
#endif
|
||||
}
|
||||
|
||||
void DisableTargets(uint32_t disabled_targets) {
|
||||
supported_mask_ = ~(disabled_targets & ~uint32_t(HWY_ENABLED_BASELINE));
|
||||
// We can call Update() here to initialize the mask but that will trigger a
|
||||
// call to SupportedTargets() which we use in tests to tell whether any of the
|
||||
// highway dynamic dispatch functions were used.
|
||||
chosen_target.DeInit();
|
||||
}
|
||||
|
||||
void SetSupportedTargetsForTest(uint32_t targets) {
|
||||
// Reset the cached supported_ value to 0 to force a re-evaluation in the
|
||||
// next call to SupportedTargets() which will use the mocked value set here
|
||||
// if not zero.
|
||||
supported_.store(0, std::memory_order_release);
|
||||
supported_targets_for_test_ = targets;
|
||||
chosen_target.DeInit();
|
||||
}
|
||||
|
||||
bool SupportedTargetsCalledForTest() {
|
||||
return supported_.load(std::memory_order_acquire) != 0;
|
||||
}
|
||||
|
||||
uint32_t SupportedTargets() {
|
||||
uint32_t bits = supported_.load(std::memory_order_acquire);
|
||||
// Already initialized?
|
||||
if (HWY_LIKELY(bits != 0)) {
|
||||
return bits & supported_mask_;
|
||||
}
|
||||
|
||||
// When running tests, this allows to mock the current supported targets.
|
||||
if (HWY_UNLIKELY(supported_targets_for_test_ != 0)) {
|
||||
// Store the value to signal that this was used.
|
||||
supported_.store(supported_targets_for_test_, std::memory_order_release);
|
||||
return supported_targets_for_test_ & supported_mask_;
|
||||
}
|
||||
|
||||
bits = HWY_SCALAR;
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
uint32_t flags = 0;
|
||||
uint32_t abcd[4];
|
||||
|
||||
Cpuid(0, 0, abcd);
|
||||
const uint32_t max_level = abcd[0];
|
||||
|
||||
// Standard feature flags
|
||||
Cpuid(1, 0, abcd);
|
||||
flags |= IsBitSet(abcd[3], 25) ? kSSE : 0;
|
||||
flags |= IsBitSet(abcd[3], 26) ? kSSE2 : 0;
|
||||
flags |= IsBitSet(abcd[2], 0) ? kSSE3 : 0;
|
||||
flags |= IsBitSet(abcd[2], 9) ? kSSSE3 : 0;
|
||||
flags |= IsBitSet(abcd[2], 19) ? kSSE41 : 0;
|
||||
flags |= IsBitSet(abcd[2], 20) ? kSSE42 : 0;
|
||||
flags |= IsBitSet(abcd[2], 12) ? kFMA : 0;
|
||||
flags |= IsBitSet(abcd[2], 28) ? kAVX : 0;
|
||||
const bool has_osxsave = IsBitSet(abcd[2], 27);
|
||||
|
||||
// Extended feature flags
|
||||
Cpuid(0x80000001U, 0, abcd);
|
||||
flags |= IsBitSet(abcd[2], 5) ? kLZCNT : 0;
|
||||
|
||||
// Extended features
|
||||
if (max_level >= 7) {
|
||||
Cpuid(7, 0, abcd);
|
||||
flags |= IsBitSet(abcd[1], 3) ? kBMI : 0;
|
||||
flags |= IsBitSet(abcd[1], 5) ? kAVX2 : 0;
|
||||
flags |= IsBitSet(abcd[1], 8) ? kBMI2 : 0;
|
||||
|
||||
flags |= IsBitSet(abcd[1], 16) ? kAVX512F : 0;
|
||||
flags |= IsBitSet(abcd[1], 17) ? kAVX512DQ : 0;
|
||||
flags |= IsBitSet(abcd[1], 30) ? kAVX512BW : 0;
|
||||
flags |= IsBitSet(abcd[1], 31) ? kAVX512VL : 0;
|
||||
}
|
||||
|
||||
// Verify OS support for XSAVE, without which XMM/YMM registers are not
|
||||
// preserved across context switches and are not safe to use.
|
||||
if (has_osxsave) {
|
||||
const uint32_t xcr0 = ReadXCR0();
|
||||
// XMM
|
||||
if (!IsBitSet(xcr0, 1)) {
|
||||
flags = 0;
|
||||
}
|
||||
// YMM
|
||||
if (!IsBitSet(xcr0, 2)) {
|
||||
flags &= ~kGroupAVX2;
|
||||
}
|
||||
// ZMM + opmask
|
||||
if ((xcr0 & 0x70) != 0x70) {
|
||||
flags &= ~kGroupAVX3;
|
||||
}
|
||||
}
|
||||
|
||||
// Set target bit(s) if all their group's flags are all set.
|
||||
if ((flags & kGroupAVX3) == kGroupAVX3) {
|
||||
bits |= HWY_AVX3;
|
||||
}
|
||||
if ((flags & kGroupAVX2) == kGroupAVX2) {
|
||||
bits |= HWY_AVX2;
|
||||
}
|
||||
if ((flags & kGroupSSE4) == kGroupSSE4) {
|
||||
bits |= HWY_SSE4;
|
||||
}
|
||||
#else
|
||||
// TODO(janwas): detect for other platforms
|
||||
bits = HWY_ENABLED_BASELINE;
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
|
||||
fprintf(stderr, "WARNING: CPU supports %zx but software requires %x\n",
|
||||
size_t(bits), HWY_ENABLED_BASELINE);
|
||||
}
|
||||
|
||||
supported_.store(bits, std::memory_order_release);
|
||||
return bits & supported_mask_;
|
||||
}
|
||||
|
||||
// Declared in targets.h
|
||||
ChosenTarget chosen_target;
|
||||
|
||||
void ChosenTarget::Update() {
|
||||
// The supported variable contains the current CPU supported targets shifted
|
||||
// to the location expected by the ChosenTarget mask. We enabled SCALAR
|
||||
// regardless of whether it was compiled since it is also used as the
|
||||
// fallback mechanism to the baseline target.
|
||||
uint32_t supported = HWY_CHOSEN_TARGET_SHIFT(hwy::SupportedTargets()) |
|
||||
HWY_CHOSEN_TARGET_MASK_SCALAR;
|
||||
mask_.store(supported);
|
||||
}
|
||||
|
||||
} // namespace hwy
|
|
@ -0,0 +1,491 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef HIGHWAY_HWY_TARGETS_H_
|
||||
#define HIGHWAY_HWY_TARGETS_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
// For SIMD module implementations and their callers. Defines which targets to
|
||||
// generate and call.
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Optional configuration
|
||||
|
||||
// See ../quick_reference.md for documentation of these macros.
|
||||
|
||||
// Uncomment to override the default baseline determined from predefined macros:
|
||||
// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
|
||||
|
||||
// Uncomment to override the default blocklist:
|
||||
// #define HWY_BROKEN_TARGETS HWY_AVX3
|
||||
|
||||
// Uncomment to definitely avoid generating those target(s):
|
||||
// #define HWY_DISABLED_TARGETS HWY_SSE4
|
||||
|
||||
// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
|
||||
// AVX2 target for VMs which support AVX2 but not the other instruction sets)
|
||||
// #define HWY_DISABLE_BMI2_FMA
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Targets
|
||||
|
||||
// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
|
||||
// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
|
||||
//
|
||||
// All values are unconditionally defined so we can test HWY_TARGETS without
|
||||
// first checking the HWY_ARCH_*.
|
||||
//
|
||||
// The C99 preprocessor evaluates #if expressions using intmax_t types, so we
|
||||
// can use 32-bit literals.
|
||||
|
||||
// 1,2,4: reserved
|
||||
#define HWY_AVX3 8
|
||||
#define HWY_AVX2 16
|
||||
// 32: reserved for AVX
|
||||
#define HWY_SSE4 64
|
||||
// 0x80, 0x100, 0x200: reserved for SSSE3, SSE3, SSE2
|
||||
|
||||
// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
|
||||
// dynamic dispatch. All x86 target bits must be lower or equal to
|
||||
// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
|
||||
// HWY_MAX_DYNAMIC_TARGETS in total.
|
||||
#define HWY_HIGHEST_TARGET_BIT_X86 9
|
||||
|
||||
// 0x400, 0x800, 0x1000 reserved for SVE, SVE2, Helium
|
||||
#define HWY_NEON 0x2000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_ARM 13
|
||||
|
||||
// 0x4000, 0x8000 reserved
|
||||
#define HWY_PPC8 0x10000 // v2.07 or 3
|
||||
// 0x20000, 0x40000 reserved for prior VSX/AltiVec
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_PPC 18
|
||||
|
||||
// 0x80000 reserved
|
||||
#define HWY_WASM 0x100000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_WASM 20
|
||||
|
||||
// 0x200000, 0x400000, 0x800000 reserved
|
||||
|
||||
#define HWY_RVV 0x1000000
|
||||
|
||||
#define HWY_HIGHEST_TARGET_BIT_RVV 24
|
||||
|
||||
// 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
|
||||
|
||||
#define HWY_SCALAR 0x20000000
|
||||
// Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Set default blocklists
|
||||
|
||||
// Disabled means excluded from enabled at user's request. A separate config
|
||||
// macro allows disabling without deactivating the blocklist below.
|
||||
#ifndef HWY_DISABLED_TARGETS
|
||||
#define HWY_DISABLED_TARGETS 0
|
||||
#endif
|
||||
|
||||
// Broken means excluded from enabled due to known compiler issues. Allow the
|
||||
// user to override this blocklist without any guarantee of success.
|
||||
#ifndef HWY_BROKEN_TARGETS
|
||||
|
||||
// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
|
||||
// SSE4 codegen (msan failure), so disable all those targets.
|
||||
#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
|
||||
// TODO: Disable all non-scalar targets for every build target once we have
|
||||
// clang-7 enabled in our builders.
|
||||
#ifdef MEMORY_SANITIZER
|
||||
#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
#endif
|
||||
// This entails a major speed reduction, so warn unless the user explicitly
|
||||
// opts in to scalar-only.
|
||||
#if !defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
|
||||
#endif
|
||||
|
||||
// MSVC, or 32-bit may fail to compile AVX2/3.
|
||||
#elif HWY_COMPILER_MSVC != 0 || HWY_ARCH_X86_32
|
||||
#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3)
|
||||
#pragma message("Disabling AVX2/3 due to known issues with MSVC/32-bit builds")
|
||||
|
||||
#else
|
||||
#define HWY_BROKEN_TARGETS 0
|
||||
#endif
|
||||
|
||||
#endif // HWY_BROKEN_TARGETS
|
||||
|
||||
// Enabled means not disabled nor blocklisted.
|
||||
#define HWY_ENABLED(targets) \
|
||||
((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Detect baseline targets using predefined macros
|
||||
|
||||
// Baseline means the targets for which the compiler is allowed to generate
|
||||
// instructions, implying the target CPU would have to support them. Do not use
|
||||
// this directly because it does not take the blocklist into account. Allow the
|
||||
// user to override this without any guarantee of success.
|
||||
#ifndef HWY_BASELINE_TARGETS
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#define HWY_BASELINE_WASM HWY_WASM
|
||||
#else
|
||||
#define HWY_BASELINE_WASM 0
|
||||
#endif
|
||||
|
||||
#ifdef __VSX__
|
||||
#define HWY_BASELINE_PPC8 HWY_PPC8
|
||||
#else
|
||||
#define HWY_BASELINE_PPC8 0
|
||||
#endif
|
||||
|
||||
// GCC 4.5.4 only defines the former; 5.4 defines both.
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#define HWY_BASELINE_NEON HWY_NEON
|
||||
#else
|
||||
#define HWY_BASELINE_NEON 0
|
||||
#endif
|
||||
|
||||
#ifdef __SSE4_1__
|
||||
#define HWY_BASELINE_SSE4 HWY_SSE4
|
||||
#else
|
||||
#define HWY_BASELINE_SSE4 0
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
#define HWY_BASELINE_AVX2 HWY_AVX2
|
||||
#else
|
||||
#define HWY_BASELINE_AVX2 0
|
||||
#endif
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#define HWY_BASELINE_AVX3 HWY_AVX3
|
||||
#else
|
||||
#define HWY_BASELINE_AVX3 0
|
||||
#endif
|
||||
|
||||
#ifdef __riscv_vector
|
||||
#define HWY_BASELINE_RVV HWY_RVV
|
||||
#else
|
||||
#define HWY_BASELINE_RVV 0
|
||||
#endif
|
||||
|
||||
#define HWY_BASELINE_TARGETS \
|
||||
(HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_NEON | \
|
||||
HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
|
||||
HWY_BASELINE_RVV)
|
||||
|
||||
#endif // HWY_BASELINE_TARGETS
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose target for static dispatch
|
||||
|
||||
#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
|
||||
#if HWY_ENABLED_BASELINE == 0
|
||||
#error "At least one baseline target must be defined and enabled"
|
||||
#endif
|
||||
|
||||
// Best baseline, used for static dispatch. This is the least-significant 1-bit
|
||||
// within HWY_ENABLED_BASELINE and lower bit values imply "better".
|
||||
#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
|
||||
|
||||
// Start by assuming static dispatch. If we later use dynamic dispatch, this
|
||||
// will be defined to other targets during the multiple-inclusion, and finally
|
||||
// return to the initial value. Defining this outside begin/end_target ensures
|
||||
// inl headers successfully compile by themselves (required by Bazel).
|
||||
#define HWY_TARGET HWY_STATIC_TARGET
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Choose targets for dynamic dispatch according to one of four policies
|
||||
|
||||
#if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
|
||||
defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
|
||||
#error "Invalid config: can only define a single policy for targets"
|
||||
#endif
|
||||
|
||||
// Attainable means enabled and the compiler allows intrinsics (even when not
|
||||
// allowed to autovectorize). Used in 3 and 4.
|
||||
#if HWY_ARCH_X86
|
||||
#define HWY_ATTAINABLE_TARGETS \
|
||||
HWY_ENABLED(HWY_SCALAR | HWY_SSE4 | HWY_AVX2 | HWY_AVX3)
|
||||
#else
|
||||
#define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
|
||||
#endif
|
||||
|
||||
// 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
|
||||
// to ~HWY_SCALAR, but this is more explicit).
|
||||
#if defined(HWY_COMPILE_ONLY_SCALAR)
|
||||
#undef HWY_STATIC_TARGET
|
||||
#define HWY_STATIC_TARGET HWY_SCALAR // override baseline
|
||||
#define HWY_TARGETS HWY_SCALAR
|
||||
|
||||
// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
|
||||
#elif defined(HWY_COMPILE_ONLY_STATIC)
|
||||
#define HWY_TARGETS HWY_STATIC_TARGET
|
||||
|
||||
// 3) For tests: include all attainable targets (in particular: scalar)
|
||||
#elif defined(HWY_COMPILE_ALL_ATTAINABLE)
|
||||
#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
|
||||
|
||||
// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
|
||||
// excluding superseded targets, in particular scalar.
|
||||
#else
|
||||
|
||||
#define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
|
||||
|
||||
#endif // target policy
|
||||
|
||||
// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
|
||||
// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
|
||||
// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
|
||||
#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
|
||||
#error "Logic error: best baseline should be included in dynamic targets"
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// Returns (cached) bitfield of enabled targets that are supported on this CPU.
|
||||
// Implemented in supported_targets.cc; unconditionally compiled to support the
|
||||
// use case of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may
|
||||
// allow eliding calls to this function.
|
||||
uint32_t SupportedTargets();
|
||||
|
||||
// Disable from runtime dispatch the mask of compiled in targets. Targets that
|
||||
// were not enabled at compile time are ignored. This function is useful to
|
||||
// disable a target supported by the CPU that is known to have bugs or when a
|
||||
// lower target is desired. For this reason, attempts to disable targets which
|
||||
// are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
|
||||
// returns at least the baseline target.
|
||||
void DisableTargets(uint32_t disabled_targets);
|
||||
|
||||
// Single target: reduce code size by eliding the call and conditional branches
|
||||
// inside Choose*() functions.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
|
||||
#define HWY_SUPPORTED_TARGETS HWY_TARGETS
|
||||
#else
|
||||
#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
|
||||
#endif
|
||||
|
||||
// Set the mock mask of CPU supported targets instead of the actual CPU
|
||||
// supported targets computed in SupportedTargets(). The return value of
|
||||
// SupportedTargets() will still be affected by the DisabledTargets() mask
|
||||
// regardless of this mock, to prevent accidentally adding targets that are
|
||||
// known to be buggy in the current CPU. Call with a mask of 0 to disable the
|
||||
// mock and use the actual CPU supported targets instead.
|
||||
void SetSupportedTargetsForTest(uint32_t targets);
|
||||
|
||||
// Returns whether the SupportedTargets() function was called since the last
|
||||
// SetSupportedTargetsForTest() call.
|
||||
bool SupportedTargetsCalledForTest();
|
||||
|
||||
// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
|
||||
// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
|
||||
// is affected by the current SetSupportedTargetsForTest() mock if any.
|
||||
HWY_INLINE std::vector<uint32_t> SupportedAndGeneratedTargets() {
|
||||
std::vector<uint32_t> ret;
|
||||
for (uint32_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
|
||||
targets = targets & (targets - 1)) {
|
||||
uint32_t current_target = targets & ~(targets - 1);
|
||||
ret.push_back(current_target);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline HWY_MAYBE_UNUSED const char* TargetName(uint32_t target) {
|
||||
switch (target) {
|
||||
#if HWY_ARCH_X86
|
||||
case HWY_SSE4:
|
||||
return "SSE4";
|
||||
case HWY_AVX2:
|
||||
return "AVX2";
|
||||
case HWY_AVX3:
|
||||
return "AVX3";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
case HWY_NEON:
|
||||
return "Neon";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_PPC
|
||||
case HWY_PPC8:
|
||||
return "Power8";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_WASM
|
||||
case HWY_WASM:
|
||||
return "Wasm";
|
||||
#endif
|
||||
|
||||
#if HWY_ARCH_RVV
|
||||
case HWY_RVV:
|
||||
return "RVV";
|
||||
#endif
|
||||
|
||||
case HWY_SCALAR:
|
||||
return "Scalar";
|
||||
|
||||
default:
|
||||
return "?";
|
||||
}
|
||||
}
|
||||
|
||||
// The maximum number of dynamic targets on any architecture is defined by
|
||||
// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
|
||||
|
||||
// For the ChosenTarget mask and index we use a different bit arrangement than
|
||||
// in the HWY_TARGETS mask. Only the targets involved in the current
|
||||
// architecture are used in this mask, and therefore only the least significant
|
||||
// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the uint32_t mask are used. The least
|
||||
// significant bit is set when the mask is not initialized, the next
|
||||
// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
|
||||
// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
|
||||
// that position and the next more significant bit is used for the scalar
|
||||
// target. Because of this we need to define equivalent values for HWY_TARGETS
|
||||
// in this representation.
|
||||
// This mask representation allows to use ctz() on this mask and obtain a small
|
||||
// number that's used as an index of the table for dynamic dispatch. In this
|
||||
// way the first entry is used when the mask is uninitialized, the following
|
||||
// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
|
||||
// scalar.
|
||||
|
||||
// The HWY_SCALAR bit in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_SCALAR (1u << (HWY_MAX_DYNAMIC_TARGETS + 1))
|
||||
|
||||
// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
|
||||
// current architecture.
|
||||
#define HWY_CHOSEN_TARGET_SHIFT(X) \
|
||||
((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
|
||||
((1u << HWY_MAX_DYNAMIC_TARGETS) - 1)) \
|
||||
<< 1)
|
||||
|
||||
// The HWY_TARGETS mask in the ChosenTarget mask format.
|
||||
#define HWY_CHOSEN_TARGET_MASK_TARGETS \
|
||||
(HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1u)
|
||||
|
||||
#if HWY_ARCH_X86
|
||||
// Maximum number of dynamic targets, changing this value is an ABI incompatible
|
||||
// change
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 10
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
|
||||
// These must match the order in which the HWY_TARGETS are defined
|
||||
// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
|
||||
// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
|
||||
// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
|
||||
// corresponds to the best target. Don't include a "," at the end of the list.
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_AVX3(func_name), /* AVX3 */ \
|
||||
HWY_CHOOSE_AVX2(func_name), /* AVX2 */ \
|
||||
nullptr, /* AVX */ \
|
||||
HWY_CHOOSE_SSE4(func_name), /* SSE4 */ \
|
||||
nullptr, /* SSSE3 */ \
|
||||
nullptr, /* SSE3 */ \
|
||||
nullptr /* SSE2 */
|
||||
|
||||
#endif // HWY_ARCH_X86
|
||||
|
||||
#if HWY_ARCH_ARM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_NEON(func_name) /* NEON */
|
||||
|
||||
#endif // HWY_ARCH_ARM
|
||||
|
||||
#if HWY_ARCH_PPC
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 5
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_PPC8(func_name), /* PPC8 */ \
|
||||
nullptr, /* VSX */ \
|
||||
nullptr /* AltiVec */
|
||||
|
||||
#endif // HWY_ARCH_PPC
|
||||
|
||||
#if HWY_ARCH_WASM
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_WASM(func_name) /* WASM */
|
||||
|
||||
#endif // HWY_ARCH_WASM
|
||||
|
||||
#if HWY_ARCH_RVV
|
||||
// See HWY_ARCH_X86 above for details.
|
||||
#define HWY_MAX_DYNAMIC_TARGETS 4
|
||||
#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
|
||||
#define HWY_CHOOSE_TARGET_LIST(func_name) \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
nullptr, /* reserved */ \
|
||||
HWY_CHOOSE_RVV(func_name) /* RVV */
|
||||
|
||||
#endif // HWY_ARCH_RVV
|
||||
|
||||
struct ChosenTarget {
|
||||
public:
|
||||
// Update the ChosenTarget mask based on the current CPU supported
|
||||
// targets.
|
||||
void Update();
|
||||
|
||||
// Reset the ChosenTarget to the uninitialized state.
|
||||
void DeInit() { mask_.store(1); }
|
||||
|
||||
// Whether the ChosenTarget was initialized. This is useful to know whether
|
||||
// any HWY_DYNAMIC_DISPATCH function was called.
|
||||
bool IsInitialized() const { return mask_.load() != 1; }
|
||||
|
||||
// Return the index in the dynamic dispatch table to be used by the current
|
||||
// CPU. Note that this method must be in the header file so it uses the value
|
||||
// of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
|
||||
// calls it, which may be different from others. This allows to only consider
|
||||
// those targets that were actually compiled in this module.
|
||||
size_t HWY_INLINE GetIndex() const {
|
||||
return hwy::Num0BitsBelowLS1Bit_Nonzero32(mask_.load() &
|
||||
HWY_CHOSEN_TARGET_MASK_TARGETS);
|
||||
}
|
||||
|
||||
private:
|
||||
// Initialized to 1 so GetChosenTargetIndex() returns 0.
|
||||
std::atomic<uint32_t> mask_{1};
|
||||
};
|
||||
|
||||
extern ChosenTarget chosen_target;
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HIGHWAY_HWY_TARGETS_H_
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "hwy/targets.h"
|
||||
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
namespace fake {
|
||||
|
||||
#define DECLARE_FUNCTION(TGT) \
|
||||
namespace N_##TGT { \
|
||||
uint32_t FakeFunction(int) { return HWY_##TGT; } \
|
||||
}
|
||||
|
||||
DECLARE_FUNCTION(AVX3)
|
||||
DECLARE_FUNCTION(AVX2)
|
||||
DECLARE_FUNCTION(SSE4)
|
||||
DECLARE_FUNCTION(NEON)
|
||||
DECLARE_FUNCTION(PPC8)
|
||||
DECLARE_FUNCTION(WASM)
|
||||
DECLARE_FUNCTION(RVV)
|
||||
DECLARE_FUNCTION(SCALAR)
|
||||
|
||||
HWY_EXPORT(FakeFunction);
|
||||
|
||||
void CheckFakeFunction() {
|
||||
#define CHECK_ARRAY_ENTRY(TGT) \
|
||||
if ((HWY_TARGETS & HWY_##TGT) != 0) { \
|
||||
hwy::SetSupportedTargetsForTest(HWY_##TGT); \
|
||||
/* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */ \
|
||||
/* the pointer to the already cached function. */ \
|
||||
hwy::chosen_target.Update(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Calling DeInit() will test that the initializer function */ \
|
||||
/* also calls the right function. */ \
|
||||
hwy::chosen_target.DeInit(); \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
/* Second call uses the cached value from the previous call. */ \
|
||||
EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \
|
||||
}
|
||||
CHECK_ARRAY_ENTRY(AVX3)
|
||||
CHECK_ARRAY_ENTRY(AVX2)
|
||||
CHECK_ARRAY_ENTRY(SSE4)
|
||||
CHECK_ARRAY_ENTRY(NEON)
|
||||
CHECK_ARRAY_ENTRY(PPC8)
|
||||
CHECK_ARRAY_ENTRY(WASM)
|
||||
CHECK_ARRAY_ENTRY(RVV)
|
||||
CHECK_ARRAY_ENTRY(SCALAR)
|
||||
#undef CHECK_ARRAY_ENTRY
|
||||
}
|
||||
|
||||
} // namespace fake
|
||||
|
||||
namespace hwy {
|
||||
|
||||
class HwyTargetsTest : public testing::Test {
|
||||
protected:
|
||||
void TearDown() override {
|
||||
SetSupportedTargetsForTest(0);
|
||||
DisableTargets(0); // Reset the mask.
|
||||
}
|
||||
};
|
||||
|
||||
// Test that the order in the HWY_EXPORT static array matches the expected
|
||||
// value of the target bits. This is only checked for the targets that are
|
||||
// enabled in the current compilation.
|
||||
TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
|
||||
|
||||
TEST_F(HwyTargetsTest, DisabledTargetsTest) {
|
||||
DisableTargets(~0u);
|
||||
// Check that the baseline can't be disabled.
|
||||
HWY_ASSERT(HWY_ENABLED_BASELINE == SupportedTargets());
|
||||
|
||||
DisableTargets(0); // Reset the mask.
|
||||
uint32_t current_targets = SupportedTargets();
|
||||
if ((current_targets & ~HWY_ENABLED_BASELINE) == 0) {
|
||||
// We can't test anything else if the only compiled target is the baseline.
|
||||
return;
|
||||
}
|
||||
// Get the lowest bit in the mask (the best target) and disable that one.
|
||||
uint32_t lowest_target = current_targets & (~current_targets + 1);
|
||||
// The lowest target shouldn't be one in the baseline.
|
||||
HWY_ASSERT((lowest_target & ~HWY_ENABLED_BASELINE) != 0);
|
||||
DisableTargets(lowest_target);
|
||||
|
||||
// Check that the other targets are still enabled.
|
||||
HWY_ASSERT((lowest_target ^ current_targets) == SupportedTargets());
|
||||
DisableTargets(0); // Reset the mask.
|
||||
}
|
||||
|
||||
} // namespace hwy
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,287 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
// Not yet implemented
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestLowerHalf {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const Half<D> d2;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
const auto v = Iota(d, 1);
|
||||
Store(LowerHalf(v), d2, lanes.get());
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d2); ++i) {
|
||||
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
|
||||
}
|
||||
// Other half remains unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct TestLowerQuarter {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const Half<Half<D>> d4;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
const auto v = Iota(d, 1);
|
||||
const auto lo = LowerHalf(LowerHalf(v));
|
||||
Store(lo, d4, lanes.get());
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d4); ++i) {
|
||||
HWY_ASSERT_EQ(T(i + 1), lanes[i]);
|
||||
}
|
||||
// Upper 3/4 remain unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLowerHalf() {
|
||||
constexpr size_t kDiv = 1;
|
||||
ForAllTypes(ForPartialVectors<TestLowerHalf, kDiv, /*kMinLanes=*/2>());
|
||||
ForAllTypes(ForPartialVectors<TestLowerQuarter, kDiv, /*kMinLanes=*/4>());
|
||||
}
|
||||
|
||||
struct TestUpperHalf {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define UpperHalf.
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const Half<D> d2;
|
||||
|
||||
const auto v = Iota(d, 1);
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
Store(UpperHalf(v), d2, lanes.get());
|
||||
size_t i = 0;
|
||||
for (; i < Lanes(d2); ++i) {
|
||||
HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]);
|
||||
}
|
||||
// Other half remains unchanged
|
||||
for (; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllUpperHalf() {
|
||||
ForAllTypes(ForGE128Vectors<TestUpperHalf>());
|
||||
}
|
||||
|
||||
struct TestZeroExtendVector {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_CAP_GE256
|
||||
const Twice<D> d2;
|
||||
|
||||
const auto v = Iota(d, 1);
|
||||
const size_t N2 = Lanes(d2);
|
||||
auto lanes = AllocateAligned<T>(N2);
|
||||
Store(v, d, &lanes[0]);
|
||||
Store(v, d, &lanes[N2 / 2]);
|
||||
|
||||
const auto ext = ZeroExtendVector(v);
|
||||
Store(ext, d2, lanes.get());
|
||||
|
||||
size_t i = 0;
|
||||
// Lower half is unchanged
|
||||
for (; i < N2 / 2; ++i) {
|
||||
HWY_ASSERT_EQ(T(1 + i), lanes[i]);
|
||||
}
|
||||
// Upper half is zero
|
||||
for (; i < N2; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), lanes[i]);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllZeroExtendVector() {
|
||||
ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
|
||||
}
|
||||
|
||||
struct TestCombine {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_CAP_GE256
|
||||
const Twice<D> d2;
|
||||
const size_t N2 = Lanes(d2);
|
||||
auto lanes = AllocateAligned<T>(N2);
|
||||
|
||||
const auto lo = Iota(d, 1);
|
||||
const auto hi = Iota(d, N2 / 2 + 1);
|
||||
const auto combined = Combine(hi, lo);
|
||||
Store(combined, d2, lanes.get());
|
||||
|
||||
const auto expected = Iota(d2, 1);
|
||||
HWY_ASSERT_VEC_EQ(d2, expected, combined);
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCombine() {
|
||||
ForAllTypes(ForExtendableVectors<TestCombine>());
|
||||
}
|
||||
|
||||
|
||||
template <int kBytes>
|
||||
struct TestCombineShiftRightBytesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
const auto lo = BitCast(d, Iota(d8, 1));
|
||||
const auto hi = BitCast(d, Iota(d8, 1 + N8));
|
||||
|
||||
auto expected = AllocateAligned<T>(Lanes(d));
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
const size_t kBlockSize = 16;
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
const size_t block = i / kBlockSize;
|
||||
const size_t lane = i % kBlockSize;
|
||||
const size_t first_lo = block * kBlockSize;
|
||||
const size_t idx = lane + kBytes;
|
||||
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
|
||||
const bool at_end = idx >= 2 * kBlockSize;
|
||||
expected_bytes[i] =
|
||||
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(),
|
||||
CombineShiftRightBytes<kBytes>(hi, lo));
|
||||
|
||||
TestCombineShiftRightBytesR<kBytes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <int kLanes>
|
||||
struct TestCombineShiftRightLanesR {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
// Scalar does not define CombineShiftRightBytes (needed for *Lanes).
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> d8;
|
||||
const size_t N8 = Lanes(d8);
|
||||
const auto lo = BitCast(d, Iota(d8, 1));
|
||||
const auto hi = BitCast(d, Iota(d8, 1 + N8));
|
||||
|
||||
auto expected = AllocateAligned<T>(Lanes(d));
|
||||
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
const size_t kBlockSize = 16;
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
const size_t block = i / kBlockSize;
|
||||
const size_t lane = i % kBlockSize;
|
||||
const size_t first_lo = block * kBlockSize;
|
||||
const size_t idx = lane + kLanes * sizeof(T);
|
||||
const size_t offset = (idx < kBlockSize) ? 0 : N8 - kBlockSize;
|
||||
const bool at_end = idx >= 2 * kBlockSize;
|
||||
expected_bytes[i] =
|
||||
at_end ? 0 : static_cast<uint8_t>(first_lo + idx + 1 + offset);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(),
|
||||
CombineShiftRightLanes<kLanes>(hi, lo));
|
||||
|
||||
TestCombineShiftRightBytesR<kLanes - 1>()(t, d);
|
||||
#else
|
||||
(void)t;
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightBytesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct TestCombineShiftRightLanesR<0> {
|
||||
template <class T, class D>
|
||||
void operator()(T /*unused*/, D /*unused*/) {}
|
||||
};
|
||||
|
||||
struct TestCombineShiftRight {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
TestCombineShiftRightBytesR<15>()(t, d);
|
||||
TestCombineShiftRightLanesR<16 / sizeof(T) - 1>()(t, d);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCombineShiftRight() {
|
||||
ForAllTypes(ForGE128Vectors<TestCombineShiftRight>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyCombineTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombineShiftRight);
|
||||
} // namespace hwy
|
||||
#endif
|
||||
|
||||
#else
|
||||
int main(int, char**) { return 0; }
|
||||
#endif // HWY_TARGET != HWY_RVV
|
|
@ -0,0 +1,217 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memset
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// All types.
|
||||
struct TestMask {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
const auto actual_false = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
|
||||
|
||||
memset(lanes.get(), 0xFF, N * sizeof(T));
|
||||
const auto actual_true = MaskFromVec(Load(d, lanes.get()));
|
||||
HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMask() { ForAllTypes(ForPartialVectors<TestMask>()); }
|
||||
|
||||
// All types.
|
||||
struct TestEquality {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v2 = Iota(d, 2);
|
||||
const auto v2b = Iota(d, 2);
|
||||
const auto v3 = Iota(d, 3);
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllEquality() {
|
||||
ForAllTypes(ForPartialVectors<TestEquality>());
|
||||
}
|
||||
|
||||
// a > b should be true, verify that for Gt/Lt and with swapped args.
|
||||
template <class D>
|
||||
void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
const auto va = Set(d, a);
|
||||
const auto vb = Set(d, b);
|
||||
AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
|
||||
AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
|
||||
|
||||
// Swapped order
|
||||
AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
|
||||
AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
|
||||
|
||||
// Also ensure irreflexive
|
||||
AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
|
||||
AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
|
||||
AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
|
||||
AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
|
||||
}
|
||||
|
||||
#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
|
||||
|
||||
struct TestStrictInt {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const T min = LimitsMin<T>();
|
||||
const T max = LimitsMax<T>();
|
||||
const auto v0 = Zero(d);
|
||||
const auto v2 = And(Iota(d, T(2)), Set(d, 127)); // 0..127
|
||||
const auto vn = Neg(v2) - Set(d, 1); // -1..-128
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
// Individual values of interest
|
||||
HWY_ENSURE_GREATER(d, 2, 1);
|
||||
HWY_ENSURE_GREATER(d, 1, 0);
|
||||
HWY_ENSURE_GREATER(d, 0, -1);
|
||||
HWY_ENSURE_GREATER(d, -1, -2);
|
||||
HWY_ENSURE_GREATER(d, max, max / 2);
|
||||
HWY_ENSURE_GREATER(d, max, 1);
|
||||
HWY_ENSURE_GREATER(d, max, 0);
|
||||
HWY_ENSURE_GREATER(d, max, -1);
|
||||
HWY_ENSURE_GREATER(d, max, min);
|
||||
HWY_ENSURE_GREATER(d, 0, min);
|
||||
HWY_ENSURE_GREATER(d, min / 2, min);
|
||||
|
||||
// Also use Iota to ensure lanes are independent
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStrictInt() {
|
||||
ForSignedTypes(ForExtendableVectors<TestStrictInt>());
|
||||
}
|
||||
|
||||
struct TestStrictFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const T huge_neg = -1E35;
|
||||
const T huge_pos = 1E36;
|
||||
const auto v0 = Zero(d);
|
||||
const auto v2 = Iota(d, T(2));
|
||||
const auto vn = Neg(v2);
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
// Individual values of interest
|
||||
HWY_ENSURE_GREATER(d, 2, 1);
|
||||
HWY_ENSURE_GREATER(d, 1, 0);
|
||||
HWY_ENSURE_GREATER(d, 0, -1);
|
||||
HWY_ENSURE_GREATER(d, -1, -2);
|
||||
HWY_ENSURE_GREATER(d, huge_pos, 1);
|
||||
HWY_ENSURE_GREATER(d, huge_pos, 0);
|
||||
HWY_ENSURE_GREATER(d, huge_pos, -1);
|
||||
HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
|
||||
HWY_ENSURE_GREATER(d, 0, huge_neg);
|
||||
|
||||
// Also use Iota to ensure lanes are independent
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStrictFloat() {
|
||||
ForFloatTypes(ForExtendableVectors<TestStrictFloat>());
|
||||
}
|
||||
|
||||
struct TestWeakFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v2 = Iota(d, 2);
|
||||
const auto vn = Iota(d, -T(Lanes(d)));
|
||||
|
||||
const auto mask_false = MaskFalse(d);
|
||||
const auto mask_true = MaskTrue(d);
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
|
||||
HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllWeakFloat() {
|
||||
ForFloatTypes(ForPartialVectors<TestWeakFloat>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyCompareTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllMask);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,568 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
|
||||
// via TestBitCastFrom.
|
||||
template <typename ToT>
|
||||
struct TestBitCast {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const Repartition<ToT, D> dto;
|
||||
HWY_ASSERT_EQ(Lanes(d) * sizeof(T), Lanes(dto) * sizeof(ToT));
|
||||
const auto vf = Iota(d, 1);
|
||||
const auto vt = BitCast(dto, vf);
|
||||
// Must return the same bits
|
||||
auto from_lanes = AllocateAligned<T>(Lanes(d));
|
||||
auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
|
||||
Store(vf, d, from_lanes.get());
|
||||
Store(vt, dto, to_lanes.get());
|
||||
HWY_ASSERT(
|
||||
BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
|
||||
}
|
||||
};
|
||||
|
||||
// From D to all types.
|
||||
struct TestBitCastFrom {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
TestBitCast<uint8_t>()(t, d);
|
||||
TestBitCast<uint16_t>()(t, d);
|
||||
TestBitCast<uint32_t>()(t, d);
|
||||
#if HWY_CAP_INTEGER64
|
||||
TestBitCast<uint64_t>()(t, d);
|
||||
#endif
|
||||
TestBitCast<int8_t>()(t, d);
|
||||
TestBitCast<int16_t>()(t, d);
|
||||
TestBitCast<int32_t>()(t, d);
|
||||
#if HWY_CAP_INTEGER64
|
||||
TestBitCast<int64_t>()(t, d);
|
||||
#endif
|
||||
TestBitCast<float>()(t, d);
|
||||
#if HWY_CAP_FLOAT64
|
||||
TestBitCast<double>()(t, d);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllBitCast() {
|
||||
// For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
|
||||
// the former can't partition its single lane, and the latter can be smaller
|
||||
// than a destination type.
|
||||
const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
|
||||
to_u8(uint8_t());
|
||||
to_u8(int8_t());
|
||||
|
||||
const ForPartialVectors<TestBitCast<int8_t>> to_i8;
|
||||
to_i8(uint8_t());
|
||||
to_i8(int8_t());
|
||||
|
||||
const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
|
||||
to_u16(uint16_t());
|
||||
to_u16(int16_t());
|
||||
|
||||
const ForPartialVectors<TestBitCast<int16_t>> to_i16;
|
||||
to_i16(uint16_t());
|
||||
to_i16(int16_t());
|
||||
|
||||
const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
|
||||
to_u32(uint32_t());
|
||||
to_u32(int32_t());
|
||||
to_u32(float());
|
||||
|
||||
const ForPartialVectors<TestBitCast<int32_t>> to_i32;
|
||||
to_i32(uint32_t());
|
||||
to_i32(int32_t());
|
||||
to_i32(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
|
||||
to_u64(uint64_t());
|
||||
to_u64(int64_t());
|
||||
#if HWY_CAP_FLOAT64
|
||||
to_u64(double());
|
||||
#endif
|
||||
|
||||
const ForPartialVectors<TestBitCast<int64_t>> to_i64;
|
||||
to_i64(uint64_t());
|
||||
to_i64(int64_t());
|
||||
#if HWY_CAP_FLOAT64
|
||||
to_i64(double());
|
||||
#endif
|
||||
#endif // HWY_CAP_INTEGER64
|
||||
|
||||
const ForPartialVectors<TestBitCast<float>> to_float;
|
||||
to_float(uint32_t());
|
||||
to_float(int32_t());
|
||||
to_float(float());
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForPartialVectors<TestBitCast<double>> to_double;
|
||||
to_double(double());
|
||||
#if HWY_CAP_INTEGER64
|
||||
to_double(uint64_t());
|
||||
to_double(int64_t());
|
||||
#endif // HWY_CAP_INTEGER64
|
||||
#endif // HWY_CAP_FLOAT64
|
||||
|
||||
// For non-scalar vectors, we can cast all types to all.
|
||||
ForAllTypes(ForGE128Vectors<TestBitCastFrom>());
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestPromoteTo {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 200; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
expected[i] = from[i];
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
PromoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllPromoteTo() {
|
||||
const ForPartialVectors<TestPromoteTo<uint16_t>, 2> to_u16div2;
|
||||
to_u16div2(uint8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<uint32_t>, 4> to_u32div4;
|
||||
to_u32div4(uint8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<uint32_t>, 2> to_u32div2;
|
||||
to_u32div2(uint16_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int16_t>, 2> to_i16div2;
|
||||
to_i16div2(uint8_t());
|
||||
to_i16div2(int8_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int32_t>, 2> to_i32div2;
|
||||
to_i32div2(uint16_t());
|
||||
to_i32div2(int16_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int32_t>, 4> to_i32div4;
|
||||
to_i32div4(uint8_t());
|
||||
to_i32div4(int8_t());
|
||||
|
||||
// Must test f16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForPartialVectors<TestPromoteTo<uint64_t>, 2> to_u64div2;
|
||||
to_u64div2(uint32_t());
|
||||
|
||||
const ForPartialVectors<TestPromoteTo<int64_t>, 2> to_i64div2;
|
||||
to_i64div2(int32_t());
|
||||
#endif
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForPartialVectors<TestPromoteTo<double>, 2> to_f64div2;
|
||||
to_f64div2(int32_t());
|
||||
to_f64div2(float());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
bool IsFinite(T t) {
|
||||
return std::isfinite(t);
|
||||
}
|
||||
// Wrapper avoids calling std::isfinite for integer types (ambiguous).
|
||||
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
||||
bool IsFinite(T /*unused*/) {
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteTo {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
// Narrower range in the wider type, for clamping before we cast
|
||||
const T min = LimitsMin<ToT>();
|
||||
const T max = LimitsMax<ToT>();
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!IsFinite(from[i]));
|
||||
expected[i] = static_cast<ToT>(std::min(std::max(min, from[i]), max));
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
DemoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToInt() {
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t());
|
||||
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int16_t());
|
||||
ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<uint16_t>, 2> to_u16;
|
||||
to_u16(int32_t());
|
||||
|
||||
const ForDemoteVectors<TestDemoteTo<int16_t>, 2> to_i16;
|
||||
to_i16(int32_t());
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToMixed() {
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteTo<int32_t>, 2> to_i32;
|
||||
to_i32(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename ToT>
|
||||
struct TestDemoteToFloat {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
|
||||
// For floats, we clamp differently and cannot call LimitsMin.
|
||||
static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
|
||||
static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
|
||||
const Rebind<ToT, D> to_d;
|
||||
|
||||
const size_t N = Lanes(from_d);
|
||||
auto from = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<ToT>(N);
|
||||
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(T));
|
||||
} while (!IsFinite(from[i]));
|
||||
const T magn = std::abs(from[i]);
|
||||
const T max_abs = HighestValue<ToT>();
|
||||
// NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
|
||||
// https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
|
||||
const T clipped = copysign(std::min(magn, max_abs), from[i]);
|
||||
expected[i] = static_cast<ToT>(clipped);
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(to_d, expected.get(),
|
||||
DemoteTo(to_d, Load(from_d, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllDemoteToFloat() {
|
||||
// Must test f16 separately because we can only load/store/convert them.
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float;
|
||||
to_float(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class D>
|
||||
AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
|
||||
const float test_cases[] = {
|
||||
// +/- 1
|
||||
1.0f, -1.0f,
|
||||
// +/- 0
|
||||
0.0f, -0.0f,
|
||||
// near 0
|
||||
0.25f, -0.25f,
|
||||
// +/- integer
|
||||
4.0f, -32.0f,
|
||||
// positive near limit
|
||||
65472.0f, 65504.0f,
|
||||
// negative near limit
|
||||
-65472.0f, -65504.0f,
|
||||
// positive +/- delta
|
||||
2.00390625f, 3.99609375f,
|
||||
// negative +/- delta
|
||||
-2.00390625f, -3.99609375f,
|
||||
// No infinity/NaN - implementation-defined due to ARM.
|
||||
};
|
||||
const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
|
||||
const size_t N = Lanes(d);
|
||||
padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors
|
||||
auto in = AllocateAligned<float>(padded);
|
||||
auto expected = AllocateAligned<float>(padded);
|
||||
std::copy(test_cases, test_cases + kNumTestCases, in.get());
|
||||
std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
|
||||
return in;
|
||||
}
|
||||
|
||||
struct TestF16 {
|
||||
template <typename TF32, class DF32>
|
||||
HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
|
||||
size_t padded;
|
||||
auto in = F16TestCases(d32, padded);
|
||||
using TF16 = float16_t;
|
||||
const Rebind<TF16, DF32> d16;
|
||||
const size_t N = Lanes(d32); // same count for f16
|
||||
auto temp16 = AllocateAligned<TF16>(N);
|
||||
|
||||
for (size_t i = 0; i < padded; i += N) {
|
||||
const auto loaded = Load(d32, &in[i]);
|
||||
Store(DemoteTo(d16, loaded), d16, temp16.get());
|
||||
HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16, 2>()(float()); }
|
||||
|
||||
struct TestConvertU8 {
|
||||
template <typename T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
|
||||
const Rebind<uint8_t, D> du8;
|
||||
auto lanes8 = AllocateAligned<uint8_t>(Lanes(du8));
|
||||
Store(Iota(du8, 0), du8, lanes8.get());
|
||||
HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(Iota(du32, 0)));
|
||||
HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F), U8FromU32(Iota(du32, 0x7F)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllConvertU8() {
|
||||
ForDemoteVectors<TestConvertU8, 4>()(uint32_t());
|
||||
}
|
||||
|
||||
// Separate function to attempt to work around a compiler bug on ARM: when this
|
||||
// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
|
||||
struct TestIntFromFloatHuge {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
// Still does not work, although ARMv7 manual says that float->int
|
||||
// saturates, i.e. chooses the nearest representable value.
|
||||
#if HWY_TARGET != HWY_NEON
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
|
||||
// Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
|
||||
// the expected i32 value is otherwise 0x80..00).
|
||||
const auto expected_max = Set(di, LimitsMax<TI>());
|
||||
HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20))));
|
||||
|
||||
// Huge negative (also lvalue for safety, but GCC bug was not triggered)
|
||||
const auto expected_min = Set(di, LimitsMin<TI>());
|
||||
HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20))));
|
||||
#else
|
||||
(void)df;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
struct TestIntFromFloat {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
using TI = MakeSigned<TF>;
|
||||
const Rebind<TI, DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
|
||||
|
||||
// Integer negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
|
||||
|
||||
// Above positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
|
||||
|
||||
// Below positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
|
||||
|
||||
const TF eps = static_cast<TF>(0.0001);
|
||||
// Above negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
|
||||
ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
|
||||
|
||||
// Below negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
|
||||
ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
|
||||
// TF does not have enough precision to represent TI.
|
||||
const double min = static_cast<double>(LimitsMin<TI>());
|
||||
const double max = static_cast<double>(LimitsMax<TI>());
|
||||
|
||||
// Also check random values.
|
||||
auto from = AllocateAligned<TF>(N);
|
||||
auto expected = AllocateAligned<TI>(N);
|
||||
RandomState rng;
|
||||
for (size_t rep = 0; rep < 1000; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
do {
|
||||
const uint64_t bits = rng();
|
||||
memcpy(&from[i], &bits, sizeof(TF));
|
||||
} while (!std::isfinite(from[i]));
|
||||
if (from[i] >= max) {
|
||||
expected[i] = LimitsMax<TI>();
|
||||
} else if (from[i] <= min) {
|
||||
expected[i] = LimitsMin<TI>();
|
||||
} else {
|
||||
expected[i] = static_cast<TI>(from[i]);
|
||||
}
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(di, expected.get(),
|
||||
ConvertTo(di, Load(df, from.get())));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIntFromFloat() {
|
||||
ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
|
||||
ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
|
||||
}
|
||||
|
||||
struct TestFloatFromInt {
|
||||
template <typename TI, class DI>
|
||||
HWY_NOINLINE void operator()(TI /*unused*/, const DI di) {
|
||||
using TF = MakeFloat<TI>;
|
||||
const Rebind<TF, DI> df;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
|
||||
|
||||
// Integer negative
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
|
||||
|
||||
// Max positive
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
|
||||
ConvertTo(df, Set(di, LimitsMax<TI>())));
|
||||
|
||||
// Min negative
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
|
||||
ConvertTo(df, Set(di, LimitsMin<TI>())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllFloatFromInt() {
|
||||
ForPartialVectors<TestFloatFromInt>()(int32_t());
|
||||
#if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
|
||||
ForPartialVectors<TestFloatFromInt>()(int64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestI32F64 {
|
||||
template <typename TF, class DF>
|
||||
HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
|
||||
using TI = int32_t;
|
||||
const Rebind<TI, DF> di;
|
||||
const size_t N = Lanes(df);
|
||||
|
||||
// Integer positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
|
||||
|
||||
// Integer negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
|
||||
|
||||
// Above positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
|
||||
|
||||
// Below positive
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
|
||||
|
||||
const TF eps = static_cast<TF>(0.0001);
|
||||
// Above negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
|
||||
|
||||
// Below negative
|
||||
HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
|
||||
DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
|
||||
HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
|
||||
|
||||
// Huge positive float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
|
||||
DemoteTo(di, Set(df, TF(1E12))));
|
||||
|
||||
// Huge negative float
|
||||
HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
|
||||
DemoteTo(di, Set(df, TF(-1E12))));
|
||||
|
||||
// Max positive int
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
|
||||
PromoteTo(df, Set(di, LimitsMax<TI>())));
|
||||
|
||||
// Min negative int
|
||||
HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
|
||||
PromoteTo(df, Set(di, LimitsMin<TI>())));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllI32F64() {
|
||||
#if HWY_CAP_FLOAT64
|
||||
ForDemoteVectors<TestI32F64, 2>()(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyConvertTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
|
||||
HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,34 @@
|
|||
// Copyright 2020 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Simple tool to print the list of targets that were compiled in when building
|
||||
// this tool.
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "hwy/highway.h"
|
||||
|
||||
void PrintTargets(const char* msg, uint32_t targets) {
|
||||
fprintf(stderr, "%s", msg);
|
||||
for (unsigned x = targets; x != 0; x = x & (x - 1)) {
|
||||
fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
PrintTargets("Compiled HWY_TARGETS:", HWY_TARGETS);
|
||||
PrintTargets("HWY_BASELINE_TARGETS:", HWY_BASELINE_TARGETS);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,691 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h> // memcmp
|
||||
|
||||
#include "hwy/base.h"
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestLogicalInteger {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vi = Iota(d, 0);
|
||||
const auto ones = VecFromMask(d, Eq(v0, v0));
|
||||
const auto v1 = Set(d, 1);
|
||||
const auto vnot1 = Set(d, ~T(1));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
|
||||
HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
|
||||
HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
|
||||
|
||||
auto v = vi;
|
||||
v = And(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
v = And(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
|
||||
v = Or(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
v = Or(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
|
||||
v = Xor(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
v = Xor(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLogicalInteger() {
|
||||
ForIntegerTypes(ForPartialVectors<TestLogicalInteger>());
|
||||
}
|
||||
|
||||
struct TestLogicalFloat {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vi = Iota(d, 0);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
|
||||
|
||||
auto v = vi;
|
||||
v = And(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
v = And(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
|
||||
v = Or(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
v = Or(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, vi, v);
|
||||
|
||||
v = Xor(v, vi);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
v = Xor(v, v0);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, v);
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLogicalFloat() {
|
||||
ForFloatTypes(ForPartialVectors<TestLogicalFloat>());
|
||||
}
|
||||
|
||||
struct TestCopySign {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vp = Iota(d, 1);
|
||||
const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
|
||||
|
||||
// Zero remains zero regardless of sign
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
|
||||
|
||||
// Positive input, positive sign => unchanged
|
||||
HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
|
||||
|
||||
// Positive input, negative sign => negated
|
||||
HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
|
||||
HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
|
||||
|
||||
// Negative input, negative sign => unchanged
|
||||
HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
|
||||
|
||||
// Negative input, positive sign => negated
|
||||
HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCopySign() {
|
||||
ForFloatTypes(ForPartialVectors<TestCopySign>());
|
||||
}
|
||||
|
||||
struct TestIfThenElse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto in1 = AllocateAligned<T>(N);
|
||||
auto in2 = AllocateAligned<T>(N);
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
// NOTE: reverse polarity (mask is true iff lane == 0) because we cannot
|
||||
// reliably compare against all bits set (NaN for float types).
|
||||
const T off = 1;
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 50; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
in1[i] = static_cast<T>(Random32(&rng));
|
||||
in2[i] = static_cast<T>(Random32(&rng));
|
||||
mask_lanes[i] = (Random32(&rng) & 1024) ? off : T(0);
|
||||
}
|
||||
|
||||
const auto v1 = Load(d, in1.get());
|
||||
const auto v2 = Load(d, in2.get());
|
||||
const auto mask = Eq(Load(d, mask_lanes.get()), Zero(d));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (mask_lanes[i] == off) ? in2[i] : in1[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = mask_lanes[i] ? T(0) : in1[i];
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = mask_lanes[i] ? in2[i] : T(0);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllIfThenElse() {
|
||||
ForAllTypes(ForPartialVectors<TestIfThenElse>());
|
||||
}
|
||||
|
||||
struct TestMaskVec {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto mask_lanes = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
mask_lanes[i] = static_cast<T>(Random32(&rng) & 1);
|
||||
}
|
||||
|
||||
const auto mask = RebindMask(d, Eq(Load(d, mask_lanes.get()), Zero(d)));
|
||||
HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllMaskVec() {
|
||||
const ForPartialVectors<TestMaskVec> test;
|
||||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
// TODO(janwas): float16_t - cannot compare yet
|
||||
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
test(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
#if HWY_CAP_FLOAT64
|
||||
test(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestCompress {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
|
||||
using TU = MakeUnsigned<T>;
|
||||
const Rebind<TU, D> du;
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
auto mask_lanes = AllocateAligned<TU>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
auto actual = AllocateAligned<T>(N);
|
||||
|
||||
// Each lane should have a chance of having mask=true.
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
size_t expected_pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
const uint64_t bits = Random32(&rng);
|
||||
in_lanes[i] = T(); // cannot initialize float16_t directly.
|
||||
CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);
|
||||
mask_lanes[i] = static_cast<TU>(Random32(&rng) & 1);
|
||||
if (mask_lanes[i] == 0) { // Zero means true (easier to compare)
|
||||
expected[expected_pos++] = in_lanes[i];
|
||||
}
|
||||
}
|
||||
|
||||
const auto in = Load(d, in_lanes.get());
|
||||
const auto mask = RebindMask(d, Eq(Load(du, mask_lanes.get()), Zero(du)));
|
||||
|
||||
Store(Compress(in, mask), d, actual.get());
|
||||
// Upper lanes are undefined.
|
||||
for (size_t i = 0; i < expected_pos; ++i) {
|
||||
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
||||
}
|
||||
|
||||
// Also check CompressStore in the same way.
|
||||
memset(actual.get(), 0, N * sizeof(T));
|
||||
const size_t num_written = CompressStore(in, mask, d, actual.get());
|
||||
HWY_ASSERT_EQ(expected_pos, num_written);
|
||||
for (size_t i = 0; i < expected_pos; ++i) {
|
||||
HWY_ASSERT(memcmp(&actual[i], &expected[i], sizeof(T)) == 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if 0
|
||||
namespace detail { // for code folding
|
||||
void PrintCompress16x8Tables() {
|
||||
constexpr size_t N = 8; // 128-bit SIMD
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint8_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Doubled (for converting lane to byte indices)
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d,", 2 * indices[i]);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Compressed to nibbles
|
||||
void PrintCompress32x8Tables() {
|
||||
constexpr size_t N = 8; // AVX2
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to nibbles
|
||||
uint64_t packed = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT(indices[i] < 16);
|
||||
packed += indices[i] << (i * 4);
|
||||
}
|
||||
|
||||
HWY_ASSERT(packed < (1ull << 32));
|
||||
printf("0x%08x,", static_cast<uint32_t>(packed));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// Pairs of 32-bit lane indices
|
||||
void PrintCompress64x4Tables() {
|
||||
constexpr size_t N = 4; // AVX2
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
printf("%d,%d,", 2 * indices[i], 2 * indices[i] + 1);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// 4-tuple of byte indices
|
||||
void PrintCompress32x4Tables() {
|
||||
using T = uint32_t;
|
||||
constexpr size_t N = 4; // SSE4
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
||||
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// 8-tuple of byte indices
|
||||
void PrintCompress64x2Tables() {
|
||||
using T = uint64_t;
|
||||
constexpr size_t N = 2; // SSE4
|
||||
for (uint64_t code = 0; code < 1ull << N; ++code) {
|
||||
std::array<uint32_t, N> indices{0};
|
||||
size_t pos = 0;
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (code & (1ull << i)) {
|
||||
indices[pos++] = i;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
|
||||
printf("%zu,", sizeof(T) * indices[i] + idx_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
} // namespace detail
|
||||
#endif
|
||||
|
||||
HWY_NOINLINE void TestAllCompress() {
|
||||
// detail::PrintCompress32x8Tables();
|
||||
// detail::PrintCompress64x4Tables();
|
||||
// detail::PrintCompress32x4Tables();
|
||||
// detail::PrintCompress64x2Tables();
|
||||
// detail::PrintCompress16x8Tables();
|
||||
|
||||
const ForPartialVectors<TestCompress> test;
|
||||
|
||||
test(uint16_t());
|
||||
test(int16_t());
|
||||
test(float16_t());
|
||||
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
test(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
#if HWY_CAP_FLOAT64
|
||||
test(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestZeroIfNegative {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v0 = Zero(d);
|
||||
const auto vp = Iota(d, 1);
|
||||
const auto vn = Iota(d, T(-1E5)); // assumes N < 10^5
|
||||
|
||||
// Zero and positive remain unchanged
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
|
||||
|
||||
// Negative are all replaced with zero
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllZeroIfNegative() {
|
||||
ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
|
||||
}
|
||||
|
||||
struct TestBroadcastSignBit {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto s0 = Zero(d);
|
||||
const auto s1 = Set(d, -1); // all bit set
|
||||
const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
|
||||
const auto vneg = s1 - vpos;
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
|
||||
HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
|
||||
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
|
||||
HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllBroadcastSignBit() {
|
||||
ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
|
||||
}
|
||||
|
||||
struct TestTestBit {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t kNumBits = sizeof(T) * 8;
|
||||
for (size_t i = 0; i < kNumBits; ++i) {
|
||||
const auto bit1 = Set(d, 1ull << i);
|
||||
const auto bit2 = Set(d, 1ull << ((i + 1) % kNumBits));
|
||||
const auto bit3 = Set(d, 1ull << ((i + 2) % kNumBits));
|
||||
const auto bits12 = Or(bit1, bit2);
|
||||
const auto bits23 = Or(bit2, bit3);
|
||||
HWY_ASSERT(AllTrue(TestBit(bit1, bit1)));
|
||||
HWY_ASSERT(AllTrue(TestBit(bits12, bit1)));
|
||||
HWY_ASSERT(AllTrue(TestBit(bits12, bit2)));
|
||||
|
||||
HWY_ASSERT(AllFalse(TestBit(bits12, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bits23, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit1, bit2)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit2, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit1, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit3, bit1)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit2, bit3)));
|
||||
HWY_ASSERT(AllFalse(TestBit(bit3, bit2)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllTestBit() {
|
||||
ForIntegerTypes(ForFullVectors<TestTestBit>());
|
||||
}
|
||||
|
||||
struct TestAllTrueFalse {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto zero = Zero(d);
|
||||
auto v = zero;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(0));
|
||||
|
||||
HWY_ASSERT(AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(Eq(v, zero)));
|
||||
|
||||
// Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
|
||||
// lanes and one is nonzero.
|
||||
const bool expected_all_false = (N != 1);
|
||||
|
||||
// Set each lane to nonzero and back to zero
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = T(1);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(!AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
|
||||
|
||||
lanes[i] = T(-1);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(!AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(expected_all_false ^ AllFalse(Eq(v, zero)));
|
||||
|
||||
// Reset to all zero
|
||||
lanes[i] = T(0);
|
||||
v = Load(d, lanes.get());
|
||||
HWY_ASSERT(AllTrue(Eq(v, zero)));
|
||||
HWY_ASSERT(!AllFalse(Eq(v, zero)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllAllTrueFalse() {
|
||||
ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
|
||||
}
|
||||
|
||||
class TestStoreMaskBits {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*t*/, D d) {
|
||||
// TODO(janwas): remove once implemented (cast or vse1)
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
const size_t expected_bytes = (N + 7) / 8;
|
||||
auto bits = AllocateAligned<uint8_t>(expected_bytes);
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
// Generate random mask pattern.
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
lanes[i] = static_cast<T>((rng() & 1024) ? 1 : 0);
|
||||
}
|
||||
const auto mask = Load(d, lanes.get()) == Zero(d);
|
||||
|
||||
const size_t bytes_written = StoreMaskBits(mask, bits.get());
|
||||
|
||||
HWY_ASSERT_EQ(expected_bytes, bytes_written);
|
||||
size_t i = 0;
|
||||
// Stored bits must match original mask
|
||||
for (; i < N; ++i) {
|
||||
const bool bit = (bits[i / 8] & (1 << (i % 8))) != 0;
|
||||
HWY_ASSERT_EQ(bit, lanes[i] == 0);
|
||||
}
|
||||
// Any partial bits in the last byte must be zero
|
||||
for (; i < 8 * bytes_written; ++i) {
|
||||
const int bit = (bits[i / 8] & (1 << (i % 8)));
|
||||
HWY_ASSERT_EQ(bit, 0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStoreMaskBits() {
|
||||
ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
|
||||
}
|
||||
|
||||
struct TestCountTrue {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = std::min(N, size_t(10));
|
||||
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(1));
|
||||
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
// Number of zeros written = number of mask lanes that are true.
|
||||
size_t expected = 0;
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
lanes[i] = T(1);
|
||||
if (code & (1ull << i)) {
|
||||
++expected;
|
||||
lanes[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
const auto mask = Eq(Load(d, lanes.get()), Zero(d));
|
||||
const size_t actual = CountTrue(mask);
|
||||
HWY_ASSERT_EQ(expected, actual);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllCountTrue() {
|
||||
ForAllTypes(ForPartialVectors<TestCountTrue>());
|
||||
}
|
||||
|
||||
struct TestLogicalMask {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto m0 = MaskFalse(d);
|
||||
const auto m_all = MaskTrue(d);
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
std::fill(lanes.get(), lanes.get() + N, T(1));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
|
||||
|
||||
// For all combinations of zero/nonzero state of subset of lanes:
|
||||
const size_t max_lanes = std::min(N, size_t(6));
|
||||
for (size_t code = 0; code < (1ull << max_lanes); ++code) {
|
||||
for (size_t i = 0; i < max_lanes; ++i) {
|
||||
lanes[i] = T(1);
|
||||
if (code & (1ull << i)) {
|
||||
lanes[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
const auto m = Eq(Load(d, lanes.get()), Zero(d));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
|
||||
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
|
||||
HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
|
||||
HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLogicalMask() {
|
||||
ForAllTypes(ForFullVectors<TestLogicalMask>());
|
||||
}
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyLogicalTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfThenElse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllMaskVec);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCompress);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllAllTrueFalse);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllStoreMaskBits);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCountTrue);
|
||||
HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalMask);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,413 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
|
||||
#include "hwy/cache_control.h"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestLoadStore {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto hi = Iota(d, 1 + N);
|
||||
const auto lo = Iota(d, 1);
|
||||
auto lanes = AllocateAligned<T>(2 * N);
|
||||
Store(hi, d, &lanes[N]);
|
||||
Store(lo, d, &lanes[0]);
|
||||
|
||||
// Aligned load
|
||||
const auto lo2 = Load(d, &lanes[0]);
|
||||
HWY_ASSERT_VEC_EQ(d, lo2, lo);
|
||||
|
||||
// Aligned store
|
||||
auto lanes2 = AllocateAligned<T>(2 * N);
|
||||
Store(lo2, d, &lanes2[0]);
|
||||
Store(hi, d, &lanes2[N]);
|
||||
for (size_t i = 0; i < 2 * N; ++i) {
|
||||
HWY_ASSERT_EQ(lanes[i], lanes2[i]);
|
||||
}
|
||||
|
||||
// Unaligned load
|
||||
const auto vu = LoadU(d, &lanes[1]);
|
||||
auto lanes3 = AllocateAligned<T>(N);
|
||||
Store(vu, d, lanes3.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
|
||||
}
|
||||
|
||||
// Unaligned store
|
||||
StoreU(lo2, d, &lanes2[N / 2]);
|
||||
size_t i = 0;
|
||||
for (; i < N / 2; ++i) {
|
||||
HWY_ASSERT_EQ(lanes[i], lanes2[i]);
|
||||
}
|
||||
for (; i < 3 * N / 2; ++i) {
|
||||
HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
|
||||
}
|
||||
// Subsequent values remain unchanged.
|
||||
for (; i < 2 * N; ++i) {
|
||||
HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLoadStore() {
|
||||
ForAllTypes(ForPartialVectors<TestLoadStore>());
|
||||
}
|
||||
|
||||
struct TestStoreInterleaved3 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
RandomState rng;
|
||||
|
||||
// Data to be interleaved
|
||||
auto bytes = AllocateAligned<uint8_t>(3 * N);
|
||||
for (size_t i = 0; i < 3 * N; ++i) {
|
||||
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
const auto in0 = Load(d, &bytes[0 * N]);
|
||||
const auto in1 = Load(d, &bytes[1 * N]);
|
||||
const auto in2 = Load(d, &bytes[2 * N]);
|
||||
|
||||
// Interleave here, ensure vector results match scalar
|
||||
auto expected = AllocateAligned<T>(4 * N);
|
||||
auto actual_aligned = AllocateAligned<T>(4 * N + 1);
|
||||
T* actual = actual_aligned.get() + 1;
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[3 * i + 0] = bytes[0 * N + i];
|
||||
expected[3 * i + 1] = bytes[1 * N + i];
|
||||
expected[3 * i + 2] = bytes[2 * N + i];
|
||||
// Ensure we do not write more than 3*N bytes
|
||||
expected[3 * N + i] = actual[3 * N + i] = 0;
|
||||
}
|
||||
StoreInterleaved3(in0, in1, in2, d, actual);
|
||||
size_t pos = 0;
|
||||
if (!BytesEqual(expected.get(), actual, 4 * N, &pos)) {
|
||||
Print(d, "in0", in0, pos / 3);
|
||||
Print(d, "in1", in1, pos / 3);
|
||||
Print(d, "in2", in2, pos / 3);
|
||||
const size_t i = pos - pos % 3;
|
||||
fprintf(stderr, "interleaved %d %d %d %d %d %d\n", actual[i],
|
||||
actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
|
||||
actual[i + 5]);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStoreInterleaved3() {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
||||
const ForExtendableVectors<TestStoreInterleaved3, 4> test;
|
||||
#else
|
||||
const ForPartialVectors<TestStoreInterleaved3> test;
|
||||
#endif
|
||||
test(uint8_t());
|
||||
}
|
||||
|
||||
struct TestStoreInterleaved4 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
RandomState rng;
|
||||
|
||||
// Data to be interleaved
|
||||
auto bytes = AllocateAligned<uint8_t>(4 * N);
|
||||
for (size_t i = 0; i < 4 * N; ++i) {
|
||||
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
const auto in0 = Load(d, &bytes[0 * N]);
|
||||
const auto in1 = Load(d, &bytes[1 * N]);
|
||||
const auto in2 = Load(d, &bytes[2 * N]);
|
||||
const auto in3 = Load(d, &bytes[3 * N]);
|
||||
|
||||
// Interleave here, ensure vector results match scalar
|
||||
auto expected = AllocateAligned<T>(5 * N);
|
||||
auto actual_aligned = AllocateAligned<T>(5 * N + 1);
|
||||
T* actual = actual_aligned.get() + 1;
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[4 * i + 0] = bytes[0 * N + i];
|
||||
expected[4 * i + 1] = bytes[1 * N + i];
|
||||
expected[4 * i + 2] = bytes[2 * N + i];
|
||||
expected[4 * i + 3] = bytes[3 * N + i];
|
||||
// Ensure we do not write more than 4*N bytes
|
||||
expected[4 * N + i] = actual[4 * N + i] = 0;
|
||||
}
|
||||
StoreInterleaved4(in0, in1, in2, in3, d, actual);
|
||||
size_t pos = 0;
|
||||
if (!BytesEqual(expected.get(), actual, 5 * N, &pos)) {
|
||||
Print(d, "in0", in0, pos / 4);
|
||||
Print(d, "in1", in1, pos / 4);
|
||||
Print(d, "in2", in2, pos / 4);
|
||||
Print(d, "in3", in3, pos / 4);
|
||||
const size_t i = pos;
|
||||
fprintf(stderr, "interleaved %d %d %d %d %d %d %d %d\n", actual[i],
|
||||
actual[i + 1], actual[i + 2], actual[i + 3], actual[i + 4],
|
||||
actual[i + 5], actual[i + 6], actual[i + 7]);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStoreInterleaved4() {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Segments are limited to 8 registers, so we can only go up to LMUL=2.
|
||||
const ForExtendableVectors<TestStoreInterleaved4, 4> test;
|
||||
#else
|
||||
const ForPartialVectors<TestStoreInterleaved4> test;
|
||||
#endif
|
||||
test(uint8_t());
|
||||
}
|
||||
|
||||
struct TestLoadDup128 {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define LoadDup128.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
constexpr size_t N128 = 16 / sizeof(T);
|
||||
alignas(16) T lanes[N128];
|
||||
for (size_t i = 0; i < N128; ++i) {
|
||||
lanes[i] = static_cast<T>(1 + i);
|
||||
}
|
||||
const auto v = LoadDup128(d, lanes);
|
||||
const size_t N = Lanes(d);
|
||||
auto out = AllocateAligned<T>(N);
|
||||
Store(v, d, out.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
HWY_ASSERT_EQ(T(i % N128 + 1), out[i]);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllLoadDup128() {
|
||||
ForAllTypes(ForGE128Vectors<TestLoadDup128>());
|
||||
}
|
||||
|
||||
struct TestStream {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, T(1));
|
||||
const size_t affected_bytes =
|
||||
(Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
|
||||
~size_t(HWY_STREAM_MULTIPLE - 1);
|
||||
const size_t affected_lanes = affected_bytes / sizeof(T);
|
||||
auto out = AllocateAligned<T>(2 * affected_lanes);
|
||||
std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
|
||||
|
||||
Stream(v, d, out.get());
|
||||
StoreFence();
|
||||
const auto actual = Load(d, out.get());
|
||||
HWY_ASSERT_VEC_EQ(d, v, actual);
|
||||
// Ensure Stream didn't modify more memory than expected
|
||||
for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
|
||||
HWY_ASSERT_EQ(T(0), out[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllStream() {
|
||||
const ForPartialVectors<TestStream> test;
|
||||
// No u8,u16.
|
||||
test(uint32_t());
|
||||
test(uint64_t());
|
||||
// No i8,i16.
|
||||
test(int32_t());
|
||||
test(int64_t());
|
||||
ForFloatTypes(test);
|
||||
}
|
||||
|
||||
// Assumes little-endian byte order!
|
||||
struct TestScatter {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using Offset = MakeSigned<T>;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
const size_t range = 4 * N; // number of items to scatter
|
||||
const size_t max_bytes = range * sizeof(T); // upper bound on offset
|
||||
|
||||
RandomState rng;
|
||||
|
||||
// Data to be scattered
|
||||
auto bytes = AllocateAligned<uint8_t>(max_bytes);
|
||||
for (size_t i = 0; i < max_bytes; ++i) {
|
||||
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
|
||||
|
||||
// Scatter into these regions, ensure vector results match scalar
|
||||
auto expected = AllocateAligned<T>(range);
|
||||
auto actual = AllocateAligned<T>(range);
|
||||
|
||||
const Rebind<Offset, D> d_offsets;
|
||||
auto offsets = AllocateAligned<Offset>(N); // or indices
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
// Byte offsets
|
||||
std::fill(expected.get(), expected.get() + range, T(0));
|
||||
std::fill(actual.get(), actual.get() + range, T(0));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] =
|
||||
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
|
||||
CopyBytes<sizeof(T)>(
|
||||
bytes.get() + i * sizeof(T),
|
||||
reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
|
||||
}
|
||||
const auto voffsets = Load(d_offsets, offsets.get());
|
||||
ScatterOffset(data, d, actual.get(), voffsets);
|
||||
if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
|
||||
Print(d, "Data", data);
|
||||
Print(d_offsets, "Offsets", voffsets);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
|
||||
// Indices
|
||||
std::fill(expected.get(), expected.get() + range, T(0));
|
||||
std::fill(actual.get(), actual.get() + range, T(0));
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] = static_cast<Offset>(Random32(&rng) % range);
|
||||
CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
|
||||
&expected[offsets[i]]);
|
||||
}
|
||||
const auto vindices = Load(d_offsets, offsets.get());
|
||||
ScatterIndex(data, d, actual.get(), vindices);
|
||||
if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
|
||||
Print(d, "Data", data);
|
||||
Print(d_offsets, "Indices", vindices);
|
||||
HWY_ASSERT(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllScatter() {
|
||||
// No u8,u16,i8,i16.
|
||||
const ForPartialVectors<TestScatter> test;
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(test);
|
||||
}
|
||||
|
||||
struct TestGather {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using Offset = MakeSigned<T>;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
|
||||
RandomState rng;
|
||||
|
||||
// Data to be gathered from
|
||||
const size_t max_bytes = 4 * N * sizeof(T); // upper bound on offset
|
||||
auto bytes = AllocateAligned<uint8_t>(max_bytes);
|
||||
for (size_t i = 0; i < max_bytes; ++i) {
|
||||
bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
|
||||
}
|
||||
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
auto offsets = AllocateAligned<Offset>(N);
|
||||
auto indices = AllocateAligned<Offset>(N);
|
||||
|
||||
for (size_t rep = 0; rep < 100; ++rep) {
|
||||
// Offsets
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
offsets[i] =
|
||||
static_cast<Offset>(Random32(&rng) % (max_bytes - sizeof(T)));
|
||||
CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
|
||||
}
|
||||
|
||||
const Rebind<Offset, D> d_offset;
|
||||
const T* base = reinterpret_cast<const T*>(bytes.get());
|
||||
auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
|
||||
|
||||
// Indices
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
indices[i] =
|
||||
static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
|
||||
CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
|
||||
}
|
||||
actual = GatherIndex(d, base, Load(d_offset, indices.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllGather() {
|
||||
// No u8,u16,i8,i16.
|
||||
const ForPartialVectors<TestGather> test;
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
test(int64_t());
|
||||
#endif
|
||||
ForFloatTypes(test);
|
||||
}
|
||||
|
||||
HWY_NOINLINE void TestAllCache() {
|
||||
LoadFence();
|
||||
StoreFence();
|
||||
int test = 0;
|
||||
Prefetch(&test);
|
||||
FlushCacheline(&test);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwyMemoryTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved3);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreInterleaved4);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
|
||||
HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,642 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestShiftBytes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define Shift*Bytes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const Repartition<uint8_t, D> du8;
|
||||
const size_t N8 = Lanes(du8);
|
||||
|
||||
// Zero remains zero
|
||||
const auto v0 = Zero(d);
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(v0));
|
||||
|
||||
// Zero after shifting out the high/low byte
|
||||
auto bytes = AllocateAligned<uint8_t>(N8);
|
||||
std::fill(bytes.get(), bytes.get() + N8, 0);
|
||||
bytes[N8 - 1] = 0x7F;
|
||||
const auto vhi = BitCast(d, Load(du8, bytes.get()));
|
||||
bytes[N8 - 1] = 0;
|
||||
bytes[0] = 0x7F;
|
||||
const auto vlo = BitCast(d, Load(du8, bytes.get()));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
|
||||
HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(vlo));
|
||||
|
||||
// Check expected result with Iota
|
||||
const size_t N = Lanes(d);
|
||||
auto in = AllocateAligned<T>(N);
|
||||
const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
|
||||
const auto v = BitCast(d, Iota(du8, 1));
|
||||
Store(v, d, in.get());
|
||||
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
const size_t kBlockSize = HWY_MIN(N8, 16);
|
||||
for (size_t block = 0; block < N8; block += kBlockSize) {
|
||||
expected_bytes[block] = 0;
|
||||
memcpy(expected_bytes + block + 1, in_bytes + block, kBlockSize - 1);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
|
||||
|
||||
for (size_t block = 0; block < N8; block += kBlockSize) {
|
||||
memcpy(expected_bytes + block, in_bytes + block + 1, kBlockSize - 1);
|
||||
expected_bytes[block + kBlockSize - 1] = 0;
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(v));
|
||||
#else
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShiftBytes() {
|
||||
ForIntegerTypes(ForGE128Vectors<TestShiftBytes>());
|
||||
}
|
||||
|
||||
struct TestShiftLanes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// Scalar does not define Shift*Lanes.
|
||||
#if HWY_TARGET != HWY_SCALAR || HWY_IDE
|
||||
const auto v = Iota(d, T(1));
|
||||
const size_t N = Lanes(d);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
|
||||
HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(v));
|
||||
|
||||
constexpr size_t kLanesPerBlock = 16 / sizeof(T);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] =
|
||||
(i % kLanesPerBlock) == (kLanesPerBlock - 1) ? T(0) : T(2 + i);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(v));
|
||||
#else
|
||||
(void)d;
|
||||
#endif // #if HWY_TARGET != HWY_SCALAR
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllShiftLanes() {
|
||||
ForAllTypes(ForGE128Vectors<TestShiftLanes>());
|
||||
}
|
||||
|
||||
template <typename D, int kLane>
|
||||
struct TestBroadcastR {
|
||||
HWY_NOINLINE void operator()() const {
|
||||
// TODO(janwas): fix failure
|
||||
#if HWY_TARGET != HWY_WASM
|
||||
using T = typename D::T;
|
||||
const D d;
|
||||
const size_t N = Lanes(d);
|
||||
auto in_lanes = AllocateAligned<T>(N);
|
||||
std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
|
||||
const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
|
||||
// Need to set within each 128-bit block
|
||||
for (size_t block = 0; block < N; block += blockN) {
|
||||
in_lanes[block + kLane] = static_cast<T>(block + 1);
|
||||
}
|
||||
const auto in = Load(d, in_lanes.get());
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t block = 0; block < N; block += blockN) {
|
||||
for (size_t i = 0; i < blockN; ++i) {
|
||||
expected[block + i] = T(block + 1);
|
||||
}
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
|
||||
|
||||
TestBroadcastR<D, kLane - 1>()();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
template <class D>
|
||||
struct TestBroadcastR<D, -1> {
|
||||
void operator()() const {}
|
||||
};
|
||||
|
||||
struct TestBroadcast {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllBroadcast() {
|
||||
const ForPartialVectors<TestBroadcast> test;
|
||||
// No u8.
|
||||
test(uint16_t());
|
||||
test(uint32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(uint64_t());
|
||||
#endif
|
||||
|
||||
// No i8.
|
||||
test(int16_t());
|
||||
test(int32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
test(int64_t());
|
||||
#endif
|
||||
|
||||
ForFloatTypes(test);
|
||||
}
|
||||
|
||||
struct TestTableLookupBytes {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
RandomState rng;
|
||||
const size_t N = Lanes(d);
|
||||
const size_t N8 = Lanes(Repartition<uint8_t, D>());
|
||||
auto in_bytes = AllocateAligned<uint8_t>(N8);
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
in_bytes[i] = Random32(&rng) & 0xFF;
|
||||
}
|
||||
const auto in =
|
||||
BitCast(d, Load(d, reinterpret_cast<const T*>(in_bytes.get())));
|
||||
|
||||
// Enough test data; for larger vectors, upper lanes will be zero.
|
||||
const uint8_t index_bytes_source[64] = {
|
||||
// Same index as source, multiple outputs from same input,
|
||||
// unused input (9), ascending/descending and nonconsecutive neighbors.
|
||||
0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11,
|
||||
11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0,
|
||||
4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1};
|
||||
auto index_bytes = AllocateAligned<uint8_t>(N8);
|
||||
for (size_t i = 0; i < N8; ++i) {
|
||||
index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
|
||||
// Avoid undefined results / asan error for scalar by capping indices.
|
||||
if (index_bytes[i] >= N * sizeof(T)) {
|
||||
index_bytes[i] = static_cast<uint8_t>(N * sizeof(T) - 1);
|
||||
}
|
||||
}
|
||||
const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
|
||||
|
||||
// Byte indices wrap around
|
||||
const size_t mod = HWY_MIN(N8, 256);
|
||||
for (size_t block = 0; block < N8; block += 16) {
|
||||
for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
|
||||
const uint8_t index = index_bytes[block + i];
|
||||
expected_bytes[block + i] = in_bytes[(block + index) % mod];
|
||||
}
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllTableLookupBytes() {
|
||||
ForIntegerTypes(ForPartialVectors<TestTableLookupBytes>());
|
||||
}
|
||||
struct TestTableLookupLanes {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
using Index = uint32_t;
|
||||
#else
|
||||
using Index = int32_t;
|
||||
#endif
|
||||
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
#if HWY_TARGET != HWY_SCALAR
|
||||
const size_t N = Lanes(d);
|
||||
auto idx = AllocateAligned<Index>(N);
|
||||
std::fill(idx.get(), idx.get() + N, Index(0));
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
const auto v = Iota(d, 1);
|
||||
|
||||
if (N <= 8) { // Test all permutations
|
||||
for (size_t i0 = 0; i0 < N; ++i0) {
|
||||
idx[0] = static_cast<Index>(i0);
|
||||
for (size_t i1 = 0; i1 < N; ++i1) {
|
||||
idx[1] = static_cast<Index>(i1);
|
||||
for (size_t i2 = 0; i2 < N; ++i2) {
|
||||
idx[2] = static_cast<Index>(i2);
|
||||
for (size_t i3 = 0; i3 < N; ++i3) {
|
||||
idx[3] = static_cast<Index>(i3);
|
||||
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
|
||||
}
|
||||
|
||||
const auto opaque = SetTableIndices(d, idx.get());
|
||||
const auto actual = TableLookupLanes(v, opaque);
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Too many permutations to test exhaustively; choose one with repeated
|
||||
// and cross-block indices and ensure indices do not exceed #lanes.
|
||||
// For larger vectors, upper lanes will be zero.
|
||||
HWY_ALIGN Index idx_source[16] = {1, 3, 2, 2, 8, 1, 7, 6,
|
||||
15, 14, 14, 15, 4, 9, 8, 5};
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
idx[i] = (i < 16) ? idx_source[i] : 0;
|
||||
// Avoid undefined results / asan error for scalar by capping indices.
|
||||
if (idx[i] >= static_cast<Index>(N)) {
|
||||
idx[i] = static_cast<Index>(N - 1);
|
||||
}
|
||||
expected[i] = static_cast<T>(idx[i] + 1); // == v[idx[i]]
|
||||
}
|
||||
|
||||
const auto opaque = SetTableIndices(d, idx.get());
|
||||
const auto actual = TableLookupLanes(v, opaque);
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
|
||||
}
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllTableLookupLanes() {
|
||||
const ForFullVectors<TestTableLookupLanes> test;
|
||||
test(uint32_t());
|
||||
test(int32_t());
|
||||
test(float());
|
||||
}
|
||||
|
||||
struct TestInterleave {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using TU = MakeUnsigned<T>;
|
||||
const size_t N = Lanes(d);
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const size_t blockN = 16 / sizeof(T);
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
const size_t index = (i % blockN) + block * 2 * blockN;
|
||||
expected[i] = static_cast<T>(index & LimitsMax<TU>());
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
|
||||
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllInterleave() {
|
||||
// Not supported by HWY_SCALAR: Interleave(f32, f32) would return f32x2.
|
||||
ForAllTypes(ForGE128Vectors<TestInterleave>());
|
||||
}
|
||||
|
||||
struct TestZipLower {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using WideT = MakeWide<T>;
|
||||
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
|
||||
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
|
||||
const size_t N = Lanes(d);
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const Repartition<WideT, D> dw;
|
||||
auto expected = AllocateAligned<WideT>(Lanes(dw));
|
||||
const WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
|
||||
for (size_t i = 0; i < Lanes(dw); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
// Value of least-significant lane in lo-vector.
|
||||
const WideT lo =
|
||||
static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
|
||||
const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
|
||||
expected[i] =
|
||||
static_cast<WideT>((static_cast<WideT>(lo + 1) << kBits) + lo);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipLower(even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestZipUpper {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
using WideT = MakeWide<T>;
|
||||
static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
|
||||
static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
|
||||
const size_t N = Lanes(d);
|
||||
auto even_lanes = AllocateAligned<T>(N);
|
||||
auto odd_lanes = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < Lanes(d); ++i) {
|
||||
even_lanes[i] = static_cast<T>(2 * i + 0);
|
||||
odd_lanes[i] = static_cast<T>(2 * i + 1);
|
||||
}
|
||||
const auto even = Load(d, even_lanes.get());
|
||||
const auto odd = Load(d, odd_lanes.get());
|
||||
|
||||
const Repartition<WideT, D> dw;
|
||||
auto expected = AllocateAligned<WideT>(Lanes(dw));
|
||||
|
||||
constexpr WideT blockN = static_cast<WideT>(16 / sizeof(WideT));
|
||||
for (size_t i = 0; i < Lanes(dw); ++i) {
|
||||
const size_t block = i / blockN;
|
||||
const WideT lo =
|
||||
static_cast<WideT>(2 * (i % blockN) + 4 * block * blockN);
|
||||
const WideT kBits = static_cast<WideT>(sizeof(T) * 8);
|
||||
expected[i] = static_cast<WideT>(
|
||||
(static_cast<WideT>(lo + 2 * blockN + 1) << kBits) + lo + 2 * blockN);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(dw, expected.get(), ZipUpper(even, odd));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllZip() {
|
||||
const ForPartialVectors<TestZipLower, 2> lower_unsigned;
|
||||
// TODO(janwas): fix
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
lower_unsigned(uint8_t());
|
||||
#endif
|
||||
lower_unsigned(uint16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
const ForPartialVectors<TestZipLower, 2> lower_signed;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
lower_signed(int8_t());
|
||||
#endif
|
||||
lower_signed(int16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
lower_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
const ForGE128Vectors<TestZipUpper> upper_unsigned;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
upper_unsigned(uint8_t());
|
||||
#endif
|
||||
upper_unsigned(uint16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_unsigned(uint32_t()); // generates u64
|
||||
#endif
|
||||
|
||||
const ForGE128Vectors<TestZipUpper> upper_signed;
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
upper_signed(int8_t());
|
||||
#endif
|
||||
upper_signed(int16_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
upper_signed(int32_t()); // generates i64
|
||||
#endif
|
||||
|
||||
// No float - concatenating f32 does not result in a f64
|
||||
}
|
||||
|
||||
class TestSpecialShuffle32 {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, 0);
|
||||
|
||||
#define VERIFY_LANES_32(d, v, i3, i2, i1, i0) \
|
||||
VerifyLanes32((d), (v), (i3), (i2), (i1), (i0), __FILE__, __LINE__)
|
||||
|
||||
VERIFY_LANES_32(d, Shuffle2301(v), 2, 3, 0, 1);
|
||||
VERIFY_LANES_32(d, Shuffle1032(v), 1, 0, 3, 2);
|
||||
VERIFY_LANES_32(d, Shuffle0321(v), 0, 3, 2, 1);
|
||||
VERIFY_LANES_32(d, Shuffle2103(v), 2, 1, 0, 3);
|
||||
VERIFY_LANES_32(d, Shuffle0123(v), 0, 1, 2, 3);
|
||||
|
||||
#undef VERIFY_LANES_32
|
||||
}
|
||||
|
||||
private:
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void VerifyLanes32(D d, V v, const int i3, const int i2,
|
||||
const int i1, const int i0,
|
||||
const char* filename, const int line) {
|
||||
using T = typename D::T;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(v, d, lanes.get());
|
||||
const std::string name = TypeName(lanes[0], N);
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
|
||||
AssertEqual(T(block + i3), lanes[block + 3], name, filename, line);
|
||||
AssertEqual(T(block + i2), lanes[block + 2], name, filename, line);
|
||||
AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
|
||||
AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class TestSpecialShuffle64 {
|
||||
public:
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const auto v = Iota(d, 0);
|
||||
VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
|
||||
}
|
||||
|
||||
private:
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void VerifyLanes64(D d, V v, const int i1, const int i0,
|
||||
const char* filename, const int line) {
|
||||
using T = typename D::T;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(v, d, lanes.get());
|
||||
const std::string name = TypeName(lanes[0], N);
|
||||
constexpr size_t kBlockN = 16 / sizeof(T);
|
||||
for (int block = 0; block < static_cast<int>(N); block += kBlockN) {
|
||||
AssertEqual(T(block + i1), lanes[block + 1], name, filename, line);
|
||||
AssertEqual(T(block + i0), lanes[block + 0], name, filename, line);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllSpecialShuffles() {
|
||||
const ForGE128Vectors<TestSpecialShuffle32> test32;
|
||||
test32(uint32_t());
|
||||
test32(int32_t());
|
||||
test32(float());
|
||||
|
||||
#if HWY_CAP_INTEGER64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test64;
|
||||
test64(uint64_t());
|
||||
test64(int64_t());
|
||||
#endif
|
||||
|
||||
#if HWY_CAP_FLOAT64
|
||||
const ForGE128Vectors<TestSpecialShuffle64> test_d;
|
||||
test_d(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
struct TestConcatHalves {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// TODO(janwas): fix
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
// Construct inputs such that interleaved halves == iota.
|
||||
const auto expected = Iota(d, 1);
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
auto lo = AllocateAligned<T>(N);
|
||||
auto hi = AllocateAligned<T>(N);
|
||||
size_t i;
|
||||
for (i = 0; i < N / 2; ++i) {
|
||||
lo[i] = static_cast<T>(1 + i);
|
||||
hi[i] = static_cast<T>(lo[i] + T(N) / 2);
|
||||
}
|
||||
for (; i < N; ++i) {
|
||||
lo[i] = hi[i] = 0;
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, expected,
|
||||
ConcatLowerLower(Load(d, hi.get()), Load(d, lo.get())));
|
||||
|
||||
// Same for high blocks.
|
||||
for (i = 0; i < N / 2; ++i) {
|
||||
lo[i] = hi[i] = 0;
|
||||
}
|
||||
for (; i < N; ++i) {
|
||||
lo[i] = static_cast<T>(1 + i - N / 2);
|
||||
hi[i] = static_cast<T>(lo[i] + T(N) / 2);
|
||||
}
|
||||
|
||||
HWY_ASSERT_VEC_EQ(d, expected,
|
||||
ConcatUpperUpper(Load(d, hi.get()), Load(d, lo.get())));
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllConcatHalves() {
|
||||
ForAllTypes(ForGE128Vectors<TestConcatHalves>());
|
||||
}
|
||||
|
||||
struct TestConcatLowerUpper {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
// TODO(janwas): fix
|
||||
#if HWY_TARGET != HWY_RVV
|
||||
const size_t N = Lanes(d);
|
||||
// Middle part of Iota(1) == Iota(1 + N / 2).
|
||||
const auto lo = Iota(d, 1);
|
||||
const auto hi = Iota(d, 1 + N);
|
||||
HWY_ASSERT_VEC_EQ(d, Iota(d, 1 + N / 2), ConcatLowerUpper(hi, lo));
|
||||
#else
|
||||
(void)d;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllConcatLowerUpper() {
|
||||
ForAllTypes(ForGE128Vectors<TestConcatLowerUpper>());
|
||||
}
|
||||
|
||||
struct TestConcatUpperLower {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto lo = Iota(d, 1);
|
||||
const auto hi = Iota(d, 1 + N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
size_t i = 0;
|
||||
for (; i < N / 2; ++i) {
|
||||
expected[i] = static_cast<T>(1 + i);
|
||||
}
|
||||
for (; i < N; ++i) {
|
||||
expected[i] = static_cast<T>(1 + i + N);
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(hi, lo));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllConcatUpperLower() {
|
||||
ForAllTypes(ForGE128Vectors<TestConcatUpperLower>());
|
||||
}
|
||||
|
||||
struct TestOddEven {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T /*unused*/, D d) {
|
||||
const size_t N = Lanes(d);
|
||||
const auto even = Iota(d, 1);
|
||||
const auto odd = Iota(d, 1 + N);
|
||||
auto expected = AllocateAligned<T>(N);
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
|
||||
}
|
||||
HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllOddEven() {
|
||||
ForAllTypes(ForGE128Vectors<TestOddEven>());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(HwySwizzleTest);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftBytes);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllShiftLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllBroadcast);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupBytes);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInterleave);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllZip);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSpecialShuffles);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatHalves);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatLowerUpper);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllConcatUpperLower);
|
||||
HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,580 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Normal include guard for non-SIMD portion of this header.
|
||||
#ifndef HWY_TESTS_TEST_UTIL_H_
|
||||
#define HWY_TESTS_TEST_UTIL_H_
|
||||
|
||||
// Helper functions for use by *_test.cc.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <utility> // std::forward
|
||||
|
||||
#include "hwy/aligned_allocator.h"
|
||||
#include "hwy/base.h"
|
||||
#include "hwy/highway.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
namespace hwy {
|
||||
|
||||
// The maximum vector size used in tests when defining test data. DEPRECATED.
|
||||
constexpr size_t kTestMaxVectorSize = 64;
|
||||
|
||||
// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
|
||||
// used INSTANTIATE_TEST_CASE_P which is now deprecated.
|
||||
#ifdef INSTANTIATE_TEST_SUITE_P
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
|
||||
#else
|
||||
#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
|
||||
#endif
|
||||
|
||||
// Helper class to run parametric tests using the hwy target as parameter. To
|
||||
// use this define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// TEST_P(MyTestSuite, MyTest) { ... }
|
||||
class TestWithParamTarget : public testing::TestWithParam<uint32_t> {
|
||||
protected:
|
||||
void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
};
|
||||
|
||||
// Function to convert the test parameter of a TestWithParamTarget for
|
||||
// displaying it in the gtest test name.
|
||||
static inline std::string TestParamTargetName(
|
||||
const testing::TestParamInfo<uint32_t>& info) {
|
||||
return TargetName(info.param);
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
::hwy::TestParamTargetName)
|
||||
|
||||
// Helper class similar to TestWithParamTarget to run parametric tests that
|
||||
// depend on the target and another parametric test. If you need to use multiple
|
||||
// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
|
||||
// the generator. To use this class define the following in your test:
|
||||
// class MyTestSuite : public TestWithParamTargetT<int> {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
|
||||
// TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
|
||||
template <typename T>
|
||||
class TestWithParamTargetAndT
|
||||
: public ::testing::TestWithParam<std::tuple<uint32_t, T>> {
|
||||
public:
|
||||
// Expose the parametric type here so it can be used by the
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
|
||||
using HwyParamType = T;
|
||||
|
||||
protected:
|
||||
void SetUp() override {
|
||||
SetSupportedTargetsForTest(std::get<0>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam()));
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
// Check that the parametric test calls SupportedTargets() when the source
|
||||
// was compiled with more than one target. In the single-target case only
|
||||
// static dispatch will be used anyway.
|
||||
#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
|
||||
EXPECT_TRUE(SupportedTargetsCalledForTest())
|
||||
<< "This hwy target parametric test doesn't use dynamic-dispatch and "
|
||||
"doesn't need to be parametric.";
|
||||
#endif
|
||||
SetSupportedTargetsForTest(0);
|
||||
}
|
||||
|
||||
T GetParam() {
|
||||
return std::get<1>(
|
||||
::testing::TestWithParam<std::tuple<uint32_t, T>>::GetParam());
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
std::string TestParamTargetNameAndT(
|
||||
const testing::TestParamInfo<std::tuple<uint32_t, T>>& info) {
|
||||
return std::string(TargetName(std::get<0>(info.param))) + "_" +
|
||||
::testing::PrintToString(std::get<1>(info.param));
|
||||
}
|
||||
|
||||
#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator) \
|
||||
HWY_GTEST_INSTANTIATE_TEST_SUITE_P( \
|
||||
suite##Group, suite, \
|
||||
::testing::Combine( \
|
||||
testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
|
||||
generator), \
|
||||
::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
|
||||
|
||||
// Helper macro to export a function and define a test that tests it. This is
|
||||
// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
|
||||
// class MyTestSuite : public TestWithParamTarget {
|
||||
// ...
|
||||
// };
|
||||
// HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
|
||||
// HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
|
||||
#define HWY_EXPORT_AND_TEST_P(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_EXPORT_AND_TEST_P_T(suite, func_name) \
|
||||
HWY_EXPORT(func_name); \
|
||||
TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
#define HWY_BEFORE_TEST(suite) \
|
||||
class suite : public hwy::TestWithParamTarget {}; \
|
||||
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite); \
|
||||
static_assert(true, "For requiring trailing semicolon")
|
||||
|
||||
// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
|
||||
// which triggers a compiler bug.
|
||||
class RandomState {
|
||||
public:
|
||||
explicit RandomState(const uint64_t seed = 0x123456789ull) {
|
||||
s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
|
||||
s1_ = SplitMix64(s0_);
|
||||
}
|
||||
|
||||
HWY_INLINE uint64_t operator()() {
|
||||
uint64_t s1 = s0_;
|
||||
const uint64_t s0 = s1_;
|
||||
const uint64_t bits = s1 + s0;
|
||||
s0_ = s0;
|
||||
s1 ^= s1 << 23;
|
||||
s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
|
||||
s1_ = s1;
|
||||
return bits;
|
||||
}
|
||||
|
||||
private:
|
||||
static uint64_t SplitMix64(uint64_t z) {
|
||||
z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
|
||||
z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
|
||||
return z ^ (z >> 31);
|
||||
}
|
||||
|
||||
uint64_t s0_;
|
||||
uint64_t s1_;
|
||||
};
|
||||
|
||||
static HWY_INLINE uint32_t Random32(RandomState* rng) {
|
||||
return static_cast<uint32_t>((*rng)());
|
||||
}
|
||||
|
||||
// Prevents the compiler from eliding the computations that led to "output".
|
||||
// Works by indicating to the compiler that "output" is being read and modified.
|
||||
// The +r constraint avoids unnecessary writes to memory, but only works for
|
||||
// built-in types.
|
||||
template <class T>
|
||||
inline void PreventElision(T&& output) {
|
||||
#if HWY_COMPILER_MSVC
|
||||
(void)output;
|
||||
#else // HWY_COMPILER_MSVC
|
||||
asm volatile("" : "+r"(output) : : "memory");
|
||||
#endif // HWY_COMPILER_MSVC
|
||||
}
|
||||
|
||||
// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
|
||||
// unsigned/signed/floating point, followed by the number of bits per lane;
|
||||
// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
|
||||
// understanding which instantiation of a generic test failed.
|
||||
template <typename T>
|
||||
static inline std::string TypeName(T /*unused*/, size_t N) {
|
||||
const char prefix = IsFloat<T>() ? 'f' : (IsSigned<T>() ? 'i' : 'u');
|
||||
char name[64];
|
||||
// Omit the xN suffix for scalars.
|
||||
if (N == 1) {
|
||||
snprintf(name, sizeof(name), "%c%zu", prefix, sizeof(T) * 8);
|
||||
} else {
|
||||
snprintf(name, sizeof(name), "%c%zux%zu", prefix, sizeof(T) * 8, N);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
// String comparison
|
||||
|
||||
template <typename T1, typename T2>
|
||||
inline bool BytesEqual(const T1* p1, const T2* p2, const size_t size,
|
||||
size_t* pos = nullptr) {
|
||||
const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
|
||||
const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (bytes1[i] != bytes2[i]) {
|
||||
fprintf(stderr, "Mismatch at byte %zu of %zu: %d != %d (%s, %s)\n", i,
|
||||
size, bytes1[i], bytes2[i], TypeName(T1(), 1).c_str(),
|
||||
TypeName(T2(), 1).c_str());
|
||||
if (pos != nullptr) {
|
||||
*pos = i;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool StringsEqual(const char* s1, const char* s2) {
|
||||
while (*s1 == *s2++) {
|
||||
if (*s1++ == '\0') return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace hwy
|
||||
|
||||
#endif // HWY_TESTS_TEST_UTIL_H_
|
||||
|
||||
// Per-target include guard
|
||||
#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
|
||||
#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
|
||||
#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
|
||||
#else
|
||||
#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
|
||||
#endif
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
// Prints lanes around `lane`, in memory order.
|
||||
template <class D>
|
||||
HWY_NOINLINE void Print(const D d, const char* caption, const Vec<D> v,
|
||||
intptr_t lane = 0) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
auto lanes = AllocateAligned<T>(N);
|
||||
Store(v, d, lanes.get());
|
||||
const size_t begin = static_cast<size_t>(std::max<intptr_t>(0, lane - 2));
|
||||
const size_t end = std::min(begin + 7, N);
|
||||
fprintf(stderr, "%s %s [%zu+ ->]:\n ", TypeName(T(), N).c_str(), caption,
|
||||
begin);
|
||||
for (size_t i = begin; i < end; ++i) {
|
||||
fprintf(stderr, "%g,", double(lanes[i]));
|
||||
}
|
||||
if (begin >= end) fprintf(stderr, "(out of bounds)");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
static HWY_NORETURN HWY_NOINLINE void NotifyFailure(
|
||||
const char* filename, const int line, const char* type_name,
|
||||
const size_t lane, const char* expected, const char* actual) {
|
||||
hwy::Abort(filename, line,
|
||||
"%s, %s lane %zu mismatch: expected '%s', got '%s'.\n",
|
||||
hwy::TargetName(HWY_TARGET), type_name, lane, expected, actual);
|
||||
}
|
||||
|
||||
template <class Out, class In>
|
||||
inline Out BitCast(const In& in) {
|
||||
static_assert(sizeof(Out) == sizeof(In), "");
|
||||
Out out;
|
||||
CopyBytes<sizeof(out)>(&in, &out);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Computes the difference in units of last place between x and y.
|
||||
template <typename TF>
|
||||
MakeUnsigned<TF> ComputeUlpDelta(TF x, TF y) {
|
||||
static_assert(IsFloat<TF>(), "Only makes sense for floating-point");
|
||||
using TU = MakeUnsigned<TF>;
|
||||
|
||||
// Handle -0 == 0 and infinities.
|
||||
if (x == y) return 0;
|
||||
|
||||
// Consider "equal" if both are NaN, so we can verify an expected NaN.
|
||||
// Needs a special case because there are many possible NaN representations.
|
||||
if (std::isnan(x) && std::isnan(y)) return 0;
|
||||
|
||||
// NOTE: no need to check for differing signs; they will result in large
|
||||
// differences, which is fine, and we avoid overflow.
|
||||
|
||||
const TU ux = BitCast<TU>(x);
|
||||
const TU uy = BitCast<TU>(y);
|
||||
// Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
|
||||
return std::max(ux, uy) - std::min(ux, uy);
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_NOT_FLOAT(T)>
|
||||
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
|
||||
return expected == actual;
|
||||
}
|
||||
|
||||
template <typename T, HWY_IF_FLOAT(T)>
|
||||
HWY_NOINLINE bool IsEqual(const T expected, const T actual) {
|
||||
return ComputeUlpDelta(expected, actual) <= 1;
|
||||
}
|
||||
|
||||
// Compare non-vector, non-string T.
|
||||
template <typename T>
|
||||
HWY_NOINLINE void AssertEqual(const T expected, const T actual,
|
||||
const std::string& type_name,
|
||||
const char* filename = "", const int line = -1,
|
||||
const size_t lane = 0) {
|
||||
if (!IsEqual(expected, actual)) {
|
||||
char expected_str[100];
|
||||
snprintf(expected_str, sizeof(expected_str), "%g", double(expected));
|
||||
char actual_str[100];
|
||||
snprintf(actual_str, sizeof(actual_str), "%g", double(actual));
|
||||
NotifyFailure(filename, line, type_name.c_str(), lane, expected_str,
|
||||
actual_str);
|
||||
}
|
||||
}
|
||||
|
||||
static HWY_NOINLINE HWY_MAYBE_UNUSED void AssertStringEqual(
|
||||
const char* expected, const char* actual, const char* filename = "",
|
||||
const int line = -1, const size_t lane = 0) {
|
||||
if (!hwy::StringsEqual(expected, actual)) {
|
||||
NotifyFailure(filename, line, "string", lane, expected, actual);
|
||||
}
|
||||
}
|
||||
|
||||
// Compare expected vector to vector.
|
||||
template <class D, class V>
|
||||
HWY_NOINLINE void AssertVecEqual(D d, const V expected, const V actual,
|
||||
const char* filename, const int line) {
|
||||
using T = TFromD<D>;
|
||||
const size_t N = Lanes(d);
|
||||
auto expected_lanes = AllocateAligned<T>(N);
|
||||
auto actual_lanes = AllocateAligned<T>(N);
|
||||
Store(expected, d, expected_lanes.get());
|
||||
Store(actual, d, actual_lanes.get());
|
||||
for (size_t i = 0; i < N; ++i) {
|
||||
if (!IsEqual(expected_lanes[i], actual_lanes[i])) {
|
||||
fprintf(stderr, "\n\n");
|
||||
Print(d, "expect", expected, i);
|
||||
Print(d, "actual", actual, i);
|
||||
|
||||
char expected_str[100];
|
||||
snprintf(expected_str, sizeof(expected_str), "%g",
|
||||
double(expected_lanes[i]));
|
||||
char actual_str[100];
|
||||
snprintf(actual_str, sizeof(actual_str), "%g", double(actual_lanes[i]));
|
||||
|
||||
NotifyFailure(filename, line, hwy::TypeName(T(), N).c_str(), i,
|
||||
expected_str, actual_str);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compare expected lanes to vector.
|
||||
template <class D>
|
||||
HWY_NOINLINE void AssertVecEqual(D d, const TFromD<D>* expected, Vec<D> actual,
|
||||
const char* filename, int line) {
|
||||
AssertVecEqual(d, LoadU(d, expected), actual, filename, line);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_NOINLINE void AssertMaskEqual(D d, Mask<D> a, Mask<D> b,
|
||||
const char* filename, int line) {
|
||||
AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line);
|
||||
|
||||
const std::string type_name = TypeName(TFromD<D>(), Lanes(d));
|
||||
AssertEqual(CountTrue(a), CountTrue(b), type_name, filename, line, 0);
|
||||
AssertEqual(AllTrue(a), AllTrue(b), type_name, filename, line, 0);
|
||||
AssertEqual(AllFalse(a), AllFalse(b), type_name, filename, line, 0);
|
||||
|
||||
// TODO(janwas): StoreMaskBits
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_NOINLINE Mask<D> MaskTrue(const D d) {
|
||||
const auto v0 = Zero(d);
|
||||
return Eq(v0, v0);
|
||||
}
|
||||
|
||||
template <class D>
|
||||
HWY_NOINLINE Mask<D> MaskFalse(const D d) {
|
||||
// Lt is only for signed types and we cannot yet cast mask types.
|
||||
return Eq(Zero(d), Set(d, 1));
|
||||
}
|
||||
|
||||
#ifndef HWY_ASSERT_EQ
|
||||
|
||||
#define HWY_ASSERT_EQ(expected, actual) \
|
||||
AssertEqual(expected, actual, hwy::TypeName(expected, 1), __FILE__, __LINE__)
|
||||
|
||||
#define HWY_ASSERT_STRING_EQ(expected, actual) \
|
||||
AssertStringEqual(expected, actual, __FILE__, __LINE__)
|
||||
|
||||
#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
|
||||
AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
|
||||
|
||||
#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
|
||||
AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
|
||||
|
||||
#endif // HWY_ASSERT_EQ
|
||||
|
||||
// Helpers for instantiating tests with combinations of lane types / counts.
|
||||
|
||||
// For all powers of two in [kMinLanes, N * kMinLanes] (so that recursion stops
|
||||
// at N == 0)
|
||||
template <typename T, size_t N, size_t kMinLanes, class Test>
|
||||
struct ForeachSizeR {
|
||||
static void Do() {
|
||||
static_assert(N != 0, "End of recursion");
|
||||
Test()(T(), Simd<T, N * kMinLanes>());
|
||||
ForeachSizeR<T, N / 2, kMinLanes, Test>::Do();
|
||||
}
|
||||
};
|
||||
|
||||
// Base case to stop the recursion.
|
||||
template <typename T, size_t kMinLanes, class Test>
|
||||
struct ForeachSizeR<T, 0, kMinLanes, Test> {
|
||||
static void Do() {}
|
||||
};
|
||||
|
||||
// These adapters may be called directly, or via For*Types:
|
||||
|
||||
// Calls Test for all powers of two in [kMinLanes, HWY_LANES(T) / kDivLanes].
|
||||
template <class Test, size_t kDivLanes = 1, size_t kMinLanes = 1>
|
||||
struct ForPartialVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Only m1..8 for now, can ignore kMaxLanes because HWY_*_LANES are full.
|
||||
ForeachSizeR<T, 8 / kDivLanes, HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kDivLanes / kMinLanes, kMinLanes,
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all vectors that can be demoted log2(kFactor) times.
|
||||
template <class Test, size_t kFactor>
|
||||
struct ForDemoteVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
// Only m1..8 for now.
|
||||
ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T), 1, Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all powers of two in [128 bits, max bits].
|
||||
template <class Test>
|
||||
struct ForGE128Vectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
Test>::Do();
|
||||
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for all vectors that can be expanded by kFactor.
|
||||
template <class Test, size_t kFactor = 2>
|
||||
struct ForExtendableVectors {
|
||||
template <typename T>
|
||||
void operator()(T /*unused*/) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test>::Do();
|
||||
#else
|
||||
ForeachSizeR<T, HWY_LANES(T) / kFactor / (16 / sizeof(T)), (16 / sizeof(T)),
|
||||
Test>::Do();
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Calls Test for full vectors only.
|
||||
template <class Test>
|
||||
struct ForFullVectors {
|
||||
template <typename T>
|
||||
void operator()(T t) const {
|
||||
#if HWY_TARGET == HWY_RVV
|
||||
ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do();
|
||||
(void)t;
|
||||
#else
|
||||
Test()(t, HWY_FULL(T)());
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
// Type lists to shorten call sites:
|
||||
|
||||
template <class Func>
|
||||
void ForSignedTypes(const Func& func) {
|
||||
func(int8_t());
|
||||
func(int16_t());
|
||||
func(int32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(int64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForUnsignedTypes(const Func& func) {
|
||||
func(uint8_t());
|
||||
func(uint16_t());
|
||||
func(uint32_t());
|
||||
#if HWY_CAP_INTEGER64
|
||||
func(uint64_t());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForIntegerTypes(const Func& func) {
|
||||
ForSignedTypes(func);
|
||||
ForUnsignedTypes(func);
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForFloatTypes(const Func& func) {
|
||||
func(float());
|
||||
#if HWY_CAP_FLOAT64
|
||||
func(double());
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class Func>
|
||||
void ForAllTypes(const Func& func) {
|
||||
ForIntegerTypes(func);
|
||||
ForFloatTypes(func);
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#endif // per-target include guard
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2019 Google LLC
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#undef HWY_TARGET_INCLUDE
|
||||
#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
|
||||
#include "hwy/foreach_target.h"
|
||||
#include "hwy/highway.h"
|
||||
#include "hwy/tests/test_util-inl.h"
|
||||
|
||||
HWY_BEFORE_NAMESPACE();
|
||||
namespace hwy {
|
||||
namespace HWY_NAMESPACE {
|
||||
|
||||
struct TestName {
|
||||
template <class T, class D>
|
||||
HWY_NOINLINE void operator()(T t, D d) {
|
||||
char num[10];
|
||||
std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
|
||||
snprintf(num, sizeof(num), "%zu", sizeof(T) * 8);
|
||||
expected += num;
|
||||
|
||||
const size_t N = Lanes(d);
|
||||
if (N != 1) {
|
||||
expected += 'x';
|
||||
snprintf(num, sizeof(num), "%zu", N);
|
||||
expected += num;
|
||||
}
|
||||
const std::string actual = TypeName(t, N);
|
||||
if (expected != actual) {
|
||||
NotifyFailure(__FILE__, __LINE__, expected.c_str(), 0, expected.c_str(),
|
||||
actual.c_str());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
|
||||
|
||||
struct TestEqualInteger {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*t*/) const {
|
||||
HWY_ASSERT(IsEqual(T(0), T(0)));
|
||||
HWY_ASSERT(IsEqual(T(1), T(1)));
|
||||
HWY_ASSERT(IsEqual(T(-1), T(-1)));
|
||||
HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>()));
|
||||
|
||||
HWY_ASSERT(!IsEqual(T(0), T(1)));
|
||||
HWY_ASSERT(!IsEqual(T(1), T(0)));
|
||||
HWY_ASSERT(!IsEqual(T(1), T(-1)));
|
||||
HWY_ASSERT(!IsEqual(T(-1), T(1)));
|
||||
HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
|
||||
HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
struct TestEqualFloat {
|
||||
template <class T>
|
||||
HWY_NOINLINE void operator()(T /*t*/) const {
|
||||
HWY_ASSERT(IsEqual(T(0), T(0)));
|
||||
HWY_ASSERT(IsEqual(T(1), T(1)));
|
||||
HWY_ASSERT(IsEqual(T(-1), T(-1)));
|
||||
HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
|
||||
|
||||
HWY_ASSERT(!IsEqual(T(0), T(1)));
|
||||
HWY_ASSERT(!IsEqual(T(1), T(0)));
|
||||
HWY_ASSERT(!IsEqual(T(1), T(-1)));
|
||||
HWY_ASSERT(!IsEqual(T(-1), T(1)));
|
||||
HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
|
||||
HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
|
||||
}
|
||||
};
|
||||
|
||||
HWY_NOINLINE void TestAllEqual() {
|
||||
ForIntegerTypes(TestEqualInteger());
|
||||
ForFloatTypes(TestEqualFloat());
|
||||
}
|
||||
|
||||
// NOLINTNEXTLINE(google-readability-namespace-comments)
|
||||
} // namespace HWY_NAMESPACE
|
||||
} // namespace hwy
|
||||
HWY_AFTER_NAMESPACE();
|
||||
|
||||
#if HWY_ONCE
|
||||
namespace hwy {
|
||||
HWY_BEFORE_TEST(TestUtilTest);
|
||||
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
|
||||
HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
|
||||
} // namespace hwy
|
||||
#endif
|
|
@ -0,0 +1,8 @@
|
|||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: libhwy-test
|
||||
Description: Efficient and performance-portable SIMD wrapper, test helpers.
|
||||
Requires: gtest
|
||||
Version: @HWY_LIBRARY_VERSION@
|
||||
Cflags: -I${includedir}
|
|
@ -0,0 +1,10 @@
|
|||
prefix=@CMAKE_INSTALL_PREFIX@
|
||||
exec_prefix=${prefix}
|
||||
libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
|
||||
includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
|
||||
|
||||
Name: libhwy
|
||||
Description: Efficient and performance-portable SIMD wrapper
|
||||
Version: @HWY_LIBRARY_VERSION@
|
||||
Libs: -L${libdir} -lhwy
|
||||
Cflags: -I${includedir}
|
|
@ -0,0 +1,20 @@
|
|||
@echo off
|
||||
REM Switch directory of this batch file
|
||||
cd %~dp0
|
||||
|
||||
if not exist build mkdir build
|
||||
|
||||
cd build
|
||||
cmake .. -G Ninja || goto error
|
||||
ninja || goto error
|
||||
ctest -j || goto error
|
||||
|
||||
cd ..
|
||||
echo Success
|
||||
goto end
|
||||
|
||||
:error
|
||||
echo Failure
|
||||
exit /b 1
|
||||
|
||||
:end
|
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Switch to directory of this script
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
cd "${MYDIR}"
|
||||
|
||||
# Exit if anything fails
|
||||
set -e
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
cmake ..
|
||||
make -j
|
||||
ctest -j
|
||||
echo Success
|
|
@ -0,0 +1,4 @@
|
|||
BasedOnStyle: Google
|
||||
IncludeCategories:
|
||||
- Regex: '^<hwy/'
|
||||
Priority: 2
|
|
@ -0,0 +1,70 @@
|
|||
# Disabled checks:
|
||||
# - google-readability-todo: We don't use the google TODO format.
|
||||
#
|
||||
# - modernize-deprecated-headers: We don't use std:: versions of the standard
|
||||
# types and functions like size_t or printf, so we should include <stdio.h>
|
||||
# instead <cstdio>.
|
||||
# - modernize-return-braced-init-list: this often doesn't improve readability.
|
||||
# - modernize-use-auto: is too aggressive towards using auto.
|
||||
# - modernize-use-default-member-init: with a mix of constructors and default
|
||||
# member initialization this can be confusing if enforced.
|
||||
# - modernize-use-trailing-return-type: does not improve readability when used
|
||||
# systematically.
|
||||
# - modernize-use-using: typedefs are ok.
|
||||
#
|
||||
# - readability-else-after-return: It doesn't always improve readability.
|
||||
# - readability-static-accessed-through-instance
|
||||
# It is often more useful and readable to access a constant of a passed
|
||||
# variable (like d.N) instead of using the type of the variable that could be
|
||||
# long and complex.
|
||||
# - readability-uppercase-literal-suffix: we write 1.0f, not 1.0F.
|
||||
|
||||
Checks: >-
|
||||
bugprone-*,
|
||||
clang-*,
|
||||
-clang-diagnostic-unused-command-line-argument,
|
||||
google-*,
|
||||
modernize-*,
|
||||
performance-*,
|
||||
readability-*,
|
||||
-google-readability-todo,
|
||||
-modernize-deprecated-headers,
|
||||
-modernize-return-braced-init-list,
|
||||
-modernize-use-auto,
|
||||
-modernize-use-default-member-init,
|
||||
-modernize-use-trailing-return-type,
|
||||
-modernize-use-using,
|
||||
-readability-else-after-return,
|
||||
-readability-function-cognitive-complexity,
|
||||
-readability-static-accessed-through-instance,
|
||||
-readability-uppercase-literal-suffix,
|
||||
|
||||
|
||||
WarningsAsErrors: >-
|
||||
bugprone-argument-comment,
|
||||
bugprone-macro-parentheses,
|
||||
bugprone-suspicious-string-compare,
|
||||
bugprone-use-after-move,
|
||||
clang-*,
|
||||
clang-analyzer-*,
|
||||
-clang-diagnostic-unused-command-line-argument,
|
||||
google-build-using-namespace,
|
||||
google-explicit-constructor,
|
||||
google-readability-braces-around-statements,
|
||||
google-readability-namespace-comments,
|
||||
modernize-use-override,
|
||||
readability-inconsistent-declaration-parameter-name
|
||||
|
||||
# We are only interested in the headers from this projects, excluding
|
||||
# third_party/ and build/.
|
||||
HeaderFilterRegex: '^.*/(lib|tools)/.*\.h$'
|
||||
|
||||
CheckOptions:
|
||||
- key: readability-braces-around-statements.ShortStatementLines
|
||||
value: '2'
|
||||
- key: google-readability-braces-around-statements.ShortStatementLines
|
||||
value: '2'
|
||||
- key: readability-implicit-bool-conversion.AllowPointerConditions
|
||||
value: '1'
|
||||
- key: readability-implicit-bool-conversion.AllowIntegerConditions
|
||||
value: '1'
|
|
@ -0,0 +1,334 @@
|
|||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Ubuntu bionic ships with cmake 3.10.
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
|
||||
if(POLICY CMP0063)
|
||||
cmake_policy(SET CMP0063 NEW)
|
||||
endif()
|
||||
# Pass CMAKE_EXE_LINKER_FLAGS to CC and CXX compilers when testing if they work.
|
||||
if(POLICY CMP0065)
|
||||
cmake_policy(SET CMP0065 NEW)
|
||||
endif()
|
||||
|
||||
# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
|
||||
if(POLICY CMP0083)
|
||||
cmake_policy(SET CMP0083 NEW)
|
||||
endif()
|
||||
|
||||
project(JPEGXL LANGUAGES C CXX)
|
||||
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles(
|
||||
"int main() {
|
||||
#if !defined(__EMSCRIPTEN__)
|
||||
static_assert(false, \"__EMSCRIPTEN__ is not defined\");
|
||||
#endif
|
||||
return 0;
|
||||
}"
|
||||
JPEGXL_EMSCRIPTEN
|
||||
)
|
||||
|
||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
include(CheckCXXCompilerFlag)
|
||||
check_cxx_compiler_flag("-fsanitize=fuzzer-no-link" CXX_FUZZERS_SUPPORTED)
|
||||
check_cxx_compiler_flag("-Xclang -mconstructor-aliases" CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
|
||||
|
||||
# Enabled PIE binaries by default if supported.
|
||||
include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
|
||||
if(CHECK_PIE_SUPPORTED)
|
||||
check_pie_supported(LANGUAGES CXX)
|
||||
if(CMAKE_CXX_LINK_PIE_SUPPORTED)
|
||||
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
### Project build options:
|
||||
if(${CXX_FUZZERS_SUPPORTED})
|
||||
# Enabled by default except on arm64, Windows and Apple builds.
|
||||
set(ENABLE_FUZZERS_DEFAULT true)
|
||||
endif()
|
||||
find_package(PkgConfig)
|
||||
if(NOT APPLE AND NOT WIN32 AND NOT HAIKU AND ${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
|
||||
pkg_check_modules(TCMallocMinimalVersionCheck QUIET IMPORTED_TARGET
|
||||
libtcmalloc_minimal)
|
||||
if(TCMallocMinimalVersionCheck_FOUND AND
|
||||
NOT TCMallocMinimalVersionCheck_VERSION VERSION_EQUAL 2.8.0)
|
||||
# Enabled by default except on Windows and Apple builds for
|
||||
# tcmalloc != 2.8.0. tcmalloc 2.8.1 already has a fix for this issue.
|
||||
set(ENABLE_TCMALLOC_DEFAULT true)
|
||||
else()
|
||||
message(STATUS
|
||||
"tcmalloc version ${TCMallocMinimalVersionCheck_VERSION} -- "
|
||||
"tcmalloc 2.8.0 disabled due to "
|
||||
"https://github.com/gperftools/gperftools/issues/1204")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(WARNINGS_AS_ERRORS_DEFAULT false)
|
||||
|
||||
set(JPEGXL_ENABLE_FUZZERS ${ENABLE_FUZZERS_DEFAULT} CACHE BOOL
|
||||
"Build JPEGXL fuzzer targets.")
|
||||
set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
|
||||
"Build JPEGXL developer tools.")
|
||||
set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
|
||||
"Build and install man pages for the command-line tools.")
|
||||
set(JPEGXL_ENABLE_BENCHMARK true CACHE BOOL
|
||||
"Build JPEGXL benchmark tools.")
|
||||
set(JPEGXL_ENABLE_EXAMPLES true CACHE BOOL
|
||||
"Build JPEGXL library usage examples.")
|
||||
set(JPEGXL_ENABLE_SJPEG true CACHE BOOL
|
||||
"Build JPEGXL with support for encoding with sjpeg.")
|
||||
set(JPEGXL_ENABLE_OPENEXR true CACHE BOOL
|
||||
"Build JPEGXL with support for OpenEXR if available.")
|
||||
set(JPEGXL_ENABLE_SKCMS true CACHE BOOL
|
||||
"Build with skcms instead of lcms2.")
|
||||
set(JPEGXL_ENABLE_VIEWERS false CACHE BOOL
|
||||
"Build JPEGXL viewer tools for evaluation.")
|
||||
set(JPEGXL_ENABLE_TCMALLOC ${ENABLE_TCMALLOC_DEFAULT} CACHE BOOL
|
||||
"Build JPEGXL using gperftools (tcmalloc) allocator.")
|
||||
set(JPEGXL_ENABLE_PLUGINS false CACHE BOOL
|
||||
"Build third-party plugings to support JPEG XL in other applications.")
|
||||
set(JPEGXL_ENABLE_COVERAGE false CACHE BOOL
|
||||
"Enable code coverage tracking for libjxl. This also enables debug and disables optimizations.")
|
||||
set(JPEGXL_STATIC false CACHE BOOL
|
||||
"Build tools as static binaries.")
|
||||
set(JPEGXL_WARNINGS_AS_ERRORS ${WARNINGS_AS_ERRORS_DEFAULT} CACHE BOOL
|
||||
"Treat warnings as errors during compilation.")
|
||||
set(JPEGXL_DEP_LICENSE_DIR "" CACHE STRING
|
||||
"Directory where to search for system dependencies \"copyright\" files.")
|
||||
set(JPEGXL_FORCE_NEON false CACHE BOOL
|
||||
"Set flags to enable NEON in arm if not enabled by your toolchain.")
|
||||
|
||||
|
||||
# Force system dependencies.
|
||||
set(JPEGXL_FORCE_SYSTEM_GTEST false CACHE BOOL
|
||||
"Force using system installed googletest (gtest/gmock) instead of third_party/googletest source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
|
||||
"Force using system installed brotli instead of third_party/brotli source.")
|
||||
set(JPEGXL_FORCE_SYSTEM_HWY false CACHE BOOL
|
||||
"Force using system installed highway (libhwy-dev) instead of third_party/highway source.")
|
||||
|
||||
# Check minimum compiler versions. Older compilers are not supported and fail
|
||||
# with hard to understand errors.
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} STREQUAL ${CMAKE_CXX_COMPILER_ID})
|
||||
message(FATAL_ERROR "Different C/C++ compilers set: "
|
||||
"${CMAKE_C_COMPILER_ID} vs ${CMAKE_CXX_COMPILER_ID}")
|
||||
endif()
|
||||
if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
# Android NDK's toolchain.cmake fakes the clang version in
|
||||
# CMAKE_CXX_COMPILER_VERSION with an incorrect number, so ignore this.
|
||||
if (NOT ${CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION} MATCHES "clang"
|
||||
AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 6)
|
||||
message(FATAL_ERROR
|
||||
"Minimum Clang version required is Clang 6, please update.")
|
||||
endif()
|
||||
elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
|
||||
if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 7)
|
||||
message(FATAL_ERROR
|
||||
"Minimum GCC version required is 7, please update.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS
|
||||
"Compiled IDs C:${CMAKE_C_COMPILER_ID}, C++:${CMAKE_CXX_COMPILER_ID}")
|
||||
|
||||
# CMAKE_EXPORT_COMPILE_COMMANDS is used to generate the compilation database
|
||||
# used by clang-tidy.
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
if(JPEGXL_STATIC)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
|
||||
set(BUILD_SHARED_LIBS 0)
|
||||
set(CMAKE_EXE_LINKER_FLAGS
|
||||
"${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
|
||||
if (MINGW)
|
||||
# In MINGW libstdc++ uses pthreads directly. When building statically a
|
||||
# program (regardless of whether the source code uses pthread or not) the
|
||||
# toolchain will add stdc++ and pthread to the linking step but stdc++ will
|
||||
# be linked statically while pthread will be linked dynamically.
|
||||
# To avoid this and have pthread statically linked with need to pass it in
|
||||
# the command line with "-Wl,-Bstatic -lpthread -Wl,-Bdynamic" but the
|
||||
# linker will discard it if not used by anything else up to that point in
|
||||
# the linker command line. If the program or any dependency don't use
|
||||
# pthread directly -lpthread is discarded and libstdc++ (added by the
|
||||
# toolchain later) will then use the dynamic version. For this we also need
|
||||
# to pass -lstdc++ explicitly before -lpthread. For pure C programs -lstdc++
|
||||
# will be discarded anyway.
|
||||
# This adds these flags as dependencies for *all* targets. Adding this to
|
||||
# CMAKE_EXE_LINKER_FLAGS instead would cause them to be included before any
|
||||
# object files and therefore discarded.
|
||||
link_libraries(-Wl,-Bstatic -lstdc++ -lpthread -Wl,-Bdynamic)
|
||||
endif() # MINGW
|
||||
endif() # JPEGXL_STATIC
|
||||
|
||||
if (MSVC)
|
||||
# TODO(janwas): add flags
|
||||
else ()
|
||||
|
||||
# Global compiler flags for all targets here and in subdirectories.
|
||||
add_definitions(
|
||||
# Avoid changing the binary based on the current time and date.
|
||||
-D__DATE__="redacted"
|
||||
-D__TIMESTAMP__="redacted"
|
||||
-D__TIME__="redacted"
|
||||
)
|
||||
|
||||
if("${JPEGXL_ENABLE_FUZZERS}" OR "${JPEGXL_ENABLE_COVERAGE}")
|
||||
add_definitions(
|
||||
-DJXL_ENABLE_FUZZERS
|
||||
)
|
||||
endif() # JPEGXL_ENABLE_FUZZERS
|
||||
|
||||
# In CMake before 3.12 it is problematic to pass repeated flags like -Xclang.
|
||||
# For this reason we place them in CMAKE_CXX_FLAGS instead.
|
||||
# See https://gitlab.kitware.com/cmake/cmake/issues/15826
|
||||
|
||||
# Machine flags.
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funwind-tables")
|
||||
if (${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mrelax-all")
|
||||
endif()
|
||||
if ("${CXX_CONSTRUCTOR_ALIASES_SUPPORTED}")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mconstructor-aliases")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
# Not supported by clang-cl, but frame pointers are default on Windows
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
|
||||
endif()
|
||||
|
||||
# CPU flags - remove once we have NEON dynamic dispatch
|
||||
|
||||
# TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
|
||||
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm")
|
||||
if(JPEGXL_FORCE_NEON)
|
||||
# GCC requires these flags, otherwise __ARM_NEON is undefined.
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
||||
-mfpu=neon-vfpv4 -mfloat-abi=hard")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Force build with optimizations in release mode.
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
|
||||
|
||||
add_compile_options(
|
||||
# Ignore this to allow redefining __DATE__ and others.
|
||||
-Wno-builtin-macro-redefined
|
||||
|
||||
# Global warning settings.
|
||||
-Wall
|
||||
)
|
||||
|
||||
if (JPEGXL_WARNINGS_AS_ERRORS)
|
||||
add_compile_options(-Werror)
|
||||
endif ()
|
||||
endif () # !MSVC
|
||||
|
||||
include(GNUInstallDirs)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED YES)
|
||||
|
||||
add_subdirectory(third_party)
|
||||
|
||||
set(THREADS_PREFER_PTHREAD_FLAG YES)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# Copy the JXL license file to the output build directory.
|
||||
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/LICENSE"
|
||||
${PROJECT_BINARY_DIR}/LICENSE.jpeg-xl COPYONLY)
|
||||
|
||||
# Enable tests regardless of where are they defined.
|
||||
enable_testing()
|
||||
include(CTest)
|
||||
|
||||
# Libraries.
|
||||
add_subdirectory(lib)
|
||||
|
||||
if(BUILD_TESTING)
|
||||
# Script to run tests over the source code in bash.
|
||||
find_program (BASH_PROGRAM bash)
|
||||
if(BASH_PROGRAM)
|
||||
add_test(
|
||||
NAME bash_test
|
||||
COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
|
||||
endif()
|
||||
endif() # BUILD_TESTING
|
||||
|
||||
# Documentation generated by Doxygen
|
||||
find_package(Doxygen)
|
||||
if(DOXYGEN_FOUND)
|
||||
set(DOXYGEN_GENERATE_HTML "YES")
|
||||
set(DOXYGEN_GENERATE_XML "NO")
|
||||
set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/include")
|
||||
set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md")
|
||||
set(DOXYGEN_WARN_AS_ERROR "YES")
|
||||
doxygen_add_docs(doc
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/lib/include/jxl"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
|
||||
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
|
||||
COMMENT "Generating C API documentation")
|
||||
else()
|
||||
# Create a "doc" target for compatibility since "doc" is not otherwise added to
|
||||
# the build when doxygen is not installed.
|
||||
add_custom_target(doc false
|
||||
COMMENT "Error: Can't generate doc since Doxygen not installed.")
|
||||
endif() # DOXYGEN_FOUND
|
||||
|
||||
if(JPEGXL_ENABLE_MANPAGES)
|
||||
find_package(Python COMPONENTS Interpreter)
|
||||
if(Python_Interpreter_FOUND)
|
||||
find_program(ASCIIDOC a2x)
|
||||
endif()
|
||||
if(NOT Python_Interpreter_FOUND OR "${ASCIIDOC}" STREQUAL "ASCIIDOC-NOTFOUND")
|
||||
message(WARNING "asciidoc was not found, the man pages will not be installed.")
|
||||
else()
|
||||
set(MANPAGE_FILES "")
|
||||
set(MANPAGES "")
|
||||
foreach(PAGE IN ITEMS cjxl djxl)
|
||||
# Invoking the Python interpreter ourselves instead of running the a2x binary
|
||||
# directly is necessary on MSYS2, otherwise it is run through cmd.exe which
|
||||
# does not recognize it.
|
||||
add_custom_command(
|
||||
OUTPUT "${PAGE}.1"
|
||||
COMMAND Python::Interpreter
|
||||
ARGS "${ASCIIDOC}"
|
||||
--format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
|
||||
MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
|
||||
list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
|
||||
list(APPEND MANPAGES "${PAGE}.1")
|
||||
endforeach()
|
||||
add_custom_target(manpages ALL DEPENDS ${MANPAGES})
|
||||
install(FILES ${MANPAGE_FILES} DESTINATION share/man/man1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Example usage code.
|
||||
if (${JPEGXL_ENABLE_EXAMPLES})
|
||||
add_subdirectory(examples)
|
||||
endif ()
|
||||
|
||||
# Plugins for third-party software
|
||||
if (${JPEGXL_ENABLE_PLUGINS})
|
||||
add_subdirectory(plugins)
|
||||
endif ()
|
||||
|
||||
# Binary tools
|
||||
add_subdirectory(tools)
|
|
@ -0,0 +1,10 @@
|
|||
# How to Contribute
|
||||
|
||||
We are currently unable to accept patches to this project, but we'd very much
|
||||
appreciate if you open a Gitlab issue for any bug reports/feature requests.
|
||||
We will be happy to investigate and hopefully fix any issues.
|
||||
|
||||
# Community Guidelines
|
||||
|
||||
This project follows
|
||||
[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
|
|
@ -0,0 +1,23 @@
|
|||
# This files lists individuals who made significant contributions to the JPEG XL
|
||||
# code base, such as design, adding features, performing experiments, ...
|
||||
# Small changes such as a small bugfix or fixing spelling errors are not
|
||||
# included. If you'd like to be included in this file thanks to a significant
|
||||
# contribution, feel free to send a pull request changing this file.
|
||||
Alex Deymo
|
||||
Alexander Rhatushnyak
|
||||
Evgenii Kliuchnikov
|
||||
Iulia-Maria Comșa
|
||||
Jan Wassenberg
|
||||
Jon Sneyers
|
||||
Jyrki Alakuijala
|
||||
Krzysztof Potempa
|
||||
Lode Vandevenne
|
||||
Luca Versari
|
||||
Martin Bruse
|
||||
Moritz Firsching
|
||||
Renata Khasanova
|
||||
Robert Obryk
|
||||
Sami Boukortt
|
||||
Sebastian Gomez-Gonzalez
|
||||
Thomas Fischbacher
|
||||
Zoltan Szabadka
|
|
@ -0,0 +1,203 @@
|
|||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
## Disclaimer
|
||||
|
||||
Haiku builds are not officially supported, i.e. the build might not work at all,
|
||||
some tests may fail and some sub-projects are excluded from build.
|
||||
|
||||
This manual outlines Haiku-specific setup. For general building and testing
|
||||
instructions see "[README](README.md)" and
|
||||
"[Building and Testing changes](doc/building_and_testing.md)".
|
||||
|
||||
## Dependencies
|
||||
|
||||
```shell
|
||||
pkgman install llvm9_clang ninja cmake doxygen libjpeg_turbo_devel
|
||||
```
|
||||
|
||||
## Building
|
||||
|
||||
```shell
|
||||
TEST_STACK_LIMIT=none CMAKE_FLAGS="-I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++ -I/boot/system/develop/tools/lib/gcc/x86_64-unknown-haiku/8.3.0/include/c++/x86_64-unknown-haiku" CMAKE_SHARED_LINKER_FLAGS="-shared -Xlinker -soname=libjpegxl.so -lpthread" ./ci.sh opt
|
||||
```
|
|
@ -0,0 +1,41 @@
|
|||
## Disclaimer
|
||||
|
||||
OSX builds have "best effort" support, i.e. build might not work at all, some
|
||||
tests may fail and some sub-projects are excluded from build.
|
||||
|
||||
This manual outlines OSX specific setup. For general building and testing
|
||||
instructions see "[README](README.md)" and
|
||||
"[Building and Testing changes](doc/building_and_testing.md)".
|
||||
|
||||
[Homebrew](https://brew.sh/) is a popular package manager. JPEG XL library and
|
||||
binaries could be installed using it:
|
||||
|
||||
```bash
|
||||
brew install jpeg-xl
|
||||
```
|
||||
|
||||
## Dependencies
|
||||
|
||||
Make sure that `brew doctor` does not report serious problems and up-to-date
|
||||
version of XCode is installed.
|
||||
|
||||
Installing (actually, building) `clang` might take a couple hours.
|
||||
|
||||
```bash
|
||||
brew install llvm
|
||||
```
|
||||
|
||||
```bash
|
||||
brew install coreutils cmake giflib jpeg-turbo libpng ninja zlib
|
||||
```
|
||||
|
||||
Before building the project check that `which clang` is
|
||||
`/usr/local/opt/llvm/bin/clang`, not the one provided by XCode. If not, update
|
||||
`PATH` environment variable.
|
||||
|
||||
Also, setting `CMAKE_PREFIX_PATH` might be necessary for correct include paths
|
||||
resolving, e.g.:
|
||||
|
||||
```bash
|
||||
export CMAKE_PREFIX_PATH=`brew --prefix giflib`:`brew --prefix jpeg-turbo`:`brew --prefix libpng`:`brew --prefix zlib`
|
||||
```
|
|
@ -0,0 +1,224 @@
|
|||
# JPEG XL reference implementation
|
||||
|
||||
<img src="doc/jxl.svg" width="100" align="right" alt="JXL logo">
|
||||
|
||||
This repository contains a reference implementation of JPEG XL (encoder and
|
||||
decoder), called `libjxl`.
|
||||
|
||||
JPEG XL is in the final stages of standardization and its codestream format is
|
||||
frozen.
|
||||
|
||||
The libraries API, command line options and tools in this repository are subject
|
||||
to change, however files encoded with `cjxl` conform to the JPEG XL format
|
||||
specification and can be decoded with current and future `djxl` decoders or
|
||||
`libjxl` decoding library.
|
||||
|
||||
## Quick start guide
|
||||
|
||||
For more details and other workflows see the "Advanced guide" below.
|
||||
|
||||
### Checking out the code
|
||||
|
||||
```bash
|
||||
git clone https://gitlab.com/wg1/jpeg-xl.git --recursive
|
||||
```
|
||||
|
||||
This repository uses git submodules to handle some third party dependencies
|
||||
under `third_party/`, that's why is important to pass `--recursive`. If you
|
||||
didn't check out with `--recursive`, or any submodule has changed, run:
|
||||
`git submodule update --init --recursive`.
|
||||
|
||||
Important: If you downloaded a zip file or tarball from the web interface you
|
||||
won't get the needed submodules and the code will not compile. You can download
|
||||
these external dependencies from source running `./deps.sh`. The git workflow
|
||||
described above is recommended instead.
|
||||
|
||||
### Installing dependencies
|
||||
|
||||
Required dependencies for compiling the code, in a Debian/Ubuntu based
|
||||
distribution run:
|
||||
|
||||
```bash
|
||||
sudo apt install cmake pkg-config libbrotli-dev
|
||||
```
|
||||
|
||||
Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
|
||||
in a Debian/Ubuntu based distribution run:
|
||||
|
||||
```bash
|
||||
sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
|
||||
```
|
||||
|
||||
We recommend using a recent Clang compiler (version 7 or newer), for that
|
||||
install clang and set `CC` and `CXX` variables. For example, with clang-7:
|
||||
|
||||
```bash
|
||||
sudo apt install clang-7
|
||||
export CC=clang-7 CXX=clang++-7
|
||||
```
|
||||
|
||||
### Building
|
||||
|
||||
```bash
|
||||
cd jpeg-xl
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
|
||||
cmake --build . -- -j$(nproc)
|
||||
```
|
||||
|
||||
The encoder/decoder tools will be available in the `build/tools` directory.
|
||||
|
||||
### <a name="installing"></a> Installing
|
||||
|
||||
```bash
|
||||
sudo cmake --install .
|
||||
```
|
||||
|
||||
### Basic encoder/decoder
|
||||
|
||||
To encode a source image to JPEG XL with default settings:
|
||||
|
||||
```bash
|
||||
build/tools/cjxl input.png output.jxl
|
||||
```
|
||||
|
||||
For more settings run `build/tools/cjxl --help` or for a full list of options
|
||||
run `build/tools/cjxl -v -v --help`.
|
||||
|
||||
To decode a JPEG XL file run:
|
||||
|
||||
```bash
|
||||
build/tools/djxl input.jxl output.png
|
||||
```
|
||||
|
||||
When possible `cjxl`/`djxl` are able to read/write the following
|
||||
image formats: .exr, .gif, .jpeg/.jpg, .pfm, .pgm/.ppm, .pgx, .png.
|
||||
|
||||
### Benchmarking
|
||||
|
||||
For speed benchmarks on single images in single or multi-threaded decoding
|
||||
`djxl` can print decoding speed information. See `djxl --help` for details
|
||||
on the decoding options and note that the output image is optional for
|
||||
benchmarking purposes.
|
||||
|
||||
For a more comprehensive comparison of compression density between multiple
|
||||
options see "Benchmarking with benchmark_xl" section below.
|
||||
|
||||
## Advanced guide
|
||||
|
||||
### Building with Docker
|
||||
|
||||
We build a common environment based on Debian/Ubuntu using Docker. Other
|
||||
systems may have different combinations of versions and dependencies that
|
||||
have not been tested and may not work. For those cases we recommend using the
|
||||
Docker environment as explained in the
|
||||
[step by step guide](doc/developing_in_docker.md).
|
||||
|
||||
### Building JPEG XL for developers
|
||||
|
||||
For experienced developers, we also provide build instructions for an [up to
|
||||
date Debian-based Linux](doc/developing_in_debian.md) and [64-bit
|
||||
Windows](doc/developing_in_windows.md). If you encounter any difficulties,
|
||||
please use Docker instead.
|
||||
|
||||
## Benchmarking with benchmark_xl
|
||||
|
||||
We recommend `build/tools/benchmark_xl` as a convenient method for reading
|
||||
images or image sequences, encoding them using various codecs (jpeg jxl png
|
||||
webp), decoding the result, and computing objective quality metrics. An example
|
||||
invocation is:
|
||||
|
||||
```bash
|
||||
build/tools/benchmark_xl --input "/path/*.png" --codec jxl:wombat:d1,jxl:cheetah:d2
|
||||
```
|
||||
|
||||
Multiple comma-separated codecs are allowed. The characters after : are
|
||||
parameters for the codec, separated by colons, in this case specifying maximum
|
||||
target psychovisual distances of 1 and 2 (higher implies lower quality) and
|
||||
the encoder effort (see below). Other common parameters are `r0.5` (target
|
||||
bitrate 0.5 bits per pixel) and `q92` (quality 92, on a scale of 0-100, where
|
||||
higher is better). The `jxl` codec supports the following additional parameters:
|
||||
|
||||
Speed: `falcon`, `cheetah`, `hare`, `wombat`, `squirrel`, `kitten`, `tortoise`
|
||||
control the encoder effort in ascending order. This also affects memory usage:
|
||||
using lower effort will typically reduce memory consumption during encoding.
|
||||
|
||||
* `falcon` disables all of the following tools.
|
||||
* `cheetah` enables coefficient reordering, context clustering, and heuristics
|
||||
for selecting DCT sizes and quantization steps.
|
||||
* `hare` enables Gaborish filtering, chroma from luma, and an initial estimate
|
||||
of quantization steps.
|
||||
* `wombat` enables error diffusion quantization and full DCT size selection
|
||||
heuristics.
|
||||
* `squirrel` (default) enables dots, patches, and spline detection, and full
|
||||
context clustering.
|
||||
* `kitten` optimizes the adaptive quantization for a psychovisual metric.
|
||||
* `tortoise` enables a more thorough adaptive quantization search.
|
||||
|
||||
Mode: JPEG XL has two modes. The default is Var-DCT mode, which is suitable for
|
||||
lossy compression. The other mode is Modular mode, which is suitable for lossless
|
||||
compression. Modular mode can also do lossy compression (e.g. `jxl:m:q50`).
|
||||
|
||||
* `m` activates modular mode.
|
||||
|
||||
Other arguments to benchmark_xl include:
|
||||
|
||||
* `--save_compressed`: save codestreams to `output_dir`.
|
||||
* `--save_decompressed`: save decompressed outputs to `output_dir`.
|
||||
* `--output_extension`: selects the format used to output decoded images.
|
||||
* `--num_threads`: number of codec instances that will independently
|
||||
encode/decode images, or 0.
|
||||
* `--inner_threads`: how many threads each instance should use for parallel
|
||||
encoding/decoding, or 0.
|
||||
* `--encode_reps`/`--decode_reps`: how many times to repeat encoding/decoding
|
||||
each image, for more consistent measurements (we recommend 10).
|
||||
|
||||
The benchmark output begins with a header:
|
||||
|
||||
```
|
||||
Compr Input Compr Compr Compr Decomp Butteraugli
|
||||
Method Pixels Size BPP # MP/s MP/s Distance Error p norm BPP*pnorm Errors
|
||||
```
|
||||
|
||||
`ComprMethod` lists each each comma-separated codec. `InputPixels` is the number
|
||||
of pixels in the input image. `ComprSize` is the codestream size in bytes and
|
||||
`ComprBPP` the bitrate. `Compr MP/s` and `Decomp MP/s` are the
|
||||
compress/decompress throughput, in units of Megapixels/second.
|
||||
`Butteraugli Distance` indicates the maximum psychovisual error in the decoded
|
||||
image (larger is worse). `Error p norm` is a similar summary of the psychovisual
|
||||
error, but closer to an average, giving less weight to small low-quality
|
||||
regions. `BPP*pnorm` is the product of `ComprBPP` and `Error p norm`, which is a
|
||||
figure of merit for the codec (lower is better). `Errors` is nonzero if errors
|
||||
occurred while loading or encoding/decoding the image.
|
||||
|
||||
## License
|
||||
|
||||
This software is available under Apache 2.0 license which can be found in the
|
||||
[LICENSE](LICENSE) file.
|
||||
|
||||
## Additional documentation
|
||||
|
||||
### Codec description
|
||||
|
||||
* [Introductory paper](https://www.spiedigitallibrary.org/proceedings/Download?fullDOI=10.1117%2F12.2529237) (open-access)
|
||||
* [XL Overview](doc/xl_overview.md) - a brief introduction to the source code modules
|
||||
* [JPEG XL white paper](http://ds.jpeg.org/whitepapers/jpeg-xl-whitepaper.pdf)
|
||||
* [JPEG XL website](https://jpeg.org/jpegxl/)
|
||||
* [Jon's JXL info page](https://sneyers.info/jxl/)
|
||||
|
||||
### Development process
|
||||
* [Docker setup - **start here**](doc/developing_in_docker.md)
|
||||
* [Building on Debian](doc/developing_in_debian.md) - for experts only
|
||||
* [Building on Windows](doc/developing_in_windows.md) - for experts only
|
||||
* [More information on testing/build options](doc/building_and_testing.md)
|
||||
* [Git guide for JPEG XL](doc/developing_in_gitlab.md) - for developers only
|
||||
* [Building Web Assembly artifacts](doc/building_wasm.md)
|
||||
|
||||
### Contact
|
||||
|
||||
If you encounter a bug or other issue with the software, please open an Issue here.
|
||||
|
||||
There is a [subreddit about JPEG XL](https://www.reddit.com/r/jpegxl/), and
|
||||
informal chatting with developers and early adopters of `libjxl` can be done on the
|
||||
[JPEG XL Discord server](https://discord.gg/DqkQgDRTFu).
|
|
@ -0,0 +1,37 @@
|
|||
# Security and Vulnerability Policy for JPEG XL
|
||||
|
||||
The current focus of the reference implementation is to provide a vehicle for
|
||||
evaluating the JPEG XL codec compression density, quality, features and its
|
||||
actual performance on different platforms. With this focus in mind we provide
|
||||
source code releases with improvements on performance and quality so developers
|
||||
can evaluate the codec.
|
||||
|
||||
At this time, **we don't provide security and vulnerability support** for any
|
||||
of these releases. This means that the source code may contain bugs, including
|
||||
security bugs, that may be added or fixed between releases and will **not** be
|
||||
individually documented. All of these
|
||||
[releases](https://gitlab.com/wg1/jpeg-xl/-/releases) include the following
|
||||
note to that effect:
|
||||
|
||||
* Note: This release is for evaluation purposes and may contain bugs, including
|
||||
security bugs, that will *not* be individually documented when fixed. Always
|
||||
prefer to use the latest release. Please provide feedback and report bugs
|
||||
[here](https://gitlab.com/wg1/jpeg-xl/-/issues).
|
||||
|
||||
To be clear, this means that because a release doesn't mention any CVE it
|
||||
doesn't mean that no security issues in previous versions were fixed. You should
|
||||
assume that any previous release contains security issues if that's a concern
|
||||
for your use case.
|
||||
|
||||
This however doesn't impede you from evaluating the codec with your own trusted
|
||||
inputs, such as `.jxl` you encoded yourself, or when taking appropriate measures
|
||||
for your application like sandboxing if processing untrusted inputs.
|
||||
|
||||
## Future plans
|
||||
|
||||
To help our users and developers integrating this implementation into their
|
||||
software we plan to provide support for security and vulnerability tracking of
|
||||
this implementation in the future.
|
||||
|
||||
When we can provide such support we will update this Policy with the details and
|
||||
expectations and clearly mention that fact in the release notes.
|
|
@ -0,0 +1,229 @@
|
|||
#!/bin/bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Tests implemented in bash. These typically will run checks about the source
|
||||
# code rather than the compiled one.
|
||||
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
|
||||
set -u
|
||||
|
||||
test_includes() {
|
||||
local ret=0
|
||||
local f
|
||||
for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h)$'); do
|
||||
# Check that the public files (in lib/include/ directory) don't use the full
|
||||
# path to the public header since users of the library will include the
|
||||
# library as: #include "jxl/foobar.h".
|
||||
if [[ "${f#lib/include/}" != "${f}" ]]; then
|
||||
if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
|
||||
echo "Don't add \"include/\" to the include path of public headers." >&2
|
||||
ret=1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${f#third_party/}" == "$f" ]]; then
|
||||
# $f is not in third_party/
|
||||
|
||||
# Check that local files don't use the full path to third_party/
|
||||
# directory since the installed versions will not have that path.
|
||||
# Add an exception for third_party/dirent.h.
|
||||
if grep -v -F 'third_party/dirent.h' "$f" | \
|
||||
grep -i -H -n -E '#include\s*[<"]third_party/' >&2 &&
|
||||
[[ $ret -eq 0 ]]; then
|
||||
cat >&2 <<EOF
|
||||
$f: Don't add third_party/ to the include path of third_party projects. This \
|
||||
makes it harder to use installed system libraries instead of the third_party/ \
|
||||
ones.
|
||||
EOF
|
||||
ret=1
|
||||
fi
|
||||
fi
|
||||
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
test_include_collision() {
|
||||
local ret=0
|
||||
local f
|
||||
for f in $(git ls-files | grep -E '^lib/include/'); do
|
||||
local base=${f#lib/include/}
|
||||
if [[ -e "lib/${base}" ]]; then
|
||||
echo "$f: Name collision, both $f and lib/${base} exist." >&2
|
||||
ret=1
|
||||
fi
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
test_copyright() {
|
||||
local ret=0
|
||||
local f
|
||||
for f in $(git ls-files | grep -E '(\.cc|\.cpp|\.h|\.sh|\.m|\.py)$'); do
|
||||
if [[ "${f#third_party/}" == "$f" ]]; then
|
||||
# $f is not in third_party/
|
||||
if ! head -n 10 "$f" |
|
||||
grep -F 'Copyright (c) the JPEG XL Project' >/dev/null ; then
|
||||
echo "$f: Missing Copyright blob near the top of the file." >&2
|
||||
ret=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
# Check that "dec_" code doesn't depend on "enc_" headers.
|
||||
test_dec_enc_deps() {
|
||||
local ret=0
|
||||
local f
|
||||
for f in $(git ls-files | grep -E '/dec_'); do
|
||||
if [[ "${f#third_party/}" == "$f" ]]; then
|
||||
# $f is not in third_party/
|
||||
if grep -n -H -E "#include.*/enc_" "$f" >&2; then
|
||||
echo "$f: Don't include \"enc_*\" files from \"dec_*\" files." >&2
|
||||
ret=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
# Check for git merge conflict markers.
|
||||
test_merge_conflict() {
|
||||
local ret=0
|
||||
TEXT_FILES='(\.cc|\.cpp|\.h|\.sh|\.m|\.py|\.md|\.txt|\.cmake)$'
|
||||
for f in $(git ls-files | grep -E "${TEXT_FILES}"); do
|
||||
if grep -E '^<<<<<<< ' "$f"; then
|
||||
echo "$f: Found git merge conflict marker. Please resolve." >&2
|
||||
ret=1
|
||||
fi
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
# Check that the library and the package have the same version. This prevents
|
||||
# accidentally having them out of sync.
|
||||
get_version() {
|
||||
local varname=$1
|
||||
local line=$(grep -F "set(${varname} " lib/CMakeLists.txt | head -n 1)
|
||||
[[ -n "${line}" ]]
|
||||
line="${line#set(${varname} }"
|
||||
line="${line%)}"
|
||||
echo "${line}"
|
||||
}
|
||||
|
||||
test_version() {
|
||||
local major=$(get_version JPEGXL_MAJOR_VERSION)
|
||||
local minor=$(get_version JPEGXL_MINOR_VERSION)
|
||||
local patch=$(get_version JPEGXL_PATCH_VERSION)
|
||||
# Check that the version is not empty
|
||||
if [[ -z "${major}${minor}${patch}" ]]; then
|
||||
echo "Couldn't parse version from CMakeLists.txt" >&2
|
||||
return 1
|
||||
fi
|
||||
local pkg_version=$(head -n 1 debian/changelog)
|
||||
# Get only the part between the first "jpeg-xl (" and the following ")".
|
||||
pkg_version="${pkg_version#jpeg-xl (}"
|
||||
pkg_version="${pkg_version%%)*}"
|
||||
if [[ -z "${pkg_version}" ]]; then
|
||||
echo "Couldn't parse version from debian package" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
local lib_version="${major}.${minor}.${patch}"
|
||||
lib_version="${lib_version%.0}"
|
||||
if [[ "${pkg_version}" != "${lib_version}"* ]]; then
|
||||
echo "Debian package version (${pkg_version}) doesn't match library" \
|
||||
"version (${lib_version})." >&2
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# Check that the SHA versions in deps.sh matches the git submodules.
|
||||
test_deps_version() {
|
||||
while IFS= read -r line; do
|
||||
if [[ "${line:0:10}" != "[submodule" ]]; then
|
||||
continue
|
||||
fi
|
||||
line="${line#[submodule \"}"
|
||||
line="${line%\"]}"
|
||||
local varname=$(tr '[:lower:]' '[:upper:]' <<< "${line}")
|
||||
varname="${varname/\//_}"
|
||||
if ! grep -F "${varname}=" deps.sh >/dev/null; then
|
||||
# Ignoring submodule not in deps.sh
|
||||
continue
|
||||
fi
|
||||
local deps_sha=$(grep -F "${varname}=" deps.sh | cut -f 2 -d '"')
|
||||
[[ -n "${deps_sha}" ]]
|
||||
local git_sha=$(git ls-tree -r HEAD "${line}" | cut -f 1 | cut -f 3 -d ' ')
|
||||
if [[ "${deps_sha}" != "${git_sha}" ]]; then
|
||||
cat >&2 <<EOF
|
||||
deps.sh: SHA for project ${line} is at ${deps_sha} but the git submodule is at
|
||||
${git_sha}. Please update deps.sh
|
||||
EOF
|
||||
return 1
|
||||
fi
|
||||
done < .gitmodules
|
||||
}
|
||||
|
||||
# Make sure that all the Fields objects are fuzzed directly.
|
||||
test_fuzz_fields() {
|
||||
local ret=0
|
||||
# List all the classes of the form "ClassName : public Fields".
|
||||
# This doesn't catch class names that are too long to fit.
|
||||
local field_classes=$( git ls-files |
|
||||
grep -E '\.(cc|h)' | grep -v 'test\.cc$' |
|
||||
xargs grep -h -o -E '\b[^ ]+ : public Fields' | cut -f 1 -d ' ')
|
||||
local classname
|
||||
for classname in ${field_classes}; do
|
||||
if ! grep -E "\\b${classname}\\b" tools/fields_fuzzer.cc >/dev/null; then
|
||||
cat >&2 <<EOF
|
||||
tools/fields_fuzzer.cc: Class ${classname} not found in the fields_fuzzer.
|
||||
EOF
|
||||
ret=1
|
||||
fi
|
||||
done
|
||||
return $ret
|
||||
}
|
||||
|
||||
main() {
|
||||
local ret=0
|
||||
cd "${MYDIR}"
|
||||
|
||||
if ! git rev-parse >/dev/null 2>/dev/null; then
|
||||
echo "Not a git checkout, skipping bash_test"
|
||||
return 0
|
||||
fi
|
||||
|
||||
IFS=$'\n'
|
||||
for f in $(declare -F); do
|
||||
local test_name=$(echo "$f" | cut -f 3 -d ' ')
|
||||
# Runs all the local bash functions that start with "test_".
|
||||
if [[ "${test_name}" == test_* ]]; then
|
||||
echo "Test ${test_name}: Start"
|
||||
if ${test_name}; then
|
||||
echo "Test ${test_name}: PASS"
|
||||
else
|
||||
echo "Test ${test_name}: FAIL"
|
||||
ret=1
|
||||
fi
|
||||
fi
|
||||
done
|
||||
return ${ret}
|
||||
}
|
||||
|
||||
main "$@"
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,86 @@
|
|||
jpeg-xl (0.3.7) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.7.
|
||||
* Fix a rounding issue in 8-bit decoding.
|
||||
|
||||
-- Sami Boukortt <sboukortt@google.com> Mon, 29 Mar 2021 12:14:20 +0200
|
||||
|
||||
jpeg-xl (0.3.6) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.6.
|
||||
* Fix a bug that could result in the generation of invalid codestreams as
|
||||
well as failure to decode valid streams.
|
||||
|
||||
-- Sami Boukortt <sboukortt@google.com> Thu, 25 Mar 2021 17:40:58 +0100
|
||||
|
||||
jpeg-xl (0.3.5) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.5.
|
||||
* Memory usage improvements.
|
||||
* New encode-time options for faster decoding at the cost of quality.
|
||||
* Faster decoding to 8-bit output with the C API.
|
||||
* GIMP plugin: avoid the sRGB conversion dialog for sRGB images, do not show
|
||||
a console window on Windows.
|
||||
* Various bug fixes.
|
||||
* Man pages for cjxl and djxl.
|
||||
|
||||
-- Sami Boukortt <sboukortt@google.com> Tue, 23 Mar 2021 15:20:44 +0100
|
||||
|
||||
jpeg-xl (0.3.4) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.4.
|
||||
* Improved box parsing.
|
||||
* Improved metadata handling.
|
||||
* Performance and memory usage improvements.
|
||||
|
||||
-- Sami Boukortt <sboukortt@google.com> Tue, 16 Mar 2021 12:13:59 +0100
|
||||
|
||||
jpeg-xl (0.3.3) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.3.
|
||||
* Performance improvements for small images.
|
||||
* Add a (flag-protected) non-high-precision mode with better speed.
|
||||
* Significantly speed up the PQ EOTF.
|
||||
* Allow optional HDR tone mapping in djxl (--tone_map, --display_nits).
|
||||
* Change the behavior of djxl -j to make it consistent with cjxl (#153).
|
||||
* Improve image quality.
|
||||
* Improve EXIF handling.
|
||||
|
||||
-- Sami Boukortt <sboukortt@google.com> Fri, 5 Mar 2021 19:15:26 +0100
|
||||
|
||||
jpeg-xl (0.3.2) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.2.
|
||||
* Fix embedded ICC encoding regression #149.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Fri, 12 Feb 2021 21:00:12 +0100
|
||||
|
||||
jpeg-xl (0.3.1) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.1.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Tue, 09 Feb 2021 09:48:43 +0100
|
||||
|
||||
jpeg-xl (0.3) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.3.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Wed, 27 Jan 2021 22:36:32 +0100
|
||||
|
||||
jpeg-xl (0.2) UNRELEASED; urgency=medium
|
||||
|
||||
* Bump JPEG XL version to 0.2.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Wed, 23 Nov 2020 20:42:10 +0100
|
||||
|
||||
jpeg-xl (0.1) UNRELEASED; urgency=medium
|
||||
|
||||
* JPEG XL format release candidate.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Fri, 13 Nov 2020 17:42:24 +0100
|
||||
|
||||
jpeg-xl (0.0.2-1) UNRELEASED; urgency=medium
|
||||
|
||||
* Initial debian package.
|
||||
|
||||
-- Alex Deymo <deymo@google.com> Tue, 27 Oct 2020 15:27:59 +0100
|
|
@ -0,0 +1 @@
|
|||
10
|
|
@ -0,0 +1,61 @@
|
|||
Source: jpeg-xl
|
||||
Maintainer: JPEG XL Maintainers <jpegxl@google.com>
|
||||
Section: misc
|
||||
Priority: optional
|
||||
Standards-Version: 3.9.8
|
||||
Build-Depends: cmake,
|
||||
debhelper (>= 9),
|
||||
libbrotli-dev,
|
||||
libgif-dev,
|
||||
libgmock-dev,
|
||||
libgoogle-perftools-dev,
|
||||
libgtest-dev,
|
||||
libhwy-dev,
|
||||
libjpeg-dev,
|
||||
libopenexr-dev,
|
||||
libpng-dev,
|
||||
libwebp-dev,
|
||||
pkg-config,
|
||||
Homepage: https://gitlab.com/wg1/jpeg-xl
|
||||
Rules-Requires-Root: no
|
||||
|
||||
Package: jxl
|
||||
Architecture: any
|
||||
Section: utils
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Description: JPEG XL Image Coding System - "JXL" (command line utility)
|
||||
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
|
||||
lossless image compression format. It has a rich feature set and is
|
||||
particularly optimized for responsive web environments, so that
|
||||
content renders well on a wide range of devices. Moreover, it includes
|
||||
several features that help transition from the legacy JPEG format.
|
||||
.
|
||||
This package installs the command line utilities.
|
||||
|
||||
Package: libjxl-dev
|
||||
Architecture: any
|
||||
Section: libdevel
|
||||
Depends: libjxl (= ${binary:Version}), ${misc:Depends}
|
||||
Description: JPEG XL Image Coding System - "JXL" (development files)
|
||||
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
|
||||
lossless image compression format. It has a rich feature set and is
|
||||
particularly optimized for responsive web environments, so that
|
||||
content renders well on a wide range of devices. Moreover, it includes
|
||||
several features that help transition from the legacy JPEG format.
|
||||
.
|
||||
This package installs development files.
|
||||
|
||||
Package: libjxl
|
||||
Architecture: any
|
||||
Multi-Arch: same
|
||||
Section: libs
|
||||
Depends: ${shlibs:Depends}, ${misc:Depends}
|
||||
Pre-Depends: ${misc:Pre-Depends}
|
||||
Description: JPEG XL Image Coding System - "JXL" (shared libraries)
|
||||
The JPEG XL Image Coding System (ISO/IEC 18181) is a lossy and
|
||||
lossless image compression format. It has a rich feature set and is
|
||||
particularly optimized for responsive web environments, so that
|
||||
content renders well on a wide range of devices. Moreover, it includes
|
||||
several features that help transition from the legacy JPEG format.
|
||||
.
|
||||
This package installs shared libraries.
|
|
@ -0,0 +1,236 @@
|
|||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: jpeg-xl
|
||||
|
||||
Files: *
|
||||
Copyright: 2020 the JPEG XL Project
|
||||
License: Apache-2.0
|
||||
|
||||
Files: third_party/sjpeg/*
|
||||
Copyright: 2017 Google, Inc
|
||||
License: Apache-2.0
|
||||
|
||||
Files: third_party/lodepng/*
|
||||
Copyright: 2005-2018 Lode Vandevenne
|
||||
License: Zlib License
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the authors be held liable for any damages
|
||||
arising from the use of this software.
|
||||
.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
.
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
.
|
||||
3. This notice may not be removed or altered from any source
|
||||
distribution.
|
||||
|
||||
Files: third_party/skcms/*
|
||||
Copyright: 2018 Google Inc.
|
||||
License: BSD-3-clause
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
.
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Files: third_party/testdata/imagecompression.info/*
|
||||
Copyright: their respective owners.
|
||||
License: License without any prohibitive copyright restrictions.
|
||||
See https://imagecompression.info/test_images/ for details.
|
||||
.
|
||||
These Images are available without any prohibitive copyright restrictions.
|
||||
.
|
||||
These images are (c) there respective owners. You are granted full
|
||||
redistribution and publication rights on these images provided:
|
||||
.
|
||||
1. The origin of the pictures must not be misrepresented; you must not claim
|
||||
that you took the original pictures. If you use, publish or redistribute them,
|
||||
an acknowledgment would be appreciated but is not required.
|
||||
2. Altered versions must be plainly marked as such, and must not be
|
||||
misinterpreted as being the originals.
|
||||
3. No payment is required for distribution of this material, it must be
|
||||
available freely under the conditions stated here. That is, it is prohibited to
|
||||
sell the material.
|
||||
4. This notice may not be removed or altered from any distribution.
|
||||
|
||||
Files: third_party/testdata/pngsuite/*
|
||||
Copyright: Willem van Schaik, 1996, 2011
|
||||
License: PngSuite License
|
||||
See http://www.schaik.com/pngsuite/ for details.
|
||||
.
|
||||
Permission to use, copy, modify and distribute these images for any
|
||||
purpose and without fee is hereby granted.
|
||||
|
||||
Files: third_party/testdata/raw.pixls/*
|
||||
Copyright: their respective owners listed in https://raw.pixls.us/
|
||||
License: CC0-1.0
|
||||
|
||||
Files: third_party/testdata/raw.pixls/*
|
||||
Copyright: their respective owners listed in https://www.wesaturate.com/
|
||||
License: CC0-1.0
|
||||
|
||||
Files: third_party/testdata/wide-gamut-tests/
|
||||
Copyright: github.com/codelogic/wide-gamut-tests authors.
|
||||
License: Apache-2.0
|
||||
|
||||
License: Apache-2.0
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
.
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
.
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
.
|
||||
On Debian systems, the complete text of the Apache License, Version 2
|
||||
can be found in "/usr/share/common-licenses/Apache-2.0".
|
||||
|
||||
License: CC0
|
||||
Creative Commons Zero v1.0 Universal
|
||||
.
|
||||
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL
|
||||
SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN ATTORNEY-CLIENT
|
||||
RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS"
|
||||
BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE USE OF THIS
|
||||
DOCUMENT OR THE INFORMATION OR WORKS PROVIDED HEREUNDER, AND DISCLAIMS
|
||||
LIABILITY FOR DAMAGES RESULTING FROM THE USE OF THIS DOCUMENT OR THE
|
||||
INFORMATION OR WORKS PROVIDED HEREUNDER.
|
||||
.
|
||||
Statement of Purpose
|
||||
.
|
||||
The laws of most jurisdictions throughout the world automatically confer
|
||||
exclusive Copyright and Related Rights (defined below) upon the creator and
|
||||
subsequent owner(s) (each and all, an "owner") of an original work of
|
||||
authorship and/or a database (each, a "Work").
|
||||
.
|
||||
Certain owners wish to permanently relinquish those rights to a Work for the
|
||||
purpose of contributing to a commons of creative, cultural and scientific
|
||||
works ("Commons") that the public can reliably and without fear of later
|
||||
claims of infringement build upon, modify, incorporate in other works, reuse
|
||||
and redistribute as freely as possible in any form whatsoever and for any
|
||||
purposes, including without limitation commercial purposes. These owners may
|
||||
contribute to the Commons to promote the ideal of a free culture and the
|
||||
further production of creative, cultural and scientific works, or to gain
|
||||
reputation or greater distribution for their Work in part through the use
|
||||
and efforts of others.
|
||||
.
|
||||
For these and/or other purposes and motivations, and without any expectation
|
||||
of additional consideration or compensation, the person associating CC0 with
|
||||
a Work (the "Affirmer"), to the extent that he or she is an owner of
|
||||
Copyright and Related Rights in the Work, voluntarily elects to apply CC0 to
|
||||
the Work and publicly distribute the Work under its terms, with knowledge of
|
||||
his or her Copyright and Related Rights in the Work and the meaning and
|
||||
intended legal effect of CC0 on those rights.
|
||||
.
|
||||
1. Copyright and Related Rights. A Work made available under CC0 may be
|
||||
protected by copyright and related or neighboring rights ("Copyright and
|
||||
Related Rights"). Copyright and Related Rights include, but are not limited
|
||||
to, the following:
|
||||
i. the right to reproduce, adapt, distribute, perform, display,
|
||||
communicate, and translate a Work;
|
||||
ii. moral rights retained by the original author(s) and/or performer(s);
|
||||
iii. publicity and privacy rights pertaining to a person's image or
|
||||
likeness depicted in a Work;
|
||||
iv. rights protecting against unfair competition in regards to a Work,
|
||||
subject to the limitations in paragraph 4(a), below;
|
||||
v. rights protecting the extraction, dissemination, use and reuse of data
|
||||
in a Work;
|
||||
vi. database rights (such as those arising under Directive 96/9/EC of the
|
||||
European Parliament and of the Council of 11 March 1996 on the legal
|
||||
protection of databases, and under any national implementation thereof,
|
||||
including any amended or successor version of such directive); and
|
||||
vii. other similar, equivalent or corresponding rights throughout the
|
||||
world based on applicable law or treaty, and any national implementations
|
||||
thereof.
|
||||
.
|
||||
2. Waiver. To the greatest extent permitted by, but not in contravention of,
|
||||
applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
|
||||
unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
|
||||
and Related Rights and associated claims and causes of action, whether now
|
||||
known or unknown (including existing as well as future claims and causes of
|
||||
action), in the Work (i) in all territories worldwide, (ii) for the maximum
|
||||
duration provided by applicable law or treaty (including future time
|
||||
extensions), (iii) in any current or future medium and for any number of
|
||||
copies, and (iv) for any purpose whatsoever, including without limitation
|
||||
commercial, advertising or promotional purposes (the "Waiver"). Affirmer
|
||||
makes the Waiver for the benefit of each member of the public at large and
|
||||
to the detriment of Affirmer's heirs and successors, fully intending that
|
||||
such Waiver shall not be subject to revocation, rescission, cancellation,
|
||||
termination, or any other legal or equitable action to disrupt the quiet
|
||||
enjoyment of the Work by the public as contemplated by Affirmer's express
|
||||
Statement of Purpose.
|
||||
.
|
||||
3. Public License Fallback. Should any part of the Waiver for any reason be
|
||||
judged legally invalid or ineffective under applicable law, then the Waiver
|
||||
shall be preserved to the maximum extent permitted taking into account
|
||||
Affirmer's express Statement of Purpose. In addition, to the extent the
|
||||
Waiver is so judged Affirmer hereby grants to each affected person a
|
||||
royalty-free, non transferable, non sublicensable, non exclusive,
|
||||
irrevocable and unconditional license to exercise Affirmer's Copyright and
|
||||
Related Rights in the Work (i) in all territories worldwide, (ii) for the
|
||||
maximum duration provided by applicable law or treaty (including future time
|
||||
extensions), (iii) in any current or future medium and for any number of
|
||||
copies, and (iv) for any purpose whatsoever, including without limitation
|
||||
commercial, advertising or promotional purposes (the "License"). The License
|
||||
shall be deemed effective as of the date CC0 was applied by Affirmer to the
|
||||
Work. Should any part of the License for any reason be judged legally
|
||||
invalid or ineffective under applicable law, such partial invalidity or
|
||||
ineffectiveness shall not invalidate the remainder of the License, and in
|
||||
such case Affirmer hereby affirms that he or she will not (i) exercise any
|
||||
of his or her remaining Copyright and Related Rights in the Work or (ii)
|
||||
assert any associated claims and causes of action with respect to the Work,
|
||||
in either case contrary to Affirmer's express Statement of Purpose.
|
||||
.
|
||||
4. Limitations and Disclaimers.
|
||||
a. No trademark or patent rights held by Affirmer are waived, abandoned,
|
||||
surrendered, licensed or otherwise affected by this document.
|
||||
b. Affirmer offers the Work as-is and makes no representations or
|
||||
warranties of any kind concerning the Work, express, implied, statutory or
|
||||
otherwise, including without limitation warranties of title,
|
||||
merchantability, fitness for a particular purpose, non infringement, or the
|
||||
absence of latent or other defects, accuracy, or the present or absence of
|
||||
errors, whether or not discoverable, all to the greatest extent permissible
|
||||
under applicable law.
|
||||
c. Affirmer disclaims responsibility for clearing rights of other persons
|
||||
that may apply to the Work or any use thereof, including without limitation
|
||||
any person's Copyright and Related Rights in the Work. Further, Affirmer
|
||||
disclaims responsibility for obtaining any necessary consents, permissions
|
||||
or other rights required for any use of the Work.
|
||||
d. Affirmer understands and acknowledges that Creative Commons is not a
|
||||
party to this document and has no duty or obligation with respect to this
|
||||
CC0 or use of the Work.
|
||||
.
|
||||
For more information, please see:
|
||||
http://creativecommons.org/publicdomain/zero/1.0/>
|
||||
|
|
@ -0,0 +1 @@
|
|||
debian/tmp/usr/bin/*
|
|
@ -0,0 +1,4 @@
|
|||
debian/tmp/usr/include/jxl/*.h
|
||||
debian/tmp/usr/lib/*/*.a
|
||||
debian/tmp/usr/lib/*/*.so
|
||||
debian/tmp/usr/lib/*/pkgconfig/*.pc
|
|
@ -0,0 +1 @@
|
|||
debian/tmp/usr/lib/*/libjxl*.so.*
|
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/make -f
|
||||
%:
|
||||
dh $@ --buildsystem=cmake
|
||||
|
||||
override_dh_auto_configure:
|
||||
# TODO(deymo): Remove the DCMAKE_BUILD_TYPE once builds without NDEBUG
|
||||
# are as useful as Release builds.
|
||||
dh_auto_configure -- \
|
||||
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
|
||||
-DJPEGXL_FORCE_SYSTEM_GTEST=ON \
|
||||
-DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
|
||||
-DJPEGXL_FORCE_SYSTEM_HWY=ON
|
|
@ -0,0 +1 @@
|
|||
3.0 (quilt)
|
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# This file downloads the dependencies needed to build JPEG XL into third_party.
|
||||
# These dependencies are normally pulled by gtest.
|
||||
|
||||
set -eu
|
||||
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
|
||||
# Git revisions we use for the given submodules. Update these whenever you
|
||||
# update a git submodule.
|
||||
THIRD_PARTY_HIGHWAY="ca1a57c342cd815053abfcffa29b44eaead4f20b"
|
||||
THIRD_PARTY_LODEPNG="48e5364ef48ec2408f44c727657ac1b6703185f8"
|
||||
THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
|
||||
THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
|
||||
|
||||
# Download the target revision from GitHub.
|
||||
download_github() {
|
||||
local path="$1"
|
||||
local project="$2"
|
||||
|
||||
local varname="${path^^}"
|
||||
varname="${varname/\//_}"
|
||||
local sha
|
||||
eval "sha=\${${varname}}"
|
||||
|
||||
local down_dir="${MYDIR}/downloads"
|
||||
local local_fn="${down_dir}/${sha}.tar.gz"
|
||||
if [[ -e "${local_fn}" && -d "${MYDIR}/${path}" ]]; then
|
||||
echo "${path} already up to date." >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
local url
|
||||
local strip_components=0
|
||||
if [[ "${project:0:4}" == "http" ]]; then
|
||||
# "project" is a googlesource.com base url.
|
||||
url="${project}${sha}.tar.gz"
|
||||
else
|
||||
# GitHub files have a top-level directory
|
||||
strip_components=1
|
||||
url="https://github.com/${project}/tarball/${sha}"
|
||||
fi
|
||||
|
||||
echo "Downloading ${path} version ${sha}..." >&2
|
||||
mkdir -p "${down_dir}"
|
||||
curl -L --show-error -o "${local_fn}.tmp" "${url}"
|
||||
mkdir -p "${MYDIR}/${path}"
|
||||
tar -zxf "${local_fn}.tmp" -C "${MYDIR}/${path}" \
|
||||
--strip-components="${strip_components}"
|
||||
mv "${local_fn}.tmp" "${local_fn}"
|
||||
}
|
||||
|
||||
|
||||
main() {
|
||||
if git -C "${MYDIR}" rev-parse; then
|
||||
cat >&2 <<EOF
|
||||
Currenty directory is a git repository, downloading dependencies via git:
|
||||
|
||||
git submodule update --init --recursive
|
||||
|
||||
EOF
|
||||
git -C "${MYDIR}" submodule update --init --recursive
|
||||
return 0
|
||||
fi
|
||||
|
||||
# Sources downloaded from a tarball.
|
||||
download_github third_party/highway google/highway
|
||||
download_github third_party/lodepng lvandeve/lodepng
|
||||
download_github third_party/sjpeg webmproject/sjpeg
|
||||
download_github third_party/skcms \
|
||||
"https://skia.googlesource.com/skcms/+archive/"
|
||||
echo "Done."
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1,30 @@
|
|||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Build an Ubuntu-based docker image with the installed software needed to
|
||||
# develop and test JPEG XL.
|
||||
|
||||
FROM ubuntu:bionic
|
||||
|
||||
# Set a prompt for when using it locally.
|
||||
ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
|
||||
|
||||
COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
|
||||
|
||||
COPY scripts /jpegxl_scripts
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN /jpegxl_scripts/jpegxl_builder.sh && \
|
||||
rm -rf /jpegxl_scripts
|
|
@ -0,0 +1,46 @@
|
|||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Build an Ubuntu-based docker image for aarch64 with the installed software
|
||||
# needed to run JPEG XL. This is only useful when running on actual aarch64
|
||||
# hardware.
|
||||
|
||||
FROM arm64v8/ubuntu:bionic
|
||||
|
||||
COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
|
||||
|
||||
# Set a prompt for when using it locally.
|
||||
ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN set -ex; \
|
||||
apt-get update -y; \
|
||||
apt-get install -y \
|
||||
bsdmainutils \
|
||||
cmake \
|
||||
curl \
|
||||
ca-certificates \
|
||||
extra-cmake-modules \
|
||||
git \
|
||||
imagemagick \
|
||||
libjpeg8 \
|
||||
libgif7 \
|
||||
libgoogle-perftools4 \
|
||||
libopenexr22 \
|
||||
libpng16-16 \
|
||||
libqt5x11extras5 \
|
||||
libsdl2-2.0-0 \
|
||||
parallel; \
|
||||
rm -rf /var/lib/apt/lists/*;
|
|
@ -0,0 +1,7 @@
|
|||
### Docker container infrastructure for JPEG XL
|
||||
|
||||
This directory contains the requirements to build a docker image for the
|
||||
JPEG XL project builder.
|
||||
|
||||
Docker images need to be created and upload manually. See ./build.sh for
|
||||
details.
|
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -eu
|
||||
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
|
||||
declare -a TARGETS
|
||||
|
||||
load_targets() {
|
||||
# Built-in OSX "find" does not support "-m".
|
||||
FIND=$(which "gfind" || which "find")
|
||||
for f in $(${FIND} -maxdepth 1 -name 'Dockerfile.*' | sort); do
|
||||
local target="${f#*Dockerfile.}"
|
||||
TARGETS+=("${target}")
|
||||
done
|
||||
}
|
||||
|
||||
usage() {
|
||||
cat >&2 <<EOF
|
||||
Use: $1 [targets]
|
||||
|
||||
Available targets:
|
||||
* all
|
||||
EOF
|
||||
for target in "${TARGETS[@]}"; do
|
||||
echo " * ${target}" >&2
|
||||
done
|
||||
}
|
||||
|
||||
build_target() {
|
||||
local target="$1"
|
||||
|
||||
local dockerfile="${MYDIR}/Dockerfile.${target}"
|
||||
# JPEG XL builder images are stored in the gcr.io/jpegxl project.
|
||||
local tag="gcr.io/jpegxl/${target}"
|
||||
|
||||
echo "Building ${target}"
|
||||
if ! sudo docker build --no-cache -t "${tag}" -f "${dockerfile}" "${MYDIR}" \
|
||||
>"${target}.log" 2>&1; then
|
||||
echo "${target} failed. See ${target}.log" >&2
|
||||
else
|
||||
echo "Done, to upload image run:" >&2
|
||||
echo " sudo docker push ${tag}"
|
||||
if [[ "${JPEGXL_PUSH:-}" == "1" ]]; then
|
||||
echo "sudo docker push ${tag}" >&2
|
||||
sudo docker push "${tag}"
|
||||
# The RepoDigest is only created after it is pushed.
|
||||
local fulltag=$(sudo docker inspect --format="{{.RepoDigests}}" "${tag}")
|
||||
fulltag="${fulltag#[}"
|
||||
fulltag="${fulltag%]}"
|
||||
echo "Updating .gitlab-ci.yml to ${fulltag}" >&2
|
||||
sed -E "s;${tag}@sha256:[0-9a-f]+;${fulltag};" \
|
||||
-i "${MYDIR}/../.gitlab-ci.yml"
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
main() {
|
||||
cd "${MYDIR}"
|
||||
local target="${1:-}"
|
||||
|
||||
load_targets
|
||||
if [[ -z "${target}" ]]; then
|
||||
usage $0
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ "${target}" == "all" ]]; then
|
||||
for target in "${TARGETS[@]}"; do
|
||||
build_target "${target}"
|
||||
done
|
||||
else
|
||||
for target in "$@"; do
|
||||
build_target "${target}"
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1 @@
|
|||
APT::Install-Recommends "false";
|
|
@ -0,0 +1,28 @@
|
|||
Description: fix lack of alignment in relocations (crashes on mingw)
|
||||
See https://sourceware.org/git/?p=binutils-gdb.git;a=patch;h=73af69e74974eaa155eec89867e3ccc77ab39f6d
|
||||
From: Marc <marc@groundctl.com>
|
||||
Date: Fri, 9 Nov 2018 11:13:50 +0000
|
||||
Subject: [PATCH] Allow for compilers that do not produce aligned .rdat
|
||||
sections in PE format files.
|
||||
|
||||
--- a/upstream/ld/scripttempl/pe.sc 2020-05-12 18:45:12.000000000 +0200
|
||||
+++ b/upstream/ld/scripttempl/pe.sc 2020-05-12 18:47:12.000000000 +0200
|
||||
@@ -143,6 +143,7 @@
|
||||
.rdata ${RELOCATING+BLOCK(__section_alignment__)} :
|
||||
{
|
||||
${R_RDATA}
|
||||
+ . = ALIGN(4);
|
||||
${RELOCATING+__rt_psrelocs_start = .;}
|
||||
${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
|
||||
${RELOCATING+__rt_psrelocs_end = .;}
|
||||
--- a/upstream/ld/scripttempl/pep.sc 2020-05-12 18:45:19.000000000 +0200
|
||||
+++ b/upstream/ld/scripttempl/pep.sc 2020-05-12 18:47:18.000000000 +0200
|
||||
@@ -143,6 +143,7 @@
|
||||
.rdata ${RELOCATING+BLOCK(__section_alignment__)} :
|
||||
{
|
||||
${R_RDATA}
|
||||
+ . = ALIGN(4);
|
||||
${RELOCATING+__rt_psrelocs_start = .;}
|
||||
${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
|
||||
${RELOCATING+__rt_psrelocs_end = .;}
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
EMSDK_URL="https://github.com/emscripten-core/emsdk/archive/master.tar.gz"
|
||||
EMSDK_DIR="/opt/emsdk"
|
||||
|
||||
EMSDK_RELEASE="2.0.4"
|
||||
|
||||
set -eu -x
|
||||
|
||||
# Temporary files cleanup hooks.
|
||||
CLEANUP_FILES=()
|
||||
cleanup() {
|
||||
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
|
||||
rm -fr "${CLEANUP_FILES[@]}"
|
||||
fi
|
||||
}
|
||||
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
|
||||
|
||||
main() {
|
||||
local workdir=$(mktemp -d --suffix=emsdk)
|
||||
CLEANUP_FILES+=("${workdir}")
|
||||
|
||||
local emsdktar="${workdir}/emsdk.tar.gz"
|
||||
curl --output "${emsdktar}" "${EMSDK_URL}" --location
|
||||
mkdir -p "${EMSDK_DIR}"
|
||||
tar -zxf "${emsdktar}" -C "${EMSDK_DIR}" --strip-components=1
|
||||
|
||||
cd "${EMSDK_DIR}"
|
||||
./emsdk install --shallow "${EMSDK_RELEASE}"
|
||||
./emsdk activate --embedded "${EMSDK_RELEASE}"
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1,515 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Main entry point for all the Dockerfile for jpegxl-builder. This centralized
|
||||
# file helps sharing code and configuration between Dockerfiles.
|
||||
|
||||
set -eux
|
||||
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
|
||||
# libjpeg-turbo.
|
||||
JPEG_TURBO_RELEASE="2.0.4"
|
||||
JPEG_TURBO_URL="https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${JPEG_TURBO_RELEASE}.tar.gz"
|
||||
JPEG_TURBO_SHA256="7777c3c19762940cff42b3ba4d7cd5c52d1671b39a79532050c85efb99079064"
|
||||
|
||||
# zlib (dependency of libpng)
|
||||
ZLIB_RELEASE="1.2.11"
|
||||
ZLIB_URL="https://www.zlib.net/zlib-${ZLIB_RELEASE}.tar.gz"
|
||||
ZLIB_SHA256="c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
|
||||
# The name in the .pc and the .dll generated don't match in zlib for Windows
|
||||
# because they use different .dll names in Windows. We avoid that by defining
|
||||
# UNIX=1. We also install all the .dll files to ${prefix}/lib instead of the
|
||||
# default ${prefix}/bin.
|
||||
ZLIB_FLAGS='-DUNIX=1 -DINSTALL_PKGCONFIG_DIR=/${CMAKE_INSTALL_PREFIX}/lib/pkgconfig -DINSTALL_BIN_DIR=/${CMAKE_INSTALL_PREFIX}/lib'
|
||||
|
||||
# libpng
|
||||
LIBPNG_RELEASE="1.6.37"
|
||||
LIBPNG_URL="https://github.com/glennrp/libpng/archive/v${LIBPNG_RELEASE}.tar.gz"
|
||||
LIBPNG_SHA256="ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307"
|
||||
|
||||
# giflib
|
||||
GIFLIB_RELEASE="5.2.1"
|
||||
GIFLIB_URL="https://netcologne.dl.sourceforge.net/project/giflib/giflib-${GIFLIB_RELEASE}.tar.gz"
|
||||
GIFLIB_SHA256="31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd"
|
||||
|
||||
# A patch needed to compile GIFLIB in mingw.
|
||||
GIFLIB_PATCH_URL="https://github.com/msys2/MINGW-packages/raw/3afde38fcee7b3ba2cafd97d76cca8f06934504f/mingw-w64-giflib/001-mingw-build.patch"
|
||||
GIFLIB_PATCH_SHA256="2b2262ddea87fc07be82e10aeb39eb699239f883c899aa18a16e4d4e40af8ec8"
|
||||
|
||||
# webp
|
||||
WEBP_RELEASE="1.0.2"
|
||||
WEBP_URL="https://codeload.github.com/webmproject/libwebp/tar.gz/v${WEBP_RELEASE}"
|
||||
WEBP_SHA256="347cf85ddc3497832b5fa9eee62164a37b249c83adae0ba583093e039bf4881f"
|
||||
|
||||
# Google benchmark
|
||||
BENCHMARK_RELEASE="1.5.2"
|
||||
BENCHMARK_URL="https://github.com/google/benchmark/archive/v${BENCHMARK_RELEASE}.tar.gz"
|
||||
BENCHMARK_SHA256="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
|
||||
BENCHMARK_FLAGS="-DGOOGLETEST_PATH=${MYDIR}/../../third_party/googletest"
|
||||
# attribute(format(__MINGW_PRINTF_FORMAT, ...)) doesn't work in our
|
||||
# environment, so we disable the warning.
|
||||
BENCHMARK_FLAGS="-DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_TESTING=OFF \
|
||||
-DCMAKE_CXX_FLAGS=-Wno-ignored-attributes \
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
|
||||
|
||||
# V8
|
||||
V8_VERSION="8.7.230"
|
||||
|
||||
# Temporary files cleanup hooks.
|
||||
CLEANUP_FILES=()
|
||||
cleanup() {
|
||||
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
|
||||
rm -fr "${CLEANUP_FILES[@]}"
|
||||
fi
|
||||
}
|
||||
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
|
||||
|
||||
# List of Ubuntu arch names supported by the builder (such as "i386").
|
||||
LIST_ARCHS=(
|
||||
amd64
|
||||
i386
|
||||
arm64
|
||||
armhf
|
||||
)
|
||||
|
||||
# List of target triplets supported by the builder.
|
||||
LIST_TARGETS=(
|
||||
x86_64-linux-gnu
|
||||
i686-linux-gnu
|
||||
arm-linux-gnueabihf
|
||||
aarch64-linux-gnu
|
||||
)
|
||||
LIST_MINGW_TARGETS=(
|
||||
i686-w64-mingw32
|
||||
x86_64-w64-mingw32
|
||||
)
|
||||
LIST_WASM_TARGETS=(
|
||||
wasm32
|
||||
)
|
||||
|
||||
# Setup the apt repositories and supported architectures.
|
||||
setup_apt() {
|
||||
apt-get update -y
|
||||
apt-get install -y curl gnupg ca-certificates
|
||||
|
||||
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
|
||||
|
||||
# node sources.
|
||||
cat >/etc/apt/sources.list.d/nodesource.list <<EOF
|
||||
deb https://deb.nodesource.com/node_14.x bionic main
|
||||
deb-src https://deb.nodesource.com/node_14.x bionic main
|
||||
EOF
|
||||
curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
|
||||
|
||||
local port_list=()
|
||||
local main_list=()
|
||||
local ubarch
|
||||
for ubarch in "${LIST_ARCHS[@]}"; do
|
||||
if [[ "${ubarch}" != "amd64" && "${ubarch}" != "i386" ]]; then
|
||||
# other archs are not part of the main mirrors, but available in
|
||||
# ports.ubuntu.com.
|
||||
port_list+=("${ubarch}")
|
||||
else
|
||||
main_list+=("${ubarch}")
|
||||
fi
|
||||
# Add the arch to the system.
|
||||
if [[ "${ubarch}" != "amd64" ]]; then
|
||||
dpkg --add-architecture "${ubarch}"
|
||||
fi
|
||||
done
|
||||
|
||||
# Update the sources.list with the split of supported architectures.
|
||||
local bkplist="/etc/apt/sources.list.bkp"
|
||||
[[ -e "${bkplist}" ]] || \
|
||||
mv /etc/apt/sources.list "${bkplist}"
|
||||
|
||||
local newlist="/etc/apt/sources.list.tmp"
|
||||
rm -f "${newlist}"
|
||||
port_list=$(echo "${port_list[@]}" | tr ' ' ,)
|
||||
if [[ -n "${port_list}" ]]; then
|
||||
local port_url="http://ports.ubuntu.com/ubuntu-ports/"
|
||||
grep -v -E '^#' "${bkplist}" |
|
||||
sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${port_list}] ${port_url} \\2;" \
|
||||
>>"${newlist}"
|
||||
fi
|
||||
|
||||
main_list=$(echo "${main_list[@]}" | tr ' ' ,)
|
||||
grep -v -E '^#' "${bkplist}" |
|
||||
sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
|
||||
>>"${newlist}"
|
||||
mv "${newlist}" /etc/apt/sources.list
|
||||
}
|
||||
|
||||
install_pkgs() {
|
||||
packages=(
|
||||
# Native compilers (minimum for SIMD is clang-7)
|
||||
clang-7 clang-format-7 clang-tidy-7
|
||||
|
||||
# TODO: Consider adding clang-8 to every builder:
|
||||
# clang-8 clang-format-8 clang-tidy-8
|
||||
|
||||
# For cross-compiling to Windows with mingw.
|
||||
mingw-w64
|
||||
wine64
|
||||
wine-binfmt
|
||||
|
||||
# Native tools.
|
||||
bsdmainutils
|
||||
cmake
|
||||
extra-cmake-modules
|
||||
git
|
||||
llvm
|
||||
nasm
|
||||
ninja-build
|
||||
parallel
|
||||
pkg-config
|
||||
|
||||
# These are used by the ./ci.sh lint in the native builder.
|
||||
clang-format-7
|
||||
clang-format-8
|
||||
|
||||
# For coverage builds
|
||||
gcovr
|
||||
|
||||
# For compiling giflib documentation.
|
||||
xmlto
|
||||
|
||||
# Common libraries.
|
||||
libstdc++-8-dev
|
||||
|
||||
# We don't use tcmalloc on archs other than amd64. This installs
|
||||
# libgoogle-perftools4:amd64.
|
||||
google-perftools
|
||||
|
||||
# NodeJS for running WASM tests
|
||||
nodejs
|
||||
|
||||
# To generate API documentation.
|
||||
doxygen
|
||||
|
||||
# Freezes version that builds (passes tests). Newer version
|
||||
# (2.30-21ubuntu1~18.04.4) claims to fix "On Intel Skylake
|
||||
# (-march=native) generated avx512 instruction can be wrong",
|
||||
# but newly added tests does not pass. Perhaps the problem is
|
||||
# that mingw package is not updated.
|
||||
binutils-source=2.30-15ubuntu1
|
||||
)
|
||||
|
||||
# Install packages that are arch-dependent.
|
||||
local ubarch
|
||||
for ubarch in "${LIST_ARCHS[@]}"; do
|
||||
packages+=(
|
||||
# Library dependencies. These normally depend on the target architecture
|
||||
# we are compiling for and can't usually be installed for multiple
|
||||
# architectures at the same time.
|
||||
libgif7:"${ubarch}"
|
||||
libjpeg-dev:"${ubarch}"
|
||||
libpng-dev:"${ubarch}"
|
||||
libqt5x11extras5-dev:"${ubarch}"
|
||||
|
||||
libstdc++-8-dev:"${ubarch}"
|
||||
qtbase5-dev:"${ubarch}"
|
||||
|
||||
# For OpenEXR:
|
||||
libilmbase12:"${ubarch}"
|
||||
libopenexr22:"${ubarch}"
|
||||
|
||||
# TCMalloc dependency
|
||||
libunwind-dev:"${ubarch}"
|
||||
|
||||
# Cross-compiling tools per arch.
|
||||
libc6-dev-"${ubarch}"-cross
|
||||
libstdc++-8-dev-"${ubarch}"-cross
|
||||
)
|
||||
done
|
||||
|
||||
local target
|
||||
for target in "${LIST_TARGETS[@]}"; do
|
||||
# Per target cross-compiling tools.
|
||||
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
|
||||
packages+=(
|
||||
binutils-"${target}"
|
||||
gcc-"${target}"
|
||||
)
|
||||
fi
|
||||
done
|
||||
|
||||
# Install all the manual packages via "apt install" for the main arch. These
|
||||
# will be installed for other archs via manual download and unpack.
|
||||
apt install -y "${packages[@]}" "${UNPACK_PKGS[@]}"
|
||||
}
|
||||
|
||||
# binutils <2.32 need a patch.
|
||||
install_binutils() {
|
||||
local workdir=$(mktemp -d --suffix=_install)
|
||||
CLEANUP_FILES+=("${workdir}")
|
||||
pushd "${workdir}"
|
||||
apt source binutils-mingw-w64
|
||||
apt -y build-dep binutils-mingw-w64
|
||||
cd binutils-mingw-w64-8ubuntu1
|
||||
cp "${MYDIR}/binutils_align_fix.patch" debian/patches
|
||||
echo binutils_align_fix.patch >> debian/patches/series
|
||||
dpkg-buildpackage -b
|
||||
cd ..
|
||||
dpkg -i *deb
|
||||
popd
|
||||
}
|
||||
|
||||
# Install a library from the source code for multiple targets.
|
||||
# Usage: install_from_source <tar_url> <sha256> <target> [<target...>]
|
||||
install_from_source() {
|
||||
local package="$1"
|
||||
shift
|
||||
|
||||
local url
|
||||
eval "url=\${${package}_URL}"
|
||||
local sha256
|
||||
eval "sha256=\${${package}_SHA256}"
|
||||
# Optional package flags
|
||||
local pkgflags
|
||||
eval "pkgflags=\${${package}_FLAGS:-}"
|
||||
|
||||
local workdir=$(mktemp -d --suffix=_install)
|
||||
CLEANUP_FILES+=("${workdir}")
|
||||
|
||||
local tarfile="${workdir}"/$(basename "${url}")
|
||||
curl -L --output "${tarfile}" "${url}"
|
||||
if ! echo "${sha256} ${tarfile}" | sha256sum -c --status -; then
|
||||
echo "SHA256 mismatch for ${url}: expected ${sha256} but found:"
|
||||
sha256sum "${tarfile}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
local target
|
||||
for target in "$@"; do
|
||||
echo "Installing ${package} for target ${target} from ${url}"
|
||||
|
||||
local srcdir="${workdir}/source-${target}"
|
||||
mkdir -p "${srcdir}"
|
||||
tar -zxf "${tarfile}" -C "${srcdir}" --strip-components=1
|
||||
|
||||
local prefix="/usr"
|
||||
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
|
||||
prefix="/usr/${target}"
|
||||
fi
|
||||
|
||||
# Apply patches to buildfiles.
|
||||
if [[ "${package}" == "GIFLIB" && "${target}" == *mingw32 ]]; then
|
||||
# GIFLIB Makefile has several problems so we need to fix them here. We are
|
||||
# using a patch from MSYS2 that already fixes the compilation for mingw.
|
||||
local make_patch="${srcdir}/libgif.patch"
|
||||
curl -L "${GIFLIB_PATCH_URL}" -o "${make_patch}"
|
||||
echo "${GIFLIB_PATCH_SHA256} ${make_patch}" | sha256sum -c --status -
|
||||
patch "${srcdir}/Makefile" < "${make_patch}"
|
||||
elif [[ "${package}" == "LIBPNG" && "${target}" == wasm* ]]; then
|
||||
# Cut the dependency to libm; there is pull request to fix it, so this
|
||||
# might not be needed in the future.
|
||||
sed -i 's/APPLE/EMSCRIPTEN/g' "${srcdir}/CMakeLists.txt"
|
||||
fi
|
||||
|
||||
local cmake_args=()
|
||||
local export_args=("CC=clang-7" "CXX=clang++-7")
|
||||
local cmake="cmake"
|
||||
local make="make"
|
||||
local system_name="Linux"
|
||||
if [[ "${target}" == *mingw32 ]]; then
|
||||
system_name="Windows"
|
||||
# When compiling with clang, CMake doesn't detect that we are using mingw.
|
||||
cmake_args+=(
|
||||
-DMINGW=1
|
||||
# Googletest needs this when cross-compiling to windows
|
||||
-DCMAKE_CROSSCOMPILING=1
|
||||
-DHAVE_STD_REGEX=0
|
||||
-DHAVE_POSIX_REGEX=0
|
||||
-DHAVE_GNU_POSIX_REGEX=0
|
||||
)
|
||||
local windres=$(which ${target}-windres || true)
|
||||
if [[ -n "${windres}" ]]; then
|
||||
cmake_args+=(-DCMAKE_RC_COMPILER="${windres}")
|
||||
fi
|
||||
fi
|
||||
if [[ "${target}" == wasm* ]]; then
|
||||
system_name="WASM"
|
||||
cmake="emcmake cmake"
|
||||
make="emmake make"
|
||||
export_args=()
|
||||
cmake_args+=(
|
||||
-DCMAKE_FIND_ROOT_PATH="${prefix}"
|
||||
-DCMAKE_PREFIX_PATH="${prefix}"
|
||||
)
|
||||
# Static and shared library link to the same file -> race condition.
|
||||
nproc=1
|
||||
else
|
||||
nproc=`nproc --all`
|
||||
fi
|
||||
cmake_args+=(-DCMAKE_SYSTEM_NAME="${system_name}")
|
||||
|
||||
if [[ "${target}" != "x86_64-linux-gnu" ]]; then
|
||||
# Cross-compiling.
|
||||
cmake_args+=(
|
||||
-DCMAKE_C_COMPILER_TARGET="${target}"
|
||||
-DCMAKE_CXX_COMPILER_TARGET="${target}"
|
||||
-DCMAKE_SYSTEM_PROCESSOR="${target%%-*}"
|
||||
)
|
||||
fi
|
||||
|
||||
if [[ -e "${srcdir}/CMakeLists.txt" ]]; then
|
||||
# Most pacakges use cmake for building which is easier to configure for
|
||||
# cross-compiling.
|
||||
if [[ "${package}" == "JPEG_TURBO" && "${target}" == wasm* ]]; then
|
||||
# JT erroneously detects WASM CPU as i386 and tries to use asm.
|
||||
# Wasm/Emscripten support for dynamic linking is incomplete; disable
|
||||
# to avoid CMake warning.
|
||||
cmake_args+=(-DWITH_SIMD=0 -DENABLE_SHARED=OFF)
|
||||
fi
|
||||
(
|
||||
cd "${srcdir}"
|
||||
export ${export_args[@]}
|
||||
${cmake} \
|
||||
-DCMAKE_INSTALL_PREFIX="${prefix}" \
|
||||
"${cmake_args[@]}" ${pkgflags}
|
||||
${make} -j${nproc}
|
||||
${make} install
|
||||
)
|
||||
elif [[ "${package}" == "GIFLIB" ]]; then
|
||||
# GIFLIB doesn't yet have a cmake build system. There is a pull
|
||||
# request in giflib for adding CMakeLists.txt so this might not be
|
||||
# needed in the future.
|
||||
(
|
||||
cd "${srcdir}"
|
||||
local giflib_make_flags=(
|
||||
CFLAGS="-O2 --target=${target} -std=gnu99"
|
||||
PREFIX="${prefix}"
|
||||
)
|
||||
if [[ "${target}" != wasm* ]]; then
|
||||
giflib_make_flags+=(CC=clang-7)
|
||||
fi
|
||||
# giflib make dependencies are not properly set up so parallel building
|
||||
# doesn't work for everything.
|
||||
${make} -j${nproc} libgif.a "${giflib_make_flags[@]}"
|
||||
${make} -j${nproc} all "${giflib_make_flags[@]}"
|
||||
${make} install "${giflib_make_flags[@]}"
|
||||
)
|
||||
else
|
||||
echo "Don't know how to install ${package}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
done
|
||||
}
|
||||
|
||||
# Packages that are manually unpacked for each architecture.
|
||||
UNPACK_PKGS=(
|
||||
libgif-dev
|
||||
libclang-common-7-dev
|
||||
|
||||
# For OpenEXR:
|
||||
libilmbase-dev
|
||||
libopenexr-dev
|
||||
|
||||
# TCMalloc
|
||||
libgoogle-perftools-dev
|
||||
libtcmalloc-minimal4
|
||||
libgoogle-perftools4
|
||||
)
|
||||
|
||||
# Main script entry point.
|
||||
main() {
|
||||
cd "${MYDIR}"
|
||||
|
||||
# Configure the repositories with the sources for multi-arch cross
|
||||
# compilation.
|
||||
setup_apt
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
|
||||
install_pkgs
|
||||
install_binutils
|
||||
apt clean
|
||||
|
||||
# Manually extract packages for the target arch that can't install it directly
|
||||
# at the same time as the native ones.
|
||||
local ubarch
|
||||
for ubarch in "${LIST_ARCHS[@]}"; do
|
||||
if [[ "${ubarch}" != "amd64" ]]; then
|
||||
local pkg
|
||||
for pkg in "${UNPACK_PKGS[@]}"; do
|
||||
apt download "${pkg}":"${ubarch}"
|
||||
dpkg -x "${pkg}"_*_"${ubarch}".deb /
|
||||
done
|
||||
fi
|
||||
done
|
||||
# TODO: Add clang from the llvm repos. This is problematic since we are
|
||||
# installing libclang-common-7-dev:"${ubarch}" from the ubuntu ports repos
|
||||
# which is not available in the llvm repos so it might have a different
|
||||
# version than the ubuntu ones.
|
||||
|
||||
# Remove the win32 libgcc version. The gcc-mingw-w64-x86-64 (and i686)
|
||||
# packages install two libgcc versions:
|
||||
# /usr/lib/gcc/x86_64-w64-mingw32/7.3-posix
|
||||
# /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32
|
||||
# (exact libgcc version number depends on the package version).
|
||||
#
|
||||
# Clang will pick the best libgcc, sorting by version, but it doesn't
|
||||
# seem to be a way to specify one or the other one, except by passing
|
||||
# -nostdlib and setting all the include paths from the command line.
|
||||
# To check which one is being used you can run:
|
||||
# clang++-7 --target=x86_64-w64-mingw32 -v -print-libgcc-file-name
|
||||
# We need to use the "posix" versions for thread support, so here we
|
||||
# just remove the other one.
|
||||
local target
|
||||
for target in "${LIST_MINGW_TARGETS[@]}"; do
|
||||
update-alternatives --set "${target}-gcc" $(which "${target}-gcc-posix")
|
||||
local gcc_win32_path=$("${target}-cpp-win32" -print-libgcc-file-name)
|
||||
rm -rf $(dirname "${gcc_win32_path}")
|
||||
done
|
||||
|
||||
# TODO: Add msan for the target when cross-compiling. This only installs it
|
||||
# for amd64.
|
||||
./msan_install.sh
|
||||
|
||||
# Build and install qemu user-linux targets.
|
||||
./qemu_install.sh
|
||||
|
||||
# Install emscripten SDK.
|
||||
./emsdk_install.sh
|
||||
|
||||
# Setup environment for building WASM libraries from sources.
|
||||
source /opt/emsdk/emsdk_env.sh
|
||||
|
||||
# Install some dependency libraries manually for the different targets.
|
||||
|
||||
install_from_source JPEG_TURBO "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
|
||||
install_from_source ZLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
|
||||
install_from_source LIBPNG "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
|
||||
install_from_source GIFLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
|
||||
# webp in Ubuntu is relatively old so we install it from source for everybody.
|
||||
install_from_source WEBP "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
|
||||
|
||||
install_from_source BENCHMARK "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
|
||||
|
||||
# Install v8. v8 has better WASM SIMD support than NodeJS 14.
|
||||
# First we need the installer to install v8.
|
||||
npm install jsvu -g
|
||||
# install specific version;
|
||||
HOME=/opt jsvu --os=linux64 "v8@${V8_VERSION}"
|
||||
ln -s "/opt/.jsvu/v8-${V8_VERSION}" "/opt/.jsvu/v8"
|
||||
|
||||
# Cleanup.
|
||||
find /var/lib/apt/lists/ -mindepth 1 -delete
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1,140 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -eu
|
||||
|
||||
MYDIR=$(dirname $(realpath "$0"))
|
||||
|
||||
# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
|
||||
CMAKE_FLAGS=${CMAKE_FLAGS:-}
|
||||
CMAKE_C_FLAGS=${CMAKE_C_FLAGS:-${CMAKE_FLAGS}}
|
||||
CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS:-${CMAKE_FLAGS}}
|
||||
CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
|
||||
|
||||
CLANG_VERSION="${CLANG_VERSION:-}"
|
||||
# Detect the clang version suffix and store it in CLANG_VERSION. For example,
|
||||
# "6.0" for clang 6 or "7" for clang 7.
|
||||
detect_clang_version() {
|
||||
if [[ -n "${CLANG_VERSION}" ]]; then
|
||||
return 0
|
||||
fi
|
||||
local clang_version=$("${CC:-clang}" --version | head -n1)
|
||||
local llvm_tag
|
||||
case "${clang_version}" in
|
||||
"clang version 6."*)
|
||||
CLANG_VERSION="6.0"
|
||||
;;
|
||||
"clang version 7."*)
|
||||
CLANG_VERSION="7"
|
||||
;;
|
||||
"clang version 8."*)
|
||||
CLANG_VERSION="8"
|
||||
;;
|
||||
"clang version 9."*)
|
||||
CLANG_VERSION="9"
|
||||
;;
|
||||
*)
|
||||
echo "Unknown clang version: ${clang_version}" >&2
|
||||
return 1
|
||||
esac
|
||||
}
|
||||
|
||||
# Temporary files cleanup hooks.
|
||||
CLEANUP_FILES=()
|
||||
cleanup() {
|
||||
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
|
||||
rm -fr "${CLEANUP_FILES[@]}"
|
||||
fi
|
||||
}
|
||||
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
|
||||
|
||||
# Install libc++ libraries compiled with msan in the msan_prefix for the current
|
||||
# compiler version.
|
||||
cmd_msan_install() {
|
||||
local tmpdir=$(mktemp -d)
|
||||
CLEANUP_FILES+=("${tmpdir}")
|
||||
# Detect the llvm to install:
|
||||
export CC="${CC:-clang}"
|
||||
export CXX="${CXX:-clang++}"
|
||||
detect_clang_version
|
||||
local llvm_tag
|
||||
case "${CLANG_VERSION}" in
|
||||
"6.0")
|
||||
llvm_tag="llvmorg-6.0.1"
|
||||
;;
|
||||
"7")
|
||||
llvm_tag="llvmorg-7.0.1"
|
||||
;;
|
||||
"8")
|
||||
llvm_tag="llvmorg-8.0.0"
|
||||
;;
|
||||
*)
|
||||
echo "Unknown clang version: ${clang_version}" >&2
|
||||
return 1
|
||||
esac
|
||||
local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
|
||||
curl -L --show-error -o "${llvm_targz}" \
|
||||
"https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
|
||||
tar -C "${tmpdir}" -zxf "${llvm_targz}"
|
||||
local llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
|
||||
|
||||
local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
|
||||
rm -rf "${msan_prefix}"
|
||||
|
||||
declare -A CMAKE_EXTRAS
|
||||
CMAKE_EXTRAS[libcxx]="\
|
||||
-DLIBCXX_CXX_ABI=libstdc++ \
|
||||
-DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
|
||||
|
||||
for project in libcxx; do
|
||||
local proj_build="${tmpdir}/build-${project}"
|
||||
local proj_dir="${llvm_root}/${project}"
|
||||
mkdir -p "${proj_build}"
|
||||
cmake -B"${proj_build}" -H"${proj_dir}" \
|
||||
-G Ninja \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DLLVM_USE_SANITIZER=Memory \
|
||||
-DLLVM_PATH="${llvm_root}/llvm" \
|
||||
-DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
|
||||
head -n1)" \
|
||||
-DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
|
||||
-DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
|
||||
-DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
|
||||
-DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
|
||||
${CMAKE_EXTRAS[${project}]}
|
||||
cmake --build "${proj_build}"
|
||||
ninja -C "${proj_build}" install
|
||||
done
|
||||
}
|
||||
|
||||
main() {
|
||||
set -x
|
||||
for version in 6.0 7 8; do
|
||||
if ! which "clang-${version}" >/dev/null; then
|
||||
echo "Skipping msan install for clang version ${version}"
|
||||
continue
|
||||
fi
|
||||
(
|
||||
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
|
||||
export CLANG_VERSION=${version}
|
||||
export CC=clang-${version}
|
||||
export CXX=clang++-${version}
|
||||
cmd_msan_install
|
||||
) &
|
||||
done
|
||||
wait
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
QEMU_RELEASE="4.1.0"
|
||||
QEMU_URL="https://download.qemu.org/qemu-${QEMU_RELEASE}.tar.xz"
|
||||
QEMU_ARCHS=(
|
||||
aarch64
|
||||
arm
|
||||
i386
|
||||
# TODO: Consider adding these:
|
||||
# aarch64_be
|
||||
# mips64el
|
||||
# mips64
|
||||
# mips
|
||||
# ppc64
|
||||
# ppc
|
||||
)
|
||||
|
||||
# Ubuntu packages not installed that are needed to build qemu.
|
||||
QEMU_BUILD_DEPS=(
|
||||
libglib2.0-dev
|
||||
libpixman-1-dev
|
||||
flex
|
||||
bison
|
||||
)
|
||||
|
||||
set -eu -x
|
||||
|
||||
# Temporary files cleanup hooks.
|
||||
CLEANUP_FILES=()
|
||||
cleanup() {
|
||||
if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
|
||||
rm -fr "${CLEANUP_FILES[@]}"
|
||||
fi
|
||||
}
|
||||
trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
|
||||
|
||||
main() {
|
||||
local workdir=$(mktemp -d --suffix=qemu)
|
||||
CLEANUP_FILES+=("${workdir}")
|
||||
|
||||
apt install -y "${QEMU_BUILD_DEPS[@]}"
|
||||
|
||||
local qemutar="${workdir}/qemu.tar.gz"
|
||||
curl --output "${qemutar}" "${QEMU_URL}"
|
||||
tar -Jxf "${qemutar}" -C "${workdir}"
|
||||
local srcdir="${workdir}/qemu-${QEMU_RELEASE}"
|
||||
|
||||
local builddir="${workdir}/build"
|
||||
local prefixdir="${workdir}/prefix"
|
||||
mkdir -p "${builddir}"
|
||||
|
||||
# List of targets to build.
|
||||
local targets=""
|
||||
local make_targets=()
|
||||
local target
|
||||
for target in "${QEMU_ARCHS[@]}"; do
|
||||
targets="${targets} ${target}-linux-user"
|
||||
# Build just the linux-user targets.
|
||||
make_targets+=("${target}-linux-user/all")
|
||||
done
|
||||
|
||||
cd "${builddir}"
|
||||
"${srcdir}/configure" \
|
||||
--prefix="${prefixdir}" \
|
||||
--static --disable-system --enable-linux-user \
|
||||
--target-list="${targets}"
|
||||
|
||||
make -j $(nproc --all || echo 1) "${make_targets[@]}"
|
||||
|
||||
# Manually install these into the non-standard location. This script runs as
|
||||
# root anyway.
|
||||
for target in "${QEMU_ARCHS[@]}"; do
|
||||
cp "${target}-linux-user/qemu-${target}" "/usr/bin/qemu-${target}-static"
|
||||
done
|
||||
|
||||
apt autoremove -y --purge "${QEMU_BUILD_DEPS[@]}"
|
||||
}
|
||||
|
||||
main "$@"
|
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) the JPEG XL Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
add_executable(decode_oneshot decode_oneshot.cc)
|
||||
target_link_libraries(decode_oneshot jxl_dec jxl_threads)
|
||||
add_executable(encode_oneshot encode_oneshot.cc)
|
||||
target_link_libraries(encode_oneshot jxl jxl_threads)
|
||||
|
||||
add_executable(jxlinfo jxlinfo.c)
|
||||
target_link_libraries(jxlinfo jxl)
|
||||
|
||||
if(NOT ${SANITIZER} STREQUAL "none")
|
||||
# Linking a C test binary with the C++ JPEG XL implementation when using
|
||||
# address sanitizer is not well supported by clang 9, so force using clang++
|
||||
# for linking this test if a sanitizer is used.
|
||||
set_target_properties(jxlinfo PROPERTIES LINKER_LANGUAGE CXX)
|
||||
endif() # SANITIZER != "none"
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче