STL/stl/inc/execution

5123 строки
231 KiB
C++

// execution standard header
// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#pragma once
#ifndef _EXECUTION_
#define _EXECUTION_
#include <yvals.h>
#if _STL_COMPILER_PREPROCESSOR
#if !_HAS_CXX17
#pragma message("The contents of <execution> are available only with C++17 or later.")
#else // ^^^ !_HAS_CXX17 / _HAS_CXX17 vvv
#include <algorithm>
#include <atomic>
#include <memory>
#include <mutex>
#include <numeric>
#include <queue>
#include <vector>
#include <xbit_ops.h>
#pragma pack(push, _CRT_PACKING)
#pragma warning(push, _STL_WARNING_LEVEL)
#pragma warning(disable : _STL_DISABLED_WARNINGS)
_STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new
_EXTERN_C
// If on Windows XP, returns 1 (disabling parallelism); otherwise, returns the number of hardware threads available.
_NODISCARD unsigned int __stdcall __std_parallel_algorithms_hw_threads() noexcept;
// Windows Vista thread pool interface; __std_parallel_algorithms_hw_threads must be called on the current
// thread before calling any of the below.
#ifdef _M_CEE
using __std_TP_WORK = void;
using __std_TP_CALLBACK_INSTANCE = void;
using __std_TP_CALLBACK_ENVIRON = void;
#else // ^^^ _M_CEE ^^^ // vvv !_M_CEE vvv
struct __std_TP_WORK; // not defined
struct __std_TP_CALLBACK_INSTANCE; // not defined
struct __std_TP_CALLBACK_ENVIRON; // not defined
#endif // _M_CEE
using __std_PTP_WORK = __std_TP_WORK*;
using __std_PTP_CALLBACK_INSTANCE = __std_TP_CALLBACK_INSTANCE*;
using __std_PTP_CALLBACK_ENVIRON = __std_TP_CALLBACK_ENVIRON*;
using __std_PTP_WORK_CALLBACK = void(__stdcall*)(
_Inout_ __std_PTP_CALLBACK_INSTANCE, _Inout_opt_ void*, _Inout_ __std_PTP_WORK);
_NODISCARD __std_PTP_WORK __stdcall __std_create_threadpool_work(
_In_ __std_PTP_WORK_CALLBACK, _Inout_opt_ void*, _In_opt_ __std_PTP_CALLBACK_ENVIRON) noexcept;
void __stdcall __std_submit_threadpool_work(_Inout_ __std_PTP_WORK) noexcept;
void __stdcall __std_bulk_submit_threadpool_work(_Inout_ __std_PTP_WORK, _In_ size_t) noexcept;
void __stdcall __std_close_threadpool_work(_Inout_ __std_PTP_WORK) noexcept;
void __stdcall __std_wait_for_threadpool_work_callbacks(_Inout_ __std_PTP_WORK, _In_ int) noexcept;
void __stdcall __std_execution_wait_on_uchar(
_In_ const volatile unsigned char* _Address, _In_ unsigned char _Compare) noexcept;
void __stdcall __std_execution_wake_by_address_all(_In_ const volatile void* _Address) noexcept;
_END_EXTERN_C
_STD_BEGIN
constexpr size_t _Oversubscription_multiplier = 32;
constexpr size_t _Oversubmission_multiplier = 4;
constexpr size_t _Still_active = static_cast<size_t>(-1);
// EXECUTION POLICIES
namespace execution {
class sequenced_policy { // request for sequential execution with termination
public:
using _Standard_execution_policy = int;
static constexpr bool _Parallelize = false;
};
inline constexpr sequenced_policy seq{/* unspecified */};
class parallel_policy { // request for parallel execution with termination
public:
using _Standard_execution_policy = int;
static constexpr bool _Parallelize = true;
};
inline constexpr parallel_policy par{/* unspecified */};
class parallel_unsequenced_policy {
// request for parallel execution without thread identity with termination
// (at this time, equivalent to parallel_policy)
public:
using _Standard_execution_policy = int;
static constexpr bool _Parallelize = true;
};
inline constexpr parallel_unsequenced_policy par_unseq{/* unspecified */};
} // namespace execution
template <>
struct is_execution_policy<execution::sequenced_policy> : true_type {}; // sequenced_policy is an execution policy
template <>
struct is_execution_policy<execution::parallel_policy> : true_type {}; // parallel_policy is an execution policy
template <>
struct is_execution_policy<execution::parallel_unsequenced_policy> : true_type {
}; // parallel_unsequenced_policy is an execution policy
// STRUCT _Parallelism_resources_exhausted
struct _Parallelism_resources_exhausted : exception {
_NODISCARD virtual const char* __CLR_OR_THIS_CALL what() const noexcept override {
// return pointer to message string
return "Insufficient resources were available to use additional parallelism.";
}
#if !_HAS_EXCEPTIONS
protected:
virtual void _Doraise() const override { // perform class-specific exception handling
_RAISE(*this);
}
#endif // !_HAS_EXCEPTIONS
};
[[noreturn]] inline void _Throw_parallelism_resources_exhausted() {
_THROW(_Parallelism_resources_exhausted{});
}
// ENUM CLASS _Cancellation_status
enum class _Cancellation_status : bool { _Running, _Canceled };
// STRUCT _Cancellation_token
struct _Cancellation_token {
atomic<_Cancellation_status> _Is_canceled_impl{_Cancellation_status::_Running};
bool _Is_canceled() const {
return _Is_canceled_impl.load() == _Cancellation_status::_Canceled;
}
bool _Is_canceled_relaxed() const {
return _Is_canceled_impl.load(memory_order_relaxed) == _Cancellation_status::_Canceled;
}
void _Cancel() {
_Is_canceled_impl.store(_Cancellation_status::_Canceled);
}
_Cancellation_status _Status() const {
return _Is_canceled_impl.load();
}
};
// CLASS _Work_ptr
class _Work_ptr {
public:
template <class _Work, enable_if_t<!is_same_v<remove_cv_t<_Work>, _Work_ptr>, int> = 0>
explicit _Work_ptr(_Work& _Operation)
: _Ptp_work(__std_create_threadpool_work(&_Work::_Threadpool_callback, _STD addressof(_Operation), nullptr)) {
// register work with the thread pool
// usually, after _Work_ptr is constructed, a parallel algorithm runs to completion or terminates
static_assert(noexcept(_Work::_Threadpool_callback(_STD declval<__std_PTP_CALLBACK_INSTANCE>(),
_STD declval<void*>(), _STD declval<__std_PTP_WORK>())),
"Threadpool callbacks must be noexcept to enforce termination");
if (!_Ptp_work) {
// usually, the last place a bailout to serial execution can occur
_Throw_parallelism_resources_exhausted();
}
}
_Work_ptr(const _Work_ptr&) = delete;
_Work_ptr& operator=(const _Work_ptr&) = delete;
~_Work_ptr() noexcept {
__std_wait_for_threadpool_work_callbacks(_Ptp_work, true);
__std_close_threadpool_work(_Ptp_work);
}
void _Submit() const noexcept {
__std_submit_threadpool_work(_Ptp_work);
}
void _Submit(const size_t _Submissions) const noexcept {
__std_bulk_submit_threadpool_work(_Ptp_work, _Submissions);
}
void _Submit_for_chunks(const size_t _Hw_threads, const size_t _Chunks) const noexcept {
_Submit((_STD min)(_Hw_threads * _Oversubmission_multiplier, _Chunks));
}
private:
__std_PTP_WORK _Ptp_work;
};
// FUNCTION TEMPLATE _Run_available_chunked_work
template <class _Work>
void _Run_available_chunked_work(_Work& _Operation) {
while (_Operation._Process_chunk() == _Cancellation_status::_Running) { // process while there are chunks remaining
}
}
// FUNCTION TEMPLATE _Run_chunked_parallel_work
template <class _Work>
void _Run_chunked_parallel_work(const size_t _Hw_threads, _Work& _Operation) {
// process chunks of _Operation on the thread pool
const _Work_ptr _Work_op{_Operation};
// setup complete, hereafter nothrow or terminate
_Work_op._Submit_for_chunks(_Hw_threads, _Operation._Team._Chunks);
_Run_available_chunked_work(_Operation);
}
// CHUNK CALCULATION FUNCTIONS
// The parallel algorithms library below assumes that distance(first, last) fits into a size_t;
// forward iterators must refer to objects in memory and therefore must meet this requirement.
//
// Unlike the serial algorithms library, which can stay in the difference_type domain, here we need
// to talk with vector (which speaks size_t), and with Windows, which wants to speak unsigned int.
//
// This assumption should be localized to the chunk calculation functions; the rest of
// the library assumes that chunk numbers can be static_cast into the difference_type domain.
// FUNCTION TEMPLATE _Get_chunked_work_chunk_count
template <class _Diff>
constexpr size_t _Get_chunked_work_chunk_count(const size_t _Hw_threads, const _Diff _Count) {
// get the number of chunks to break work into to parallelize
const auto _Size_count = static_cast<size_t>(_Count); // no overflow due to forward iterators
// we assume _Hw_threads * _Oversubscription_multiplier does not overflow
return (_STD min)(_Hw_threads * _Oversubscription_multiplier, _Size_count);
}
// FUNCTION TEMPLATE _Get_least2_chunked_work_chunk_count
template <class _Diff>
constexpr size_t _Get_least2_chunked_work_chunk_count(const size_t _Hw_threads, const _Diff _Count) {
// get the number of chunks to break work into to parallelize, assuming chunks must be of size 2
const auto _Size_count = static_cast<size_t>(_Count); // no overflow due to forward iterators
// we assume _Hw_threads * _Oversubscription_multiplier does not overflow
return _Get_chunked_work_chunk_count(_Hw_threads, _Size_count / 2);
}
// STRUCT TEMPLATE _Parallelism_allocator
struct _Parallelism_allocate_traits {
__declspec(allocator) static void* _Allocate(const size_t _Bytes) {
void* _Result = ::operator new(_Bytes, nothrow);
if (!_Result) {
_Throw_parallelism_resources_exhausted();
}
return _Result;
}
#ifdef __cpp_aligned_new
__declspec(allocator) static void* _Allocate_aligned(const size_t _Bytes, const size_t _Align) {
void* _Result = ::operator new (_Bytes, align_val_t{_Align}, nothrow);
if (!_Result) {
_Throw_parallelism_resources_exhausted();
}
return _Result;
}
#endif // __cpp_aligned_new
};
template <class _Ty = void>
struct _Parallelism_allocator {
using value_type = _Ty;
_Parallelism_allocator() = default;
template <class _Other>
constexpr _Parallelism_allocator(const _Parallelism_allocator<_Other>&) noexcept {}
_Ty* allocate(const size_t _Count) {
return static_cast<_Ty*>(
_Allocate<_New_alignof<_Ty>, _Parallelism_allocate_traits>(_Get_size_of_n<sizeof(_Ty)>(_Count)));
}
void deallocate(_Ty* const _Ptr, const size_t _Count) {
// no overflow check on the following multiply; we assume _Allocate did that check
_Deallocate<_New_alignof<_Ty>>(_Ptr, sizeof(_Ty) * _Count);
}
template <class _Other>
bool operator==(const _Parallelism_allocator<_Other>&) const noexcept {
return true;
}
template <class _Other>
bool operator!=(const _Parallelism_allocator<_Other>&) const noexcept {
return false;
}
};
template <class _Ty>
using _Parallel_vector = vector<_Ty, _Parallelism_allocator<_Ty>>;
template <class _Ty>
struct _Generalized_sum_drop { // drop off point for GENERALIZED_SUM intermediate results
_Ty* _Data;
size_t _Slots;
atomic<size_t> _Frontier;
explicit _Generalized_sum_drop(const size_t _Slots)
: _Data(static_cast<_Ty*>(
_Allocate<_New_alignof<_Ty>, _Parallelism_allocate_traits>(_Get_size_of_n<sizeof(_Ty)>(_Slots)))),
_Slots(_Slots), _Frontier(0) {}
~_Generalized_sum_drop() noexcept {
// pre: the caller has synchronized with all threads that modify _Data.
_Destroy_range(begin(), end());
// no overflow check on the following multiply; we assume _Allocate did that check
_Deallocate<_New_alignof<_Ty>>(_Data, sizeof(_Ty) * _Slots);
}
template <class... _Args>
void _Add_result(_Args&&... _Vals) noexcept /* terminates */ {
// constructs a _Ty in place with _Vals parameters perfectly forwarded
// pre: the number of results added is less than the size the drop was constructed with
const size_t _Target = _Frontier++;
_Construct_in_place(_Data[_Target], _STD forward<_Args>(_Vals)...);
}
_Ty* begin() {
return _Data;
}
_Ty* end() {
return _Data + _Frontier.load(memory_order_relaxed);
}
};
// VARIABLE TEMPLATE _Use_atomic_iterator
template <class _Ty>
struct _Atomic_is_usually_lock_free
: bool_constant<atomic<_Ty>::is_always_lock_free> { // deferred evaluation of atomic::is_always_lock_free
};
template <class _FwdIt>
inline constexpr bool _Use_atomic_iterator = conjunction_v<bool_constant<_Is_random_iter_v<_FwdIt>>,
is_trivially_copyable<_FwdIt>, _Atomic_is_usually_lock_free<_FwdIt>>;
// STRUCT TEMPLATE _Parallel_choose_min_result
template <class _Ty>
struct _Parallel_choose_min_result { // parallel results collector which uses atomic<_Ty> to choose the minimum value
_Ty _Last;
atomic<_Ty> _Result;
explicit _Parallel_choose_min_result(_Ty _Last_) : _Last{_Last_}, _Result{_Last} {}
_Ty _Get_result() const { // load the imbued value
return _Result.load(memory_order_relaxed);
}
bool _Complete() const { // tests whether a result has been found
return _Result.load(memory_order_relaxed) != _Last;
}
void _Imbue(size_t, const _Ty _Local_result) { // atomically sets the result to min(result, _Local_result)
_Ty _Expected{_Last};
while (!_Result.compare_exchange_weak(_Expected, _Local_result) && _Expected > _Local_result) { // keep trying
}
}
};
// STRUCT TEMPLATE _Parallel_choose_max_result
template <class _Ty>
struct _Parallel_choose_max_result { // parallel results collector which uses atomic<_Ty> to choose the maximum value
_Ty _Last;
atomic<_Ty> _Result;
explicit _Parallel_choose_max_result(_Ty _Last_) : _Last{_Last_}, _Result{_Last} {}
_Ty _Get_result() const { // load the imbued value
return _Result.load(memory_order_relaxed);
}
bool _Complete() const { // tests whether a result has been found
return _Result.load(memory_order_relaxed) != _Last;
}
void _Imbue(size_t, const _Ty _Local_result) { // atomically sets the result to max(result, _Local_result)
_Ty _Expected{_Last};
if (_Result.compare_exchange_strong(_Expected, _Local_result)) {
return;
}
while (_Expected < _Local_result && !_Result.compare_exchange_weak(_Expected, _Local_result)) { // keep trying
}
}
};
// STRUCT TEMPLATE _Parallel_choose_min_chunk
template <class _Ty>
struct _Parallel_choose_min_chunk {
// parallel results collector which uses atomic<chunk number> to choose the lowest chunk's result
_Ty _Result;
atomic<size_t> _Selected_chunk;
mutex _Mtx;
explicit _Parallel_choose_min_chunk(_Ty _Last) : _Result(_Last), _Selected_chunk{_Still_active}, _Mtx{} {}
_Ty _Get_result() const { // load the imbued value
return _Result;
}
bool _Complete() const { // tests whether a result has been found
return _Selected_chunk.load(memory_order_relaxed) != _Still_active;
}
void _Imbue(const size_t _Chunk, const _Ty _Local_result) {
// atomically sets the result to the lowest chunk's value
size_t _Expected = _Still_active;
while (!_Selected_chunk.compare_exchange_weak(_Expected, _Chunk)) {
// note: _Still_active is the maximum possible value, so it gets ignored implicitly
if (_Chunk > _Expected) {
return;
}
}
lock_guard<mutex> _Lck(_Mtx); // TRANSITION, VSO-671180
if (_Selected_chunk.load(memory_order_relaxed) == _Chunk) {
_Result = _Local_result;
}
}
};
// STRUCT TEMPLATE _Parallel_choose_max_chunk
template <class _Ty>
struct _Parallel_choose_max_chunk {
// parallel results collector which uses atomic<chunk number> to choose the highest chunk's result
_Ty _Result;
atomic<size_t> _Selected_chunk;
mutex _Mtx;
explicit _Parallel_choose_max_chunk(_Ty _Last) : _Result(_Last), _Selected_chunk{_Still_active}, _Mtx{} {}
_Ty _Get_result() const { // load the imbued value
return _Result;
}
bool _Complete() const { // tests whether a result has been found
return _Selected_chunk.load(memory_order_relaxed) != _Still_active;
}
void _Imbue(const size_t _Chunk, const _Ty _Local_result) {
// atomically sets the result to the highest chunk's value
size_t _Expected = _Still_active;
while (!_Selected_chunk.compare_exchange_weak(_Expected, _Chunk)) {
// wrap _Still_active down to 0 so that only 1 branch is necessary:
if (_Chunk + 1 < _Expected + 1) {
return;
}
}
lock_guard<mutex> _Lck(_Mtx); // TRANSITION, VSO-671180
if (_Selected_chunk.load(memory_order_relaxed) == _Chunk) {
_Result = _Local_result;
}
}
};
// CLASS TEMPLATE _Work_stealing_deque
template <class _Ty>
struct alignas(_Ty) alignas(size_t) alignas(_Atomic_counter_t) _Circular_buffer { // work stealing deque extent type
static_assert(is_trivial_v<_Ty>, "Work stealing deques work only with trivial operations");
size_t _Log_size;
_Atomic_counter_t _Ref_count;
void _Release() {
static_assert(is_trivially_destructible_v<_Circular_buffer>, "global delete requires trivial destruction");
if (_MT_DECR(_Ref_count) == 0) {
::operator delete(this);
}
}
static _Circular_buffer* _Allocate_circular_buffer(const size_t _New_log_size) {
// allocate a circular buffer with space for 2^_New_log_size elements
if (_New_log_size >= 32) {
_Throw_parallelism_resources_exhausted();
}
const size_t _Count = static_cast<size_t>(1) << _New_log_size;
constexpr size_t _Max_bytes = static_cast<size_t>(-1) - sizeof(_Circular_buffer);
if (_Max_bytes / sizeof(_Ty) < _Count) {
_Throw_parallelism_resources_exhausted();
}
const size_t _Result_bytes = _Count * sizeof(_Ty) + sizeof(_Circular_buffer);
static_assert(alignof(_Ty) <= alignof(max_align_t), "incapable of supporting the requested alignment");
const auto _Result = static_cast<_Circular_buffer*>(::operator new(_Result_bytes));
_Result->_Log_size = _New_log_size;
_Result->_Ref_count = 1;
return _Result;
}
static _Circular_buffer* _New_circular_buffer() { // allocate a circular buffer with a default number of elements
return _Allocate_circular_buffer(6); // start with 64 elements
}
_Ty* _Get_base() { // get the base address where the _Ty instances are stored
return reinterpret_cast<_Ty*>(this + 1);
}
const _Ty* _Get_base() const { // get the base address where the _Ty instances are stored
return reinterpret_cast<const _Ty*>(this + 1);
}
_Ty& _Subscript(const size_t _Idx) { // get a reference to the _Idxth element
const auto _Mask = (static_cast<size_t>(1) << _Log_size) - static_cast<size_t>(1);
return _Get_base()[_Idx & _Mask];
}
const _Ty& _Subscript(const size_t _Idx) const { // get a reference to the _Idxth element
const auto _Mask = (static_cast<size_t>(1) << _Log_size) - static_cast<size_t>(1);
return _Get_base()[_Idx & _Mask];
}
_Circular_buffer* _Grow(const size_t _Bottom, const size_t _Top) const {
// create a bigger _Circular_buffer suitable for use by a _Work_stealing_deque<_Ty> with bounds _Bottom and _Top
const size_t _New_log_size = _Log_size + 1;
_Circular_buffer* _Result = _Allocate_circular_buffer(_New_log_size);
for (size_t _Idx = _Top; _Idx < _Bottom; ++_Idx) {
_Result->_Subscript(_Idx) = _Subscript(_Idx);
}
return _Result;
}
};
#pragma warning(push)
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
template <class _Ty>
class alignas(hardware_destructive_interference_size) _Work_stealing_deque {
// thread-local work-stealing deque, which allows efficient access from a single owner thread at the "bottom"
// of the queue, and any thread access to the "top" of the queue. Originally described in the paper
// "Dynamic Circular Work-Stealing Deque" by David Chase and Yossi Lev
public:
_Work_stealing_deque() = default;
_Work_stealing_deque(const _Work_stealing_deque&) = delete;
_Work_stealing_deque& operator=(const _Work_stealing_deque&) = delete;
~_Work_stealing_deque() noexcept {
_Segment->_Release();
}
void _Push_bottom(_Ty& _Val) {
// attempts to push _Val onto the bottom of this queue
// may be accessed by owning thread only
const auto _Local_b = _Bottom.load();
if (_Local_b == SIZE_MAX) {
// we assume that any input range won't be divided into more than SIZE_MAX subproblems;
// treat overflow of that kind as OOM
_Throw_parallelism_resources_exhausted();
}
const auto _Local_t = _Top.load();
const auto _Size = _Local_b - _Local_t;
if (_Size >= (static_cast<size_t>(1) << _Segment->_Log_size)) {
auto _New_segment = _Segment->_Grow(_Local_b, _Local_t);
_Circular_buffer<_Ty>* _Detached_segment;
{
lock_guard<mutex> _Lck(_Segment_lock); // TRANSITION, VSO-671180
_Detached_segment = _STD exchange(_Segment, _New_segment);
} // unlock
_Detached_segment->_Release();
}
_Segment->_Subscript(_Local_b) = _Val;
_Bottom.store(_Local_b + 1U);
}
bool _Steal(_Ty& _Val) noexcept {
// attempt to pop an item from the top of this deque
// may be accessed by any thread
// returns false if the deque was empty and _Val is indeterminate; otherwise, returns true and sets _Val to the
// element retrieved from the top of the deque.
auto _Local_t = _Top.load();
size_t _Desired_t;
do {
if (_Bottom.load() <= _Local_t) { // deque was empty
return false;
}
_Circular_buffer<_Ty>* _Stealing_segment;
{
lock_guard<mutex> _Lck(_Segment_lock); // TRANSITION, VSO-671180
_Stealing_segment = _Segment;
_MT_INCR(_Stealing_segment->_Ref_count);
}
_Val = _Stealing_segment->_Subscript(_Local_t); // speculative read/write data race
_Stealing_segment->_Release();
// The above is technically prohibited by the C++ memory model, but happens
// to be well defined on all hardware this implementation targets.
// Hardware with trap representations or similar must not use this implementation.
_Desired_t = _Local_t + 1U;
} while (!_Top.compare_exchange_strong(_Local_t, _Desired_t)); // if a data race occurred, try again
return true;
}
bool _Try_pop_bottom(_Ty& _Val) noexcept {
// attempt to pop an item from the bottom of this deque into _Val
// may be accessed by owning thread only
auto _Local_b = _Bottom.load();
if (_Local_b == 0) { // queue never contained any elements (should never happen)
return false;
}
--_Local_b;
_Bottom.store(_Local_b);
auto _Local_t = _Top.load();
if (_Local_b < _Local_t) { // all elements were stolen before we got here
_Bottom.store(_Local_t);
return false;
}
// memory model says following load is OK, since _Push_bottom can't run concurrently
_Val = _Segment->_Subscript(_Local_b);
if (_Local_b > _Local_t) {
// other threads only look at top, so we get the bottom without synchronization
return true;
}
// We're trying to read the last element that another thread may be trying to steal;
// see who gets to keep the element through _Top (effectively, steal from ourselves)
const auto _Desired_top = _Local_t + 1U;
if (_Top.compare_exchange_strong(_Local_t, _Desired_top)) {
_Bottom.store(_Desired_top);
return true;
} else {
_Bottom.store(_Local_t);
return false;
}
}
private:
atomic<size_t> _Bottom{0}; // modified by only owning thread
atomic<size_t> _Top{0}; // modified by all threads
_Guarded_by_(_Segment_lock) _Circular_buffer<_Ty>* _Segment{_Circular_buffer<_Ty>::_New_circular_buffer()};
mutex _Segment_lock{};
};
#pragma warning(pop)
// STRUCT TEMPLATE _Work_stealing_membership
enum class _Steal_result { _Success, _Abort, _Done };
template <class _Ty>
struct _Work_stealing_team;
template <class _Ty>
struct _Work_stealing_membership { // thread-local "ticket" that team members use to talk with a _Work_stealing_team
using _Diff = typename _Ty::difference_type;
size_t _Id;
_Work_stealing_team<_Ty>* _Team;
_Diff _Work_complete;
void _Push_bottom(_Ty& _Val) {
_Team->_Queues[_Id]._Push_bottom(_Val);
}
bool _Try_pop_bottom(_Ty& _Val) noexcept {
return _Team->_Queues[_Id]._Try_pop_bottom(_Val);
}
_Steal_result _Steal(_Ty& _Val) noexcept {
_Diff _Remaining;
const auto _Completed_this_time = _STD exchange(_Work_complete, {});
if (_Completed_this_time == 0) {
_Remaining = _Team->_Remaining_work.load();
} else {
_Remaining = _Team->_Remaining_work -= _Completed_this_time;
}
if (_Remaining == 0) {
return _Steal_result::_Done;
}
const size_t _High = _Team->_Queues_used.load() + 1;
size_t _Idx = _Id;
for (;;) {
if (_Idx == 0) {
_Idx = _High;
}
--_Idx;
if (_Idx == _Id) {
return _Steal_result::_Abort;
}
if (_Team->_Queues[_Idx]._Steal(_Val)) {
return _Steal_result::_Success;
}
}
}
void _Leave() noexcept {
_Team->_Leave_team(_Id);
}
};
// STRUCT TEMPLATE _Work_stealing_team
template <class _Ty>
struct _Work_stealing_team { // inter-thread communication for threads working on a single task
using _Diff = typename _Ty::difference_type;
static _Parallel_vector<size_t> _Get_queues(const size_t _Queue_count) {
_Parallel_vector<size_t> _Result(_Queue_count);
_STD iota(_Result.begin(), _Result.end(), size_t{});
return _Result;
}
_Work_stealing_team(size_t _Threads, _Diff _Total_work)
: _Queues(_Threads), _Queues_used(0), _Remaining_work(_Total_work), _Available_mutex(),
_Available_queues(greater<>{}, _Get_queues(_Threads)) { // register work with the thread pool
}
_Work_stealing_membership<_Ty> _Join_team() noexcept {
size_t _Id;
{
lock_guard<mutex> _Lck(_Available_mutex); // TRANSITION, VSO-671180
_Id = _Available_queues.top();
_Available_queues.pop();
} // unlock
// set _Queues_used to the high water mark of queues used
size_t _High_water = _Queues_used.load();
while (_High_water < _Id && !_Queues_used.compare_exchange_weak(_High_water, _Id)) { // keep trying
}
return _Work_stealing_membership<_Ty>{_Id, this, 0};
}
void _Leave_team(size_t _Id) noexcept {
lock_guard<mutex> _Lck(_Available_mutex); // TRANSITION, VSO-671180
_Available_queues.push(_Id);
}
_Parallel_vector<_Work_stealing_deque<_Ty>> _Queues;
atomic<size_t> _Queues_used;
atomic<_Diff> _Remaining_work;
mutex _Available_mutex;
priority_queue<size_t, _Parallel_vector<size_t>, greater<>> _Available_queues;
};
// STRUCT TEMPLATE _Static_partition_key
template <class _Diff>
struct _Static_partition_key { // "pointer" identifying a static partition
size_t _Chunk_number; // In range [0, numeric_limits<_Diff>::max()]
_Diff _Start_at;
_Diff _Size;
explicit operator bool() const { // test if this is a valid key
return _Chunk_number != static_cast<size_t>(-1);
}
};
// STRUCT TEMPLATE _Static_partition_team
template <class _Diff>
struct _Static_partition_team { // common data for all static partitioned ops
atomic<size_t> _Consumed_chunks;
size_t _Chunks;
_Diff _Count;
_Diff _Chunk_size;
_Diff _Unchunked_items;
_Static_partition_team(const _Diff _Count_, const size_t _Chunks_)
: _Consumed_chunks{0}, _Chunks{_Chunks_}, _Count{_Count_}, _Chunk_size{static_cast<_Diff>(
_Count_ / static_cast<_Diff>(_Chunks_))},
_Unchunked_items{static_cast<_Diff>(_Count_ % static_cast<_Diff>(_Chunks_))} {
// Calculate common data for statically partitioning iterator ranges.
// pre: _Count_ >= _Chunks_ && _Chunks_ >= 1
}
_Static_partition_key<_Diff> _Get_chunk_key(const size_t _This_chunk) const {
const auto _This_chunk_diff = static_cast<_Diff>(_This_chunk);
auto _This_chunk_size = _Chunk_size;
auto _This_chunk_start_at = static_cast<_Diff>(_This_chunk_diff * _This_chunk_size);
if (_This_chunk_diff < _Unchunked_items) {
// chunks at index lower than _Unchunked_items get an extra item,
// and need to shift forward by all their predecessors' extra items
_This_chunk_start_at += _This_chunk_diff;
++_This_chunk_size;
} else { // chunks without an extra item need to account for all the extra items
_This_chunk_start_at += _Unchunked_items;
}
return {_This_chunk, _This_chunk_start_at, _This_chunk_size};
}
_Diff _Get_chunk_offset(const size_t _This_chunk) const {
const auto _This_chunk_diff = static_cast<_Diff>(_This_chunk);
return _This_chunk_diff * _Chunk_size + (_STD min)(_This_chunk_diff, _Unchunked_items);
}
_Static_partition_key<_Diff> _Get_next_key() {
// retrieves the next static partition key to process, if it exists;
// otherwise, retrieves an invalid partition key
const auto _This_chunk = _Consumed_chunks++;
if (_This_chunk < _Chunks) {
return _Get_chunk_key(_This_chunk);
}
return {static_cast<size_t>(-1), 0, 0};
}
};
// STRUCT TEMPLATE _Iterator_range
template <class _FwdIt>
struct _Iterator_range { // record of a partition of work
_FwdIt _First;
_FwdIt _Last;
};
// STRUCT TEMPLATE _Static_partition_range
template <class _FwdIt, class _Diff = _Iter_diff_t<_FwdIt>, bool = _Is_random_iter_v<_FwdIt>>
struct _Static_partition_range;
template <class _RanIt, class _Diff>
struct _Static_partition_range<_RanIt, _Diff, true> {
using _Target_diff = _Iter_diff_t<_RanIt>;
using _URanIt = _Unwrapped_t<const _RanIt&>;
_URanIt _Start_at;
using _Chunk_type = _Iterator_range<_URanIt>;
_RanIt _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _First) {
// statically partition a random-access iterator range and return next(_First, _Team._Count)
// pre: _Populate hasn't yet been called on this instance
auto _Result = _First + static_cast<_Target_diff>(_Team._Count); // does verification
_Start_at = _Get_unwrapped(_First);
return _Result;
}
bool _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _First, _RanIt _Last) {
// statically partition a random-access iterator range and check if the range ends at _Last
// pre: _Populate hasn't yet been called on this instance
_Adl_verify_range(_First, _Last);
_Start_at = _Get_unwrapped(_First);
return _Team._Count == _Last - _First;
}
_URanIt _Get_first(size_t /* _Chunk_number */, const _Diff _Offset) {
// get the first iterator for _Chunk _Chunk_number (which is at offset _Offset)
return _Start_at + static_cast<_Target_diff>(_Offset);
}
_Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const {
// get a static partition chunk from a random-access range
// pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate
const auto _First = _Start_at + static_cast<_Target_diff>(_Key._Start_at);
return {_First, _First + static_cast<_Target_diff>(_Key._Size)};
}
};
template <class _FwdIt, class _Diff>
struct _Static_partition_range<_FwdIt, _Diff, false> {
using _Target_diff = _Iter_diff_t<_FwdIt>;
using _UFwdIt = _Unwrapped_t<const _FwdIt&>;
_Parallel_vector<_UFwdIt> _Division_points;
using _Chunk_type = _Iterator_range<_UFwdIt>;
_FwdIt _Populate(const _Static_partition_team<_Diff>& _Team, _FwdIt _First) {
// statically partition a forward iterator range and return next(_First, _Team._Count)
// pre: _Populate hasn't yet been called on this instance
const auto _Chunks = _Team._Chunks;
_Division_points.resize(_Chunks + 1);
// The following potentially narrowing cast is OK because caller has ensured
// next(_First, _Team._Count) is valid (and _Count <= _Chunk_size)
const auto _Chunk_size = static_cast<_Target_diff>(_Team._Chunk_size);
const auto _Unchunked_items = _Team._Unchunked_items;
auto _Result = _Division_points.begin();
*_Result = _Get_unwrapped(_First);
for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) { // record bounds of chunks with an extra item
_STD advance(_First, static_cast<_Target_diff>(_Chunk_size + 1));
*++_Result = _Get_unwrapped(_First);
}
const auto _Diff_chunks = static_cast<_Diff>(_Chunks);
for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) { // record bounds of chunks with no extra item
_STD advance(_First, _Chunk_size);
*++_Result = _Get_unwrapped(_First);
}
return _First;
}
bool _Populate(const _Static_partition_team<_Diff>& _Team, _FwdIt _First, _FwdIt _Last) {
// statically partition a forward iterator range and check if the range ends at _Last
// pre: _Populate hasn't yet been called on this instance
const auto _Chunks = _Team._Chunks;
_Division_points.resize(_Chunks + 1);
const auto _Chunk_size = _Team._Chunk_size;
const auto _Unchunked_items = _Team._Unchunked_items;
auto _Result = _Division_points.begin();
*_Result = _Get_unwrapped(_First);
for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) { // record bounds of chunks with an extra item
for (_Diff _This_chunk_size = _Chunk_size; 0 < _This_chunk_size--;) {
if (_First == _Last) {
return false;
}
++_First;
}
*++_Result = _Get_unwrapped(_First);
}
const auto _Diff_chunks = static_cast<_Diff>(_Chunks);
for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) { // record bounds of chunks with no extra item
for (_Diff _This_chunk_size = _Chunk_size; 0 < _This_chunk_size--;) {
if (_First == _Last) {
return false;
}
++_First;
}
*++_Result = _Get_unwrapped(_First);
}
return _First == _Last;
}
_UFwdIt _Get_first(const size_t _Chunk_number, _Diff /* _Offset */) {
// get the first iterator for _Chunk _Chunk_number (which is at offset _Offset)
return _Division_points[_Chunk_number];
}
_Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const {
// get a static partition chunk from a forward range
// pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate
return {_Division_points[_Key._Chunk_number], _Division_points[_Key._Chunk_number + 1]};
}
};
// STRUCT TEMPLATE _Static_partition_range_backward
template <class _BidIt, class _Diff = _Iter_diff_t<_BidIt>, bool = _Is_random_iter_v<_BidIt>>
struct _Static_partition_range_backward;
template <class _RanIt, class _Diff>
struct _Static_partition_range_backward<_RanIt, _Diff, true> {
using _Target_diff = _Iter_diff_t<_RanIt>;
_Unwrapped_t<const _RanIt&> _Start_at;
using _Chunk_type = _Iterator_range<_Unwrapped_t<const _RanIt&>>;
void _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _Last) {
// statically partition a random-access iterator range ending at _Last
// pre: _Populate hasn't yet been called on this instance
_Start_at = _Get_unwrapped_n(_Last, -static_cast<_Target_diff>(_Team._Count));
}
_Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const {
// get a static partition chunk from a random-access range
// pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate
const auto _Last = _Start_at - static_cast<_Target_diff>(_Key._Start_at);
return {_Last - static_cast<_Target_diff>(_Key._Size), _Last};
}
};
template <class _BidIt, class _Diff>
struct _Static_partition_range_backward<_BidIt, _Diff, false> {
using _Target_diff = _Iter_diff_t<_BidIt>;
_Parallel_vector<_Unwrapped_t<const _BidIt&>> _Division_points;
using _Chunk_type = _Iterator_range<_Unwrapped_t<const _BidIt&>>;
void _Populate(const _Static_partition_team<_Diff>& _Team, _BidIt _Last) {
// statically partition a bidirectional iterator range ending at _Last
// pre: _Populate hasn't yet been called on this instance
const auto _Chunks = _Team._Chunks;
_Division_points.resize(_Chunks + 1);
const auto _Neg_chunk_size = static_cast<_Target_diff>(-_Team._Chunk_size);
const auto _Unchunked_items = _Team._Unchunked_items;
auto _Result = _Division_points.begin(); // does range checking by incrementing in the checked domain
*_Result = _Get_unwrapped(_Last);
for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) {
_STD advance(_Last, static_cast<_Target_diff>(_Neg_chunk_size - 1));
*++_Result = _Get_unwrapped(_Last);
}
const auto _Diff_chunks = static_cast<_Diff>(_Chunks);
for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) {
_STD advance(_Last, _Neg_chunk_size);
*++_Result = _Get_unwrapped(_Last);
}
}
_Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const {
// get a static partition chunk from a bidirectional range
// pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate
return {_Division_points[_Key._Chunk_number + 1], _Division_points[_Key._Chunk_number]};
}
};
// FUNCTION TEMPLATE _Distance_any
template <class _InIt1, class _InIt2>
_Common_diff_t<_InIt1, _InIt2> _Distance_any(_InIt1 _First1, _InIt1 _Last1, _InIt2 _First2, _InIt2 _Last2) {
// get the distance from 2 ranges which should have identical lengths
if constexpr (_Is_random_iter_v<_InIt1>) {
return _Last1 - _First1;
} else if constexpr (_Is_random_iter_v<_InIt2>) {
return _Last2 - _First2;
} else {
return _STD distance(_First1, _Last1);
}
}
// FUNCTION TEMPLATE _Distance_min
template <class _InIt1, class _InIt2>
_Common_diff_t<_InIt1, _InIt2> _Distance_min(_InIt1 _First1, const _InIt1 _Last1, _InIt2 _First2, const _InIt2 _Last2) {
// get min(distance(_First1, _Last1), distance(_First2, _Last2))
using _CT = _Common_diff_t<_InIt1, _InIt2>;
_CT _Result{};
if constexpr (_Is_random_iter_v<_InIt1> && _Is_random_iter_v<_InIt2>) {
const _CT _Count1 = _Last1 - _First1;
const _CT _Count2 = _Last2 - _First2;
_Result = (_STD min)(_Count1, _Count2);
} else if constexpr (_Is_random_iter_v<_InIt1>) {
for (auto _Count1 = _Last1 - _First1; 0 < _Count1 && _First2 != _Last2; --_Count1) {
++_First2;
++_Result;
}
} else if constexpr (_Is_random_iter_v<_InIt2>) {
for (auto _Count2 = _Last2 - _First2; 0 < _Count2 && _First1 != _Last1; --_Count2) {
++_First1;
++_Result;
}
} else {
while (_First1 != _Last1 && _First2 != _Last2) {
++_First1;
++_First2;
++_Result;
}
}
return _Result;
}
// PARALLEL FUNCTION TEMPLATE all_of
template <bool _Invert, class _FwdIt, class _Pr>
struct _Static_partitioned_all_of_family2 { // all_of/any_of/none_of task scheduled on the system thread pool
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Pr _Pred;
_Cancellation_token _Cancel_token;
_Static_partitioned_all_of_family2(
_FwdIt _First, const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred(_Pred_), _Cancel_token{} {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
if (_Cancel_token._Is_canceled()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Range = _Basis._Get_chunk(_Key);
for (auto _First = _Range._First; _First != _Range._Last; ++_First) {
if (_Pred(*_First) ? _Invert : !_Invert) {
_Cancel_token._Cancel();
return _Cancellation_status::_Canceled;
}
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_all_of_family2*>(_Context));
}
};
template <bool _Invert, class _FwdIt, class _Pr>
bool _All_of_family_parallel(_FwdIt _First, const _FwdIt _Last, _Pr _Pred) {
// test if all elements in [_First, _Last) satisfy _Pred (or !_Pred if _Invert is true) in parallel
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_First, _Last);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_all_of_family2<_Invert, _FwdIt, _Pr> _Operation{_First, _Hw_threads, _Count, _Pred};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return !_Operation._Cancel_token._Is_canceled_relaxed();
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
for (; _First != _Last; ++_First) {
if (_Pred(*_First) ? _Invert : !_Invert) {
return false;
}
}
return true;
}
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool all_of(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// test if all elements in [_First, _Last) satisfy _Pred with the indicated execution policy
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
return _All_of_family_parallel<false>(_UFirst, _ULast, _Pass_fn(_Pred));
} else {
return _STD all_of(_UFirst, _ULast, _Pass_fn(_Pred));
}
}
// PARALLEL FUNCTION TEMPLATE any_of
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool any_of(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// test if any element in [_First, _Last) satisfies _Pred with the indicated execution policy
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
return !_All_of_family_parallel<true>(_UFirst, _ULast, _Pass_fn(_Pred));
} else {
return _STD any_of(_UFirst, _ULast, _Pass_fn(_Pred));
}
}
// PARALLEL FUNCTION TEMPLATE none_of
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool none_of(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// test if no element in [_First, _Last) satisfies _Pred with the indicated execution policy
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
return _All_of_family_parallel<true>(_UFirst, _ULast, _Pass_fn(_Pred));
} else {
return _STD none_of(_UFirst, _ULast, _Pass_fn(_Pred));
}
}
// PARALLEL FUNCTION TEMPLATE for_each
template <class _FwdIt, class _Fn>
void _For_each_ivdep(_FwdIt _First, const _FwdIt _Last, _Fn _Func) {
// perform function for each element [_First, _Last) assuming independent loop bodies
#pragma loop(ivdep)
for (; _First != _Last; ++_First) {
_Func(*_First);
}
}
template <class _FwdIt, class _Diff, class _Fn>
struct _Static_partitioned_for_each2 { // for_each task scheduled on the system thread pool
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt, _Diff> _Basis;
_Fn _Func;
_Static_partitioned_for_each2(const size_t _Hw_threads, const _Diff _Count, _Fn _Fx)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Func(_Fx) {}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (_Key) {
const auto _Chunk = _Basis._Get_chunk(_Key);
_For_each_ivdep(_Chunk._First, _Chunk._Last, _Func);
return _Cancellation_status::_Running;
}
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_for_each2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Fn, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
void for_each(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Fn _Func) noexcept /* terminates */ {
// perform function for each element [_First, _Last) with the indicated execution policy
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
auto _Passed_fn = _Pass_fn(_Func);
_Static_partitioned_for_each2<decltype(_UFirst), decltype(_Count), decltype(_Passed_fn)> _Operation{
_Hw_threads, _Count, _Passed_fn};
_Operation._Basis._Populate(_Operation._Team, _UFirst);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
_For_each_ivdep(_UFirst, _ULast, _Pass_fn(_Func));
} else {
for (; _UFirst != _ULast; ++_UFirst) {
_Func(*_UFirst);
}
}
}
// PARALLEL FUNCTION TEMPLATE for_each_n
template <class _FwdIt, class _Diff, class _Fn>
_FwdIt _For_each_n_ivdep(_FwdIt _First, _Diff _Count, _Fn _Func) {
// perform function for each element [_First, _First + _Count) assuming independent loop bodies
#pragma loop(ivdep)
for (; 0 < _Count; --_Count, (void) ++_First) {
_Func(*_First);
}
return _First;
}
template <class _ExPo, class _FwdIt, class _Diff, class _Fn, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt for_each_n(_ExPo&&, _FwdIt _First, const _Diff _Count_raw, _Fn _Func) noexcept /* terminates */ {
// perform function for each element [_First, _First + _Count)
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Algorithm_int_t<_Diff> _Count = _Count_raw;
if (0 < _Count) {
auto _UFirst = _Get_unwrapped_n(_First, _Count);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1 && _Count >= 2) { // parallelize on multiprocessor machines with at least 2 elements
_TRY_BEGIN
auto _Passed_fn = _Pass_fn(_Func);
_Static_partitioned_for_each2<decltype(_UFirst), decltype(_Count), decltype(_Passed_fn)> _Operation{
_Hw_threads, _Count, _Passed_fn};
_Seek_wrapped(_First, _Operation._Basis._Populate(_Operation._Team, _UFirst));
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _First;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_First, _For_each_n_ivdep(_UFirst, _Count, _Pass_fn(_Func)));
} else {
for (; 0 < _Count; --_Count, (void) ++_UFirst) {
_Func(*_UFirst);
}
_Seek_wrapped(_First, _UFirst);
}
}
return _First;
}
// PARALLEL FUNCTION TEMPLATE find
template <class _FwdIt>
using _Parallel_find_results = conditional_t<_Use_atomic_iterator<_FwdIt>, _Parallel_choose_min_result<_FwdIt>,
_Parallel_choose_min_chunk<_FwdIt>>;
template <class _FwdIt, class _Find_fx>
struct _Static_partitioned_find2 {
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Parallel_find_results<_FwdIt> _Results;
_Find_fx _Fx;
_Static_partitioned_find2(
const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _Last, const _Find_fx _Fx_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results(_Last), _Fx(_Fx_) {}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Range = _Basis._Get_chunk(_Key);
const auto _This_find = _Fx(_Range._First, _Range._Last);
if (_This_find == _Range._Last) {
return _Cancellation_status::_Running;
}
_Results._Imbue(_Key._Chunk_number, _This_find);
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_find2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Find_fx>
_FwdIt _Find_parallel_unchecked(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, const _Find_fx _Fx) {
// find first matching _Val, potentially in parallel
if (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_First, _Last);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_find2 _Operation{_Hw_threads, _Count, _Last, _Fx};
_Operation._Basis._Populate(_Operation._Team, _First);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Operation._Results._Get_result();
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to _Fx below
_CATCH_END
}
}
}
return _Fx(_First, _Last);
}
template <class _ExPo, class _FwdIt, class _Ty, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt find(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept /* terminates */ {
// find first matching _Val
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
using _UFwdIt = _Unwrapped_t<const _FwdIt&>;
_Adl_verify_range(_First, _Last);
_Seek_wrapped(_First,
_Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last),
[&](const _UFwdIt _LFirst, const _UFwdIt _LLast) { return _Find_unchecked(_LFirst, _LLast, _Val); }));
return _First;
}
// PARALLEL FUNCTION TEMPLATE find_if
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt find_if(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// find first satisfying _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
using _UFwdIt = _Unwrapped_t<const _FwdIt&>;
_Adl_verify_range(_First, _Last);
auto _Pass_pred = _Pass_fn(_Pred);
_Seek_wrapped(_First,
_Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last),
[=](const _UFwdIt _LFirst, const _UFwdIt _LLast) { return _STD find_if(_LFirst, _LLast, _Pass_pred); }));
return _First;
}
// PARALLEL FUNCTION TEMPLATE find_if_not
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt find_if_not(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// find first satisfying !_Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
using _UFwdIt = _Unwrapped_t<const _FwdIt&>;
_Adl_verify_range(_First, _Last);
auto _Pass_pred = _Pass_fn(_Pred);
_Seek_wrapped(_First, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First),
_Get_unwrapped(_Last), [=](const _UFwdIt _LFirst, const _UFwdIt _LLast) {
return _STD find_if_not(_LFirst, _LLast, _Pass_pred);
}));
return _First;
}
// PARALLEL FUNCTION TEMPLATE find_end
template <class _FwdIt1, class _FwdIt2>
_Iter_diff_t<_FwdIt1> _Get_find_end_forward_partition_size(
_FwdIt1 _First1, const _FwdIt1 _Last1, _FwdIt2 _First2, const _FwdIt2 _Last2) {
// get the count of the range of possible matches in a find_end operation for forward iterators
if (_First2 == _Last2) {
return 0;
}
for (;;) {
if (_First1 == _Last1) { // haystack is shorter than needle
return 0;
}
++_First2;
if (_First2 == _Last2) { // seek complete
return _STD distance(_First1, _Last1);
}
++_First1;
}
}
template <class _FwdIt1, class _FwdIt2, class _Pr>
struct _Static_partitioned_find_end_forward {
_Static_partition_team<_Iter_diff_t<_FwdIt1>> _Team;
_Static_partition_range<_FwdIt1> _Basis;
_Iterator_range<_FwdIt2> _Range2;
_Pr _Pred;
conditional_t<_Use_atomic_iterator<_FwdIt1>, _Parallel_choose_max_result<_FwdIt1>,
_Parallel_choose_max_chunk<_FwdIt1>>
_Results;
_Static_partitioned_find_end_forward(const size_t _Hw_threads, const _Iter_diff_t<_FwdIt1> _Count,
const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Range2{_First2, _Last2},
_Pred{_Pred_}, _Results(_Last1) {}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _Range = _Basis._Get_chunk(_Key);
_FwdIt1 _Result = _Range._Last;
for (auto _First1 = _Range._First; _First1 != _Range._Last; ++_First1) {
if (_Equal_rev_pred_unchecked(_First1, _Range2._First, _Range2._Last, _Pred)) {
_Result = _First1;
}
}
if (_Result != _Range._Last) {
_Results._Imbue(_Chunk_number, _Result);
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
(void) static_cast<_Static_partitioned_find_end_forward*>(_Context)->_Process_chunk();
}
};
template <class _BidIt1, class _FwdIt2>
_BidIt1 _Get_find_end_backward_partition_start(
const _BidIt1 _First1, _BidIt1 _Last1, _FwdIt2 _First2, const _FwdIt2 _Last2) {
// gets the end of the range of possible matches for a find_end operation
if constexpr (_Is_random_iter_v<_BidIt1> && _Is_random_iter_v<_FwdIt2>) {
using _CT = _Common_diff_t<_BidIt1, _FwdIt2>;
const _CT _Count1 = _Last1 - _First1;
const _CT _Count2 = _Last2 - _First2;
_Iter_diff_t<_BidIt1> _Count{};
if (0 < _Count2 && _Count2 <= _Count1) {
_Count = static_cast<_Iter_diff_t<_BidIt1>>(_Count1 - _Count2 + 1);
}
return _First1 + _Count;
} else {
if (_First2 == _Last2) { // always 0 matches, give up
return _First1;
}
for (;;) {
if (_First1 == _Last1) { // haystack is shorter than needle, give up
return _First1;
}
++_First2;
if (_First2 == _Last2) { // seek complete
return _Last1;
}
--_Last1;
}
}
}
template <class _BidIt1, class _FwdIt2, class _Pr>
struct _Static_partitioned_find_end_backward2 {
_Static_partition_team<_Iter_diff_t<_BidIt1>> _Team;
_Static_partition_range_backward<_BidIt1> _Basis;
conditional_t<_Use_atomic_iterator<_BidIt1>, _Parallel_choose_max_result<_BidIt1>,
_Parallel_choose_min_chunk<_BidIt1>>
_Results;
_Iterator_range<_FwdIt2> _Range2;
_Pr _Pred;
_Static_partitioned_find_end_backward2(const size_t _Hw_threads, const _Iter_diff_t<_BidIt1> _Count,
const _BidIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{},
_Results(_Last1), _Range2{_First2, _Last2}, _Pred{_Pred_} {}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _Range = _Basis._Get_chunk(_Key);
auto _Last1 = _Range._Last;
do {
--_Last1;
if (_Equal_rev_pred_unchecked(_Last1, _Range2._First, _Range2._Last, _Pred)) {
_Results._Imbue(_Chunk_number, _Last1);
return _Cancellation_status::_Canceled;
}
} while (_Last1 != _Range._First);
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_find_end_backward2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt1 find_end(_ExPo&&, _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2,
_Pr _Pred) noexcept /* terminates */ {
// find last [_First2, _Last2) satisfying _Pred
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
const auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
if constexpr (_Is_bidi_iter_v<_FwdIt1>) {
const auto _Partition_start =
_Get_find_end_backward_partition_start(_UFirst1, _ULast1, _UFirst2, _ULast2);
if (_UFirst1 == _Partition_start) {
_Seek_wrapped(_First1, _ULast1);
return _First1;
}
const auto _Count = _STD distance(_UFirst1, _Partition_start);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_find_end_backward2 _Operation{
_Hw_threads, _Count, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)};
_Operation._Basis._Populate(_Operation._Team, _Partition_start);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First1, _Operation._Results._Get_result());
return _First1;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
} else {
const auto _Count = _Get_find_end_forward_partition_size(_UFirst1, _ULast1, _UFirst2, _ULast2);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_find_end_forward _Operation{
_Hw_threads, _Count, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)};
_Operation._Basis._Populate(_Operation._Team, _UFirst1);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First1, _Operation._Results._Get_result());
return _First1;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
}
_Seek_wrapped(_First1, _STD find_end(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)));
return _First1;
}
// PARALLEL FUNCTION TEMPLATE find_first_of
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt1 find_first_of(_ExPo&& _Exec, const _FwdIt1 _First1, _FwdIt1 _Last1, const _FwdIt2 _First2,
const _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ {
// look for one of [_First2, _Last2) that matches element
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
using _UFwdIt1 = _Unwrapped_t<const _FwdIt1&>;
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
const auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
if (_UFirst2 == _ULast2) {
return _Last1;
}
auto _Pass_pred = _Pass_fn(_Pred);
_Seek_wrapped(_Last1, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First1),
_Get_unwrapped(_Last1), [=](const _UFwdIt1 _LFirst1, const _UFwdIt1 _LLast1) {
return _STD find_first_of(_LFirst1, _LLast1, _UFirst2, _ULast2, _Pass_pred);
}));
return _Last1;
}
// PARALLEL FUNCTION TEMPLATE adjacent_find
template <class _FwdIt, class _Pr>
struct _Static_partitioned_adjacent_find2 {
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Parallel_find_results<_FwdIt> _Results;
_Pr _Pred;
_Static_partitioned_adjacent_find2(
const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _Last, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results{_Last}, _Pred{_Pred_} {}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _Range = _Basis._Get_chunk(_Key);
// tests [_First, _Last) for _Pred(*_Result, *next(_Result));
// note: intentionally dereferences _Last
auto _First = _Range._First;
for (auto _Next = _First; _First != _Range._Last; _First = _Next) {
++_Next;
if (_Pred(*_First, *_Next)) { // found match
_Results._Imbue(_Chunk_number, _First);
return _Cancellation_status::_Canceled;
}
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_adjacent_find2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt adjacent_find(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// find first satisfying _Pred with successor
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = static_cast<_Iter_diff_t<_FwdIt>>(_STD distance(_UFirst, _ULast) - 1);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_adjacent_find2 _Operation{_Hw_threads, _Count, _ULast, _Pass_fn(_Pred)};
_Operation._Basis._Populate(_Operation._Team, _UFirst);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_Last, _Operation._Results._Get_result());
return _Last;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to adjacent_find below
_CATCH_END
}
}
}
_Seek_wrapped(_Last, _STD adjacent_find(_UFirst, _ULast, _Pass_fn(_Pred)));
return _Last;
}
// PARALLEL FUNCTION TEMPLATES count AND count_if
template <class _FwdIt, class _Pr>
struct _Static_partitioned_count_if2 {
using _Diff = _Iter_diff_t<_FwdIt>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Pr _Pred;
atomic<_Diff> _Results;
_Static_partitioned_count_if2(const _Diff _Count, const size_t _Chunks, const _FwdIt _First, const _Pr _Pred_)
: _Team{_Count, _Chunks}, _Basis{}, _Pred{_Pred_}, _Results{} {
_Basis._Populate(_Team, _First);
}
_Diff _Process_chunks() {
_Diff _Result{};
while (const auto _Key = _Team._Get_next_key()) {
const auto _Range = _Basis._Get_chunk(_Key);
_Result += _STD count_if(_Range._First, _Range._Last, _Pred);
}
return _Result;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
const auto _This = static_cast<_Static_partitioned_count_if2*>(_Context);
_This->_Results.fetch_add(_This->_Process_chunks(), memory_order_relaxed);
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _Iter_diff_t<_FwdIt> count_if(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept
/* terminates */ {
// count elements satisfying _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 2) {
const auto _Chunks = _Get_chunked_work_chunk_count(_Hw_threads, _Count);
_TRY_BEGIN
_Static_partitioned_count_if2 _Operation{_Count, _Chunks, _UFirst, _Pass_fn(_Pred)};
_Iter_diff_t<_FwdIt> _Foreground_count;
{
const _Work_ptr _Work{_Operation};
// setup complete, hereafter nothrow or terminate
_Work._Submit_for_chunks(_Hw_threads, _Chunks);
_Foreground_count = _Operation._Process_chunks();
} // join with _Work_ptr threads
return _Operation._Results.load(memory_order_relaxed) + _Foreground_count;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to count_if below
_CATCH_END
}
}
}
return _STD count_if(_UFirst, _ULast, _Pass_fn(_Pred));
}
template <class _ExPo, class _FwdIt, class _Ty, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _Iter_diff_t<_FwdIt> count(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept
/* terminates */ {
// count elements that match _Val
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
return _STD count_if(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last),
[&_Val](auto&& _Iter_val) { return _STD forward<decltype(_Iter_val)>(_Iter_val) == _Val; });
}
// PARALLEL FUNCTION TEMPLATE mismatch
template <class _FwdIt1, class _FwdIt2,
bool = _Use_atomic_iterator<_Unwrapped_t<const _FwdIt1&>>&& _Is_random_iter_v<_FwdIt2>,
bool = _Use_atomic_iterator<_Unwrapped_t<const _FwdIt2&>>&& _Is_random_iter_v<_FwdIt1>>
struct _Static_partitioned_mismatch_results;
template <class _FwdIt1, class _FwdIt2, bool _Unused>
struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, true, _Unused> {
// atomically manipulate atomic<_FwdIt1> and calculate the second iterator by adding distance to it
_Parallel_choose_min_result<_FwdIt1> _Storage;
_Static_partitioned_mismatch_results(const _FwdIt1 _Last1, const _Unwrapped_t<const _FwdIt2&>&)
: _Storage(_Last1) {}
void _Imbue(const size_t _Chunk_number, const _FwdIt1 _First1, const _Unwrapped_t<const _FwdIt2&>&) {
_Storage._Imbue(_Chunk_number, _First1);
}
pair<_FwdIt1, _FwdIt2> _Get_result(const _FwdIt1 _First1, const _FwdIt2 _First2) const {
const _FwdIt1 _Result1 = _Storage._Get_result();
return {_Result1, _First2 + static_cast<_Iter_diff_t<_FwdIt2>>(_Result1 - _First1)};
}
};
template <class _FwdIt1, class _FwdIt2>
struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, false, true> {
// atomically manipulate atomic<_FwdIt2> and calculate the first iterator by adding distance to it
_Parallel_choose_min_result<_FwdIt2> _Storage;
_Static_partitioned_mismatch_results(const _Unwrapped_t<const _FwdIt1&>&, const _FwdIt2 _Last2)
: _Storage(_Last2) {}
void _Imbue(const size_t _Chunk_number, const _Unwrapped_t<const _FwdIt1&>&, const _FwdIt2 _First2) {
_Storage._Imbue(_Chunk_number, _First2);
}
pair<_FwdIt1, _FwdIt2> _Get_result(const _FwdIt1 _First1, const _FwdIt2 _First2) const {
const _FwdIt2 _Result2 = _Storage._Get_result();
return {_First1 + static_cast<_Iter_diff_t<_FwdIt1>>(_Result2 - _First2), _Result2};
}
};
template <class _FwdIt1, class _FwdIt2>
struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, false, false> {
// get both iterators by manipulating them under lock
using _UFwdIt1 = _Unwrapped_t<const _FwdIt1&>;
using _UFwdIt2 = _Unwrapped_t<const _FwdIt2&>;
_Parallel_choose_min_chunk<pair<_UFwdIt1, _UFwdIt2>> _Storage;
_Static_partitioned_mismatch_results(const _UFwdIt1 _Last1, const _UFwdIt2 _Last2) : _Storage({_Last1, _Last2}) {}
void _Imbue(const size_t _Chunk_number, const _UFwdIt1 _First1, const _UFwdIt2 _First2) {
_Storage._Imbue(_Chunk_number, {_First1, _First2});
}
pair<_FwdIt1, _FwdIt2> _Get_result(_FwdIt1 _First1, _FwdIt2 _First2) const {
const auto _Result = _Storage._Get_result();
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
}
};
template <class _FwdIt1, class _FwdIt2, class _Pr>
struct _Static_partitioned_mismatch2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2> _Results;
_Pr _Pred;
_Static_partitioned_mismatch2(
const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First1, const _FwdIt2 _First2, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Results(
_Get_unwrapped(_Basis1._Populate(_Team, _First1)), _Get_unwrapped(_Basis2._Populate(_Team, _First2))),
_Pred(_Pred_) {}
_Cancellation_status _Process_chunk() {
if (_Results._Storage._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _Range1 = _Basis1._Get_chunk(_Key);
auto _First1 = _Range1._First;
auto _First2 = _Basis2._Get_chunk(_Key)._First;
for (;;) {
if (_First1 == _Range1._Last) {
return _Cancellation_status::_Running;
}
if (!_Pred(*_First1, *_First2)) {
_Results._Imbue(_Chunk_number, _First1, _First2);
return _Cancellation_status::_Canceled;
}
++_First1;
++_First2;
}
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_mismatch2*>(_Context));
}
};
#pragma warning(push)
#pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order
// in braced initializer list (/Wall)
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD pair<_FwdIt1, _FwdIt2> mismatch(
_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _Pr _Pred) noexcept /* terminates */ {
// return [_First1, _Last1)/[_First2, ...) mismatch using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First1, _Last1);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_UFirst1, _ULast1);
const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_mismatch2 _Operation{_Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
const auto _Result = _Operation._Results._Get_result(_UFirst1, _UFirst2);
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
const auto _Result = _STD mismatch(_UFirst1, _ULast1, _UFirst2, _Pass_fn(_Pred));
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
}
}
const auto _Result = _STD mismatch(
_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Idl_distance<_FwdIt1>(_UFirst1, _ULast1)), _Pass_fn(_Pred));
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
}
#pragma warning(pop)
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD pair<_FwdIt1, _FwdIt2> mismatch(
_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ {
// return [_First1, _Last1)/[_First2, _Last2) mismatch using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
const auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = static_cast<_Iter_diff_t<_FwdIt1>>(_Distance_min(_UFirst1, _ULast1, _UFirst2, _ULast2));
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_mismatch2 _Operation{_Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
const auto _Result = _Operation._Results._Get_result(_UFirst1, _UFirst2);
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial below
_CATCH_END
}
}
}
const auto _Result = _STD mismatch(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred));
_Seek_wrapped(_First2, _Result.second);
_Seek_wrapped(_First1, _Result.first);
return {_First1, _First2};
}
// PARALLEL FUNCTION TEMPLATE equal
template <class _FwdIt1, class _FwdIt2, class _Pr>
struct _Static_partitioned_equal2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Pr _Pred;
_Cancellation_token _Cancel_token;
_Static_partitioned_equal2(const size_t _Hw_threads, const _Diff _Count, _Pr _Pred_, const _FwdIt1&, const _FwdIt2&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Pred(_Pred_), _Cancel_token{} {}
_Cancellation_status _Process_chunk() {
if (_Cancel_token._Is_canceled()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Range1 = _Basis1._Get_chunk(_Key);
const auto _Range2_first = _Basis2._Get_chunk(_Key)._First;
if (_STD equal(_Range1._First, _Range1._Last, _Range2_first, _Pred)) {
return _Cancellation_status::_Running;
}
_Cancel_token._Cancel();
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_equal2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, _Pr _Pred) noexcept
/* terminates */ {
// compare [_First1, _Last1) to [_First2, ...) using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First1, _Last1);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_UFirst1, _ULast1);
const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_equal2 _Operation{_Hw_threads, _Count, _Pass_fn(_Pred), _UFirst1, _UFirst2};
_Operation._Basis1._Populate(_Operation._Team, _UFirst1);
_Operation._Basis2._Populate(_Operation._Team, _UFirst2);
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return !_Operation._Cancel_token._Is_canceled_relaxed();
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
return _STD equal(_UFirst1, _ULast1, _UFirst2, _Pass_fn(_Pred));
}
}
return _STD equal(_UFirst1, _ULast1, _First2, _Pass_fn(_Pred));
}
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2,
_Pr _Pred) noexcept /* terminates */ {
// compare [_First1, _Last1) to [_First2, _Last2) using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
const auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _Distance_any(_UFirst1, _ULast1, _UFirst2, _ULast2);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_equal2 _Operation{_Hw_threads, _Count, _Pass_fn(_Pred), _UFirst1, _UFirst2};
if (!_Operation._Basis1._Populate(_Operation._Team, _UFirst1, _ULast1)) {
// left sequence didn't have length _Count
return false;
}
if (!_Operation._Basis2._Populate(_Operation._Team, _UFirst2, _ULast2)) {
// right sequence didn't have length _Count
return false;
}
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return !_Operation._Cancel_token._Is_canceled_relaxed();
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to equal below
_CATCH_END
}
}
}
return _STD equal(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred));
}
// PARALLEL FUNCTION TEMPLATE search
template <class _FwdItHaystack, class _FwdItPat, class _Pr>
struct _Static_partitioned_search2 {
_Static_partition_team<_Iter_diff_t<_FwdItHaystack>> _Team;
_Static_partition_range<_FwdItHaystack> _Basis;
_Parallel_find_results<_FwdItHaystack> _Results;
_FwdItPat _First2;
_FwdItPat _Last2;
_Pr _Pred;
_Static_partitioned_search2(const size_t _Hw_threads, const _Iter_diff_t<_FwdItHaystack> _Count,
const _FwdItHaystack _First1, const _FwdItHaystack _Last1, const _FwdItPat _First2_, const _FwdItPat _Last2_,
_Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results(_Last1),
_First2(_First2_), _Last2(_Last2_), _Pred(_Pred_) {
_Basis._Populate(_Team, _First1);
}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Range = _Basis._Get_chunk(_Key);
for (auto _Candidate = _Range._First; _Candidate != _Range._Last; ++_Candidate) {
if (_Equal_rev_pred_unchecked(_Candidate, _First2, _Last2, _Pred)) {
_Results._Imbue(_Key._Chunk_number, _Candidate);
return _Cancellation_status::_Canceled;
}
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_search2*>(_Context));
}
};
template <class _ExPo, class _FwdItHaystack, class _FwdItPat, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdItHaystack search(_ExPo&&, const _FwdItHaystack _First1, _FwdItHaystack _Last1, const _FwdItPat _First2,
const _FwdItPat _Last2, _Pr _Pred) noexcept /* terminates */ {
// find first [_First2, _Last2) match
_Adl_verify_range(_First2, _Last2);
const auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
if (_UFirst2 == _ULast2) {
return _First1;
}
_Adl_verify_range(_First1, _Last1);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
_Iter_diff_t<_FwdItHaystack> _Count;
if constexpr (_Is_random_iter_v<_FwdItHaystack> && _Is_random_iter_v<_FwdItPat>) {
const auto _HaystackDist = _ULast1 - _UFirst1;
const auto _NeedleDist = _ULast2 - _UFirst2;
if (_NeedleDist > _HaystackDist) { // needle is longer than haystack, no match possible
return _Last1;
}
// +1 can't overflow because _NeedleDist > 0 because _First2 != _Last2
_Count = static_cast<_Iter_diff_t<_FwdItHaystack>>(_HaystackDist - _NeedleDist + 1);
} else {
auto _UFirst1c = _UFirst1;
auto _UFirst2c = _UFirst2;
for (;;) {
if (_UFirst1c == _ULast1) {
// either haystack was shorter than needle so no match is possible, or haystack was empty so
// returning _Last1 is the same as returning _First1
return _Last1;
}
++_UFirst2c;
if (_UFirst2c == _ULast2) {
_Count = _STD distance(_UFirst1c, _ULast1);
break;
}
++_UFirst1c;
}
}
if (_Count == 1) {
// 1 match possible == ranges are of equal length -- do better than serial search because we've done the
// work to calculate distance() of the inputs
if (_Equal_rev_pred_unchecked(_UFirst1, _UFirst2, _ULast2, _Pass_fn(_Pred))) {
return _First1;
}
return _Last1;
}
_TRY_BEGIN
_Static_partitioned_search2 _Operation{
_Hw_threads, _Count, _UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_Last1, _Operation._Results._Get_result());
return _Last1;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to search, below
_CATCH_END
}
}
_Seek_wrapped(_Last1, _STD search(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)));
return _Last1;
}
// PARALLEL FUNCTION TEMPLATE search_n
template <class _FwdIt, class _Ty, class _Pr>
struct _Static_partitioned_search_n2 {
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Parallel_find_results<_FwdIt> _Results;
_Iter_diff_t<_FwdIt> _Target_count;
const _Ty& _Val;
_Pr _Pred;
_Static_partitioned_search_n2(const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Candidates, const _FwdIt _First,
const _FwdIt _Last, const _Iter_diff_t<_FwdIt> _Target_count_, const _Ty& _Val_, _Pr _Pred_)
: _Team{_Candidates, _Get_chunked_work_chunk_count(_Hw_threads, _Candidates)}, _Basis{}, _Results(_Last),
_Target_count(_Target_count_), _Val(_Val_), _Pred(_Pred_) {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Range = _Basis._Get_chunk(_Key);
// any match in this chunk will have at least 1 element in _Range, so the furthest off the
// end of the candidate range we will need to search is _Target_count - 1
const _Iter_diff_t<_FwdIt> _Overshoot = _Target_count - 1;
const auto _Serial_last = _STD next(_Range._Last, _Overshoot);
// we delegate to the serial algorithm targeting [_First, _Last + _Overshoot)
// which we know is safe because we only partitioned _First + _Candidates start positions
// if we have a match, it will be within [_First, _Last); otherwise the serial algorithm
// will tell us _Last + _Overshoot
const auto _Candidate = _STD search_n(_Range._First, _Serial_last, _Target_count, _Val, _Pred);
if (_Candidate == _Serial_last) {
return _Cancellation_status::_Running;
}
_Results._Imbue(_Key._Chunk_number, _Candidate);
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_search_n2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Diff, class _Ty, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt search_n(_ExPo&&, const _FwdIt _First, _FwdIt _Last, const _Diff _Count_raw, const _Ty& _Val,
_Pr _Pred) noexcept /* terminates */ {
// find first _Count * _Val satisfying _Pred
const _Algorithm_int_t<_Diff> _Count = _Count_raw;
if (_Count <= 0) {
_Last = _First;
return _Last;
}
if (static_cast<uintmax_t>(_Count) > static_cast<uintmax_t>((numeric_limits<_Iter_diff_t<_FwdIt>>::max)())) {
// if the number of _Vals searched for is larger than the longest possible sequence, we can't find it
return _Last;
}
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Haystack_count = _STD distance(_UFirst, _ULast);
if (_Count > _Haystack_count) {
return _Last;
}
// +1 can't overflow because _Count > 0
const auto _Candidates = static_cast<_Iter_diff_t<_FwdIt>>(_Haystack_count - _Count + 1);
_TRY_BEGIN
_Static_partitioned_search_n2 _Operation{_Hw_threads, _Candidates, _UFirst, _ULast,
static_cast<_Iter_diff_t<_FwdIt>>(_Count), _Val, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_Last, _Operation._Results._Get_result());
return _Last;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to search_n, below
_CATCH_END
}
}
_Seek_wrapped(_Last, _STD search_n(_UFirst, _ULast, _Count, _Val, _Pass_fn(_Pred)));
return _Last;
}
// PARALLEL FUNCTION TEMPLATE transform
template <class _FwdIt1, class _FwdIt2, class _Fn>
_FwdIt2 _Transform_ivdep(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Fn _Func) {
// unary op transform with independent loop bodies
#pragma loop(ivdep)
for (; _First != _Last; ++_First, (void) ++_Dest) {
*_Dest = _Func(*_First);
}
return _Dest;
}
template <class _FwdIt1, class _FwdIt2, class _Fn>
struct _Static_partitioned_unary_transform2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Source_basis;
_Static_partition_range<_FwdIt2, _Diff> _Dest_basis;
_Fn _Func;
_Static_partitioned_unary_transform2(
const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _Fn _Fx, const _FwdIt2&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Source_basis{}, _Dest_basis{},
_Func(_Fx) {
_Source_basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (_Key) {
const auto _Source = _Source_basis._Get_chunk(_Key);
_Transform_ivdep(_Source._First, _Source._Last, _Dest_basis._Get_chunk(_Key)._First, _Func);
return _Cancellation_status::_Running;
}
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_unary_transform2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Fn, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 transform(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Fn _Func) noexcept
/* terminates */ {
// transform [_First, _Last) with _Func
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_UFirst, _ULast);
const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_unary_transform2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Func), _UDest};
_Seek_wrapped(_Dest, _Operation._Dest_basis._Populate(_Operation._Team, _UDest));
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _Transform_ivdep(_UFirst, _ULast, _UDest, _Pass_fn(_Func)));
return _Dest;
} else {
_Seek_wrapped(
_Dest, _Transform_ivdep(_UFirst, _ULast,
_Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Func)));
return _Dest;
}
} else {
_Seek_wrapped(_Dest, _STD transform(_UFirst, _ULast,
_Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Func)));
return _Dest;
}
}
template <class _FwdIt1, class _FwdIt2, class _FwdIt3, class _Fn>
_FwdIt3 _Transform_ivdep(_FwdIt1 _First1, const _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt3 _Dest, _Fn _Func) {
// binary op transform with independent loop bodies
#pragma loop(ivdep)
for (; _First1 != _Last1; ++_First1, (void) ++_First2, ++_Dest) {
*_Dest = _Func(*_First1, *_First2);
}
return _Dest;
}
template <class _FwdIt1, class _FwdIt2, class _FwdIt3, class _Fn>
struct _Static_partitioned_binary_transform2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Source1_basis;
_Static_partition_range<_FwdIt2, _Diff> _Source2_basis;
_Static_partition_range<_FwdIt3, _Diff> _Dest_basis;
_Fn _Func;
_Static_partitioned_binary_transform2(
const size_t _Hw_threads, const _Diff _Count, _FwdIt1 _First1, _FwdIt2 _First2, _Fn _Fx, const _FwdIt3&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Source1_basis{}, _Source2_basis{},
_Dest_basis{}, _Func(_Fx) {
_Source1_basis._Populate(_Team, _First1);
_Source2_basis._Populate(_Team, _First2);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (_Key) {
const auto _Source1 = _Source1_basis._Get_chunk(_Key);
_Transform_ivdep(_Source1._First, _Source1._Last, _Source2_basis._Get_chunk(_Key)._First,
_Dest_basis._Get_chunk(_Key)._First, _Func);
return _Cancellation_status::_Running;
}
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_binary_transform2*>(_Context));
}
};
#pragma warning(push)
#pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order
// in braced initializer list (/Wall)
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _FwdIt3, class _Fn,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt3 transform(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, _FwdIt3 _Dest,
_Fn _Func) noexcept /* terminates */ {
// transform [_First1, _Last1) and [_First2, ...) with _Func
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt3);
_Adl_verify_range(_First1, _Last1);
const auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_UFirst1, _ULast1);
const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_binary_transform2 _Operation{
_Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Func), _UDest};
_Seek_wrapped(_Dest, _Operation._Dest_basis._Populate(_Operation._Team, _UDest));
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _UFirst2, _UDest, _Pass_fn(_Func)));
return _Dest;
} else {
const auto _Count = _Idl_distance<_FwdIt1>(_UFirst1, _ULast1);
_Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count),
_Get_unwrapped_n(_Dest, _Count), _Pass_fn(_Func)));
return _Dest;
}
} else {
const auto _Count = _Idl_distance<_FwdIt1>(_UFirst1, _ULast1);
_Seek_wrapped(_Dest, _STD transform(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count),
_Get_unwrapped_n(_Dest, _Count), _Pass_fn(_Func)));
return _Dest;
}
}
#pragma warning(pop)
// PARALLEL FUNCTION TEMPLATE replace
template <class _ExPo, class _FwdIt, class _Ty, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
void replace(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Oldval, const _Ty& _Newval) noexcept
/* terminates */ {
// replace each matching _Oldval with _Newval
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_STD for_each(_STD forward<_ExPo>(_Exec), _First, _Last, [&](auto&& _Value) {
if (_STD forward<decltype(_Value)>(_Value) == _Oldval) {
_STD forward<decltype(_Value)>(_Value) = _Newval;
}
});
}
// PARALLEL FUNCTION TEMPLATE replace_if
template <class _ExPo, class _FwdIt, class _Pr, class _Ty, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
void replace_if(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred, const _Ty& _Val) noexcept
/* terminates */ {
// replace each satisfying _Pred with _Val
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_STD for_each(
_STD forward<_ExPo>(_Exec), _First, _Last, [&_Val, _Lambda_pred = _Pass_fn(_Pred)](auto&& _Value) mutable {
if (_Lambda_pred(_STD forward<decltype(_Value)>(_Value))) {
_STD forward<decltype(_Value)>(_Value) = _Val;
}
});
}
// PARALLEL FUNCTION TEMPLATES remove AND remove_if
template <class _FwdIt, class _Pr>
_FwdIt _Remove_move_if_unchecked(_FwdIt _First, const _FwdIt _Last, _FwdIt _Dest, _Pr _Pred) {
// move omitting each element satisfying _Pred
for (; _First != _Last; ++_First) {
if (!_Pred(*_First)) {
*_Dest = _STD move(*_First);
++_Dest;
}
}
return _Dest;
}
template <class _FwdIt, class _Pr>
struct _Static_partitioned_remove_if2 {
enum class _Chunk_state : unsigned char {
_Serial, // while a chunk is in the serial state, it is touched only by an owner thread
_Merging, // while a chunk is in the merging state, threads all try to CAS the chunk _Merging -> _Moving
// the thread that succeeds takes responsibility for moving the keepers from that chunk to the
// results
_Moving, // while a chunk is in the moving state, the keepers are being moved to _Results
// only one chunk at a time is ever _Moving; this also serves to synchronize access to _Results
_Done // when a chunk becomes _Done, it is complete / will never need to touch _Results again
// as an optimization, if a thread sees that it has no predecessor (or its predecessor is _Done), it
// may transition from _Serial directly to _Done, doing the moving step implicitly.
};
#pragma warning(push)
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
struct alignas(hardware_destructive_interference_size) alignas(_FwdIt) _Chunk_local_data {
atomic<_Chunk_state> _State;
_FwdIt _New_end;
};
#pragma warning(pop)
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Pr _Pred;
_Parallel_vector<_Chunk_local_data> _Chunk_locals;
_FwdIt _Results;
_Static_partitioned_remove_if2(
const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _First, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred{_Pred_},
_Chunk_locals(_Team._Chunks), _Results{_First} {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
// remove phase:
auto _Merge_index = _Key._Chunk_number; // merge step will start from this index
{
auto& _Chunk_data = _Chunk_locals[_Merge_index];
const auto _Range = _Basis._Get_chunk(_Key);
if (_Merge_index == 0 || _Chunk_locals[_Merge_index - 1]._State.load() == _Chunk_state::_Done) {
// no predecessor, so run serial algorithm directly into results
if (_Merge_index == 0 || _Results == _Range._First) {
_Results = _STD remove_if(_Range._First, _Range._Last, _Pred);
} else {
_Results = _Remove_move_if_unchecked(_Range._First, _Range._Last, _Results, _Pred);
}
_Chunk_data._State.store(_Chunk_state::_Done);
++_Merge_index; // this chunk is already merged
} else { // predecessor, run serial algorithm in place and attempt to merge later
_Chunk_data._New_end = _STD remove_if(_Range._First, _Range._Last, _Pred);
_Chunk_data._State.store(_Chunk_state::_Merging);
if (_Chunk_locals[_Merge_index - 1]._State.load() != _Chunk_state::_Done) {
// if the predecessor isn't done, whichever thread merges our predecessor will merge us too
return _Cancellation_status::_Running;
}
}
}
// merge phase: at this point, we have observed that our predecessor chunk has been merged to the output,
// attempt to become the new merging thread if the previous merger gave up
// note: it is an invariant when we get here that _Chunk_locals[_Merge_index - 1]._State == _Chunk_state::_Done
for (; _Merge_index != _Team._Chunks; ++_Merge_index) {
auto& _Merge_chunk_data = _Chunk_locals[_Merge_index];
auto _Expected = _Chunk_state::_Merging;
if (!_Merge_chunk_data._State.compare_exchange_strong(_Expected, _Chunk_state::_Moving)) {
// either the _Merge_index chunk isn't ready to merge yet, or another thread will do it
return _Cancellation_status::_Running;
}
const auto _Merge_first = _Basis._Get_first(_Merge_index, _Team._Get_chunk_offset(_Merge_index));
const auto _Merge_new_end = _STD exchange(_Merge_chunk_data._New_end, {});
if (_Results == _Merge_first) { // entire range up to now had no removals, don't bother moving
_Results = _Merge_new_end;
} else {
_Results = _Move_unchecked(_Merge_first, _Merge_new_end, _Results);
}
_Merge_chunk_data._State.store(_Chunk_state::_Done);
}
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_remove_if2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt remove_if(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// remove each satisfying _Pred
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_remove_if2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First, _Operation._Results);
return _First;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_First, _STD remove_if(_UFirst, _ULast, _Pass_fn(_Pred)));
return _First;
}
template <class _ExPo, class _FwdIt, class _Ty, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt remove(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept
/* terminates */ {
// remove each matching _Val
return _STD remove_if(_STD forward<_ExPo>(_Exec), _First, _Last,
[&_Val](auto&& _Lhs) { return _STD forward<decltype(_Lhs)>(_Lhs) == _Val; });
}
// PARALLEL FUNCTION TEMPLATE sort
template <class _Diff>
struct _Sort_work_item_impl { // data describing an individual sort work item
using difference_type = _Diff;
_Diff _Offset;
_Diff _Size;
_Diff _Ideal;
};
template <class _RanIt>
using _Sort_work_item = _Sort_work_item_impl<_Iter_diff_t<_RanIt>>;
template <class _RanIt, class _Pr>
bool _Process_sort_work_item(const _RanIt _Basis, _Pr _Pred, _Sort_work_item<_RanIt>& _Wi,
_Sort_work_item<_RanIt>& _Right_fork_wi, _Iter_diff_t<_RanIt>& _Work_complete) noexcept /* terminates */ {
// processes the sort work item, _Wi, relative to _Basis
// if the sort is divided into quicksort sub-problems:
// the return value is true
// _Wi contains the left sub-problem; the caller should continue with this
// _Right_fork_wi contains the right sub-problem; the caller should allow this to be stolen
// otherwise:
// the return value is false
// _Wi's range is completely sorted
// _Right_fork_wi is unmodified
using _Diff = _Iter_diff_t<_RanIt>;
constexpr auto _Diffsort_max = static_cast<_Diff>(_ISORT_MAX);
const auto _Size = _Wi._Size;
const auto _First = _Basis + _Wi._Offset;
const auto _Last = _First + _Size;
const auto _Ideal = _Wi._Ideal;
if (_Size <= _Diffsort_max) {
_Insertion_sort_unchecked(_First, _Last, _Pred);
_Work_complete += _Size;
return false;
}
if (0 < _Ideal) { // divide and conquer by partitioning (quicksort)
const auto _Mid = _Partition_by_median_guess_unchecked(_First, _Last, _Pred);
const auto _New_ideal = static_cast<_Diff>(_Ideal / 2 + _Ideal / 4); // allow 1.5 log2(N) divisions
_Wi._Size = _Mid.first - _First;
_Wi._Ideal = _New_ideal;
_Right_fork_wi = {_Mid.second - _Basis, _Last - _Mid.second, _New_ideal};
_Work_complete += _Mid.second - _Mid.first;
return true;
}
// too many divisions; heap sort
_Make_heap_unchecked(_First, _Last, _Pred);
_Sort_heap_unchecked(_First, _Last, _Pred);
_Work_complete += _Size;
return false;
}
template <class _RanIt, class _Pr>
void _Process_sort_queue(const _RanIt _Basis, _Pr _Pred, _Work_stealing_membership<_Sort_work_item<_RanIt>>& _My_ticket,
_Sort_work_item<_RanIt>& _Wi) noexcept /* terminates */ {
_Sort_work_item<_RanIt> _Right_fork_wi;
do { // process work items in the local queue
while (_Process_sort_work_item(_Basis, _Pred, _Wi, _Right_fork_wi, _My_ticket._Work_complete)) {
_TRY_BEGIN
_My_ticket._Push_bottom(_Right_fork_wi);
_CATCH(const _Parallelism_resources_exhausted&)
// local queue is full and memory can't be acquired, process _Right_fork_wi serially
const auto _First = _Basis + _Right_fork_wi._Offset;
_Sort_unchecked(_First, _First + _Right_fork_wi._Size, _Right_fork_wi._Ideal, _Pred);
_My_ticket._Work_complete += _Right_fork_wi._Size;
_CATCH_END
}
} while (_My_ticket._Try_pop_bottom(_Wi));
}
template <class _RanIt, class _Pr>
struct _Sort_operation { // context for background threads
_RanIt _Basis;
_Pr _Pred;
_Work_stealing_team<_Sort_work_item<_RanIt>> _Team;
_Sort_operation(_RanIt _First, _Pr _Pred_arg, size_t _Threads, _Iter_diff_t<_RanIt> _Count)
: _Basis(_First), _Pred(_Pred_arg), _Team(_Threads, _Count) {}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, const __std_PTP_WORK _Work) noexcept /* terminates */ {
auto* const _This = static_cast<_Sort_operation*>(_Context);
const auto _Basis = _This->_Basis;
const auto _Pred = _This->_Pred;
auto& _Team = _This->_Team;
auto _My_ticket = _Team._Join_team();
_Sort_work_item<_RanIt> _Wi;
for (;;) {
switch (_My_ticket._Steal(_Wi)) {
case _Steal_result::_Success:
_Process_sort_queue(_Basis, _Pred, _My_ticket, _Wi);
break;
case _Steal_result::_Abort:
_My_ticket._Leave();
__std_submit_threadpool_work(_Work);
return;
case _Steal_result::_Done:
return;
}
}
}
};
template <class _ExPo, class _RanIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
void sort(_ExPo&&, const _RanIt _First, const _RanIt _Last, _Pr _Pred) noexcept /* terminates */ {
// order [_First, _Last), using _Pred
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
const _Iter_diff_t<_RanIt> _Ideal = _ULast - _UFirst;
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
size_t _Threads;
if (_Ideal > _ISORT_MAX && (_Threads = __std_parallel_algorithms_hw_threads()) > 1) {
// parallelize when input is large enough and we aren't on a uniprocessor machine
_TRY_BEGIN
_Sort_operation _Operation(_UFirst, _Pass_fn(_Pred), _Threads, _Ideal); // throws
const _Work_ptr _Work{_Operation}; // throws
auto& _Team = _Operation._Team;
auto _My_ticket = _Team._Join_team();
_Work._Submit(_Threads - 1);
_Sort_work_item<_RanIt> _Wi{0, _Ideal, _Ideal};
_Steal_result _Sr;
do {
_Process_sort_queue(_UFirst, _Pass_fn(_Pred), _My_ticket, _Wi);
do {
_Sr = _My_ticket._Steal(_Wi);
} while (_Sr == _Steal_result::_Abort);
} while (_Sr != _Steal_result::_Done);
return;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to _Sort_unchecked, below
_CATCH_END
}
}
_Sort_unchecked(_UFirst, _ULast, _Ideal, _Pass_fn(_Pred));
}
// PARALLEL FUNCTION TEMPLATE stable_sort
template <class _Ty>
struct _Static_partitioned_temporary_buffer2 {
_Optimistic_temporary_buffer<_Ty>& _Temp_buf;
ptrdiff_t _Chunk_size;
ptrdiff_t _Unchunked_items;
template <class _Diff>
explicit _Static_partitioned_temporary_buffer2(
_Optimistic_temporary_buffer<_Ty>& _Temp_buf_raw, _Static_partition_team<_Diff>& _Team)
: _Temp_buf(_Temp_buf_raw), _Chunk_size(static_cast<ptrdiff_t>(_Temp_buf._Capacity / _Team._Chunks)),
_Unchunked_items(static_cast<ptrdiff_t>(_Temp_buf._Capacity % _Team._Chunks)) {}
pair<_Ty*, ptrdiff_t> _Get_temp_buffer_for_chunk(const size_t _Chunk_number) {
// get a view of the region of the temporary buffer allocated to chunk _Chunk_number
return _Get_temp_buffer_for_chunk_range(_Chunk_number, _Chunk_number + 1);
}
pair<_Ty*, ptrdiff_t> _Get_temp_buffer_for_chunk_range(const size_t _Base, const size_t _End) {
// get a view of the region of the temporary buffer allocated to the region of chunks [_Base, _End)
const auto _Offset = _Get_offset(_Base);
return {_Temp_buf._Data + _Offset, _Offset - _Get_offset(_End)};
}
_Ty* _Get_first(const size_t _Chunk_number) {
// get a pointer to the first element of the temporary buffer allocated to chunk _Chunk_number
return _Temp_buf._Data + _Get_offset(_Chunk_number);
}
ptrdiff_t _Get_offset(const size_t _Chunk_number) {
// get the offset of the first element of the temporary buffer allocated to chunk _Chunk_number
auto _Diff_chunk = static_cast<ptrdiff_t>(_Chunk_number);
return _Diff_chunk * _Chunk_size + (_STD min)(_Diff_chunk, _Unchunked_items);
}
void _Destroy_all() { // destroy each element of the temporary buffer
_Destroy_range(_Temp_buf._Data, _Temp_buf._Data + _Temp_buf._Capacity);
}
};
inline size_t _Get_stable_sort_tree_height(const size_t _Count, const size_t _Hw_threads) {
// Get height of merge tree for parallel stable_sort, a bottom-up merge sort.
// * each merge takes two chunks from a buffer and copies to the other buffer in sorted order
// * we want the overall result to end up in the input buffer and not into _Temp_buf; each merge
// "level" switches between the input buffer and the temporary buffer; as a result we want
// the number of merge "levels" to be even (and thus chunks must be 2 raised to an even power)
// * the smallest chunk must be at least of size _ISORT_MAX
// * we want a number of chunks as close to _Ideal_chunks as we can to minimize scheduling
// overhead, but can use more chunks than that
const auto _Count_max_chunks = _Count / _ISORT_MAX;
const size_t _Log_count_max_chunks = _Floor_of_log_2(_Count_max_chunks);
// if _Log_count_max_chunks is odd, that would break our 2 to even power invariant, so
// go to the next smaller power of 2
const auto _Count_max_tree_height = _Log_count_max_chunks & ~static_cast<size_t>(1);
const auto _Ideal_chunks = _Hw_threads * _Oversubscription_multiplier;
const size_t _Log_ideal_chunks = _Floor_of_log_2(_Ideal_chunks);
#ifdef _WIN64
const size_t _Max_tree_height = 62; // to avoid ptrdiff_t overflow
#else // ^^^ _WIN64 ^^^ // vvv !_WIN64 vvv
const size_t _Max_tree_height = 30;
#endif // _WIN64
const size_t _Clamped_ideal_chunks = (_STD min)(_Max_tree_height, _Log_ideal_chunks);
// similarly, if _Clamped_ideal_chunks is odd, that would break our 2 to even power invariant,
// so go to the next higher power of 2
const auto _Ideal_tree_height = _Clamped_ideal_chunks + (_Clamped_ideal_chunks & 0x1U);
return (_STD min)(_Count_max_tree_height, _Ideal_tree_height);
}
struct _Bottom_up_merge_tree {
// merge tree / cells:
// each level of the tree has the next 1 bit turned on:
// 0 == done
// 1 1
// 2 3 1x
// 4 5 6 7 1xx
// chunks: 0 1 2 3 4 5 6 7
// the starting cell for a given chunk is (_Chunk_number >> 1) + (1 << (height - 1))
// (divide the chunk number by 2, and add the starting index of the bottom row of the tree)
// a cell's parent is just left shifting the current cell by 1
size_t _Height;
// each cell's data is stored at _Buckets[_Cell - 1], since the 0th cell is unused / indicates termination
_Parallel_vector<atomic<bool>> _Buckets;
explicit _Bottom_up_merge_tree(const size_t _Height_)
: _Height(_Height_), _Buckets((static_cast<size_t>(1) << _Height_) - 1) {}
};
struct _Bottom_up_tree_visitor {
size_t _Cell;
size_t _Base;
size_t _Shift;
_Bottom_up_tree_visitor(const size_t _Tree_height, const size_t _Initial_chunk)
: _Cell((_Initial_chunk >> 1) + (static_cast<size_t>(1) << (_Tree_height - 1))),
_Base(_Initial_chunk & ~static_cast<size_t>(1)), _Shift(1) {}
bool _Try_give_up_merge_to_peer(_Bottom_up_merge_tree& _Merge_tree) const {
// Attempt to mark that this child is done, to let our peer do the merge.
// Returns whether we successfully gave responsibility for doing the current merge to our peer.
return !_Merge_tree._Buckets[_Cell - 1].exchange(true);
}
bool _Go_to_parent() {
// Attempt to go to the parent in the merge tree; returns whether the move to the parent was successful.
_Shift <<= 1;
_Base -= _Shift * (_Cell & static_cast<size_t>(1));
_Cell >>= 1;
return _Cell != 0;
}
};
template <class _BidIt, class _Pr>
struct _Static_partitioned_stable_sort3 {
using _Diff = _Iter_diff_t<_BidIt>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_BidIt> _Basis;
_Bottom_up_merge_tree _Merge_tree;
_Static_partitioned_temporary_buffer2<_Iter_value_t<_BidIt>> _Temp_buf;
_Pr _Pred;
_Static_partitioned_stable_sort3(_Optimistic_temporary_buffer<_Iter_value_t<_BidIt>>& _Temp_buf_raw,
const _Diff _Count, const size_t _Merge_tree_height_, const _BidIt _First, _Pr _Pred_)
: _Team(_Count, static_cast<size_t>(1) << _Merge_tree_height_), _Basis{}, _Merge_tree(_Merge_tree_height_),
_Temp_buf(_Temp_buf_raw, _Team), _Pred{_Pred_} {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
{
const auto _Serial_chunk = _Basis._Get_chunk(_Key);
const auto _Temp_chunk = _Temp_buf._Get_temp_buffer_for_chunk(_Key._Chunk_number);
_Stable_sort_unchecked(
_Serial_chunk._First, _Serial_chunk._Last, _Key._Size, _Temp_chunk.first, _Temp_chunk.second, _Pred);
}
_Bottom_up_tree_visitor _Visitor{_Merge_tree._Height, _Key._Chunk_number};
if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // peer will do the work to merge
return _Cancellation_status::_Running;
}
if (_Temp_buf._Temp_buf._Capacity >= _Team._Count) { // enough space, combine using merge
{ // first merge is special; constructs the elements in the temporary buffer space
const size_t _Base = _Visitor._Base;
_Uninitialized_merge_move(_Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)),
_Basis._Get_first(_Base + 1, _Team._Get_chunk_offset(_Base + 1)),
_Basis._Get_first(_Base + 2, _Team._Get_chunk_offset(_Base + 2)), _Temp_buf._Get_first(_Base),
_Pred);
}
for (;;) { // walk remaining merge tree
(void) _Visitor._Go_to_parent(); // can't be complete because the data is in the temporary buffer
if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge
return _Cancellation_status::_Running;
}
{ // merge two sub-ranges from temporary buffer to input
const size_t _Base = _Visitor._Base;
const size_t _Mid = _Base + _Visitor._Shift;
const size_t _End = _Mid + _Visitor._Shift;
_Merge_move(_Temp_buf._Get_first(_Base), _Temp_buf._Get_first(_Mid), _Temp_buf._Get_first(_End),
_Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)), _Pred);
}
if (!_Visitor._Go_to_parent()) {
// temporary bits have been copied back to the input, no parent, so we're done
_Temp_buf._Destroy_all();
return _Cancellation_status::_Canceled;
}
if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge
return _Cancellation_status::_Running;
}
{ // merge two sub-ranges from input to temporary buffer
const size_t _Base = _Visitor._Base;
const size_t _Mid = _Base + _Visitor._Shift;
const size_t _End = _Mid + _Visitor._Shift;
_Merge_move(_Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)),
_Basis._Get_first(_Mid, _Team._Get_chunk_offset(_Mid)),
_Basis._Get_first(_End, _Team._Get_chunk_offset(_End)), _Temp_buf._Get_first(_Base), _Pred);
}
}
} else { // not enough space, combine using inplace_merge
for (;;) {
const size_t _Base = _Visitor._Base;
const size_t _Mid = _Base + _Visitor._Shift;
const size_t _End = _Mid + _Visitor._Shift;
const auto _Base_offset = _Team._Get_chunk_offset(_Base);
const auto _Mid_offset = _Team._Get_chunk_offset(_Mid);
const auto _End_offset = _Team._Get_chunk_offset(_End);
const auto _Temp_range = _Temp_buf._Get_temp_buffer_for_chunk_range(_Base, _End);
_Buffered_inplace_merge_unchecked(_Basis._Get_first(_Base, _Base_offset),
_Basis._Get_first(_Mid, _Mid_offset), _Basis._Get_first(_End, _End_offset),
_Mid_offset - _Base_offset, _End_offset - _Mid_offset, _Temp_range.first, _Temp_range.second,
_Pred);
if (!_Visitor._Go_to_parent()) { // no parent, so we're done
return _Cancellation_status::_Canceled;
}
if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge
return _Cancellation_status::_Running;
}
}
}
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_stable_sort3*>(_Context));
}
};
template <class _ExPo, class _BidIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
void stable_sort(_ExPo&&, const _BidIt _First, const _BidIt _Last, _Pr _Pred) noexcept /* terminates */ {
// sort preserving order of equivalents, using _Pred
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
const auto _Count = _STD distance(_UFirst, _ULast);
if (_Count <= _ISORT_MAX) {
_Insertion_sort_unchecked(_UFirst, _ULast, _Pass_fn(_Pred));
return;
}
size_t _Hw_threads;
bool _Attempt_parallelism;
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
_Hw_threads = __std_parallel_algorithms_hw_threads();
_Attempt_parallelism = _Hw_threads > 1;
} else {
_Attempt_parallelism = false;
}
_Optimistic_temporary_buffer<_Iter_value_t<_BidIt>> _Temp_buf{_Attempt_parallelism ? _Count : _Count - _Count / 2};
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
if (_Attempt_parallelism) {
// forward+ iterator overflow assumption for size_t cast
const auto _Tree_height = _Get_stable_sort_tree_height(static_cast<size_t>(_Count), _Hw_threads);
if (_Tree_height != 0) {
_TRY_BEGIN
_Static_partitioned_stable_sort3 _Operation{_Temp_buf, _Count, _Tree_height, _UFirst, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Stable_sort_unchecked(_UFirst, _ULast, _Count, _Temp_buf._Data, _Temp_buf._Capacity, _Pass_fn(_Pred));
}
// PARALLEL FUNCTION TEMPLATE is_sorted_until
template <class _FwdIt, class _Pr>
struct _Static_partitioned_is_sorted_until {
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
// note offset partitioning:
_Static_partition_range<_FwdIt> _Basis; // contains partition of [_First, _Last - 1)
_Pr _Pred;
_Parallel_find_results<_FwdIt> _Results;
_Static_partitioned_is_sorted_until(
_FwdIt _First, _FwdIt _Last, const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred(_Pred_), _Results(_Last) {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
auto _Range = _Basis._Get_chunk(_Key);
auto _Next = _Range._First;
do {
++_Next;
if (_DEBUG_LT_PRED(_Pred, *_Next, *_Range._First)) {
_Results._Imbue(_Key._Chunk_number, _Next);
return _Cancellation_status::_Canceled;
}
_Range._First = _Next;
} while (_Range._First != _Range._Last);
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_is_sorted_until*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _FwdIt is_sorted_until(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// find extent of range that is ordered by predicate
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 3) { // ... with at least 3 elements
_TRY_BEGIN
--_Count; // note unusual offset partitioning
_Static_partitioned_is_sorted_until _Operation{_UFirst, _ULast, _Hw_threads, _Count, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First, _Operation._Results._Get_result());
return _First;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_First, _STD is_sorted_until(_UFirst, _ULast, _Pass_fn(_Pred)));
return _First;
}
// PARALLEL FUNCTION TEMPLATE is_partitioned
inline constexpr unsigned char _Contains_counterexample = 0;
inline constexpr unsigned char _Contains_true = 1;
inline constexpr unsigned char _Contains_false = 2;
template <class _FwdIt, class _Pr>
unsigned char _Chunk_is_partitioned_status(_FwdIt _First, _FwdIt _Last, _Pr _Pred) {
// Returns the is_partitioned status for this range.
// pre: Range is non-empty
unsigned char _Result = 0;
if (_Pred(*_First)) { // starts with T
_Result = _Contains_true;
for (;;) { // skip T partition
++_First;
if (_First == _Last) { // all T
return _Contains_true;
}
if (!_Pred(*_First)) { // range contains a switching point from T to F
break;
}
}
}
_Result |= _Contains_false;
while (++_First != _Last) { // verify F partition
if (_Pred(*_First)) { // found an out of place element
return _Contains_counterexample;
}
}
return _Result;
}
template <class _FwdIt, class _Pr>
struct _Static_partitioned_is_partitioned {
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
atomic<size_t> _Rightmost_true; // chunk number of the rightmost chunk found so far containing a T element
atomic<size_t> _Leftmost_false; // chunk number of the leftmost chunk found so far containing an F element
_Pr _Pred;
_Static_partitioned_is_partitioned(
const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _First, _Pr _Pred_)
: _Team(_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)), _Basis{}, _Rightmost_true(0),
_Leftmost_false(SIZE_MAX - 1), _Pred(_Pred_) {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
// Note that the cancellation status here is not used in the final returned answer of is_partitioned. Returning
// _Cancellation_status::_Canceled is simply used as an "early fail" mechanism to avoid doing unnecessary work.
// A final comparison of _Rightmost_true and _Leftmost_false is used to determine the final return value from
// a call to is_partitioned.
if (_Rightmost_true.load(memory_order_relaxed) > _Leftmost_false.load(memory_order_relaxed)) {
// we've found a T to the right of an F, so we know the range cannot be partitioned and we can stop
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
// looking at chunks from either end, moving in towards the middle
auto _Target_chunk_number = _Key._Chunk_number >> 1;
if (!(_Key._Chunk_number & static_cast<size_t>(1))) {
_Target_chunk_number = _Team._Chunks - _Target_chunk_number - 1;
}
const auto _Target_chunk_key = _Team._Get_chunk_key(_Target_chunk_number);
const auto _Chunk = _Basis._Get_chunk(_Target_chunk_key);
const auto _This_chunk_status = _Chunk_is_partitioned_status(_Chunk._First, _Chunk._Last, _Pred);
if (_This_chunk_status == _Contains_counterexample) {
// indicating that this chunk contains a counterexample, so the range is not partitioned
_Rightmost_true.store(SIZE_MAX);
return _Cancellation_status::_Canceled;
}
// after determining the is_partitioned status for this chunk,
// we need to update the chunk numbers for leftmost F and rightmost T
auto _Old_true = _Rightmost_true.load();
if (_This_chunk_status & _Contains_true) {
while (_Target_chunk_number > _Old_true) {
if (_Rightmost_true.compare_exchange_weak(_Old_true, _Target_chunk_number)) {
_Old_true = _Target_chunk_number;
break;
}
}
}
// try to bail before doing more work if possible
auto _Old_false = _Leftmost_false.load();
if (_Old_true > _Old_false) {
return _Cancellation_status::_Canceled;
}
if (_This_chunk_status & _Contains_false) {
while (_Target_chunk_number < _Old_false) {
if (_Leftmost_false.compare_exchange_weak(_Old_false, _Target_chunk_number)) {
_Old_false = _Target_chunk_number;
break;
}
}
}
// Other loads/stores may have been reordered around the loads of _Old_false and _Old_true, but this check may
// allow us to avoid more atomic loads.
// If such loads/stores have been reordered around the loads of _Old_false and _Old_true, then the next call to
// _Process_chunk will exit given the check on the _Rightmost_true and _Leftmost_false values at the top of the
// method.
if (_Old_true > _Old_false) {
return _Cancellation_status::_Canceled;
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_is_partitioned*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD bool is_partitioned(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// test if [_First, _Last) is partitioned by _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_is_partitioned _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Operation._Rightmost_true.load(memory_order_relaxed)
<= _Operation._Leftmost_false.load(memory_order_relaxed);
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
return _STD is_partitioned(_UFirst, _ULast, _Pass_fn(_Pred));
}
// PARALLEL FUNCTION TEMPLATE is_heap_until
template <class _RanIt, class _Pr>
struct _Static_partitioned_is_heap_until {
using _Diff = _Iter_diff_t<_RanIt>;
_Static_partition_team<_Diff> _Team;
_RanIt _Range_first;
_Pr _Pred;
_Parallel_find_results<_RanIt> _Results;
_Static_partitioned_is_heap_until(
_RanIt _First, _RanIt _Last, const size_t _Hw_threads, const _Diff _Count, _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Range_first(_First), _Pred(_Pred_),
_Results(_Last) {}
_Cancellation_status _Process_chunk() {
if (_Results._Complete()) {
return _Cancellation_status::_Canceled;
}
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_range_size = _Key._Size;
const auto _Chunk_offset = _Key._Start_at;
const auto _Last = _Chunk_offset + _Chunk_range_size;
const auto _Initial = (_STD max)(_Chunk_offset, _Diff{1});
for (_Diff _Off = _Initial; _Off < _Last; ++_Off) {
if (_DEBUG_LT_PRED(_Pred, *(_Range_first + ((_Off - 1) >> 1)), *(_Range_first + _Off))) {
_Results._Imbue(_Key._Chunk_number, _Range_first + _Off);
return _Cancellation_status::_Canceled;
}
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_is_heap_until*>(_Context));
}
};
template <class _ExPo, class _RanIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _RanIt is_heap_until(_ExPo&&, _RanIt _First, _RanIt _Last, _Pr _Pred) noexcept /* terminates */ {
// find extent of range that is a heap ordered by _Pred
_REQUIRE_PARALLEL_ITERATOR(_RanIt);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _ULast - _UFirst;
if (_Count >= 3) { // ... with at least 3 elements
_TRY_BEGIN
_Static_partitioned_is_heap_until _Operation{_UFirst, _ULast, _Hw_threads, _Count, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First, _Operation._Results._Get_result());
return _First;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_First, _STD is_heap_until(_UFirst, _ULast, _Pass_fn(_Pred)));
return _First;
}
// PARALLEL FUNCTION TEMPLATE partition
template <class _FwdIt, class _Pr>
pair<_FwdIt, _Iter_diff_t<_FwdIt>> _Partition_with_count_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) {
// move elements satisfying _Pred to front and track how many elements satisfy _Pred
if constexpr (_Is_random_iter_v<_FwdIt>) {
auto _Mid = _STD partition(_First, _Last, _Pred);
return {_Mid, _Mid - _First};
} else if constexpr (_Is_bidi_iter_v<_FwdIt>) {
_Iter_diff_t<_FwdIt> _Trues{};
for (;;) { // find any out-of-order pair
for (;;) { // skip in-place elements at beginning
if (_First == _Last) {
return {_First, _Trues};
}
if (!_Pred(*_First)) {
break;
}
++_First;
++_Trues;
}
do { // skip in-place elements at end
--_Last;
if (_First == _Last) {
return {_First, _Trues};
}
} while (!_Pred(*_Last));
_STD iter_swap(_First, _Last); // out of place, swap and loop
++_First;
++_Trues;
}
} else {
_Iter_diff_t<_FwdIt> _Trues{};
for (;;) { // skip in-place elements at beginning
if (_First == _Last) {
return {_First, _Trues};
}
if (!_Pred(*_First)) {
break;
}
++_First;
++_Trues;
}
for (_FwdIt _Next = _First; ++_Next != _Last;) {
if (_Pred(*_Next)) {
_STD iter_swap(_First, _Next); // out of place, swap and loop
++_First;
++_Trues;
}
}
return {_First, _Trues};
}
}
template <class _FwdIt, class _Pr>
pair<_FwdIt, _Iter_diff_t<_FwdIt>> _Partition_swap_backward(
_FwdIt _First, _FwdIt _Last, _FwdIt _Beginning_of_falses, _Pr _Pred) {
// Swap elements in [_First, _Last) satisfying _Pred with elements from _Beginning_of_falses.
// Pre: _Beginning_of_falses < _First
_Iter_diff_t<_FwdIt> _Trues{};
if constexpr (_Is_bidi_iter_v<_FwdIt>) {
while (_First != _Last) {
--_Last;
if (_Pred(*_Last)) {
_STD iter_swap(_Beginning_of_falses, _Last);
++_Beginning_of_falses;
++_Trues;
if (_Beginning_of_falses == _First) {
auto _Remain = _Partition_with_count_unchecked(_First, _Last, _Pred);
return {_Remain.first, static_cast<_Iter_diff_t<_FwdIt>>(_Remain.second + _Trues)};
}
}
}
} else {
for (; _First != _Last; ++_First) {
if (_Pred(*_First)) {
_STD iter_swap(_First, _Beginning_of_falses);
++_Beginning_of_falses;
++_Trues;
}
}
}
return {_Beginning_of_falses, _Trues};
}
template <class _FwdIt>
_FwdIt _Partition_merge(const _FwdIt _False_first, const _FwdIt _True_first, const _FwdIt _True_last,
const _Iter_diff_t<_FwdIt> _Count1, const _Iter_diff_t<_FwdIt> _Count2) {
// Merge partition ranges where [_False_first, _True_first) are falses, [_True_first, _True_last) are trues
// pre: _Count1 == distance(_False_first, _True_first) && _Count2 == distance(_True_first, _True_last)
if (_Count1 < _Count2) { // move the false range to the end of the true range
const _Iter_diff_t<_FwdIt> _Offset = _Count2 - _Count1;
auto _Result = _True_first;
if constexpr (_Is_random_iter_v<_FwdIt>) {
_Result += _Offset;
} else if constexpr (_Is_bidi_iter_v<_FwdIt>) {
if (_Count1 < _Offset) {
_Result = _True_last;
_STD advance(_Result, -_Count1);
} else {
_STD advance(_Result, _Offset);
}
} else {
_STD advance(_Result, _Offset);
}
_Swap_ranges_unchecked(_False_first, _True_first, _Result);
return _Result;
}
// move the true range to the beginning of the false range
return _Swap_ranges_unchecked(_True_first, _True_last, _False_first);
}
template <class _FwdIt, class _Pr>
struct _Static_partitioned_partition2 {
using _Diff = _Iter_diff_t<_FwdIt>;
enum class _Chunk_state : unsigned char {
_Serial, // while a chunk is in the serial state, it is touched only by an owner thread
_Merging, // while a chunk is in the merging state, threads all try to CAS the chunk _Merging -> _Swapping
// the thread that succeeds takes responsibility for swapping the trues from that chunk to the
// results
_Swapping, // while a chunk is in the swapping state, the trues are being merged with _Results
// only one chunk at a time is ever _Swapping; this also serves to synchronize access to
// _Results and _Results_falses
_Done // when a chunk becomes _Done, it is complete / will never need to touch _Results again
};
#pragma warning(push)
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
struct alignas(hardware_destructive_interference_size) alignas(_FwdIt) _Chunk_local_data {
atomic<_Chunk_state> _State;
_FwdIt _Beginning_of_falses;
_Diff _Chunk_trues;
};
#pragma warning(pop)
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt> _Basis;
_Pr _Pred;
_Parallel_vector<_Chunk_local_data> _Chunk_locals;
_FwdIt _Results;
_Diff _Results_falses;
_Static_partitioned_partition2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt _First, const _Pr _Pred_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred{_Pred_},
_Chunk_locals(_Team._Chunks), _Results{_First}, _Results_falses{} {
_Basis._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
// serial-on-each-chunk phase:
auto _Merge_index = _Key._Chunk_number; // merge step will start from this index
{
auto& _Chunk_data = _Chunk_locals[_Merge_index];
const auto _Range = _Basis._Get_chunk(_Key);
if (_Merge_index == 0 || _Chunk_locals[_Merge_index - 1]._State.load() == _Chunk_state::_Done) {
// no predecessor, so run serial algorithm directly into results
const auto _Chunk_results =
_Merge_index == 0 || _Results == _Range._First
? _Partition_with_count_unchecked(_Range._First, _Range._Last, _Pred)
: _Partition_swap_backward(_Range._First, _Range._Last, _Results, _Pred);
_Results = _Chunk_results.first;
_Chunk_data._Chunk_trues = _Chunk_results.second;
_Results_falses += _Key._Size - _Chunk_results.second;
_Chunk_data._State.store(_Chunk_state::_Done);
++_Merge_index; // this chunk is already merged
} else {
// predecessor, run serial algorithm in place and attempt to merge later
auto _Chunk_results = _Partition_with_count_unchecked(_Range._First, _Range._Last, _Pred);
_Chunk_data._Beginning_of_falses = _Chunk_results.first;
_Chunk_data._Chunk_trues = _Chunk_results.second;
_Chunk_data._State.store(_Chunk_state::_Merging);
if (_Chunk_locals[_Merge_index - 1]._State.load() != _Chunk_state::_Done) {
// if the predecessor isn't done, whichever thread merges our predecessor will merge us too
return _Cancellation_status::_Running;
}
}
}
// merge phase: at this point, we have observed that our predecessor chunk has been merged to the output,
// attempt to become the new merging thread if the previous merger gave up
// note: it is an invariant when we get here that _Chunk_locals[_Merge_index - 1]._State == _Chunk_state::_Done
for (; _Merge_index != _Team._Chunks; ++_Merge_index) {
auto& _Merge_chunk_data = _Chunk_locals[_Merge_index];
auto _Expected = _Chunk_state::_Merging;
if (!_Merge_chunk_data._State.compare_exchange_strong(_Expected, _Chunk_state::_Swapping)) {
// either the _Merge_index chunk isn't ready to merge yet, or another thread will do it
return _Cancellation_status::_Running;
}
const auto _Merge_key = _Team._Get_chunk_key(_Merge_index);
const auto _Chunk_trues = _Merge_chunk_data._Chunk_trues;
_Results = _Partition_merge(_Results, _Basis._Get_first(_Merge_index, _Merge_key._Start_at),
_STD exchange(_Merge_chunk_data._Beginning_of_falses, {}), _Results_falses, _Chunk_trues);
_Results_falses += _Merge_key._Size - _Chunk_trues;
_Merge_chunk_data._State.store(_Chunk_state::_Done);
}
return _Cancellation_status::_Canceled;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_partition2*>(_Context));
}
};
template <class _ExPo, class _FwdIt, class _Pr, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt partition(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ {
// move elements satisfying _Pred to beginning of sequence
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) {
const auto _Count = _STD distance(_UFirst, _ULast);
if (_Count >= 2) {
_TRY_BEGIN
_Static_partitioned_partition2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)};
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_Seek_wrapped(_First, _Operation._Results);
return _First;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_First, _STD partition(_UFirst, _ULast, _Pass_fn(_Pred)));
return _First;
}
// PARALLEL FUNCTION TEMPLATE set_intersection
template <class _Ty>
struct _Storage_for {
// uninitialized space to store a _Ty
alignas(_Ty) unsigned char _Storage[sizeof(_Ty)];
_Storage_for() = default;
_Storage_for(const _Storage_for&) = delete;
_Storage_for& operator=(const _Storage_for&) = delete;
_Ty& _Ref() {
return reinterpret_cast<_Ty&>(_Storage);
}
};
inline constexpr unsigned char _Local_available = 1;
inline constexpr unsigned char _Sum_available = 2;
template <class _Ty>
struct _Scan_decoupled_lookback {
// inter-chunk communication block in "Single-pass Parallel Prefix Scan with Decoupled Look-back" by Merrill and
// Garland
using value_type = _Ty;
atomic<unsigned char> _State;
_Storage_for<_Ty> _Local; // owned by thread iff _State < _Local_available; otherwise const and shared
_Storage_for<_Ty> _Sum; // owned by thread iff _State < _Sum_available
unsigned char _Get_available_state() const {
for (;;) {
const unsigned char _Local_state = _State.load();
if (_Local_state != 0) {
return _Local_state;
}
__std_execution_wait_on_uchar(reinterpret_cast<const unsigned char*>(&_State), _Local_state);
}
}
void _Store_available_state(const unsigned char _New_state) {
_State.store(_New_state);
__std_execution_wake_by_address_all(&_State);
}
template <class _FwdIt, class _BinOp>
void _Apply_exclusive_predecessor(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) {
// apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op
_Construct_in_place(_Sum._Ref(), _Reduce_op(_Preceding, _Local._Ref()));
_State.store(_Local_available | _Sum_available);
*_First = _Preceding;
#pragma loop(ivdep)
while (++_First != _Last) {
*_First = _Reduce_op(_Preceding, _STD move(*_First));
}
}
template <class _FwdIt, class _BinOp>
void _Apply_inclusive_predecessor(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) {
// apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op
_Construct_in_place(_Sum._Ref(), _Reduce_op(_Preceding, _Local._Ref()));
_State.store(_Local_available | _Sum_available);
#pragma loop(ivdep)
for (; _First != _Last; ++_First) {
*_First = _Reduce_op(_Preceding, _STD move(*_First));
}
}
~_Scan_decoupled_lookback() {
const auto _State_bits = _State.load(memory_order_relaxed);
if (_State_bits & _Sum_available) {
_Destroy_in_place(_Sum._Ref());
}
if (_State_bits & _Local_available) {
_Destroy_in_place(_Local._Ref());
}
}
};
template <class _BidIt, class _BinOp>
typename _Iter_value_t<_BidIt>::value_type _Get_lookback_sum(const _BidIt _Current, _BinOp _Reduce_op) {
// Calculate the sum of the _Scan_decoupled_lookback referenced by _Current.
// pre: _Current->_State & _Local_available
// pre: Some iterator i exists before _Current such that i->_Get_available_state & _Sum_available
static_assert(_Is_specialization_v<_Iter_value_t<_BidIt>, _Scan_decoupled_lookback>, "Bad _Get_lookback_sum");
auto _Prev = _Current;
--_Prev;
auto _Prev_state = _Prev->_Get_available_state();
typename _Iter_value_t<_BidIt>::value_type _Result(
_Reduce_op(_Prev_state & _Sum_available ? _Prev->_Sum._Ref() : _Prev->_Local._Ref(), _Current->_Local._Ref()));
while (!(_Prev_state & _Sum_available)) {
--_Prev;
_Prev_state = _Prev->_Get_available_state();
_Result =
_Reduce_op(_Prev_state & _Sum_available ? _Prev->_Sum._Ref() : _Prev->_Local._Ref(), _STD move(_Result));
}
return _Result;
}
template <class _Ty>
struct _Casty_plus {
// Adds the two arguments together and casts the result back to _Ty.
// pre: the result from adding the two arguments together can fit in _Ty
_NODISCARD constexpr _Ty operator()(_Ty _Val1, _Ty _Val2) const noexcept /* terminates */ {
return static_cast<_Ty>(_Val1 + _Val2);
}
};
template <class _Diff>
void _Surrender_elements_to_next_chunk(const size_t _Chunk_number,
const typename _Parallel_vector<_Scan_decoupled_lookback<_Diff>>::iterator _Chunk_lookback_data) {
// Deals with the case in which all of the elements in the chunk corresponding to _Chunk_lookback_data will be
// handled by the following chunk, so this chunk needs to publish its results accordingly.
if (_Chunk_number == 0) {
// This is the first chunk, so we can immediately publish results. No need to set
// _Chunk_lookback_data->_Local._Ref() to be 0, since chunk 0 has no predecessors and its local and total sums
// are the same. Chunk_lookback_data->_Sum is already 0, so we can just publish results immediately.
_Chunk_lookback_data->_Store_available_state(_Sum_available);
return;
}
// We need to pass the previous chunk's sum to the right.
// _Chunk_lookback_data->_Local is already 0, so we can just publish results immediately.
_Chunk_lookback_data->_Store_available_state(_Local_available);
const auto _Prev_chunk_lookback_data = _Prev_iter(_Chunk_lookback_data);
_Diff _Prev_chunk_sum;
if (_Prev_chunk_lookback_data->_Get_available_state() & _Sum_available) {
_Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref();
} else {
// Note that we can use _Casty_plus because _Diff is defined as _Common_diff<..., _RanIt3> and the maximum value
// that will be placed in _Lookback by adding two of the previous sums together is the total number of elements
// in the result. Assuming that _Dest has enough space for the result, the value produced by adding two previous
// sums should fit inside _Diff.
_Prev_chunk_sum = _Get_lookback_sum(_Prev_chunk_lookback_data, _Casty_plus<_Diff>{});
}
_Chunk_lookback_data->_Sum._Ref() = _Prev_chunk_sum;
_Chunk_lookback_data->_Store_available_state(_Sum_available);
}
template <class _RanIt1, class _RanIt2, class _RanIt3>
void _Place_elements_from_indices(
const _RanIt1 _First, _RanIt2 _Dest, _RanIt3 _Indices_first, const ptrdiff_t _Num_results) {
// Places _Num_results elements at indices in _Indices_first from the range indicated by _First into _Dest.
const auto _Last_index = _Indices_first + _Num_results;
for (; _Indices_first != _Last_index; ++_Indices_first) {
const auto _Curr_index = *_Indices_first;
*_Dest = *(_First + static_cast<_Iter_diff_t<_RanIt1>>(_Curr_index));
++_Dest;
}
}
template <class _RanIt1, class _RanIt2, class _RanIt3, class _Pr, class _SetOper>
struct _Static_partitioned_set_subtraction {
using _Diff = _Common_diff_t<_RanIt1, _RanIt2, _RanIt3>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_RanIt1, _Diff> _Basis;
_Parallel_vector<_Diff> _Index_indicator; // buffer used to store information about indices in Range 1
_Iterator_range<_RanIt2> _Range2;
_RanIt3 _Dest;
_Parallel_vector<_Scan_decoupled_lookback<_Diff>> _Lookback; // the "Single-pass Parallel Prefix Scan with
// Decoupled Look-back" is used here to track
// information about how many elements were placed
// in _Dest by preceding chunks
_Pr _Pred;
_SetOper _Set_oper_per_chunk;
_Static_partitioned_set_subtraction(const size_t _Hw_threads, const _Diff _Count, _RanIt1 _First1, _RanIt2 _First2,
const _RanIt2 _Last2, _RanIt3 _Dest_, _Pr _Pred_, _SetOper _Set_oper)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{},
_Index_indicator(static_cast<size_t>(_Count)), _Range2{_First2, _Last2}, _Dest(_Dest_),
_Lookback(_Team._Chunks), _Pred(_Pred_), _Set_oper_per_chunk(_Set_oper) {
_Basis._Populate(_Team, _First1);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _Chunk_lookback_data = _Lookback.begin() + static_cast<ptrdiff_t>(_Chunk_number);
// iterators for the actual beginning of this chunk's range in _Range1 (after adjustments below)
auto [_Range1_chunk_first, _Range1_chunk_last] = _Basis._Get_chunk(_Key);
const bool _Last_chunk = _Chunk_number == _Team._Chunks - 1;
// Get appropriate range for _Range1.
// We don't want any spans of equal elements to reach across chunk boundaries.
if (!_Last_chunk) {
// Slide _Range1_chunk_last to the left so that there are no copies of *_Range1_chunk_last in _Range1_chunk.
// Note that we know that this chunk is not the last, so we can look at the element at _Range1_chunk_last.
_Range1_chunk_last = _STD lower_bound(_Range1_chunk_first, _Range1_chunk_last, *_Range1_chunk_last, _Pred);
if (_Range1_chunk_last <= _Range1_chunk_first) {
// All of the elements in _Range1's chunk are equal to the element at _Range1_chunk_last, so they will
// be handled by the next chunk.
_Surrender_elements_to_next_chunk<_Diff>(_Chunk_number, _Chunk_lookback_data);
return _Cancellation_status::_Running;
}
}
// Slide _Range1_chunk_first to the left so that all copies of *_Range1_chunk_first are in this chunk
// of Range 1.
_Range1_chunk_first = _STD lower_bound(_Basis._Start_at, _Range1_chunk_first, *_Range1_chunk_first, _Pred);
// Get chunk in _Range2 that corresponds to our current chunk from _Range1
auto _Range2_chunk_first = _STD lower_bound(_Range2._First, _Range2._Last, *_Range1_chunk_first, _Pred);
auto _Range2_chunk_last =
_STD upper_bound(_Range2_chunk_first, _Range2._Last, *_Prev_iter(_Range1_chunk_last), _Pred);
// Publish results to rest of chunks.
if (_Chunk_number == 0) {
// Chunk 0 is special as it has no predecessor;
// its local and total sums are the same and we can immediately put its results in _Dest.
const auto _Num_results = _Set_oper_per_chunk._Update_dest(
_Range1_chunk_first, _Range1_chunk_last, _Range2_chunk_first, _Range2_chunk_last, _Dest, _Pred);
_Chunk_lookback_data->_Sum._Ref() = _Num_results;
_Chunk_lookback_data->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
const auto _Prev_chunk_lookback_data = _Prev_iter(_Chunk_lookback_data);
if (_Prev_chunk_lookback_data->_State.load() & _Sum_available) {
// If the predecessor sum is already complete, we can incorporate its value directly for 1 pass.
const auto _Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref();
auto _Chunk_specific_dest = _Dest + static_cast<_Iter_diff_t<_RanIt3>>(_Prev_chunk_sum);
const auto _Num_results = _Set_oper_per_chunk._Update_dest(_Range1_chunk_first, _Range1_chunk_last,
_Range2_chunk_first, _Range2_chunk_last, _Chunk_specific_dest, _Pred);
_Chunk_lookback_data->_Sum._Ref() = static_cast<_Diff>(_Num_results + _Prev_chunk_sum);
_Chunk_lookback_data->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
// Get range we can use for this chunk of range 1 in the buffer.
auto _Index_chunk_first =
_Index_indicator.begin() + static_cast<ptrdiff_t>(_Range1_chunk_first - _Basis._Start_at);
// Determine the indices of elements that should be in the result from this chunk.
const auto _Num_results = _Set_oper_per_chunk._Mark_indices(_Range1_chunk_first, _Range1_chunk_last,
_Range2_chunk_first, _Range2_chunk_last, _Index_chunk_first, _Pred);
_Chunk_lookback_data->_Local._Ref() = _Num_results;
_Chunk_lookback_data->_Store_available_state(_Local_available);
// Apply the predecessor overall sum to current overall sum and elements.
_Diff _Prev_chunk_sum;
if (_Prev_chunk_lookback_data->_Get_available_state() & _Sum_available) {
// Predecessor overall sum is done, use directly.
_Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref();
} else {
_Prev_chunk_sum = _Get_lookback_sum(_Prev_chunk_lookback_data, _Casty_plus<_Diff>{});
}
_Chunk_lookback_data->_Sum._Ref() = static_cast<_Diff>(_Num_results + _Prev_chunk_sum);
_Chunk_lookback_data->_Store_available_state(_Sum_available);
// Place elements from _Range1 in _Dest according to the offsets previously calculated.
auto _Chunk_specific_dest = _Dest + static_cast<_Iter_diff_t<_RanIt3>>(_Prev_chunk_sum);
_Place_elements_from_indices(
_Range1_chunk_first, _Chunk_specific_dest, _Index_chunk_first, static_cast<ptrdiff_t>(_Num_results));
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_set_subtraction*>(_Context));
}
};
struct _Set_intersection_per_chunk {
template <class _RanIt1, class _RanIt2, class _RanIt3, class _Pr>
_Common_diff_t<_RanIt1, _RanIt2, _RanIt3> _Update_dest(
_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2, _RanIt3 _Dest, _Pr _Pred) {
// Copy elements from [_First, _Last1) that are also present in [_First2, _Last2) according to _Pred, to
// _Dest. Returns the number of elements stored.
return _STD set_intersection(_First1, _Last1, _First2, _Last2, _Dest, _Pred) - _Dest;
}
template <class _RanIt1, class _RanIt2, class _BidIt, class _Pr>
_Iter_value_t<_BidIt> _Mark_indices(_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2,
_BidIt _Index_chunk_first, _Pr _Pred) {
// Stores offsets of elements in [_First1, _Last1) that are also present in [_First2, _Last2) according to
// _Pred, to _Index_chunk_first. Returns the number of offsets stored.
static_assert(is_integral_v<_Iter_value_t<_BidIt>>);
static_assert(
is_same_v<_Iter_value_t<_BidIt>, common_type_t<_Iter_value_t<_BidIt>, _Common_diff_t<_RanIt1, _RanIt2>>>);
_DEBUG_ORDER_SET_UNWRAPPED(_RanIt2, _First1, _Last1, _Pred);
_DEBUG_ORDER_SET_UNWRAPPED(_RanIt1, _First2, _Last2, _Pred);
const auto _Index_chunk_first_save = _Index_chunk_first;
_Iter_diff_t<_RanIt1> _Curr_range1_index = 0;
_Iter_diff_t<_RanIt2> _Curr_range2_index = 0;
const auto _Range1_dist = _Last1 - _First1;
const auto _Range2_dist = _Last2 - _First2;
while (_Curr_range1_index < _Range1_dist && _Curr_range1_index < _Range2_dist) {
if (_DEBUG_LT_PRED(_Pred, *(_First1 + _Curr_range1_index), *(_First2 + _Curr_range2_index))) {
++_Curr_range1_index;
} else {
if (!_Pred(*(_First2 + _Curr_range2_index), *(_First1 + _Curr_range1_index))) {
*_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index);
++_Index_chunk_first;
++_Curr_range1_index;
}
++_Curr_range2_index;
}
}
return static_cast<_Iter_value_t<_BidIt>>(_Index_chunk_first - _Index_chunk_first_save);
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _FwdIt3, class _Pr,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt3 set_intersection(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _FwdIt3 _Dest,
_Pr _Pred) noexcept /* terminates */ {
// AND sets [_First1, _Last1) and [_First2, _Last2), using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt3);
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
auto _UDest = _Get_unwrapped_unverified(_Dest);
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>;
if constexpr (remove_reference_t<_ExPo>::_Parallelize
&& _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) {
// only parallelize if desired, and all of the iterators given are random access
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const _Diff _Count1 = _ULast1 - _UFirst1;
const _Diff _Count2 = _ULast2 - _UFirst2;
if (_Count1 >= 2 && _Count2 >= 2) { // ... with each range containing at least 2 elements
_TRY_BEGIN
_Static_partitioned_set_subtraction _Operation(_Hw_threads, _Count1, _UFirst1, _UFirst2, _ULast2,
_UDest, _Pass_fn(_Pred), _Set_intersection_per_chunk());
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_UDest += static_cast<_Iter_diff_t<_FwdIt3>>(_Operation._Lookback.back()._Sum._Ref());
_Seek_wrapped(_Dest, _UDest);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_Dest, _STD set_intersection(_UFirst1, _ULast1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE set_difference
struct _Set_difference_per_chunk {
template <class _RanIt1, class _RanIt2, class _RanIt3, class _Pr>
_Common_diff_t<_RanIt1, _RanIt2, _RanIt3> _Update_dest(
_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, _RanIt2 _Last2, _RanIt3 _Dest, _Pr _Pred) {
// Copy elements from [_First1, _Last1), except those in [_First2, _Last2) according to _Pred, to _Dest.
// Returns the number of elements stored.
return _STD set_difference(_First1, _Last1, _First2, _Last2, _Dest, _Pred) - _Dest;
}
template <class _RanIt1, class _RanIt2, class _BidIt, class _Pr>
_Iter_value_t<_BidIt> _Mark_indices(_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2,
_BidIt _Index_chunk_first, _Pr _Pred) {
// Stores offsets of elements in [_First1, _Last1), except those in [_First2, _Last2) according to _Pred, to
// _Index_chunk_first. Returns the number of offsets stored.
static_assert(is_integral_v<_Iter_value_t<_BidIt>>);
static_assert(
is_same_v<_Iter_value_t<_BidIt>, common_type_t<_Iter_value_t<_BidIt>, _Common_diff_t<_RanIt1, _RanIt2>>>);
_DEBUG_ORDER_SET_UNWRAPPED(_RanIt2, _First1, _Last1, _Pred);
_DEBUG_ORDER_SET_UNWRAPPED(_RanIt1, _First2, _Last2, _Pred);
const auto _Index_chunk_first_save = _Index_chunk_first;
_Iter_diff_t<_RanIt1> _Curr_range1_index = 0;
_Iter_diff_t<_RanIt2> _Curr_range2_index = 0;
const auto _Range1_dist = _Last1 - _First1;
const auto _Range2_dist = _Last2 - _First2;
while (_Curr_range1_index < _Range1_dist && _Curr_range2_index < _Range2_dist) {
if (_DEBUG_LT_PRED(_Pred, *(_First1 + _Curr_range1_index), *(_First2 + _Curr_range2_index))) {
*_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index);
++_Index_chunk_first;
++_Curr_range1_index;
} else {
if (!_Pred(*(_First2 + _Curr_range2_index), *(_First1 + _Curr_range1_index))) {
++_Curr_range1_index;
}
++_Curr_range2_index;
}
}
// If we haven't traversed all of range 1 yet, we want to include the rest of it in the results.
for (; _Curr_range1_index < _Range1_dist; ++_Curr_range1_index) {
*_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index);
++_Index_chunk_first;
}
return static_cast<_Iter_value_t<_BidIt>>(_Index_chunk_first - _Index_chunk_first_save);
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _FwdIt3, class _Pr,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt3 set_difference(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _FwdIt3 _Dest,
_Pr _Pred) noexcept /* terminates */ {
// take set [_First2, _Last2) from [_First1, _Last1), using _Pred
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt3);
_Adl_verify_range(_First1, _Last1);
_Adl_verify_range(_First2, _Last2);
auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
auto _UFirst2 = _Get_unwrapped(_First2);
const auto _ULast2 = _Get_unwrapped(_Last2);
auto _UDest = _Get_unwrapped_unverified(_Dest);
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>;
if constexpr (remove_reference_t<_ExPo>::_Parallelize
&& _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) {
// only parallelize if desired, and all of the iterators given are random access
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const _Diff _Count = _ULast1 - _UFirst1;
if (_Count >= 2) { // ... with at least 2 elements in [_First1, _Last1)
_TRY_BEGIN
_Static_partitioned_set_subtraction _Operation(_Hw_threads, _Count, _UFirst1, _UFirst2, _ULast2, _UDest,
_Pass_fn(_Pred), _Set_difference_per_chunk());
_Run_chunked_parallel_work(_Hw_threads, _Operation);
_UDest += static_cast<_Iter_diff_t<_FwdIt3>>(_Operation._Lookback.back()._Sum._Ref());
_Seek_wrapped(_Dest, _UDest);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
_Seek_wrapped(_Dest, _STD set_difference(_UFirst1, _ULast1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE reduce
template <class _InIt, class _Ty, class _BinOp>
_Ty _Reduce_move_unchecked(_InIt _First, const _InIt _Last, _Ty _Val, _BinOp _Reduce_op) {
// return reduction, choose optimization
if constexpr (_Plus_on_arithmetic_ranges_reduction_v<_Unwrapped_t<const _InIt&>, _Ty, _BinOp>) {
return _Reduce_plus_arithmetic_ranges(_First, _Last, _Val);
} else {
for (; _First != _Last; ++_First) {
_Val = _Reduce_op(_STD move(_Val), _STD move(*_First)); // Requirement missing from N4713
}
return _Val;
}
}
template <class _Ty, class _FwdIt, class _BinOp>
_Ty _Reduce_at_least_two(const _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) {
// return reduction with no initial value
// pre: distance(_First, _Last) >= 2
if constexpr (_Plus_on_arithmetic_ranges_reduction_v<_FwdIt, _Ty, _BinOp>) {
return _Reduce_plus_arithmetic_ranges(_First, _Last, _Ty{0});
} else {
auto _Next = _First;
_Ty _Val = _Reduce_op(*_First, *++_Next);
while (++_Next != _Last) {
_Val = _Reduce_op(_STD move(_Val), *_Next); // Requirement missing from N4713
}
return _Val;
}
}
template <class _FwdIt, class _Ty, class _BinOp>
struct _Static_partitioned_reduce2 {
// reduction task scheduled on the system thread pool
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_BinOp _Reduce_op;
_Generalized_sum_drop<_Ty> _Results;
_Static_partitioned_reduce2(
const _Iter_diff_t<_FwdIt> _Count, const size_t _Chunks, const _FwdIt _First, _BinOp _Reduce_op_)
: _Team{_Count, _Chunks}, _Basis{}, _Reduce_op(_Reduce_op_), _Results{_Team._Chunks} {
_Basis._Populate(_Team, _First);
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
const auto _This = static_cast<_Static_partitioned_reduce2*>(_Context);
auto _Key = _This->_Team._Get_next_key();
if (_Key) {
auto _Chunk = _This->_Basis._Get_chunk(_Key);
auto _Local_result = _Reduce_at_least_two<_Ty>(_Chunk._First, _Chunk._Last, _This->_Reduce_op);
while ((_Key = _This->_Team._Get_next_key())) {
_Chunk = _This->_Basis._Get_chunk(_Key);
_Local_result = _STD reduce(_Chunk._First, _Chunk._Last, _STD move(_Local_result), _This->_Reduce_op);
}
_This->_Results._Add_result(_STD move(_Local_result));
}
}
};
template <class _ExPo, class _FwdIt, class _Ty, class _BinOp, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _Ty reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Ty _Val, _BinOp _Reduce_op) noexcept
/* terminates */ {
// return commutative and associative reduction of _Val and [_First, _Last), using _Reduce_op
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_UFirst, _ULast);
const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count);
if (_Chunks > 1) {
_TRY_BEGIN
auto _Passed_fn = _Pass_fn(_Reduce_op);
_Static_partitioned_reduce2<decltype(_UFirst), _Ty, decltype(_Passed_fn)> _Operation{
_Count, _Chunks, _UFirst, _Passed_fn};
{
// we don't use _Run_chunked_parallel_work here because the initial value on background threads
// is synthesized from the input, but on this thread the initial value is _Val
const _Work_ptr _Work{_Operation};
// setup complete, hereafter nothrow or terminate
_Work._Submit_for_chunks(_Hw_threads, _Chunks);
while (const auto _Stolen_key = _Operation._Team._Get_next_key()) {
auto _Chunk = _Operation._Basis._Get_chunk(_Stolen_key);
_Val = _STD reduce(_Chunk._First, _Chunk._Last, _STD move(_Val), _Pass_fn(_Reduce_op));
}
} // join with _Work_ptr threads
auto& _Results = _Operation._Results;
return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op));
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
return _STD reduce(_UFirst, _ULast, _STD move(_Val), _Pass_fn(_Reduce_op));
}
// PARALLEL FUNCTION TEMPLATE transform_reduce
template <class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp1, class _BinOp2>
struct _Static_partitioned_transform_reduce_binary2 { // transform-reduction task scheduled on the system thread pool
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_BinOp1 _Reduce_op;
_BinOp2 _Transform_op;
_Generalized_sum_drop<_Ty> _Results;
_Static_partitioned_transform_reduce_binary2(const _Diff _Count, const size_t _Chunks, const _FwdIt1 _First1,
const _FwdIt2 _First2, _BinOp1 _Reduce_op_, _BinOp2 _Transform_op_)
: _Team{_Count, _Chunks}, _Basis1{}, _Basis2{}, _Reduce_op(_Reduce_op_),
_Transform_op(_Transform_op_), _Results{_Chunks} {
_Basis1._Populate(_Team, _First1);
_Basis2._Populate(_Team, _First2);
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
const auto _This = static_cast<_Static_partitioned_transform_reduce_binary2*>(_Context);
auto _Key = _This->_Team._Get_next_key();
if (_Key) {
auto _Reduce_op = _This->_Reduce_op;
auto _Transform_op = _This->_Transform_op;
auto _Chunk1 = _This->_Basis1._Get_chunk(_Key);
auto _First2 =
_This->_Basis2._Get_first(_Key._Chunk_number, _This->_Team._Get_chunk_offset(_Key._Chunk_number));
auto _Next1 = _Chunk1._First;
auto _Next2 = _First2;
// Requirement missing from N4713:
_Ty _Val = _Reduce_op(_Transform_op(*_Chunk1._First, *_First2), _Transform_op(*++_Next1, *++_Next2));
while (++_Next1 != _Chunk1._Last) {
// Requirement missing from N4713:
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next1, *++_Next2));
}
while ((_Key = _This->_Team._Get_next_key())) {
_Chunk1 = _This->_Basis1._Get_chunk(_Key);
_First2 =
_This->_Basis2._Get_first(_Key._Chunk_number, _This->_Team._Get_chunk_offset(_Key._Chunk_number));
_Next1 = _Chunk1._First;
_Next2 = _First2;
for (; _Next1 != _Chunk1._Last; ++_Next1, (void) ++_Next2) {
// Requirement missing from N4713:
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next1, *_Next2));
}
}
_This->_Results._Add_result(_STD move(_Val));
}
}
};
#pragma warning(push)
#pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order
// in braced initializer list (/Wall)
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp1, class _BinOp2,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _Ty transform_reduce(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _Ty _Val, _BinOp1 _Reduce_op,
_BinOp2 _Transform_op) noexcept /* terminates */ {
// return commutative and associative transform-reduction of sequences, using _Reduce_op and _Transform_op
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First1, _Last1);
auto _UFirst1 = _Get_unwrapped(_First1);
const auto _ULast1 = _Get_unwrapped(_Last1);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_UFirst1, _ULast1);
auto _UFirst2 = _Get_unwrapped_n(_First2, _Count);
const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count);
if (_Chunks > 1) {
_TRY_BEGIN
auto _Passed_reduce = _Pass_fn(_Reduce_op);
auto _Passed_transform = _Pass_fn(_Transform_op);
_Static_partitioned_transform_reduce_binary2<decltype(_UFirst1), decltype(_UFirst2), _Ty,
decltype(_Passed_reduce), decltype(_Passed_transform)>
_Operation{_Count, _Chunks, _UFirst1, _UFirst2, _Passed_reduce, _Passed_transform};
{ // ditto no _Run_chunked_parallel_work for the same reason as reduce
const _Work_ptr _Work{_Operation};
// setup complete, hereafter nothrow or terminate
_Work._Submit_for_chunks(_Hw_threads, _Chunks);
while (const auto _Stolen_key = _Operation._Team._Get_next_key()) {
const auto _Chunk_number = _Stolen_key._Chunk_number;
const auto _Chunk1 = _Operation._Basis1._Get_chunk(_Stolen_key);
_Val = _STD transform_reduce(_Chunk1._First, _Chunk1._Last,
_Operation._Basis2._Get_first(
_Chunk_number, _Operation._Team._Get_chunk_offset(_Chunk_number)),
_STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op));
}
} // join with _Work_ptr threads
auto& _Results = _Operation._Results; // note: already transformed
return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op));
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
return _STD transform_reduce(
_UFirst1, _ULast1, _UFirst2, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op));
}
}
return _STD transform_reduce(_UFirst1, _ULast1,
_Get_unwrapped_n(_First2, _Idl_distance<_FwdIt1>(_UFirst1, _ULast1)), _STD move(_Val), _Pass_fn(_Reduce_op),
_Pass_fn(_Transform_op));
}
#pragma warning(pop)
template <class _FwdIt, class _Ty, class _BinOp, class _UnaryOp>
struct _Static_partitioned_transform_reduce2 { // transformed reduction task scheduled on the system thread pool
_Static_partition_team<_Iter_diff_t<_FwdIt>> _Team;
_Static_partition_range<_FwdIt> _Basis;
_BinOp _Reduce_op;
_UnaryOp _Transform_op;
_Generalized_sum_drop<_Ty> _Results;
_Static_partitioned_transform_reduce2(const _Iter_diff_t<_FwdIt> _Count, const size_t _Chunks, _FwdIt _First,
_BinOp _Reduce_op_, _UnaryOp _Transform_op_)
: _Team{_Count, _Chunks}, _Basis{}, _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_), _Results{_Chunks} {
_Basis._Populate(_Team, _First);
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
const auto _This = static_cast<_Static_partitioned_transform_reduce2*>(_Context);
auto _Key = _This->_Team._Get_next_key();
if (_Key) {
auto& _Reduce_op = _This->_Reduce_op;
auto& _Transform_op = _This->_Transform_op;
auto _Chunk = _This->_Basis._Get_chunk(_Key);
auto _Next = _Chunk._First;
_Ty _Val{_Reduce_op(_Transform_op(*_Chunk._First), _Transform_op(*++_Next))};
while (++_Next != _Chunk._Last) {
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next));
}
while ((_Key = _This->_Team._Get_next_key())) {
_Chunk = _This->_Basis._Get_chunk(_Key);
_Next = _Chunk._First;
for (; _Next != _Chunk._Last; ++_Next) {
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next));
}
}
_This->_Results._Add_result(_STD move(_Val));
}
}
};
template <class _ExPo, class _FwdIt, class _Ty, class _BinOp, class _UnaryOp,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_NODISCARD _Ty transform_reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Ty _Val, _BinOp _Reduce_op,
_UnaryOp _Transform_op) noexcept /* terminates */ {
// return commutative and associative reduction of transformed sequence, using _Reduce_op and _Transform_op
_REQUIRE_PARALLEL_ITERATOR(_FwdIt);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines...
const auto _Count = _STD distance(_UFirst, _ULast);
const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count);
if (_Chunks > 1) {
_TRY_BEGIN
auto _Passed_reduce = _Pass_fn(_Reduce_op);
auto _Passed_transform = _Pass_fn(_Transform_op);
_Static_partitioned_transform_reduce2<decltype(_UFirst), _Ty, decltype(_Passed_reduce),
decltype(_Passed_transform)>
_Operation{_Count, _Chunks, _UFirst, _Passed_reduce, _Passed_transform};
{ // ditto no _Run_chunked_parallel_work for the same reason as reduce
const _Work_ptr _Work{_Operation};
// setup complete, hereafter nothrow or terminate
_Work._Submit_for_chunks(_Hw_threads, _Chunks);
while (auto _Stolen_key = _Operation._Team._Get_next_key()) {
// keep processing remaining chunks to comply with N4687 [intro.progress]/14
auto _Chunk = _Operation._Basis._Get_chunk(_Stolen_key);
_Val = _STD transform_reduce(_Chunk._First, _Chunk._Last, _STD move(_Val), _Pass_fn(_Reduce_op),
_Pass_fn(_Transform_op));
}
} // join with _Work_ptr threads
auto& _Results = _Operation._Results; // note: already transformed
return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op));
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
}
}
return _STD transform_reduce(_UFirst, _ULast, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op));
}
// PARALLEL FUNCTION TEMPLATE exclusive_scan
struct _No_init_tag {
explicit _No_init_tag() = default;
}; // tag to indicate that no initial value is to be used
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _Ty>
_FwdIt2 _Exclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val) {
// local-sum for parallel exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and stores
// successor sum in _Val
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, *_First);
for (;;) {
++_First;
++_Dest;
if (_First == _Last) {
return _Dest;
}
_Ty _Tmp(_Reduce_op(_Val, *_First)); // temp to enable _First == _Dest
*_Dest = _Val;
_Val = _STD move(_Tmp);
}
}
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _Ty>
void _Exclusive_scan_per_chunk_complete(
_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _Ty& _Init) {
// Sum for parallel exclusive_scan with predecessor available, into [_Dest, _Dest + (_Last - _First)) and stores
// successor sum in _Val.
// Pre: _Val is *uninitialized* && _First != _Last && predecessor sum is in _Init
_Construct_in_place(_Val, _Reduce_op(_Init, *_First));
*_Dest = _Init;
while (++_First != _Last) {
++_Dest;
_Ty _Tmp(_Reduce_op(_Val, *_First)); // temp to enable _First == _Dest
*_Dest = _STD move(_Val);
_Val = _STD move(_Tmp);
}
}
template <class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp>
struct _Static_partitioned_exclusive_scan2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback;
_Ty& _Initial;
_BinOp _Reduce_op;
_Static_partitioned_exclusive_scan2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First,
_Ty& _Initial_, _BinOp _Reduce_op_, const _FwdIt2&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Lookback(_Team._Chunks), _Initial(_Initial_), _Reduce_op(_Reduce_op_) {
_Basis1._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _In_range = _Basis1._Get_chunk(_Key);
const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number));
// Run local exclusive_scan on this chunk
const auto _Chunk = _Lookback.begin() + static_cast<ptrdiff_t>(_Chunk_number);
if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same
_Exclusive_scan_per_chunk_complete(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Initial);
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
const auto _Prev_chunk = _Prev_iter(_Chunk);
if (_Prev_chunk->_State.load() & _Sum_available) {
// if predecessor sum already complete, we can incorporate its value directly for 1 pass
_Exclusive_scan_per_chunk_complete(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref());
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
// Calculate local sum and publish to other threads
const auto _Last =
_Exclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref());
_Chunk->_Store_available_state(_Local_available);
// Apply the predecessor overall sum to current overall sum and elements
if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly
_Chunk->_Apply_exclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op);
} else {
auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op);
_Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op);
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_exclusive_scan2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Ty _Val,
_BinOp _Reduce_op) noexcept /* terminates */ {
// set each value in [_Dest, _Dest + (_Last - _First)) to the associative reduction of predecessors and _Val
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_exclusive_scan2 _Operation{
_Hw_threads, _Count, _UFirst, _Val, _Pass_fn(_Reduce_op), _UDest};
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
// Note that _Val is used as temporary storage by whichever thread runs the first chunk.
// If any thread starts any chunk, initialization is complete, so we can't enter the
// catch or serial fallback below, so that's OK.
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _STD exclusive_scan(_UFirst, _ULast, _UDest, _STD move(_Val), _Pass_fn(_Reduce_op)));
return _Dest;
}
}
_Seek_wrapped(
_Dest, _STD exclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)),
_STD move(_Val), _Pass_fn(_Reduce_op)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE inclusive_scan
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _Ty>
_FwdIt2 _Inclusive_scan_per_chunk(
_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _No_init_tag) {
// Local-sum for parallel inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall sum in
// _Val.
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, *_First);
for (;;) {
*_Dest = _Val;
++_Dest;
++_First;
if (_First == _Last) {
return _Dest;
}
_Val = _Reduce_op(_STD move(_Val), *_First);
}
}
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _Ty, class _Ty_fwd,
enable_if_t<!is_same_v<_No_init_tag, remove_const_t<remove_reference_t<_Ty_fwd>>>, int> = 0>
_FwdIt2 _Inclusive_scan_per_chunk(
_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _Ty_fwd&& _Predecessor) {
// local-sum for parallel inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall sum in
// _Val.
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, _Reduce_op(_STD forward<_Ty_fwd>(_Predecessor), *_First));
for (;;) {
*_Dest = _Val;
++_Dest;
++_First;
if (_First == _Last) {
return _Dest;
}
_Val = _Reduce_op(_STD move(_Val), *_First);
}
}
template <class _Ty, class _Init_ty, class _FwdIt1, class _FwdIt2, class _BinOp>
struct _Static_partitioned_inclusive_scan2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback;
_BinOp _Reduce_op;
_Init_ty& _Initial;
_Static_partitioned_inclusive_scan2(
const size_t _Hw_threads, const _Diff _Count, _BinOp _Reduce_op_, _Init_ty& _Initial_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Lookback(_Team._Chunks), _Reduce_op(_Reduce_op_), _Initial(_Initial_) {}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _In_range = _Basis1._Get_chunk(_Key);
const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number));
// Run local inclusive_scan on this chunk
const auto _Chunk = _Lookback.begin() + static_cast<ptrdiff_t>(_Chunk_number);
if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same
_Inclusive_scan_per_chunk(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _STD move(_Initial));
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
const auto _Prev_chunk = _Prev_iter(_Chunk);
if (_Prev_chunk->_State.load() & _Sum_available) {
// if predecessor sum already complete, we can incorporate its value directly for 1 pass
_Inclusive_scan_per_chunk(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref());
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
// Calculate local sum and publish to other threads
const auto _Last = _Inclusive_scan_per_chunk(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref(), _No_init_tag{});
_Chunk->_Store_available_state(_Local_available);
// Apply the predecessor overall sum to current overall sum and elements
if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly
_Chunk->_Apply_inclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op);
} else {
auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op);
_Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op);
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_inclusive_scan2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _BinOp, class _Ty,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty _Val) noexcept
/* terminates */ {
// compute partial noncommutative and associative reductions including _Val into _Dest, using _Reduce_op
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_First, _Last);
auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
auto _Passed_op = _Pass_fn(_Reduce_op);
_Static_partitioned_inclusive_scan2<_Ty, _Ty, _Unwrapped_t<const _FwdIt1&>, decltype(_UDest),
decltype(_Passed_op)>
_Operation{_Hw_threads, _Count, _Passed_op, _Val};
_Operation._Basis1._Populate(_Operation._Team, _UFirst);
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
// Note that _Val is moved from by whichever thread runs the first chunk.
// If any thread starts any chunk, initialization is complete, so we can't enter the
// catch or serial fallback below.
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op), _STD move(_Val)));
return _Dest;
}
}
_Seek_wrapped(
_Dest, _STD inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)),
_Pass_fn(_Reduce_op), _STD move(_Val)));
return _Dest;
}
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _BinOp, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op) noexcept
/* terminates */ {
// compute partial noncommutative and associative reductions into _Dest, using _Reduce_op
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_No_init_tag _Tag;
auto _Passed_op = _Pass_fn(_Reduce_op);
_Static_partitioned_inclusive_scan2<_Iter_value_t<_FwdIt1>, _No_init_tag, _Unwrapped_t<const _FwdIt1&>,
decltype(_UDest), decltype(_Passed_op)>
_Operation{_Hw_threads, _Count, _Passed_op, _Tag};
_Operation._Basis1._Populate(_Operation._Team, _UFirst);
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op)));
return _Dest;
}
}
_Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast,
_Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Reduce_op)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE transform_exclusive_scan
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp, class _Ty>
_FwdIt2 _Transform_exclusive_scan_per_chunk(
_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val) {
// Local-sum for parallel transform_exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and
// stores successor sum in _Val.
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, _Transform_op(*_First));
for (;;) {
++_First;
++_Dest;
if (_First == _Last) {
return _Dest;
}
_Ty _Tmp(_Reduce_op(_Val, _Transform_op(*_First))); // temp to enable _First == _Dest
*_Dest = _Val;
_Val = _STD move(_Tmp);
}
}
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp, class _Ty>
void _Transform_exclusive_scan_per_chunk_complete(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op,
_UnaryOp _Transform_op, _Ty& _Val, _Ty& _Init) {
// Sum for parallel transform_exclusive_scan with predecessor available, into [_Dest, _Dest + (_Last - _First)) and
// stores successor sum in _Val.
// pre: _Val is *uninitialized* && _First != _Last && predecessor sum is in _Init
_Construct_in_place(_Val, _Reduce_op(_Init, _Transform_op(*_First)));
*_Dest = _Init;
while (++_First != _Last) {
++_Dest;
_Ty _Tmp(_Reduce_op(_Val, _Transform_op(*_First))); // temp to enable _First == _Dest
*_Dest = _STD move(_Val);
_Val = _STD move(_Tmp);
}
}
template <class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp, class _UnaryOp>
struct _Static_partitioned_transform_exclusive_scan2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback;
_Ty& _Initial;
_BinOp _Reduce_op;
_UnaryOp _Transform_op;
_Static_partitioned_transform_exclusive_scan2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First,
_Ty& _Initial_, _BinOp _Reduce_op_, _UnaryOp _Transform_op_, const _FwdIt2&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Lookback(_Team._Chunks), _Initial(_Initial_), _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_) {
_Basis1._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _In_range = _Basis1._Get_chunk(_Key);
const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number));
// Run local transform_exclusive_scan on this chunk
const auto _Chunk = _Lookback.begin() + static_cast<ptrdiff_t>(_Chunk_number);
if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same
_Transform_exclusive_scan_per_chunk_complete(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Sum._Ref(), _Initial);
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
const auto _Prev_chunk = _Prev_iter(_Chunk);
if (_Prev_chunk->_State.load() & _Sum_available) {
// if predecessor sum already complete, we can incorporate its value directly for 1 pass
_Transform_exclusive_scan_per_chunk_complete(_In_range._First, _In_range._Last, _Dest, _Reduce_op,
_Transform_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref());
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
// Calculate local sum and publish to other threads
const auto _Last = _Transform_exclusive_scan_per_chunk(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Local._Ref());
_Chunk->_Store_available_state(_Local_available);
// Apply the predecessor overall sum to current overall sum and elements
if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly
_Chunk->_Apply_exclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op);
} else {
auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op);
_Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op);
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_transform_exclusive_scan2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp, class _UnaryOp,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 transform_exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Ty _Val,
_BinOp _Reduce_op, _UnaryOp _Transform_op) noexcept /* terminates */ {
// set each value in [_Dest, _Dest + (_Last - _First)) to the associative reduction of transformed predecessors
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_Static_partitioned_transform_exclusive_scan2 _Operation{
_Hw_threads, _Count, _UFirst, _Val, _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op), _UDest};
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
// Note that _Val is used as temporary storage by whichever thread runs the first chunk.
// If any thread starts any chunk, initialization is complete, so we can't enter the
// catch or serial fallback below, so that's OK.
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _STD transform_exclusive_scan(_UFirst, _ULast, _UDest, _STD move(_Val),
_Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)));
return _Dest;
}
}
_Seek_wrapped(_Dest,
_STD transform_exclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)),
_STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE transform_inclusive_scan
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp, class _Ty>
_FwdIt2 _Transform_inclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op,
_UnaryOp _Transform_op, _Ty& _Val, _No_init_tag) {
// Local-sum for parallel transform_inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall
// sum in _Val.
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, _Transform_op(*_First));
for (;;) {
*_Dest = _Val;
++_Dest;
++_First;
if (_First == _Last) {
return _Dest;
}
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_First));
}
}
template <class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp, class _Ty, class _Ty_fwd,
enable_if_t<!is_same_v<_No_init_tag, remove_const_t<remove_reference_t<_Ty_fwd>>>, int> = 0>
_FwdIt2 _Transform_inclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op,
_UnaryOp _Transform_op, _Ty& _Val, _Ty_fwd&& _Predecessor) {
// local-sum for parallel transform_inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall
// sum in _Val
// pre: _Val is *uninitialized* && _First != _Last
_Construct_in_place(_Val, _Reduce_op(_STD forward<_Ty_fwd>(_Predecessor), _Transform_op(*_First)));
for (;;) {
*_Dest = _Val;
++_Dest;
++_First;
if (_First == _Last) {
return _Dest;
}
_Val = _Reduce_op(_STD move(_Val), _Transform_op(*_First));
}
}
template <class _Ty, class _Init_ty, class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp>
struct _Static_partitioned_transform_inclusive_scan2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
_Static_partition_range<_FwdIt1, _Diff> _Basis1;
_Static_partition_range<_FwdIt2, _Diff> _Basis2;
_Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback;
_BinOp _Reduce_op;
_UnaryOp _Transform_op;
_Init_ty& _Initial;
_Static_partitioned_transform_inclusive_scan2(
const size_t _Hw_threads, const _Diff _Count, _BinOp _Reduce_op_, _UnaryOp _Transform_op_, _Init_ty& _Initial_)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{},
_Lookback(_Team._Chunks), _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_), _Initial(_Initial_) {}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
const auto _In_range = _Basis1._Get_chunk(_Key);
const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number));
// Run local transform_inclusive_scan on this chunk
const auto _Chunk = _Lookback.begin() + static_cast<ptrdiff_t>(_Chunk_number);
if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same
_Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op,
_Chunk->_Sum._Ref(), _STD move(_Initial));
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
const auto _Prev_chunk = _Prev_iter(_Chunk);
if (_Prev_chunk->_State.load() & _Sum_available) {
// if predecessor sum already complete, we can incorporate its value directly for 1 pass
_Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op,
_Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref());
_Chunk->_Store_available_state(_Sum_available);
return _Cancellation_status::_Running;
}
// Calculate local sum and publish to other threads
const auto _Last = _Transform_inclusive_scan_per_chunk(
_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _No_init_tag{});
_Chunk->_Store_available_state(_Local_available);
// Apply the predecessor overall sum to current overall sum and elements
if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly
_Chunk->_Apply_inclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op);
} else {
auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op);
_Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op);
}
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_transform_inclusive_scan2*>(_Context));
}
};
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _Ty, class _BinOp, class _UnaryOp,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op,
_UnaryOp _Transform_op, _Ty _Val) noexcept /* terminates */ {
// compute partial noncommutative and associative transformed reductions including _Val into _Dest
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
auto _Passed_reduce = _Pass_fn(_Reduce_op);
auto _Passed_transform = _Pass_fn(_Transform_op);
_Static_partitioned_transform_inclusive_scan2<_Ty, _Ty, _Unwrapped_t<const _FwdIt1&>, decltype(_UDest),
decltype(_Passed_reduce), decltype(_Passed_transform)>
_Operation{_Hw_threads, _Count, _Passed_reduce, _Passed_transform, _Val};
_Operation._Basis1._Populate(_Operation._Team, _UFirst);
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
// Note that _Val is moved from by whichever thread runs the first chunk.
// If any thread starts any chunk, initialization is complete, so we can't enter the
// catch or serial fallback below.
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _STD transform_inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op),
_Pass_fn(_Transform_op), _STD move(_Val)));
return _Dest;
}
}
_Seek_wrapped(_Dest,
_STD transform_inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)),
_Pass_fn(_Reduce_op), _Pass_fn(_Transform_op), _STD move(_Val)));
return _Dest;
}
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _BinOp, class _UnaryOp,
_Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op,
_UnaryOp _Transform_op) noexcept /* terminates */ {
// compute partial noncommutative and associative transformed reductions into _Dest
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
const auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
const auto _Count = _STD distance(_UFirst, _ULast);
auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
_No_init_tag _Tag;
auto _Passed_reduce = _Pass_fn(_Reduce_op);
auto _Passed_transform = _Pass_fn(_Transform_op);
using _Intermediate_t = decay_t<decltype(_Transform_op(*_UFirst))>;
_Static_partitioned_transform_inclusive_scan2<_Intermediate_t, _No_init_tag,
_Unwrapped_t<const _FwdIt1&>, decltype(_UDest), decltype(_Passed_reduce),
decltype(_Passed_transform)>
_Operation{_Hw_threads, _Count, _Passed_reduce, _Passed_transform, _Tag};
_Operation._Basis1._Populate(_Operation._Team, _UFirst);
_Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest));
_Run_chunked_parallel_work(_Hw_threads, _Operation);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest,
_STD transform_inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)));
return _Dest;
}
}
_Seek_wrapped(_Dest,
_STD transform_inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)),
_Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)));
return _Dest;
}
// PARALLEL FUNCTION TEMPLATE adjacent_difference
template <class _FwdIt1, class _FwdIt2, class _BinOp>
struct _Static_partitioned_adjacent_difference2 {
using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>;
_Static_partition_team<_Diff> _Team;
// note offset partitioning:
_Static_partition_range<_FwdIt1, _Diff> _Basis1; // contains partition of [_First, _Last - 1)
_Static_partition_range<_FwdIt2, _Diff> _Basis2; // contains partition of [_Dest + 1, _Dest + (_Last - _First))
_BinOp _Diff_op;
_Static_partitioned_adjacent_difference2(
const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _BinOp _Diff_op_, const _FwdIt2&)
: _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Diff_op(_Diff_op_) {
_Basis1._Populate(_Team, _First);
}
_Cancellation_status _Process_chunk() {
const auto _Key = _Team._Get_next_key();
if (!_Key) {
return _Cancellation_status::_Canceled;
}
const auto _Chunk_number = _Key._Chunk_number;
auto _In_range = _Basis1._Get_chunk(_Key);
auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number));
auto _Next = _In_range._First;
do {
++_Next; // note: steps 1 element into the following chunk
*_Dest = _Diff_op(*_Next, *_In_range._First);
++_Dest;
_In_range._First = _Next;
} while (_In_range._First != _In_range._Last);
return _Cancellation_status::_Running;
}
static void __stdcall _Threadpool_callback(
__std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ {
_Run_available_chunked_work(*static_cast<_Static_partitioned_adjacent_difference2*>(_Context));
}
};
template <class _FwdIt1, class _FwdIt2, class _BinOp>
_FwdIt2 _Adjacent_difference_seq(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Diff_op) {
// compute adjacent differences into _Dest, serially
if (_First != _Last) {
*_Dest = *_First;
++_Dest;
for (auto _Next = _First; ++_Next != _Last; _First = _Next) {
*_Dest = _Diff_op(*_Next, *_First);
++_Dest;
}
}
return _Dest;
}
template <class _ExPo, class _FwdIt1, class _FwdIt2, class _BinOp, _Enable_if_execution_policy_t<_ExPo> /* = 0 */>
_FwdIt2 adjacent_difference(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Diff_op) noexcept
/* terminates */ {
// compute adjacent differences into _Dest
_REQUIRE_PARALLEL_ITERATOR(_FwdIt1);
_REQUIRE_PARALLEL_ITERATOR(_FwdIt2);
_Adl_verify_range(_First, _Last);
auto _UFirst = _Get_unwrapped(_First);
const auto _ULast = _Get_unwrapped(_Last);
if constexpr (remove_reference_t<_ExPo>::_Parallelize) {
const size_t _Hw_threads = __std_parallel_algorithms_hw_threads();
if (_Hw_threads > 1) { // parallelize on multiprocessor machines
auto _Count = _STD distance(_UFirst, _ULast);
const auto _UDest = _Get_unwrapped_n(_Dest, _Count);
if (_Count >= 2) { // ... with at least 2 elements
_TRY_BEGIN
--_Count; // note unusual offset partitioning
_Static_partitioned_adjacent_difference2 _Operation{
_Hw_threads, _Count, _UFirst, _Pass_fn(_Diff_op), _UDest};
auto _Result = _Operation._Basis2._Populate(_Operation._Team, _Next_iter(_UDest));
const _Work_ptr _Work_op{_Operation};
// setup complete, hereafter nothrow or terminate
_Work_op._Submit_for_chunks(_Hw_threads, _Operation._Team._Chunks);
// must be done after setup is complete to avoid duplicate assign in serial fallback:
*_UDest = *_UFirst;
_Run_available_chunked_work(_Operation);
_Seek_wrapped(_Dest, _Result);
return _Dest;
_CATCH(const _Parallelism_resources_exhausted&)
// fall through to serial case below
_CATCH_END
}
_Seek_wrapped(_Dest, _Adjacent_difference_seq(_UFirst, _ULast, _UDest, _Pass_fn(_Diff_op)));
return _Dest;
}
}
// Don't call serial adjacent_difference because it's described as creating a temporary we can avoid
_Seek_wrapped(_Dest, _Adjacent_difference_seq(_UFirst, _ULast,
_Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Diff_op)));
return _Dest;
}
_STD_END
#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
#pragma pack(pop)
#endif // _HAS_CXX17
#endif // _STL_COMPILER_PREPROCESSOR
#endif // _EXECUTION_