// execution standard header // Copyright (c) Microsoft Corporation. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #pragma once #ifndef _EXECUTION_ #define _EXECUTION_ #include #if _STL_COMPILER_PREPROCESSOR #if !_HAS_CXX17 #pragma message("The contents of are available only with C++17 or later.") #else // ^^^ !_HAS_CXX17 / _HAS_CXX17 vvv #include #include #include #include #include #include #include #include #pragma pack(push, _CRT_PACKING) #pragma warning(push, _STL_WARNING_LEVEL) #pragma warning(disable : _STL_DISABLED_WARNINGS) _STL_DISABLE_CLANG_WARNINGS #pragma push_macro("new") #undef new _EXTERN_C // If on Windows XP, returns 1 (disabling parallelism); otherwise, returns the number of hardware threads available. _NODISCARD unsigned int __stdcall __std_parallel_algorithms_hw_threads() noexcept; // Windows Vista thread pool interface; __std_parallel_algorithms_hw_threads must be called on the current // thread before calling any of the below. #ifdef _M_CEE using __std_TP_WORK = void; using __std_TP_CALLBACK_INSTANCE = void; using __std_TP_CALLBACK_ENVIRON = void; #else // ^^^ _M_CEE ^^^ // vvv !_M_CEE vvv struct __std_TP_WORK; // not defined struct __std_TP_CALLBACK_INSTANCE; // not defined struct __std_TP_CALLBACK_ENVIRON; // not defined #endif // _M_CEE using __std_PTP_WORK = __std_TP_WORK*; using __std_PTP_CALLBACK_INSTANCE = __std_TP_CALLBACK_INSTANCE*; using __std_PTP_CALLBACK_ENVIRON = __std_TP_CALLBACK_ENVIRON*; using __std_PTP_WORK_CALLBACK = void(__stdcall*)( _Inout_ __std_PTP_CALLBACK_INSTANCE, _Inout_opt_ void*, _Inout_ __std_PTP_WORK); _NODISCARD __std_PTP_WORK __stdcall __std_create_threadpool_work( _In_ __std_PTP_WORK_CALLBACK, _Inout_opt_ void*, _In_opt_ __std_PTP_CALLBACK_ENVIRON) noexcept; void __stdcall __std_submit_threadpool_work(_Inout_ __std_PTP_WORK) noexcept; void __stdcall __std_bulk_submit_threadpool_work(_Inout_ __std_PTP_WORK, _In_ size_t) noexcept; void __stdcall __std_close_threadpool_work(_Inout_ __std_PTP_WORK) noexcept; void __stdcall __std_wait_for_threadpool_work_callbacks(_Inout_ __std_PTP_WORK, _In_ int) noexcept; void __stdcall __std_execution_wait_on_uchar( _In_ const volatile unsigned char* _Address, _In_ unsigned char _Compare) noexcept; void __stdcall __std_execution_wake_by_address_all(_In_ const volatile void* _Address) noexcept; _END_EXTERN_C _STD_BEGIN constexpr size_t _Oversubscription_multiplier = 32; constexpr size_t _Oversubmission_multiplier = 4; constexpr size_t _Still_active = static_cast(-1); // EXECUTION POLICIES namespace execution { class sequenced_policy { // request for sequential execution with termination public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = false; }; inline constexpr sequenced_policy seq{/* unspecified */}; class parallel_policy { // request for parallel execution with termination public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = true; }; inline constexpr parallel_policy par{/* unspecified */}; class parallel_unsequenced_policy { // request for parallel execution without thread identity with termination // (at this time, equivalent to parallel_policy) public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = true; }; inline constexpr parallel_unsequenced_policy par_unseq{/* unspecified */}; } // namespace execution template <> struct is_execution_policy : true_type {}; // sequenced_policy is an execution policy template <> struct is_execution_policy : true_type {}; // parallel_policy is an execution policy template <> struct is_execution_policy : true_type { }; // parallel_unsequenced_policy is an execution policy // STRUCT _Parallelism_resources_exhausted struct _Parallelism_resources_exhausted : exception { _NODISCARD virtual const char* __CLR_OR_THIS_CALL what() const noexcept override { // return pointer to message string return "Insufficient resources were available to use additional parallelism."; } #if !_HAS_EXCEPTIONS protected: virtual void _Doraise() const override { // perform class-specific exception handling _RAISE(*this); } #endif // !_HAS_EXCEPTIONS }; [[noreturn]] inline void _Throw_parallelism_resources_exhausted() { _THROW(_Parallelism_resources_exhausted{}); } // ENUM CLASS _Cancellation_status enum class _Cancellation_status : bool { _Running, _Canceled }; // STRUCT _Cancellation_token struct _Cancellation_token { atomic<_Cancellation_status> _Is_canceled_impl{_Cancellation_status::_Running}; bool _Is_canceled() const { return _Is_canceled_impl.load() == _Cancellation_status::_Canceled; } bool _Is_canceled_relaxed() const { return _Is_canceled_impl.load(memory_order_relaxed) == _Cancellation_status::_Canceled; } void _Cancel() { _Is_canceled_impl.store(_Cancellation_status::_Canceled); } _Cancellation_status _Status() const { return _Is_canceled_impl.load(); } }; // CLASS _Work_ptr class _Work_ptr { public: template , _Work_ptr>, int> = 0> explicit _Work_ptr(_Work& _Operation) : _Ptp_work(__std_create_threadpool_work(&_Work::_Threadpool_callback, _STD addressof(_Operation), nullptr)) { // register work with the thread pool // usually, after _Work_ptr is constructed, a parallel algorithm runs to completion or terminates static_assert(noexcept(_Work::_Threadpool_callback(_STD declval<__std_PTP_CALLBACK_INSTANCE>(), _STD declval(), _STD declval<__std_PTP_WORK>())), "Threadpool callbacks must be noexcept to enforce termination"); if (!_Ptp_work) { // usually, the last place a bailout to serial execution can occur _Throw_parallelism_resources_exhausted(); } } _Work_ptr(const _Work_ptr&) = delete; _Work_ptr& operator=(const _Work_ptr&) = delete; ~_Work_ptr() noexcept { __std_wait_for_threadpool_work_callbacks(_Ptp_work, true); __std_close_threadpool_work(_Ptp_work); } void _Submit() const noexcept { __std_submit_threadpool_work(_Ptp_work); } void _Submit(const size_t _Submissions) const noexcept { __std_bulk_submit_threadpool_work(_Ptp_work, _Submissions); } void _Submit_for_chunks(const size_t _Hw_threads, const size_t _Chunks) const noexcept { _Submit((_STD min)(_Hw_threads * _Oversubmission_multiplier, _Chunks)); } private: __std_PTP_WORK _Ptp_work; }; // FUNCTION TEMPLATE _Run_available_chunked_work template void _Run_available_chunked_work(_Work& _Operation) { while (_Operation._Process_chunk() == _Cancellation_status::_Running) { // process while there are chunks remaining } } // FUNCTION TEMPLATE _Run_chunked_parallel_work template void _Run_chunked_parallel_work(const size_t _Hw_threads, _Work& _Operation) { // process chunks of _Operation on the thread pool const _Work_ptr _Work_op{_Operation}; // setup complete, hereafter nothrow or terminate _Work_op._Submit_for_chunks(_Hw_threads, _Operation._Team._Chunks); _Run_available_chunked_work(_Operation); } // CHUNK CALCULATION FUNCTIONS // The parallel algorithms library below assumes that distance(first, last) fits into a size_t; // forward iterators must refer to objects in memory and therefore must meet this requirement. // // Unlike the serial algorithms library, which can stay in the difference_type domain, here we need // to talk with vector (which speaks size_t), and with Windows, which wants to speak unsigned int. // // This assumption should be localized to the chunk calculation functions; the rest of // the library assumes that chunk numbers can be static_cast into the difference_type domain. // FUNCTION TEMPLATE _Get_chunked_work_chunk_count template constexpr size_t _Get_chunked_work_chunk_count(const size_t _Hw_threads, const _Diff _Count) { // get the number of chunks to break work into to parallelize const auto _Size_count = static_cast(_Count); // no overflow due to forward iterators // we assume _Hw_threads * _Oversubscription_multiplier does not overflow return (_STD min)(_Hw_threads * _Oversubscription_multiplier, _Size_count); } // FUNCTION TEMPLATE _Get_least2_chunked_work_chunk_count template constexpr size_t _Get_least2_chunked_work_chunk_count(const size_t _Hw_threads, const _Diff _Count) { // get the number of chunks to break work into to parallelize, assuming chunks must be of size 2 const auto _Size_count = static_cast(_Count); // no overflow due to forward iterators // we assume _Hw_threads * _Oversubscription_multiplier does not overflow return _Get_chunked_work_chunk_count(_Hw_threads, _Size_count / 2); } // STRUCT TEMPLATE _Parallelism_allocator struct _Parallelism_allocate_traits { __declspec(allocator) static void* _Allocate(const size_t _Bytes) { void* _Result = ::operator new(_Bytes, nothrow); if (!_Result) { _Throw_parallelism_resources_exhausted(); } return _Result; } #ifdef __cpp_aligned_new __declspec(allocator) static void* _Allocate_aligned(const size_t _Bytes, const size_t _Align) { void* _Result = ::operator new (_Bytes, align_val_t{_Align}, nothrow); if (!_Result) { _Throw_parallelism_resources_exhausted(); } return _Result; } #endif // __cpp_aligned_new }; template struct _Parallelism_allocator { using value_type = _Ty; _Parallelism_allocator() = default; template constexpr _Parallelism_allocator(const _Parallelism_allocator<_Other>&) noexcept {} _Ty* allocate(const size_t _Count) { return static_cast<_Ty*>( _Allocate<_New_alignof<_Ty>, _Parallelism_allocate_traits>(_Get_size_of_n(_Count))); } void deallocate(_Ty* const _Ptr, const size_t _Count) { // no overflow check on the following multiply; we assume _Allocate did that check _Deallocate<_New_alignof<_Ty>>(_Ptr, sizeof(_Ty) * _Count); } template bool operator==(const _Parallelism_allocator<_Other>&) const noexcept { return true; } template bool operator!=(const _Parallelism_allocator<_Other>&) const noexcept { return false; } }; template using _Parallel_vector = vector<_Ty, _Parallelism_allocator<_Ty>>; template struct _Generalized_sum_drop { // drop off point for GENERALIZED_SUM intermediate results _Ty* _Data; size_t _Slots; atomic _Frontier; explicit _Generalized_sum_drop(const size_t _Slots) : _Data(static_cast<_Ty*>( _Allocate<_New_alignof<_Ty>, _Parallelism_allocate_traits>(_Get_size_of_n(_Slots)))), _Slots(_Slots), _Frontier(0) {} ~_Generalized_sum_drop() noexcept { // pre: the caller has synchronized with all threads that modify _Data. _Destroy_range(begin(), end()); // no overflow check on the following multiply; we assume _Allocate did that check _Deallocate<_New_alignof<_Ty>>(_Data, sizeof(_Ty) * _Slots); } template void _Add_result(_Args&&... _Vals) noexcept /* terminates */ { // constructs a _Ty in place with _Vals parameters perfectly forwarded // pre: the number of results added is less than the size the drop was constructed with const size_t _Target = _Frontier++; _Construct_in_place(_Data[_Target], _STD forward<_Args>(_Vals)...); } _Ty* begin() { return _Data; } _Ty* end() { return _Data + _Frontier.load(memory_order_relaxed); } }; // VARIABLE TEMPLATE _Use_atomic_iterator template struct _Atomic_is_usually_lock_free : bool_constant::is_always_lock_free> { // deferred evaluation of atomic::is_always_lock_free }; template inline constexpr bool _Use_atomic_iterator = conjunction_v>, is_trivially_copyable<_FwdIt>, _Atomic_is_usually_lock_free<_FwdIt>>; // STRUCT TEMPLATE _Parallel_choose_min_result template struct _Parallel_choose_min_result { // parallel results collector which uses atomic<_Ty> to choose the minimum value _Ty _Last; atomic<_Ty> _Result; explicit _Parallel_choose_min_result(_Ty _Last_) : _Last{_Last_}, _Result{_Last} {} _Ty _Get_result() const { // load the imbued value return _Result.load(memory_order_relaxed); } bool _Complete() const { // tests whether a result has been found return _Result.load(memory_order_relaxed) != _Last; } void _Imbue(size_t, const _Ty _Local_result) { // atomically sets the result to min(result, _Local_result) _Ty _Expected{_Last}; while (!_Result.compare_exchange_weak(_Expected, _Local_result) && _Expected > _Local_result) { // keep trying } } }; // STRUCT TEMPLATE _Parallel_choose_max_result template struct _Parallel_choose_max_result { // parallel results collector which uses atomic<_Ty> to choose the maximum value _Ty _Last; atomic<_Ty> _Result; explicit _Parallel_choose_max_result(_Ty _Last_) : _Last{_Last_}, _Result{_Last} {} _Ty _Get_result() const { // load the imbued value return _Result.load(memory_order_relaxed); } bool _Complete() const { // tests whether a result has been found return _Result.load(memory_order_relaxed) != _Last; } void _Imbue(size_t, const _Ty _Local_result) { // atomically sets the result to max(result, _Local_result) _Ty _Expected{_Last}; if (_Result.compare_exchange_strong(_Expected, _Local_result)) { return; } while (_Expected < _Local_result && !_Result.compare_exchange_weak(_Expected, _Local_result)) { // keep trying } } }; // STRUCT TEMPLATE _Parallel_choose_min_chunk template struct _Parallel_choose_min_chunk { // parallel results collector which uses atomic to choose the lowest chunk's result _Ty _Result; atomic _Selected_chunk; mutex _Mtx; explicit _Parallel_choose_min_chunk(_Ty _Last) : _Result(_Last), _Selected_chunk{_Still_active}, _Mtx{} {} _Ty _Get_result() const { // load the imbued value return _Result; } bool _Complete() const { // tests whether a result has been found return _Selected_chunk.load(memory_order_relaxed) != _Still_active; } void _Imbue(const size_t _Chunk, const _Ty _Local_result) { // atomically sets the result to the lowest chunk's value size_t _Expected = _Still_active; while (!_Selected_chunk.compare_exchange_weak(_Expected, _Chunk)) { // note: _Still_active is the maximum possible value, so it gets ignored implicitly if (_Chunk > _Expected) { return; } } lock_guard _Lck(_Mtx); // TRANSITION, VSO-671180 if (_Selected_chunk.load(memory_order_relaxed) == _Chunk) { _Result = _Local_result; } } }; // STRUCT TEMPLATE _Parallel_choose_max_chunk template struct _Parallel_choose_max_chunk { // parallel results collector which uses atomic to choose the highest chunk's result _Ty _Result; atomic _Selected_chunk; mutex _Mtx; explicit _Parallel_choose_max_chunk(_Ty _Last) : _Result(_Last), _Selected_chunk{_Still_active}, _Mtx{} {} _Ty _Get_result() const { // load the imbued value return _Result; } bool _Complete() const { // tests whether a result has been found return _Selected_chunk.load(memory_order_relaxed) != _Still_active; } void _Imbue(const size_t _Chunk, const _Ty _Local_result) { // atomically sets the result to the highest chunk's value size_t _Expected = _Still_active; while (!_Selected_chunk.compare_exchange_weak(_Expected, _Chunk)) { // wrap _Still_active down to 0 so that only 1 branch is necessary: if (_Chunk + 1 < _Expected + 1) { return; } } lock_guard _Lck(_Mtx); // TRANSITION, VSO-671180 if (_Selected_chunk.load(memory_order_relaxed) == _Chunk) { _Result = _Local_result; } } }; // CLASS TEMPLATE _Work_stealing_deque template struct alignas(_Ty) alignas(size_t) alignas(_Atomic_counter_t) _Circular_buffer { // work stealing deque extent type static_assert(is_trivial_v<_Ty>, "Work stealing deques work only with trivial operations"); size_t _Log_size; _Atomic_counter_t _Ref_count; void _Release() { static_assert(is_trivially_destructible_v<_Circular_buffer>, "global delete requires trivial destruction"); if (_MT_DECR(_Ref_count) == 0) { ::operator delete(this); } } static _Circular_buffer* _Allocate_circular_buffer(const size_t _New_log_size) { // allocate a circular buffer with space for 2^_New_log_size elements if (_New_log_size >= 32) { _Throw_parallelism_resources_exhausted(); } const size_t _Count = static_cast(1) << _New_log_size; constexpr size_t _Max_bytes = static_cast(-1) - sizeof(_Circular_buffer); if (_Max_bytes / sizeof(_Ty) < _Count) { _Throw_parallelism_resources_exhausted(); } const size_t _Result_bytes = _Count * sizeof(_Ty) + sizeof(_Circular_buffer); static_assert(alignof(_Ty) <= alignof(max_align_t), "incapable of supporting the requested alignment"); const auto _Result = static_cast<_Circular_buffer*>(::operator new(_Result_bytes)); _Result->_Log_size = _New_log_size; _Result->_Ref_count = 1; return _Result; } static _Circular_buffer* _New_circular_buffer() { // allocate a circular buffer with a default number of elements return _Allocate_circular_buffer(6); // start with 64 elements } _Ty* _Get_base() { // get the base address where the _Ty instances are stored return reinterpret_cast<_Ty*>(this + 1); } const _Ty* _Get_base() const { // get the base address where the _Ty instances are stored return reinterpret_cast(this + 1); } _Ty& _Subscript(const size_t _Idx) { // get a reference to the _Idxth element const auto _Mask = (static_cast(1) << _Log_size) - static_cast(1); return _Get_base()[_Idx & _Mask]; } const _Ty& _Subscript(const size_t _Idx) const { // get a reference to the _Idxth element const auto _Mask = (static_cast(1) << _Log_size) - static_cast(1); return _Get_base()[_Idx & _Mask]; } _Circular_buffer* _Grow(const size_t _Bottom, const size_t _Top) const { // create a bigger _Circular_buffer suitable for use by a _Work_stealing_deque<_Ty> with bounds _Bottom and _Top const size_t _New_log_size = _Log_size + 1; _Circular_buffer* _Result = _Allocate_circular_buffer(_New_log_size); for (size_t _Idx = _Top; _Idx < _Bottom; ++_Idx) { _Result->_Subscript(_Idx) = _Subscript(_Idx); } return _Result; } }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier template class alignas(hardware_destructive_interference_size) _Work_stealing_deque { // thread-local work-stealing deque, which allows efficient access from a single owner thread at the "bottom" // of the queue, and any thread access to the "top" of the queue. Originally described in the paper // "Dynamic Circular Work-Stealing Deque" by David Chase and Yossi Lev public: _Work_stealing_deque() = default; _Work_stealing_deque(const _Work_stealing_deque&) = delete; _Work_stealing_deque& operator=(const _Work_stealing_deque&) = delete; ~_Work_stealing_deque() noexcept { _Segment->_Release(); } void _Push_bottom(_Ty& _Val) { // attempts to push _Val onto the bottom of this queue // may be accessed by owning thread only const auto _Local_b = _Bottom.load(); if (_Local_b == SIZE_MAX) { // we assume that any input range won't be divided into more than SIZE_MAX subproblems; // treat overflow of that kind as OOM _Throw_parallelism_resources_exhausted(); } const auto _Local_t = _Top.load(); const auto _Size = _Local_b - _Local_t; if (_Size >= (static_cast(1) << _Segment->_Log_size)) { auto _New_segment = _Segment->_Grow(_Local_b, _Local_t); _Circular_buffer<_Ty>* _Detached_segment; { lock_guard _Lck(_Segment_lock); // TRANSITION, VSO-671180 _Detached_segment = _STD exchange(_Segment, _New_segment); } // unlock _Detached_segment->_Release(); } _Segment->_Subscript(_Local_b) = _Val; _Bottom.store(_Local_b + 1U); } bool _Steal(_Ty& _Val) noexcept { // attempt to pop an item from the top of this deque // may be accessed by any thread // returns false if the deque was empty and _Val is indeterminate; otherwise, returns true and sets _Val to the // element retrieved from the top of the deque. auto _Local_t = _Top.load(); size_t _Desired_t; do { if (_Bottom.load() <= _Local_t) { // deque was empty return false; } _Circular_buffer<_Ty>* _Stealing_segment; { lock_guard _Lck(_Segment_lock); // TRANSITION, VSO-671180 _Stealing_segment = _Segment; _MT_INCR(_Stealing_segment->_Ref_count); } _Val = _Stealing_segment->_Subscript(_Local_t); // speculative read/write data race _Stealing_segment->_Release(); // The above is technically prohibited by the C++ memory model, but happens // to be well defined on all hardware this implementation targets. // Hardware with trap representations or similar must not use this implementation. _Desired_t = _Local_t + 1U; } while (!_Top.compare_exchange_strong(_Local_t, _Desired_t)); // if a data race occurred, try again return true; } bool _Try_pop_bottom(_Ty& _Val) noexcept { // attempt to pop an item from the bottom of this deque into _Val // may be accessed by owning thread only auto _Local_b = _Bottom.load(); if (_Local_b == 0) { // queue never contained any elements (should never happen) return false; } --_Local_b; _Bottom.store(_Local_b); auto _Local_t = _Top.load(); if (_Local_b < _Local_t) { // all elements were stolen before we got here _Bottom.store(_Local_t); return false; } // memory model says following load is OK, since _Push_bottom can't run concurrently _Val = _Segment->_Subscript(_Local_b); if (_Local_b > _Local_t) { // other threads only look at top, so we get the bottom without synchronization return true; } // We're trying to read the last element that another thread may be trying to steal; // see who gets to keep the element through _Top (effectively, steal from ourselves) const auto _Desired_top = _Local_t + 1U; if (_Top.compare_exchange_strong(_Local_t, _Desired_top)) { _Bottom.store(_Desired_top); return true; } else { _Bottom.store(_Local_t); return false; } } private: atomic _Bottom{0}; // modified by only owning thread atomic _Top{0}; // modified by all threads _Guarded_by_(_Segment_lock) _Circular_buffer<_Ty>* _Segment{_Circular_buffer<_Ty>::_New_circular_buffer()}; mutex _Segment_lock{}; }; #pragma warning(pop) // STRUCT TEMPLATE _Work_stealing_membership enum class _Steal_result { _Success, _Abort, _Done }; template struct _Work_stealing_team; template struct _Work_stealing_membership { // thread-local "ticket" that team members use to talk with a _Work_stealing_team using _Diff = typename _Ty::difference_type; size_t _Id; _Work_stealing_team<_Ty>* _Team; _Diff _Work_complete; void _Push_bottom(_Ty& _Val) { _Team->_Queues[_Id]._Push_bottom(_Val); } bool _Try_pop_bottom(_Ty& _Val) noexcept { return _Team->_Queues[_Id]._Try_pop_bottom(_Val); } _Steal_result _Steal(_Ty& _Val) noexcept { _Diff _Remaining; const auto _Completed_this_time = _STD exchange(_Work_complete, {}); if (_Completed_this_time == 0) { _Remaining = _Team->_Remaining_work.load(); } else { _Remaining = _Team->_Remaining_work -= _Completed_this_time; } if (_Remaining == 0) { return _Steal_result::_Done; } const size_t _High = _Team->_Queues_used.load() + 1; size_t _Idx = _Id; for (;;) { if (_Idx == 0) { _Idx = _High; } --_Idx; if (_Idx == _Id) { return _Steal_result::_Abort; } if (_Team->_Queues[_Idx]._Steal(_Val)) { return _Steal_result::_Success; } } } void _Leave() noexcept { _Team->_Leave_team(_Id); } }; // STRUCT TEMPLATE _Work_stealing_team template struct _Work_stealing_team { // inter-thread communication for threads working on a single task using _Diff = typename _Ty::difference_type; static _Parallel_vector _Get_queues(const size_t _Queue_count) { _Parallel_vector _Result(_Queue_count); _STD iota(_Result.begin(), _Result.end(), size_t{}); return _Result; } _Work_stealing_team(size_t _Threads, _Diff _Total_work) : _Queues(_Threads), _Queues_used(0), _Remaining_work(_Total_work), _Available_mutex(), _Available_queues(greater<>{}, _Get_queues(_Threads)) { // register work with the thread pool } _Work_stealing_membership<_Ty> _Join_team() noexcept { size_t _Id; { lock_guard _Lck(_Available_mutex); // TRANSITION, VSO-671180 _Id = _Available_queues.top(); _Available_queues.pop(); } // unlock // set _Queues_used to the high water mark of queues used size_t _High_water = _Queues_used.load(); while (_High_water < _Id && !_Queues_used.compare_exchange_weak(_High_water, _Id)) { // keep trying } return _Work_stealing_membership<_Ty>{_Id, this, 0}; } void _Leave_team(size_t _Id) noexcept { lock_guard _Lck(_Available_mutex); // TRANSITION, VSO-671180 _Available_queues.push(_Id); } _Parallel_vector<_Work_stealing_deque<_Ty>> _Queues; atomic _Queues_used; atomic<_Diff> _Remaining_work; mutex _Available_mutex; priority_queue, greater<>> _Available_queues; }; // STRUCT TEMPLATE _Static_partition_key template struct _Static_partition_key { // "pointer" identifying a static partition size_t _Chunk_number; // In range [0, numeric_limits<_Diff>::max()] _Diff _Start_at; _Diff _Size; explicit operator bool() const { // test if this is a valid key return _Chunk_number != static_cast(-1); } }; // STRUCT TEMPLATE _Static_partition_team template struct _Static_partition_team { // common data for all static partitioned ops atomic _Consumed_chunks; size_t _Chunks; _Diff _Count; _Diff _Chunk_size; _Diff _Unchunked_items; _Static_partition_team(const _Diff _Count_, const size_t _Chunks_) : _Consumed_chunks{0}, _Chunks{_Chunks_}, _Count{_Count_}, _Chunk_size{static_cast<_Diff>( _Count_ / static_cast<_Diff>(_Chunks_))}, _Unchunked_items{static_cast<_Diff>(_Count_ % static_cast<_Diff>(_Chunks_))} { // Calculate common data for statically partitioning iterator ranges. // pre: _Count_ >= _Chunks_ && _Chunks_ >= 1 } _Static_partition_key<_Diff> _Get_chunk_key(const size_t _This_chunk) const { const auto _This_chunk_diff = static_cast<_Diff>(_This_chunk); auto _This_chunk_size = _Chunk_size; auto _This_chunk_start_at = static_cast<_Diff>(_This_chunk_diff * _This_chunk_size); if (_This_chunk_diff < _Unchunked_items) { // chunks at index lower than _Unchunked_items get an extra item, // and need to shift forward by all their predecessors' extra items _This_chunk_start_at += _This_chunk_diff; ++_This_chunk_size; } else { // chunks without an extra item need to account for all the extra items _This_chunk_start_at += _Unchunked_items; } return {_This_chunk, _This_chunk_start_at, _This_chunk_size}; } _Diff _Get_chunk_offset(const size_t _This_chunk) const { const auto _This_chunk_diff = static_cast<_Diff>(_This_chunk); return _This_chunk_diff * _Chunk_size + (_STD min)(_This_chunk_diff, _Unchunked_items); } _Static_partition_key<_Diff> _Get_next_key() { // retrieves the next static partition key to process, if it exists; // otherwise, retrieves an invalid partition key const auto _This_chunk = _Consumed_chunks++; if (_This_chunk < _Chunks) { return _Get_chunk_key(_This_chunk); } return {static_cast(-1), 0, 0}; } }; // STRUCT TEMPLATE _Iterator_range template struct _Iterator_range { // record of a partition of work _FwdIt _First; _FwdIt _Last; }; // STRUCT TEMPLATE _Static_partition_range template , bool = _Is_random_iter_v<_FwdIt>> struct _Static_partition_range; template struct _Static_partition_range<_RanIt, _Diff, true> { using _Target_diff = _Iter_diff_t<_RanIt>; using _URanIt = _Unwrapped_t; _URanIt _Start_at; using _Chunk_type = _Iterator_range<_URanIt>; _RanIt _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _First) { // statically partition a random-access iterator range and return next(_First, _Team._Count) // pre: _Populate hasn't yet been called on this instance auto _Result = _First + static_cast<_Target_diff>(_Team._Count); // does verification _Start_at = _Get_unwrapped(_First); return _Result; } bool _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _First, _RanIt _Last) { // statically partition a random-access iterator range and check if the range ends at _Last // pre: _Populate hasn't yet been called on this instance _Adl_verify_range(_First, _Last); _Start_at = _Get_unwrapped(_First); return _Team._Count == _Last - _First; } _URanIt _Get_first(size_t /* _Chunk_number */, const _Diff _Offset) { // get the first iterator for _Chunk _Chunk_number (which is at offset _Offset) return _Start_at + static_cast<_Target_diff>(_Offset); } _Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const { // get a static partition chunk from a random-access range // pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate const auto _First = _Start_at + static_cast<_Target_diff>(_Key._Start_at); return {_First, _First + static_cast<_Target_diff>(_Key._Size)}; } }; template struct _Static_partition_range<_FwdIt, _Diff, false> { using _Target_diff = _Iter_diff_t<_FwdIt>; using _UFwdIt = _Unwrapped_t; _Parallel_vector<_UFwdIt> _Division_points; using _Chunk_type = _Iterator_range<_UFwdIt>; _FwdIt _Populate(const _Static_partition_team<_Diff>& _Team, _FwdIt _First) { // statically partition a forward iterator range and return next(_First, _Team._Count) // pre: _Populate hasn't yet been called on this instance const auto _Chunks = _Team._Chunks; _Division_points.resize(_Chunks + 1); // The following potentially narrowing cast is OK because caller has ensured // next(_First, _Team._Count) is valid (and _Count <= _Chunk_size) const auto _Chunk_size = static_cast<_Target_diff>(_Team._Chunk_size); const auto _Unchunked_items = _Team._Unchunked_items; auto _Result = _Division_points.begin(); *_Result = _Get_unwrapped(_First); for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) { // record bounds of chunks with an extra item _STD advance(_First, static_cast<_Target_diff>(_Chunk_size + 1)); *++_Result = _Get_unwrapped(_First); } const auto _Diff_chunks = static_cast<_Diff>(_Chunks); for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) { // record bounds of chunks with no extra item _STD advance(_First, _Chunk_size); *++_Result = _Get_unwrapped(_First); } return _First; } bool _Populate(const _Static_partition_team<_Diff>& _Team, _FwdIt _First, _FwdIt _Last) { // statically partition a forward iterator range and check if the range ends at _Last // pre: _Populate hasn't yet been called on this instance const auto _Chunks = _Team._Chunks; _Division_points.resize(_Chunks + 1); const auto _Chunk_size = _Team._Chunk_size; const auto _Unchunked_items = _Team._Unchunked_items; auto _Result = _Division_points.begin(); *_Result = _Get_unwrapped(_First); for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) { // record bounds of chunks with an extra item for (_Diff _This_chunk_size = _Chunk_size; 0 < _This_chunk_size--;) { if (_First == _Last) { return false; } ++_First; } *++_Result = _Get_unwrapped(_First); } const auto _Diff_chunks = static_cast<_Diff>(_Chunks); for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) { // record bounds of chunks with no extra item for (_Diff _This_chunk_size = _Chunk_size; 0 < _This_chunk_size--;) { if (_First == _Last) { return false; } ++_First; } *++_Result = _Get_unwrapped(_First); } return _First == _Last; } _UFwdIt _Get_first(const size_t _Chunk_number, _Diff /* _Offset */) { // get the first iterator for _Chunk _Chunk_number (which is at offset _Offset) return _Division_points[_Chunk_number]; } _Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const { // get a static partition chunk from a forward range // pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate return {_Division_points[_Key._Chunk_number], _Division_points[_Key._Chunk_number + 1]}; } }; // STRUCT TEMPLATE _Static_partition_range_backward template , bool = _Is_random_iter_v<_BidIt>> struct _Static_partition_range_backward; template struct _Static_partition_range_backward<_RanIt, _Diff, true> { using _Target_diff = _Iter_diff_t<_RanIt>; _Unwrapped_t _Start_at; using _Chunk_type = _Iterator_range<_Unwrapped_t>; void _Populate(const _Static_partition_team<_Diff>& _Team, _RanIt _Last) { // statically partition a random-access iterator range ending at _Last // pre: _Populate hasn't yet been called on this instance _Start_at = _Get_unwrapped_n(_Last, -static_cast<_Target_diff>(_Team._Count)); } _Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const { // get a static partition chunk from a random-access range // pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate const auto _Last = _Start_at - static_cast<_Target_diff>(_Key._Start_at); return {_Last - static_cast<_Target_diff>(_Key._Size), _Last}; } }; template struct _Static_partition_range_backward<_BidIt, _Diff, false> { using _Target_diff = _Iter_diff_t<_BidIt>; _Parallel_vector<_Unwrapped_t> _Division_points; using _Chunk_type = _Iterator_range<_Unwrapped_t>; void _Populate(const _Static_partition_team<_Diff>& _Team, _BidIt _Last) { // statically partition a bidirectional iterator range ending at _Last // pre: _Populate hasn't yet been called on this instance const auto _Chunks = _Team._Chunks; _Division_points.resize(_Chunks + 1); const auto _Neg_chunk_size = static_cast<_Target_diff>(-_Team._Chunk_size); const auto _Unchunked_items = _Team._Unchunked_items; auto _Result = _Division_points.begin(); // does range checking by incrementing in the checked domain *_Result = _Get_unwrapped(_Last); for (_Diff _Idx{}; _Idx < _Unchunked_items; ++_Idx) { _STD advance(_Last, static_cast<_Target_diff>(_Neg_chunk_size - 1)); *++_Result = _Get_unwrapped(_Last); } const auto _Diff_chunks = static_cast<_Diff>(_Chunks); for (_Diff _Idx = _Unchunked_items; _Idx < _Diff_chunks; ++_Idx) { _STD advance(_Last, _Neg_chunk_size); *++_Result = _Get_unwrapped(_Last); } } _Chunk_type _Get_chunk(const _Static_partition_key<_Diff> _Key) const { // get a static partition chunk from a bidirectional range // pre: _Key was generated by the _Static_partition_team instance passed to a previous call to _Populate return {_Division_points[_Key._Chunk_number + 1], _Division_points[_Key._Chunk_number]}; } }; // FUNCTION TEMPLATE _Distance_any template _Common_diff_t<_InIt1, _InIt2> _Distance_any(_InIt1 _First1, _InIt1 _Last1, _InIt2 _First2, _InIt2 _Last2) { // get the distance from 2 ranges which should have identical lengths if constexpr (_Is_random_iter_v<_InIt1>) { return _Last1 - _First1; } else if constexpr (_Is_random_iter_v<_InIt2>) { return _Last2 - _First2; } else { return _STD distance(_First1, _Last1); } } // FUNCTION TEMPLATE _Distance_min template _Common_diff_t<_InIt1, _InIt2> _Distance_min(_InIt1 _First1, const _InIt1 _Last1, _InIt2 _First2, const _InIt2 _Last2) { // get min(distance(_First1, _Last1), distance(_First2, _Last2)) using _CT = _Common_diff_t<_InIt1, _InIt2>; _CT _Result{}; if constexpr (_Is_random_iter_v<_InIt1> && _Is_random_iter_v<_InIt2>) { const _CT _Count1 = _Last1 - _First1; const _CT _Count2 = _Last2 - _First2; _Result = (_STD min)(_Count1, _Count2); } else if constexpr (_Is_random_iter_v<_InIt1>) { for (auto _Count1 = _Last1 - _First1; 0 < _Count1 && _First2 != _Last2; --_Count1) { ++_First2; ++_Result; } } else if constexpr (_Is_random_iter_v<_InIt2>) { for (auto _Count2 = _Last2 - _First2; 0 < _Count2 && _First1 != _Last1; --_Count2) { ++_First1; ++_Result; } } else { while (_First1 != _Last1 && _First2 != _Last2) { ++_First1; ++_First2; ++_Result; } } return _Result; } // PARALLEL FUNCTION TEMPLATE all_of template struct _Static_partitioned_all_of_family2 { // all_of/any_of/none_of task scheduled on the system thread pool _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _Pr _Pred; _Cancellation_token _Cancel_token; _Static_partitioned_all_of_family2( _FwdIt _First, const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred(_Pred_), _Cancel_token{} { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { if (_Cancel_token._Is_canceled()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Range = _Basis._Get_chunk(_Key); for (auto _First = _Range._First; _First != _Range._Last; ++_First) { if (_Pred(*_First) ? _Invert : !_Invert) { _Cancel_token._Cancel(); return _Cancellation_status::_Canceled; } } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_all_of_family2*>(_Context)); } }; template bool _All_of_family_parallel(_FwdIt _First, const _FwdIt _Last, _Pr _Pred) { // test if all elements in [_First, _Last) satisfy _Pred (or !_Pred if _Invert is true) in parallel const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_First, _Last); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_all_of_family2<_Invert, _FwdIt, _Pr> _Operation{_First, _Hw_threads, _Count, _Pred}; _Run_chunked_parallel_work(_Hw_threads, _Operation); return !_Operation._Cancel_token._Is_canceled_relaxed(); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } for (; _First != _Last; ++_First) { if (_Pred(*_First) ? _Invert : !_Invert) { return false; } } return true; } template /* = 0 */> _NODISCARD bool all_of(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // test if all elements in [_First, _Last) satisfy _Pred with the indicated execution policy _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { return _All_of_family_parallel(_UFirst, _ULast, _Pass_fn(_Pred)); } else { return _STD all_of(_UFirst, _ULast, _Pass_fn(_Pred)); } } // PARALLEL FUNCTION TEMPLATE any_of template /* = 0 */> _NODISCARD bool any_of(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // test if any element in [_First, _Last) satisfies _Pred with the indicated execution policy _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { return !_All_of_family_parallel(_UFirst, _ULast, _Pass_fn(_Pred)); } else { return _STD any_of(_UFirst, _ULast, _Pass_fn(_Pred)); } } // PARALLEL FUNCTION TEMPLATE none_of template /* = 0 */> _NODISCARD bool none_of(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // test if no element in [_First, _Last) satisfies _Pred with the indicated execution policy _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { return _All_of_family_parallel(_UFirst, _ULast, _Pass_fn(_Pred)); } else { return _STD none_of(_UFirst, _ULast, _Pass_fn(_Pred)); } } // PARALLEL FUNCTION TEMPLATE for_each template void _For_each_ivdep(_FwdIt _First, const _FwdIt _Last, _Fn _Func) { // perform function for each element [_First, _Last) assuming independent loop bodies #pragma loop(ivdep) for (; _First != _Last; ++_First) { _Func(*_First); } } template struct _Static_partitioned_for_each2 { // for_each task scheduled on the system thread pool _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt, _Diff> _Basis; _Fn _Func; _Static_partitioned_for_each2(const size_t _Hw_threads, const _Diff _Count, _Fn _Fx) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Func(_Fx) {} _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (_Key) { const auto _Chunk = _Basis._Get_chunk(_Key); _For_each_ivdep(_Chunk._First, _Chunk._Last, _Func); return _Cancellation_status::_Running; } return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_for_each2*>(_Context)); } }; template /* = 0 */> void for_each(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Fn _Func) noexcept /* terminates */ { // perform function for each element [_First, _Last) with the indicated execution policy _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN auto _Passed_fn = _Pass_fn(_Func); _Static_partitioned_for_each2 _Operation{ _Hw_threads, _Count, _Passed_fn}; _Operation._Basis._Populate(_Operation._Team, _UFirst); _Run_chunked_parallel_work(_Hw_threads, _Operation); return; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } _For_each_ivdep(_UFirst, _ULast, _Pass_fn(_Func)); } else { for (; _UFirst != _ULast; ++_UFirst) { _Func(*_UFirst); } } } // PARALLEL FUNCTION TEMPLATE for_each_n template _FwdIt _For_each_n_ivdep(_FwdIt _First, _Diff _Count, _Fn _Func) { // perform function for each element [_First, _First + _Count) assuming independent loop bodies #pragma loop(ivdep) for (; 0 < _Count; --_Count, (void) ++_First) { _Func(*_First); } return _First; } template /* = 0 */> _FwdIt for_each_n(_ExPo&&, _FwdIt _First, const _Diff _Count_raw, _Fn _Func) noexcept /* terminates */ { // perform function for each element [_First, _First + _Count) _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Algorithm_int_t<_Diff> _Count = _Count_raw; if (0 < _Count) { auto _UFirst = _Get_unwrapped_n(_First, _Count); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1 && _Count >= 2) { // parallelize on multiprocessor machines with at least 2 elements _TRY_BEGIN auto _Passed_fn = _Pass_fn(_Func); _Static_partitioned_for_each2 _Operation{ _Hw_threads, _Count, _Passed_fn}; _Seek_wrapped(_First, _Operation._Basis._Populate(_Operation._Team, _UFirst)); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _First; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_First, _For_each_n_ivdep(_UFirst, _Count, _Pass_fn(_Func))); } else { for (; 0 < _Count; --_Count, (void) ++_UFirst) { _Func(*_UFirst); } _Seek_wrapped(_First, _UFirst); } } return _First; } // PARALLEL FUNCTION TEMPLATE find template using _Parallel_find_results = conditional_t<_Use_atomic_iterator<_FwdIt>, _Parallel_choose_min_result<_FwdIt>, _Parallel_choose_min_chunk<_FwdIt>>; template struct _Static_partitioned_find2 { _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _Parallel_find_results<_FwdIt> _Results; _Find_fx _Fx; _Static_partitioned_find2( const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _Last, const _Find_fx _Fx_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results(_Last), _Fx(_Fx_) {} _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Range = _Basis._Get_chunk(_Key); const auto _This_find = _Fx(_Range._First, _Range._Last); if (_This_find == _Range._Last) { return _Cancellation_status::_Running; } _Results._Imbue(_Key._Chunk_number, _This_find); return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_find2*>(_Context)); } }; template _FwdIt _Find_parallel_unchecked(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, const _Find_fx _Fx) { // find first matching _Val, potentially in parallel if (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_First, _Last); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_find2 _Operation{_Hw_threads, _Count, _Last, _Fx}; _Operation._Basis._Populate(_Operation._Team, _First); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Operation._Results._Get_result(); _CATCH(const _Parallelism_resources_exhausted&) // fall through to _Fx below _CATCH_END } } } return _Fx(_First, _Last); } template /* = 0 */> _NODISCARD _FwdIt find(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept /* terminates */ { // find first matching _Val _REQUIRE_PARALLEL_ITERATOR(_FwdIt); using _UFwdIt = _Unwrapped_t; _Adl_verify_range(_First, _Last); _Seek_wrapped(_First, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last), [&](const _UFwdIt _LFirst, const _UFwdIt _LLast) { return _Find_unchecked(_LFirst, _LLast, _Val); })); return _First; } // PARALLEL FUNCTION TEMPLATE find_if template /* = 0 */> _NODISCARD _FwdIt find_if(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // find first satisfying _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt); using _UFwdIt = _Unwrapped_t; _Adl_verify_range(_First, _Last); auto _Pass_pred = _Pass_fn(_Pred); _Seek_wrapped(_First, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last), [=](const _UFwdIt _LFirst, const _UFwdIt _LLast) { return _STD find_if(_LFirst, _LLast, _Pass_pred); })); return _First; } // PARALLEL FUNCTION TEMPLATE find_if_not template /* = 0 */> _NODISCARD _FwdIt find_if_not(_ExPo&& _Exec, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // find first satisfying !_Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt); using _UFwdIt = _Unwrapped_t; _Adl_verify_range(_First, _Last); auto _Pass_pred = _Pass_fn(_Pred); _Seek_wrapped(_First, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last), [=](const _UFwdIt _LFirst, const _UFwdIt _LLast) { return _STD find_if_not(_LFirst, _LLast, _Pass_pred); })); return _First; } // PARALLEL FUNCTION TEMPLATE find_end template _Iter_diff_t<_FwdIt1> _Get_find_end_forward_partition_size( _FwdIt1 _First1, const _FwdIt1 _Last1, _FwdIt2 _First2, const _FwdIt2 _Last2) { // get the count of the range of possible matches in a find_end operation for forward iterators if (_First2 == _Last2) { return 0; } for (;;) { if (_First1 == _Last1) { // haystack is shorter than needle return 0; } ++_First2; if (_First2 == _Last2) { // seek complete return _STD distance(_First1, _Last1); } ++_First1; } } template struct _Static_partitioned_find_end_forward { _Static_partition_team<_Iter_diff_t<_FwdIt1>> _Team; _Static_partition_range<_FwdIt1> _Basis; _Iterator_range<_FwdIt2> _Range2; _Pr _Pred; conditional_t<_Use_atomic_iterator<_FwdIt1>, _Parallel_choose_max_result<_FwdIt1>, _Parallel_choose_max_chunk<_FwdIt1>> _Results; _Static_partitioned_find_end_forward(const size_t _Hw_threads, const _Iter_diff_t<_FwdIt1> _Count, const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Range2{_First2, _Last2}, _Pred{_Pred_}, _Results(_Last1) {} _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _Range = _Basis._Get_chunk(_Key); _FwdIt1 _Result = _Range._Last; for (auto _First1 = _Range._First; _First1 != _Range._Last; ++_First1) { if (_Equal_rev_pred_unchecked(_First1, _Range2._First, _Range2._Last, _Pred)) { _Result = _First1; } } if (_Result != _Range._Last) { _Results._Imbue(_Chunk_number, _Result); } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { (void) static_cast<_Static_partitioned_find_end_forward*>(_Context)->_Process_chunk(); } }; template _BidIt1 _Get_find_end_backward_partition_start( const _BidIt1 _First1, _BidIt1 _Last1, _FwdIt2 _First2, const _FwdIt2 _Last2) { // gets the end of the range of possible matches for a find_end operation if constexpr (_Is_random_iter_v<_BidIt1> && _Is_random_iter_v<_FwdIt2>) { using _CT = _Common_diff_t<_BidIt1, _FwdIt2>; const _CT _Count1 = _Last1 - _First1; const _CT _Count2 = _Last2 - _First2; _Iter_diff_t<_BidIt1> _Count{}; if (0 < _Count2 && _Count2 <= _Count1) { _Count = static_cast<_Iter_diff_t<_BidIt1>>(_Count1 - _Count2 + 1); } return _First1 + _Count; } else { if (_First2 == _Last2) { // always 0 matches, give up return _First1; } for (;;) { if (_First1 == _Last1) { // haystack is shorter than needle, give up return _First1; } ++_First2; if (_First2 == _Last2) { // seek complete return _Last1; } --_Last1; } } } template struct _Static_partitioned_find_end_backward2 { _Static_partition_team<_Iter_diff_t<_BidIt1>> _Team; _Static_partition_range_backward<_BidIt1> _Basis; conditional_t<_Use_atomic_iterator<_BidIt1>, _Parallel_choose_max_result<_BidIt1>, _Parallel_choose_min_chunk<_BidIt1>> _Results; _Iterator_range<_FwdIt2> _Range2; _Pr _Pred; _Static_partitioned_find_end_backward2(const size_t _Hw_threads, const _Iter_diff_t<_BidIt1> _Count, const _BidIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results(_Last1), _Range2{_First2, _Last2}, _Pred{_Pred_} {} _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _Range = _Basis._Get_chunk(_Key); auto _Last1 = _Range._Last; do { --_Last1; if (_Equal_rev_pred_unchecked(_Last1, _Range2._First, _Range2._Last, _Pred)) { _Results._Imbue(_Chunk_number, _Last1); return _Cancellation_status::_Canceled; } } while (_Last1 != _Range._First); return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_find_end_backward2*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdIt1 find_end(_ExPo&&, _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ { // find last [_First2, _Last2) satisfying _Pred _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); const auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { if constexpr (_Is_bidi_iter_v<_FwdIt1>) { const auto _Partition_start = _Get_find_end_backward_partition_start(_UFirst1, _ULast1, _UFirst2, _ULast2); if (_UFirst1 == _Partition_start) { _Seek_wrapped(_First1, _ULast1); return _First1; } const auto _Count = _STD distance(_UFirst1, _Partition_start); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_find_end_backward2 _Operation{ _Hw_threads, _Count, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)}; _Operation._Basis._Populate(_Operation._Team, _Partition_start); _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First1, _Operation._Results._Get_result()); return _First1; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } else { const auto _Count = _Get_find_end_forward_partition_size(_UFirst1, _ULast1, _UFirst2, _ULast2); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_find_end_forward _Operation{ _Hw_threads, _Count, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)}; _Operation._Basis._Populate(_Operation._Team, _UFirst1); _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First1, _Operation._Results._Get_result()); return _First1; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } } _Seek_wrapped(_First1, _STD find_end(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred))); return _First1; } // PARALLEL FUNCTION TEMPLATE find_first_of template /* = 0 */> _NODISCARD _FwdIt1 find_first_of(_ExPo&& _Exec, const _FwdIt1 _First1, _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ { // look for one of [_First2, _Last2) that matches element _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); using _UFwdIt1 = _Unwrapped_t; _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); const auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); if (_UFirst2 == _ULast2) { return _Last1; } auto _Pass_pred = _Pass_fn(_Pred); _Seek_wrapped(_Last1, _Find_parallel_unchecked(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First1), _Get_unwrapped(_Last1), [=](const _UFwdIt1 _LFirst1, const _UFwdIt1 _LLast1) { return _STD find_first_of(_LFirst1, _LLast1, _UFirst2, _ULast2, _Pass_pred); })); return _Last1; } // PARALLEL FUNCTION TEMPLATE adjacent_find template struct _Static_partitioned_adjacent_find2 { _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _Parallel_find_results<_FwdIt> _Results; _Pr _Pred; _Static_partitioned_adjacent_find2( const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _Last, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results{_Last}, _Pred{_Pred_} {} _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _Range = _Basis._Get_chunk(_Key); // tests [_First, _Last) for _Pred(*_Result, *next(_Result)); // note: intentionally dereferences _Last auto _First = _Range._First; for (auto _Next = _First; _First != _Range._Last; _First = _Next) { ++_Next; if (_Pred(*_First, *_Next)) { // found match _Results._Imbue(_Chunk_number, _First); return _Cancellation_status::_Canceled; } } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_adjacent_find2*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdIt adjacent_find(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // find first satisfying _Pred with successor _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = static_cast<_Iter_diff_t<_FwdIt>>(_STD distance(_UFirst, _ULast) - 1); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_adjacent_find2 _Operation{_Hw_threads, _Count, _ULast, _Pass_fn(_Pred)}; _Operation._Basis._Populate(_Operation._Team, _UFirst); _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_Last, _Operation._Results._Get_result()); return _Last; _CATCH(const _Parallelism_resources_exhausted&) // fall through to adjacent_find below _CATCH_END } } } _Seek_wrapped(_Last, _STD adjacent_find(_UFirst, _ULast, _Pass_fn(_Pred))); return _Last; } // PARALLEL FUNCTION TEMPLATES count AND count_if template struct _Static_partitioned_count_if2 { using _Diff = _Iter_diff_t<_FwdIt>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt> _Basis; _Pr _Pred; atomic<_Diff> _Results; _Static_partitioned_count_if2(const _Diff _Count, const size_t _Chunks, const _FwdIt _First, const _Pr _Pred_) : _Team{_Count, _Chunks}, _Basis{}, _Pred{_Pred_}, _Results{} { _Basis._Populate(_Team, _First); } _Diff _Process_chunks() { _Diff _Result{}; while (const auto _Key = _Team._Get_next_key()) { const auto _Range = _Basis._Get_chunk(_Key); _Result += _STD count_if(_Range._First, _Range._Last, _Pred); } return _Result; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { const auto _This = static_cast<_Static_partitioned_count_if2*>(_Context); _This->_Results.fetch_add(_This->_Process_chunks(), memory_order_relaxed); } }; template /* = 0 */> _NODISCARD _Iter_diff_t<_FwdIt> count_if(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // count elements satisfying _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 2) { const auto _Chunks = _Get_chunked_work_chunk_count(_Hw_threads, _Count); _TRY_BEGIN _Static_partitioned_count_if2 _Operation{_Count, _Chunks, _UFirst, _Pass_fn(_Pred)}; _Iter_diff_t<_FwdIt> _Foreground_count; { const _Work_ptr _Work{_Operation}; // setup complete, hereafter nothrow or terminate _Work._Submit_for_chunks(_Hw_threads, _Chunks); _Foreground_count = _Operation._Process_chunks(); } // join with _Work_ptr threads return _Operation._Results.load(memory_order_relaxed) + _Foreground_count; _CATCH(const _Parallelism_resources_exhausted&) // fall through to count_if below _CATCH_END } } } return _STD count_if(_UFirst, _ULast, _Pass_fn(_Pred)); } template /* = 0 */> _NODISCARD _Iter_diff_t<_FwdIt> count(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept /* terminates */ { // count elements that match _Val _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); return _STD count_if(_STD forward<_ExPo>(_Exec), _Get_unwrapped(_First), _Get_unwrapped(_Last), [&_Val](auto&& _Iter_val) { return _STD forward(_Iter_val) == _Val; }); } // PARALLEL FUNCTION TEMPLATE mismatch template >&& _Is_random_iter_v<_FwdIt2>, bool = _Use_atomic_iterator<_Unwrapped_t>&& _Is_random_iter_v<_FwdIt1>> struct _Static_partitioned_mismatch_results; template struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, true, _Unused> { // atomically manipulate atomic<_FwdIt1> and calculate the second iterator by adding distance to it _Parallel_choose_min_result<_FwdIt1> _Storage; _Static_partitioned_mismatch_results(const _FwdIt1 _Last1, const _Unwrapped_t&) : _Storage(_Last1) {} void _Imbue(const size_t _Chunk_number, const _FwdIt1 _First1, const _Unwrapped_t&) { _Storage._Imbue(_Chunk_number, _First1); } pair<_FwdIt1, _FwdIt2> _Get_result(const _FwdIt1 _First1, const _FwdIt2 _First2) const { const _FwdIt1 _Result1 = _Storage._Get_result(); return {_Result1, _First2 + static_cast<_Iter_diff_t<_FwdIt2>>(_Result1 - _First1)}; } }; template struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, false, true> { // atomically manipulate atomic<_FwdIt2> and calculate the first iterator by adding distance to it _Parallel_choose_min_result<_FwdIt2> _Storage; _Static_partitioned_mismatch_results(const _Unwrapped_t&, const _FwdIt2 _Last2) : _Storage(_Last2) {} void _Imbue(const size_t _Chunk_number, const _Unwrapped_t&, const _FwdIt2 _First2) { _Storage._Imbue(_Chunk_number, _First2); } pair<_FwdIt1, _FwdIt2> _Get_result(const _FwdIt1 _First1, const _FwdIt2 _First2) const { const _FwdIt2 _Result2 = _Storage._Get_result(); return {_First1 + static_cast<_Iter_diff_t<_FwdIt1>>(_Result2 - _First2), _Result2}; } }; template struct _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2, false, false> { // get both iterators by manipulating them under lock using _UFwdIt1 = _Unwrapped_t; using _UFwdIt2 = _Unwrapped_t; _Parallel_choose_min_chunk> _Storage; _Static_partitioned_mismatch_results(const _UFwdIt1 _Last1, const _UFwdIt2 _Last2) : _Storage({_Last1, _Last2}) {} void _Imbue(const size_t _Chunk_number, const _UFwdIt1 _First1, const _UFwdIt2 _First2) { _Storage._Imbue(_Chunk_number, {_First1, _First2}); } pair<_FwdIt1, _FwdIt2> _Get_result(_FwdIt1 _First1, _FwdIt2 _First2) const { const auto _Result = _Storage._Get_result(); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; } }; template struct _Static_partitioned_mismatch2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Static_partitioned_mismatch_results<_FwdIt1, _FwdIt2> _Results; _Pr _Pred; _Static_partitioned_mismatch2( const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First1, const _FwdIt2 _First2, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Results( _Get_unwrapped(_Basis1._Populate(_Team, _First1)), _Get_unwrapped(_Basis2._Populate(_Team, _First2))), _Pred(_Pred_) {} _Cancellation_status _Process_chunk() { if (_Results._Storage._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _Range1 = _Basis1._Get_chunk(_Key); auto _First1 = _Range1._First; auto _First2 = _Basis2._Get_chunk(_Key)._First; for (;;) { if (_First1 == _Range1._Last) { return _Cancellation_status::_Running; } if (!_Pred(*_First1, *_First2)) { _Results._Imbue(_Chunk_number, _First1, _First2); return _Cancellation_status::_Canceled; } ++_First1; ++_First2; } } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_mismatch2*>(_Context)); } }; #pragma warning(push) #pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order // in braced initializer list (/Wall) template /* = 0 */> _NODISCARD pair<_FwdIt1, _FwdIt2> mismatch( _ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _Pr _Pred) noexcept /* terminates */ { // return [_First1, _Last1)/[_First2, ...) mismatch using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First1, _Last1); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_UFirst1, _ULast1); const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_mismatch2 _Operation{_Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); const auto _Result = _Operation._Results._Get_result(_UFirst1, _UFirst2); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } const auto _Result = _STD mismatch(_UFirst1, _ULast1, _UFirst2, _Pass_fn(_Pred)); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; } } const auto _Result = _STD mismatch( _UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Idl_distance<_FwdIt1>(_UFirst1, _ULast1)), _Pass_fn(_Pred)); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; } #pragma warning(pop) template /* = 0 */> _NODISCARD pair<_FwdIt1, _FwdIt2> mismatch( _ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ { // return [_First1, _Last1)/[_First2, _Last2) mismatch using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); const auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = static_cast<_Iter_diff_t<_FwdIt1>>(_Distance_min(_UFirst1, _ULast1, _UFirst2, _ULast2)); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_mismatch2 _Operation{_Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); const auto _Result = _Operation._Results._Get_result(_UFirst1, _UFirst2); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial below _CATCH_END } } } const auto _Result = _STD mismatch(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)); _Seek_wrapped(_First2, _Result.second); _Seek_wrapped(_First1, _Result.first); return {_First1, _First2}; } // PARALLEL FUNCTION TEMPLATE equal template struct _Static_partitioned_equal2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Pr _Pred; _Cancellation_token _Cancel_token; _Static_partitioned_equal2(const size_t _Hw_threads, const _Diff _Count, _Pr _Pred_, const _FwdIt1&, const _FwdIt2&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Pred(_Pred_), _Cancel_token{} {} _Cancellation_status _Process_chunk() { if (_Cancel_token._Is_canceled()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Range1 = _Basis1._Get_chunk(_Key); const auto _Range2_first = _Basis2._Get_chunk(_Key)._First; if (_STD equal(_Range1._First, _Range1._Last, _Range2_first, _Pred)) { return _Cancellation_status::_Running; } _Cancel_token._Cancel(); return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_equal2*>(_Context)); } }; template /* = 0 */> _NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, _Pr _Pred) noexcept /* terminates */ { // compare [_First1, _Last1) to [_First2, ...) using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First1, _Last1); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_UFirst1, _ULast1); const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_equal2 _Operation{_Hw_threads, _Count, _Pass_fn(_Pred), _UFirst1, _UFirst2}; _Operation._Basis1._Populate(_Operation._Team, _UFirst1); _Operation._Basis2._Populate(_Operation._Team, _UFirst2); _Run_chunked_parallel_work(_Hw_threads, _Operation); return !_Operation._Cancel_token._Is_canceled_relaxed(); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } return _STD equal(_UFirst1, _ULast1, _UFirst2, _Pass_fn(_Pred)); } } return _STD equal(_UFirst1, _ULast1, _First2, _Pass_fn(_Pred)); } template /* = 0 */> _NODISCARD bool equal(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, const _FwdIt2 _Last2, _Pr _Pred) noexcept /* terminates */ { // compare [_First1, _Last1) to [_First2, _Last2) using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); const auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _Distance_any(_UFirst1, _ULast1, _UFirst2, _ULast2); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_equal2 _Operation{_Hw_threads, _Count, _Pass_fn(_Pred), _UFirst1, _UFirst2}; if (!_Operation._Basis1._Populate(_Operation._Team, _UFirst1, _ULast1)) { // left sequence didn't have length _Count return false; } if (!_Operation._Basis2._Populate(_Operation._Team, _UFirst2, _ULast2)) { // right sequence didn't have length _Count return false; } _Run_chunked_parallel_work(_Hw_threads, _Operation); return !_Operation._Cancel_token._Is_canceled_relaxed(); _CATCH(const _Parallelism_resources_exhausted&) // fall through to equal below _CATCH_END } } } return _STD equal(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)); } // PARALLEL FUNCTION TEMPLATE search template struct _Static_partitioned_search2 { _Static_partition_team<_Iter_diff_t<_FwdItHaystack>> _Team; _Static_partition_range<_FwdItHaystack> _Basis; _Parallel_find_results<_FwdItHaystack> _Results; _FwdItPat _First2; _FwdItPat _Last2; _Pr _Pred; _Static_partitioned_search2(const size_t _Hw_threads, const _Iter_diff_t<_FwdItHaystack> _Count, const _FwdItHaystack _First1, const _FwdItHaystack _Last1, const _FwdItPat _First2_, const _FwdItPat _Last2_, _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Results(_Last1), _First2(_First2_), _Last2(_Last2_), _Pred(_Pred_) { _Basis._Populate(_Team, _First1); } _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Range = _Basis._Get_chunk(_Key); for (auto _Candidate = _Range._First; _Candidate != _Range._Last; ++_Candidate) { if (_Equal_rev_pred_unchecked(_Candidate, _First2, _Last2, _Pred)) { _Results._Imbue(_Key._Chunk_number, _Candidate); return _Cancellation_status::_Canceled; } } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_search2*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdItHaystack search(_ExPo&&, const _FwdItHaystack _First1, _FwdItHaystack _Last1, const _FwdItPat _First2, const _FwdItPat _Last2, _Pr _Pred) noexcept /* terminates */ { // find first [_First2, _Last2) match _Adl_verify_range(_First2, _Last2); const auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); if (_UFirst2 == _ULast2) { return _First1; } _Adl_verify_range(_First1, _Last1); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { _Iter_diff_t<_FwdItHaystack> _Count; if constexpr (_Is_random_iter_v<_FwdItHaystack> && _Is_random_iter_v<_FwdItPat>) { const auto _HaystackDist = _ULast1 - _UFirst1; const auto _NeedleDist = _ULast2 - _UFirst2; if (_NeedleDist > _HaystackDist) { // needle is longer than haystack, no match possible return _Last1; } // +1 can't overflow because _NeedleDist > 0 because _First2 != _Last2 _Count = static_cast<_Iter_diff_t<_FwdItHaystack>>(_HaystackDist - _NeedleDist + 1); } else { auto _UFirst1c = _UFirst1; auto _UFirst2c = _UFirst2; for (;;) { if (_UFirst1c == _ULast1) { // either haystack was shorter than needle so no match is possible, or haystack was empty so // returning _Last1 is the same as returning _First1 return _Last1; } ++_UFirst2c; if (_UFirst2c == _ULast2) { _Count = _STD distance(_UFirst1c, _ULast1); break; } ++_UFirst1c; } } if (_Count == 1) { // 1 match possible == ranges are of equal length -- do better than serial search because we've done the // work to calculate distance() of the inputs if (_Equal_rev_pred_unchecked(_UFirst1, _UFirst2, _ULast2, _Pass_fn(_Pred))) { return _First1; } return _Last1; } _TRY_BEGIN _Static_partitioned_search2 _Operation{ _Hw_threads, _Count, _UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_Last1, _Operation._Results._Get_result()); return _Last1; _CATCH(const _Parallelism_resources_exhausted&) // fall through to search, below _CATCH_END } } _Seek_wrapped(_Last1, _STD search(_UFirst1, _ULast1, _UFirst2, _ULast2, _Pass_fn(_Pred))); return _Last1; } // PARALLEL FUNCTION TEMPLATE search_n template struct _Static_partitioned_search_n2 { _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _Parallel_find_results<_FwdIt> _Results; _Iter_diff_t<_FwdIt> _Target_count; const _Ty& _Val; _Pr _Pred; _Static_partitioned_search_n2(const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Candidates, const _FwdIt _First, const _FwdIt _Last, const _Iter_diff_t<_FwdIt> _Target_count_, const _Ty& _Val_, _Pr _Pred_) : _Team{_Candidates, _Get_chunked_work_chunk_count(_Hw_threads, _Candidates)}, _Basis{}, _Results(_Last), _Target_count(_Target_count_), _Val(_Val_), _Pred(_Pred_) { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Range = _Basis._Get_chunk(_Key); // any match in this chunk will have at least 1 element in _Range, so the furthest off the // end of the candidate range we will need to search is _Target_count - 1 const _Iter_diff_t<_FwdIt> _Overshoot = _Target_count - 1; const auto _Serial_last = _STD next(_Range._Last, _Overshoot); // we delegate to the serial algorithm targeting [_First, _Last + _Overshoot) // which we know is safe because we only partitioned _First + _Candidates start positions // if we have a match, it will be within [_First, _Last); otherwise the serial algorithm // will tell us _Last + _Overshoot const auto _Candidate = _STD search_n(_Range._First, _Serial_last, _Target_count, _Val, _Pred); if (_Candidate == _Serial_last) { return _Cancellation_status::_Running; } _Results._Imbue(_Key._Chunk_number, _Candidate); return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_search_n2*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdIt search_n(_ExPo&&, const _FwdIt _First, _FwdIt _Last, const _Diff _Count_raw, const _Ty& _Val, _Pr _Pred) noexcept /* terminates */ { // find first _Count * _Val satisfying _Pred const _Algorithm_int_t<_Diff> _Count = _Count_raw; if (_Count <= 0) { _Last = _First; return _Last; } if (static_cast(_Count) > static_cast((numeric_limits<_Iter_diff_t<_FwdIt>>::max)())) { // if the number of _Vals searched for is larger than the longest possible sequence, we can't find it return _Last; } _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Haystack_count = _STD distance(_UFirst, _ULast); if (_Count > _Haystack_count) { return _Last; } // +1 can't overflow because _Count > 0 const auto _Candidates = static_cast<_Iter_diff_t<_FwdIt>>(_Haystack_count - _Count + 1); _TRY_BEGIN _Static_partitioned_search_n2 _Operation{_Hw_threads, _Candidates, _UFirst, _ULast, static_cast<_Iter_diff_t<_FwdIt>>(_Count), _Val, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_Last, _Operation._Results._Get_result()); return _Last; _CATCH(const _Parallelism_resources_exhausted&) // fall through to search_n, below _CATCH_END } } _Seek_wrapped(_Last, _STD search_n(_UFirst, _ULast, _Count, _Val, _Pass_fn(_Pred))); return _Last; } // PARALLEL FUNCTION TEMPLATE transform template _FwdIt2 _Transform_ivdep(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Fn _Func) { // unary op transform with independent loop bodies #pragma loop(ivdep) for (; _First != _Last; ++_First, (void) ++_Dest) { *_Dest = _Func(*_First); } return _Dest; } template struct _Static_partitioned_unary_transform2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Source_basis; _Static_partition_range<_FwdIt2, _Diff> _Dest_basis; _Fn _Func; _Static_partitioned_unary_transform2( const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _Fn _Fx, const _FwdIt2&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Source_basis{}, _Dest_basis{}, _Func(_Fx) { _Source_basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (_Key) { const auto _Source = _Source_basis._Get_chunk(_Key); _Transform_ivdep(_Source._First, _Source._Last, _Dest_basis._Get_chunk(_Key)._First, _Func); return _Cancellation_status::_Running; } return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_unary_transform2*>(_Context)); } }; template /* = 0 */> _FwdIt2 transform(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Fn _Func) noexcept /* terminates */ { // transform [_First, _Last) with _Func _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_UFirst, _ULast); const auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_unary_transform2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Func), _UDest}; _Seek_wrapped(_Dest, _Operation._Dest_basis._Populate(_Operation._Team, _UDest)); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst, _ULast, _UDest, _Pass_fn(_Func))); return _Dest; } else { _Seek_wrapped( _Dest, _Transform_ivdep(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Func))); return _Dest; } } else { _Seek_wrapped(_Dest, _STD transform(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Func))); return _Dest; } } template _FwdIt3 _Transform_ivdep(_FwdIt1 _First1, const _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt3 _Dest, _Fn _Func) { // binary op transform with independent loop bodies #pragma loop(ivdep) for (; _First1 != _Last1; ++_First1, (void) ++_First2, ++_Dest) { *_Dest = _Func(*_First1, *_First2); } return _Dest; } template struct _Static_partitioned_binary_transform2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Source1_basis; _Static_partition_range<_FwdIt2, _Diff> _Source2_basis; _Static_partition_range<_FwdIt3, _Diff> _Dest_basis; _Fn _Func; _Static_partitioned_binary_transform2( const size_t _Hw_threads, const _Diff _Count, _FwdIt1 _First1, _FwdIt2 _First2, _Fn _Fx, const _FwdIt3&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Source1_basis{}, _Source2_basis{}, _Dest_basis{}, _Func(_Fx) { _Source1_basis._Populate(_Team, _First1); _Source2_basis._Populate(_Team, _First2); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (_Key) { const auto _Source1 = _Source1_basis._Get_chunk(_Key); _Transform_ivdep(_Source1._First, _Source1._Last, _Source2_basis._Get_chunk(_Key)._First, _Dest_basis._Get_chunk(_Key)._First, _Func); return _Cancellation_status::_Running; } return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_binary_transform2*>(_Context)); } }; #pragma warning(push) #pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order // in braced initializer list (/Wall) template /* = 0 */> _FwdIt3 transform(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _FwdIt2 _First2, _FwdIt3 _Dest, _Fn _Func) noexcept /* terminates */ { // transform [_First1, _Last1) and [_First2, ...) with _Func _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _REQUIRE_PARALLEL_ITERATOR(_FwdIt3); _Adl_verify_range(_First1, _Last1); const auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_UFirst1, _ULast1); const auto _UFirst2 = _Get_unwrapped_n(_First2, _Count); const auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_binary_transform2 _Operation{ _Hw_threads, _Count, _UFirst1, _UFirst2, _Pass_fn(_Func), _UDest}; _Seek_wrapped(_Dest, _Operation._Dest_basis._Populate(_Operation._Team, _UDest)); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _UFirst2, _UDest, _Pass_fn(_Func))); return _Dest; } else { const auto _Count = _Idl_distance<_FwdIt1>(_UFirst1, _ULast1); _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count), _Get_unwrapped_n(_Dest, _Count), _Pass_fn(_Func))); return _Dest; } } else { const auto _Count = _Idl_distance<_FwdIt1>(_UFirst1, _ULast1); _Seek_wrapped(_Dest, _STD transform(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count), _Get_unwrapped_n(_Dest, _Count), _Pass_fn(_Func))); return _Dest; } } #pragma warning(pop) // PARALLEL FUNCTION TEMPLATE replace template /* = 0 */> void replace(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Oldval, const _Ty& _Newval) noexcept /* terminates */ { // replace each matching _Oldval with _Newval _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _STD for_each(_STD forward<_ExPo>(_Exec), _First, _Last, [&](auto&& _Value) { if (_STD forward(_Value) == _Oldval) { _STD forward(_Value) = _Newval; } }); } // PARALLEL FUNCTION TEMPLATE replace_if template /* = 0 */> void replace_if(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred, const _Ty& _Val) noexcept /* terminates */ { // replace each satisfying _Pred with _Val _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _STD for_each( _STD forward<_ExPo>(_Exec), _First, _Last, [&_Val, _Lambda_pred = _Pass_fn(_Pred)](auto&& _Value) mutable { if (_Lambda_pred(_STD forward(_Value))) { _STD forward(_Value) = _Val; } }); } // PARALLEL FUNCTION TEMPLATES remove AND remove_if template _FwdIt _Remove_move_if_unchecked(_FwdIt _First, const _FwdIt _Last, _FwdIt _Dest, _Pr _Pred) { // move omitting each element satisfying _Pred for (; _First != _Last; ++_First) { if (!_Pred(*_First)) { *_Dest = _STD move(*_First); ++_Dest; } } return _Dest; } template struct _Static_partitioned_remove_if2 { enum class _Chunk_state : unsigned char { _Serial, // while a chunk is in the serial state, it is touched only by an owner thread _Merging, // while a chunk is in the merging state, threads all try to CAS the chunk _Merging -> _Moving // the thread that succeeds takes responsibility for moving the keepers from that chunk to the // results _Moving, // while a chunk is in the moving state, the keepers are being moved to _Results // only one chunk at a time is ever _Moving; this also serves to synchronize access to _Results _Done // when a chunk becomes _Done, it is complete / will never need to touch _Results again // as an optimization, if a thread sees that it has no predecessor (or its predecessor is _Done), it // may transition from _Serial directly to _Done, doing the moving step implicitly. }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier struct alignas(hardware_destructive_interference_size) alignas(_FwdIt) _Chunk_local_data { atomic<_Chunk_state> _State; _FwdIt _New_end; }; #pragma warning(pop) _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _Pr _Pred; _Parallel_vector<_Chunk_local_data> _Chunk_locals; _FwdIt _Results; _Static_partitioned_remove_if2( const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _First, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred{_Pred_}, _Chunk_locals(_Team._Chunks), _Results{_First} { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } // remove phase: auto _Merge_index = _Key._Chunk_number; // merge step will start from this index { auto& _Chunk_data = _Chunk_locals[_Merge_index]; const auto _Range = _Basis._Get_chunk(_Key); if (_Merge_index == 0 || _Chunk_locals[_Merge_index - 1]._State.load() == _Chunk_state::_Done) { // no predecessor, so run serial algorithm directly into results if (_Merge_index == 0 || _Results == _Range._First) { _Results = _STD remove_if(_Range._First, _Range._Last, _Pred); } else { _Results = _Remove_move_if_unchecked(_Range._First, _Range._Last, _Results, _Pred); } _Chunk_data._State.store(_Chunk_state::_Done); ++_Merge_index; // this chunk is already merged } else { // predecessor, run serial algorithm in place and attempt to merge later _Chunk_data._New_end = _STD remove_if(_Range._First, _Range._Last, _Pred); _Chunk_data._State.store(_Chunk_state::_Merging); if (_Chunk_locals[_Merge_index - 1]._State.load() != _Chunk_state::_Done) { // if the predecessor isn't done, whichever thread merges our predecessor will merge us too return _Cancellation_status::_Running; } } } // merge phase: at this point, we have observed that our predecessor chunk has been merged to the output, // attempt to become the new merging thread if the previous merger gave up // note: it is an invariant when we get here that _Chunk_locals[_Merge_index - 1]._State == _Chunk_state::_Done for (; _Merge_index != _Team._Chunks; ++_Merge_index) { auto& _Merge_chunk_data = _Chunk_locals[_Merge_index]; auto _Expected = _Chunk_state::_Merging; if (!_Merge_chunk_data._State.compare_exchange_strong(_Expected, _Chunk_state::_Moving)) { // either the _Merge_index chunk isn't ready to merge yet, or another thread will do it return _Cancellation_status::_Running; } const auto _Merge_first = _Basis._Get_first(_Merge_index, _Team._Get_chunk_offset(_Merge_index)); const auto _Merge_new_end = _STD exchange(_Merge_chunk_data._New_end, {}); if (_Results == _Merge_first) { // entire range up to now had no removals, don't bother moving _Results = _Merge_new_end; } else { _Results = _Move_unchecked(_Merge_first, _Merge_new_end, _Results); } _Merge_chunk_data._State.store(_Chunk_state::_Done); } return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_remove_if2*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdIt remove_if(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // remove each satisfying _Pred _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_remove_if2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First, _Operation._Results); return _First; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_First, _STD remove_if(_UFirst, _ULast, _Pass_fn(_Pred))); return _First; } template /* = 0 */> _NODISCARD _FwdIt remove(_ExPo&& _Exec, const _FwdIt _First, const _FwdIt _Last, const _Ty& _Val) noexcept /* terminates */ { // remove each matching _Val return _STD remove_if(_STD forward<_ExPo>(_Exec), _First, _Last, [&_Val](auto&& _Lhs) { return _STD forward(_Lhs) == _Val; }); } // PARALLEL FUNCTION TEMPLATE sort template struct _Sort_work_item_impl { // data describing an individual sort work item using difference_type = _Diff; _Diff _Offset; _Diff _Size; _Diff _Ideal; }; template using _Sort_work_item = _Sort_work_item_impl<_Iter_diff_t<_RanIt>>; template bool _Process_sort_work_item(const _RanIt _Basis, _Pr _Pred, _Sort_work_item<_RanIt>& _Wi, _Sort_work_item<_RanIt>& _Right_fork_wi, _Iter_diff_t<_RanIt>& _Work_complete) noexcept /* terminates */ { // processes the sort work item, _Wi, relative to _Basis // if the sort is divided into quicksort sub-problems: // the return value is true // _Wi contains the left sub-problem; the caller should continue with this // _Right_fork_wi contains the right sub-problem; the caller should allow this to be stolen // otherwise: // the return value is false // _Wi's range is completely sorted // _Right_fork_wi is unmodified using _Diff = _Iter_diff_t<_RanIt>; constexpr auto _Diffsort_max = static_cast<_Diff>(_ISORT_MAX); const auto _Size = _Wi._Size; const auto _First = _Basis + _Wi._Offset; const auto _Last = _First + _Size; const auto _Ideal = _Wi._Ideal; if (_Size <= _Diffsort_max) { _Insertion_sort_unchecked(_First, _Last, _Pred); _Work_complete += _Size; return false; } if (0 < _Ideal) { // divide and conquer by partitioning (quicksort) const auto _Mid = _Partition_by_median_guess_unchecked(_First, _Last, _Pred); const auto _New_ideal = static_cast<_Diff>(_Ideal / 2 + _Ideal / 4); // allow 1.5 log2(N) divisions _Wi._Size = _Mid.first - _First; _Wi._Ideal = _New_ideal; _Right_fork_wi = {_Mid.second - _Basis, _Last - _Mid.second, _New_ideal}; _Work_complete += _Mid.second - _Mid.first; return true; } // too many divisions; heap sort _Make_heap_unchecked(_First, _Last, _Pred); _Sort_heap_unchecked(_First, _Last, _Pred); _Work_complete += _Size; return false; } template void _Process_sort_queue(const _RanIt _Basis, _Pr _Pred, _Work_stealing_membership<_Sort_work_item<_RanIt>>& _My_ticket, _Sort_work_item<_RanIt>& _Wi) noexcept /* terminates */ { _Sort_work_item<_RanIt> _Right_fork_wi; do { // process work items in the local queue while (_Process_sort_work_item(_Basis, _Pred, _Wi, _Right_fork_wi, _My_ticket._Work_complete)) { _TRY_BEGIN _My_ticket._Push_bottom(_Right_fork_wi); _CATCH(const _Parallelism_resources_exhausted&) // local queue is full and memory can't be acquired, process _Right_fork_wi serially const auto _First = _Basis + _Right_fork_wi._Offset; _Sort_unchecked(_First, _First + _Right_fork_wi._Size, _Right_fork_wi._Ideal, _Pred); _My_ticket._Work_complete += _Right_fork_wi._Size; _CATCH_END } } while (_My_ticket._Try_pop_bottom(_Wi)); } template struct _Sort_operation { // context for background threads _RanIt _Basis; _Pr _Pred; _Work_stealing_team<_Sort_work_item<_RanIt>> _Team; _Sort_operation(_RanIt _First, _Pr _Pred_arg, size_t _Threads, _Iter_diff_t<_RanIt> _Count) : _Basis(_First), _Pred(_Pred_arg), _Team(_Threads, _Count) {} static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, const __std_PTP_WORK _Work) noexcept /* terminates */ { auto* const _This = static_cast<_Sort_operation*>(_Context); const auto _Basis = _This->_Basis; const auto _Pred = _This->_Pred; auto& _Team = _This->_Team; auto _My_ticket = _Team._Join_team(); _Sort_work_item<_RanIt> _Wi; for (;;) { switch (_My_ticket._Steal(_Wi)) { case _Steal_result::_Success: _Process_sort_queue(_Basis, _Pred, _My_ticket, _Wi); break; case _Steal_result::_Abort: _My_ticket._Leave(); __std_submit_threadpool_work(_Work); return; case _Steal_result::_Done: return; } } } }; template /* = 0 */> void sort(_ExPo&&, const _RanIt _First, const _RanIt _Last, _Pr _Pred) noexcept /* terminates */ { // order [_First, _Last), using _Pred _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); const _Iter_diff_t<_RanIt> _Ideal = _ULast - _UFirst; if constexpr (remove_reference_t<_ExPo>::_Parallelize) { size_t _Threads; if (_Ideal > _ISORT_MAX && (_Threads = __std_parallel_algorithms_hw_threads()) > 1) { // parallelize when input is large enough and we aren't on a uniprocessor machine _TRY_BEGIN _Sort_operation _Operation(_UFirst, _Pass_fn(_Pred), _Threads, _Ideal); // throws const _Work_ptr _Work{_Operation}; // throws auto& _Team = _Operation._Team; auto _My_ticket = _Team._Join_team(); _Work._Submit(_Threads - 1); _Sort_work_item<_RanIt> _Wi{0, _Ideal, _Ideal}; _Steal_result _Sr; do { _Process_sort_queue(_UFirst, _Pass_fn(_Pred), _My_ticket, _Wi); do { _Sr = _My_ticket._Steal(_Wi); } while (_Sr == _Steal_result::_Abort); } while (_Sr != _Steal_result::_Done); return; _CATCH(const _Parallelism_resources_exhausted&) // fall through to _Sort_unchecked, below _CATCH_END } } _Sort_unchecked(_UFirst, _ULast, _Ideal, _Pass_fn(_Pred)); } // PARALLEL FUNCTION TEMPLATE stable_sort template struct _Static_partitioned_temporary_buffer2 { _Optimistic_temporary_buffer<_Ty>& _Temp_buf; ptrdiff_t _Chunk_size; ptrdiff_t _Unchunked_items; template explicit _Static_partitioned_temporary_buffer2( _Optimistic_temporary_buffer<_Ty>& _Temp_buf_raw, _Static_partition_team<_Diff>& _Team) : _Temp_buf(_Temp_buf_raw), _Chunk_size(static_cast(_Temp_buf._Capacity / _Team._Chunks)), _Unchunked_items(static_cast(_Temp_buf._Capacity % _Team._Chunks)) {} pair<_Ty*, ptrdiff_t> _Get_temp_buffer_for_chunk(const size_t _Chunk_number) { // get a view of the region of the temporary buffer allocated to chunk _Chunk_number return _Get_temp_buffer_for_chunk_range(_Chunk_number, _Chunk_number + 1); } pair<_Ty*, ptrdiff_t> _Get_temp_buffer_for_chunk_range(const size_t _Base, const size_t _End) { // get a view of the region of the temporary buffer allocated to the region of chunks [_Base, _End) const auto _Offset = _Get_offset(_Base); return {_Temp_buf._Data + _Offset, _Offset - _Get_offset(_End)}; } _Ty* _Get_first(const size_t _Chunk_number) { // get a pointer to the first element of the temporary buffer allocated to chunk _Chunk_number return _Temp_buf._Data + _Get_offset(_Chunk_number); } ptrdiff_t _Get_offset(const size_t _Chunk_number) { // get the offset of the first element of the temporary buffer allocated to chunk _Chunk_number auto _Diff_chunk = static_cast(_Chunk_number); return _Diff_chunk * _Chunk_size + (_STD min)(_Diff_chunk, _Unchunked_items); } void _Destroy_all() { // destroy each element of the temporary buffer _Destroy_range(_Temp_buf._Data, _Temp_buf._Data + _Temp_buf._Capacity); } }; inline size_t _Get_stable_sort_tree_height(const size_t _Count, const size_t _Hw_threads) { // Get height of merge tree for parallel stable_sort, a bottom-up merge sort. // * each merge takes two chunks from a buffer and copies to the other buffer in sorted order // * we want the overall result to end up in the input buffer and not into _Temp_buf; each merge // "level" switches between the input buffer and the temporary buffer; as a result we want // the number of merge "levels" to be even (and thus chunks must be 2 raised to an even power) // * the smallest chunk must be at least of size _ISORT_MAX // * we want a number of chunks as close to _Ideal_chunks as we can to minimize scheduling // overhead, but can use more chunks than that const auto _Count_max_chunks = _Count / _ISORT_MAX; const size_t _Log_count_max_chunks = _Floor_of_log_2(_Count_max_chunks); // if _Log_count_max_chunks is odd, that would break our 2 to even power invariant, so // go to the next smaller power of 2 const auto _Count_max_tree_height = _Log_count_max_chunks & ~static_cast(1); const auto _Ideal_chunks = _Hw_threads * _Oversubscription_multiplier; const size_t _Log_ideal_chunks = _Floor_of_log_2(_Ideal_chunks); #ifdef _WIN64 const size_t _Max_tree_height = 62; // to avoid ptrdiff_t overflow #else // ^^^ _WIN64 ^^^ // vvv !_WIN64 vvv const size_t _Max_tree_height = 30; #endif // _WIN64 const size_t _Clamped_ideal_chunks = (_STD min)(_Max_tree_height, _Log_ideal_chunks); // similarly, if _Clamped_ideal_chunks is odd, that would break our 2 to even power invariant, // so go to the next higher power of 2 const auto _Ideal_tree_height = _Clamped_ideal_chunks + (_Clamped_ideal_chunks & 0x1U); return (_STD min)(_Count_max_tree_height, _Ideal_tree_height); } struct _Bottom_up_merge_tree { // merge tree / cells: // each level of the tree has the next 1 bit turned on: // 0 == done // 1 1 // 2 3 1x // 4 5 6 7 1xx // chunks: 0 1 2 3 4 5 6 7 // the starting cell for a given chunk is (_Chunk_number >> 1) + (1 << (height - 1)) // (divide the chunk number by 2, and add the starting index of the bottom row of the tree) // a cell's parent is just left shifting the current cell by 1 size_t _Height; // each cell's data is stored at _Buckets[_Cell - 1], since the 0th cell is unused / indicates termination _Parallel_vector> _Buckets; explicit _Bottom_up_merge_tree(const size_t _Height_) : _Height(_Height_), _Buckets((static_cast(1) << _Height_) - 1) {} }; struct _Bottom_up_tree_visitor { size_t _Cell; size_t _Base; size_t _Shift; _Bottom_up_tree_visitor(const size_t _Tree_height, const size_t _Initial_chunk) : _Cell((_Initial_chunk >> 1) + (static_cast(1) << (_Tree_height - 1))), _Base(_Initial_chunk & ~static_cast(1)), _Shift(1) {} bool _Try_give_up_merge_to_peer(_Bottom_up_merge_tree& _Merge_tree) const { // Attempt to mark that this child is done, to let our peer do the merge. // Returns whether we successfully gave responsibility for doing the current merge to our peer. return !_Merge_tree._Buckets[_Cell - 1].exchange(true); } bool _Go_to_parent() { // Attempt to go to the parent in the merge tree; returns whether the move to the parent was successful. _Shift <<= 1; _Base -= _Shift * (_Cell & static_cast(1)); _Cell >>= 1; return _Cell != 0; } }; template struct _Static_partitioned_stable_sort3 { using _Diff = _Iter_diff_t<_BidIt>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_BidIt> _Basis; _Bottom_up_merge_tree _Merge_tree; _Static_partitioned_temporary_buffer2<_Iter_value_t<_BidIt>> _Temp_buf; _Pr _Pred; _Static_partitioned_stable_sort3(_Optimistic_temporary_buffer<_Iter_value_t<_BidIt>>& _Temp_buf_raw, const _Diff _Count, const size_t _Merge_tree_height_, const _BidIt _First, _Pr _Pred_) : _Team(_Count, static_cast(1) << _Merge_tree_height_), _Basis{}, _Merge_tree(_Merge_tree_height_), _Temp_buf(_Temp_buf_raw, _Team), _Pred{_Pred_} { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } { const auto _Serial_chunk = _Basis._Get_chunk(_Key); const auto _Temp_chunk = _Temp_buf._Get_temp_buffer_for_chunk(_Key._Chunk_number); _Stable_sort_unchecked( _Serial_chunk._First, _Serial_chunk._Last, _Key._Size, _Temp_chunk.first, _Temp_chunk.second, _Pred); } _Bottom_up_tree_visitor _Visitor{_Merge_tree._Height, _Key._Chunk_number}; if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // peer will do the work to merge return _Cancellation_status::_Running; } if (_Temp_buf._Temp_buf._Capacity >= _Team._Count) { // enough space, combine using merge { // first merge is special; constructs the elements in the temporary buffer space const size_t _Base = _Visitor._Base; _Uninitialized_merge_move(_Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)), _Basis._Get_first(_Base + 1, _Team._Get_chunk_offset(_Base + 1)), _Basis._Get_first(_Base + 2, _Team._Get_chunk_offset(_Base + 2)), _Temp_buf._Get_first(_Base), _Pred); } for (;;) { // walk remaining merge tree (void) _Visitor._Go_to_parent(); // can't be complete because the data is in the temporary buffer if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge return _Cancellation_status::_Running; } { // merge two sub-ranges from temporary buffer to input const size_t _Base = _Visitor._Base; const size_t _Mid = _Base + _Visitor._Shift; const size_t _End = _Mid + _Visitor._Shift; _Merge_move(_Temp_buf._Get_first(_Base), _Temp_buf._Get_first(_Mid), _Temp_buf._Get_first(_End), _Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)), _Pred); } if (!_Visitor._Go_to_parent()) { // temporary bits have been copied back to the input, no parent, so we're done _Temp_buf._Destroy_all(); return _Cancellation_status::_Canceled; } if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge return _Cancellation_status::_Running; } { // merge two sub-ranges from input to temporary buffer const size_t _Base = _Visitor._Base; const size_t _Mid = _Base + _Visitor._Shift; const size_t _End = _Mid + _Visitor._Shift; _Merge_move(_Basis._Get_first(_Base, _Team._Get_chunk_offset(_Base)), _Basis._Get_first(_Mid, _Team._Get_chunk_offset(_Mid)), _Basis._Get_first(_End, _Team._Get_chunk_offset(_End)), _Temp_buf._Get_first(_Base), _Pred); } } } else { // not enough space, combine using inplace_merge for (;;) { const size_t _Base = _Visitor._Base; const size_t _Mid = _Base + _Visitor._Shift; const size_t _End = _Mid + _Visitor._Shift; const auto _Base_offset = _Team._Get_chunk_offset(_Base); const auto _Mid_offset = _Team._Get_chunk_offset(_Mid); const auto _End_offset = _Team._Get_chunk_offset(_End); const auto _Temp_range = _Temp_buf._Get_temp_buffer_for_chunk_range(_Base, _End); _Buffered_inplace_merge_unchecked(_Basis._Get_first(_Base, _Base_offset), _Basis._Get_first(_Mid, _Mid_offset), _Basis._Get_first(_End, _End_offset), _Mid_offset - _Base_offset, _End_offset - _Mid_offset, _Temp_range.first, _Temp_range.second, _Pred); if (!_Visitor._Go_to_parent()) { // no parent, so we're done return _Cancellation_status::_Canceled; } if (_Visitor._Try_give_up_merge_to_peer(_Merge_tree)) { // the other child will do the merge return _Cancellation_status::_Running; } } } } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_stable_sort3*>(_Context)); } }; template /* = 0 */> void stable_sort(_ExPo&&, const _BidIt _First, const _BidIt _Last, _Pr _Pred) noexcept /* terminates */ { // sort preserving order of equivalents, using _Pred _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); const auto _Count = _STD distance(_UFirst, _ULast); if (_Count <= _ISORT_MAX) { _Insertion_sort_unchecked(_UFirst, _ULast, _Pass_fn(_Pred)); return; } size_t _Hw_threads; bool _Attempt_parallelism; if constexpr (remove_reference_t<_ExPo>::_Parallelize) { _Hw_threads = __std_parallel_algorithms_hw_threads(); _Attempt_parallelism = _Hw_threads > 1; } else { _Attempt_parallelism = false; } _Optimistic_temporary_buffer<_Iter_value_t<_BidIt>> _Temp_buf{_Attempt_parallelism ? _Count : _Count - _Count / 2}; if constexpr (remove_reference_t<_ExPo>::_Parallelize) { if (_Attempt_parallelism) { // forward+ iterator overflow assumption for size_t cast const auto _Tree_height = _Get_stable_sort_tree_height(static_cast(_Count), _Hw_threads); if (_Tree_height != 0) { _TRY_BEGIN _Static_partitioned_stable_sort3 _Operation{_Temp_buf, _Count, _Tree_height, _UFirst, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); return; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Stable_sort_unchecked(_UFirst, _ULast, _Count, _Temp_buf._Data, _Temp_buf._Capacity, _Pass_fn(_Pred)); } // PARALLEL FUNCTION TEMPLATE is_sorted_until template struct _Static_partitioned_is_sorted_until { _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; // note offset partitioning: _Static_partition_range<_FwdIt> _Basis; // contains partition of [_First, _Last - 1) _Pr _Pred; _Parallel_find_results<_FwdIt> _Results; _Static_partitioned_is_sorted_until( _FwdIt _First, _FwdIt _Last, const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred(_Pred_), _Results(_Last) { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } auto _Range = _Basis._Get_chunk(_Key); auto _Next = _Range._First; do { ++_Next; if (_DEBUG_LT_PRED(_Pred, *_Next, *_Range._First)) { _Results._Imbue(_Key._Chunk_number, _Next); return _Cancellation_status::_Canceled; } _Range._First = _Next; } while (_Range._First != _Range._Last); return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_is_sorted_until*>(_Context)); } }; template /* = 0 */> _NODISCARD _FwdIt is_sorted_until(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // find extent of range that is ordered by predicate _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 3) { // ... with at least 3 elements _TRY_BEGIN --_Count; // note unusual offset partitioning _Static_partitioned_is_sorted_until _Operation{_UFirst, _ULast, _Hw_threads, _Count, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First, _Operation._Results._Get_result()); return _First; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_First, _STD is_sorted_until(_UFirst, _ULast, _Pass_fn(_Pred))); return _First; } // PARALLEL FUNCTION TEMPLATE is_partitioned inline constexpr unsigned char _Contains_counterexample = 0; inline constexpr unsigned char _Contains_true = 1; inline constexpr unsigned char _Contains_false = 2; template unsigned char _Chunk_is_partitioned_status(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // Returns the is_partitioned status for this range. // pre: Range is non-empty unsigned char _Result = 0; if (_Pred(*_First)) { // starts with T _Result = _Contains_true; for (;;) { // skip T partition ++_First; if (_First == _Last) { // all T return _Contains_true; } if (!_Pred(*_First)) { // range contains a switching point from T to F break; } } } _Result |= _Contains_false; while (++_First != _Last) { // verify F partition if (_Pred(*_First)) { // found an out of place element return _Contains_counterexample; } } return _Result; } template struct _Static_partitioned_is_partitioned { _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; atomic _Rightmost_true; // chunk number of the rightmost chunk found so far containing a T element atomic _Leftmost_false; // chunk number of the leftmost chunk found so far containing an F element _Pr _Pred; _Static_partitioned_is_partitioned( const size_t _Hw_threads, const _Iter_diff_t<_FwdIt> _Count, const _FwdIt _First, _Pr _Pred_) : _Team(_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)), _Basis{}, _Rightmost_true(0), _Leftmost_false(SIZE_MAX - 1), _Pred(_Pred_) { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { // Note that the cancellation status here is not used in the final returned answer of is_partitioned. Returning // _Cancellation_status::_Canceled is simply used as an "early fail" mechanism to avoid doing unnecessary work. // A final comparison of _Rightmost_true and _Leftmost_false is used to determine the final return value from // a call to is_partitioned. if (_Rightmost_true.load(memory_order_relaxed) > _Leftmost_false.load(memory_order_relaxed)) { // we've found a T to the right of an F, so we know the range cannot be partitioned and we can stop return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } // looking at chunks from either end, moving in towards the middle auto _Target_chunk_number = _Key._Chunk_number >> 1; if (!(_Key._Chunk_number & static_cast(1))) { _Target_chunk_number = _Team._Chunks - _Target_chunk_number - 1; } const auto _Target_chunk_key = _Team._Get_chunk_key(_Target_chunk_number); const auto _Chunk = _Basis._Get_chunk(_Target_chunk_key); const auto _This_chunk_status = _Chunk_is_partitioned_status(_Chunk._First, _Chunk._Last, _Pred); if (_This_chunk_status == _Contains_counterexample) { // indicating that this chunk contains a counterexample, so the range is not partitioned _Rightmost_true.store(SIZE_MAX); return _Cancellation_status::_Canceled; } // after determining the is_partitioned status for this chunk, // we need to update the chunk numbers for leftmost F and rightmost T auto _Old_true = _Rightmost_true.load(); if (_This_chunk_status & _Contains_true) { while (_Target_chunk_number > _Old_true) { if (_Rightmost_true.compare_exchange_weak(_Old_true, _Target_chunk_number)) { _Old_true = _Target_chunk_number; break; } } } // try to bail before doing more work if possible auto _Old_false = _Leftmost_false.load(); if (_Old_true > _Old_false) { return _Cancellation_status::_Canceled; } if (_This_chunk_status & _Contains_false) { while (_Target_chunk_number < _Old_false) { if (_Leftmost_false.compare_exchange_weak(_Old_false, _Target_chunk_number)) { _Old_false = _Target_chunk_number; break; } } } // Other loads/stores may have been reordered around the loads of _Old_false and _Old_true, but this check may // allow us to avoid more atomic loads. // If such loads/stores have been reordered around the loads of _Old_false and _Old_true, then the next call to // _Process_chunk will exit given the check on the _Rightmost_true and _Leftmost_false values at the top of the // method. if (_Old_true > _Old_false) { return _Cancellation_status::_Canceled; } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_is_partitioned*>(_Context)); } }; template /* = 0 */> _NODISCARD bool is_partitioned(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // test if [_First, _Last) is partitioned by _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_is_partitioned _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Operation._Rightmost_true.load(memory_order_relaxed) <= _Operation._Leftmost_false.load(memory_order_relaxed); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } return _STD is_partitioned(_UFirst, _ULast, _Pass_fn(_Pred)); } // PARALLEL FUNCTION TEMPLATE is_heap_until template struct _Static_partitioned_is_heap_until { using _Diff = _Iter_diff_t<_RanIt>; _Static_partition_team<_Diff> _Team; _RanIt _Range_first; _Pr _Pred; _Parallel_find_results<_RanIt> _Results; _Static_partitioned_is_heap_until( _RanIt _First, _RanIt _Last, const size_t _Hw_threads, const _Diff _Count, _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Range_first(_First), _Pred(_Pred_), _Results(_Last) {} _Cancellation_status _Process_chunk() { if (_Results._Complete()) { return _Cancellation_status::_Canceled; } const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_range_size = _Key._Size; const auto _Chunk_offset = _Key._Start_at; const auto _Last = _Chunk_offset + _Chunk_range_size; const auto _Initial = (_STD max)(_Chunk_offset, _Diff{1}); for (_Diff _Off = _Initial; _Off < _Last; ++_Off) { if (_DEBUG_LT_PRED(_Pred, *(_Range_first + ((_Off - 1) >> 1)), *(_Range_first + _Off))) { _Results._Imbue(_Key._Chunk_number, _Range_first + _Off); return _Cancellation_status::_Canceled; } } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_is_heap_until*>(_Context)); } }; template /* = 0 */> _NODISCARD _RanIt is_heap_until(_ExPo&&, _RanIt _First, _RanIt _Last, _Pr _Pred) noexcept /* terminates */ { // find extent of range that is a heap ordered by _Pred _REQUIRE_PARALLEL_ITERATOR(_RanIt); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _ULast - _UFirst; if (_Count >= 3) { // ... with at least 3 elements _TRY_BEGIN _Static_partitioned_is_heap_until _Operation{_UFirst, _ULast, _Hw_threads, _Count, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First, _Operation._Results._Get_result()); return _First; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_First, _STD is_heap_until(_UFirst, _ULast, _Pass_fn(_Pred))); return _First; } // PARALLEL FUNCTION TEMPLATE partition template pair<_FwdIt, _Iter_diff_t<_FwdIt>> _Partition_with_count_unchecked(_FwdIt _First, _FwdIt _Last, _Pr _Pred) { // move elements satisfying _Pred to front and track how many elements satisfy _Pred if constexpr (_Is_random_iter_v<_FwdIt>) { auto _Mid = _STD partition(_First, _Last, _Pred); return {_Mid, _Mid - _First}; } else if constexpr (_Is_bidi_iter_v<_FwdIt>) { _Iter_diff_t<_FwdIt> _Trues{}; for (;;) { // find any out-of-order pair for (;;) { // skip in-place elements at beginning if (_First == _Last) { return {_First, _Trues}; } if (!_Pred(*_First)) { break; } ++_First; ++_Trues; } do { // skip in-place elements at end --_Last; if (_First == _Last) { return {_First, _Trues}; } } while (!_Pred(*_Last)); _STD iter_swap(_First, _Last); // out of place, swap and loop ++_First; ++_Trues; } } else { _Iter_diff_t<_FwdIt> _Trues{}; for (;;) { // skip in-place elements at beginning if (_First == _Last) { return {_First, _Trues}; } if (!_Pred(*_First)) { break; } ++_First; ++_Trues; } for (_FwdIt _Next = _First; ++_Next != _Last;) { if (_Pred(*_Next)) { _STD iter_swap(_First, _Next); // out of place, swap and loop ++_First; ++_Trues; } } return {_First, _Trues}; } } template pair<_FwdIt, _Iter_diff_t<_FwdIt>> _Partition_swap_backward( _FwdIt _First, _FwdIt _Last, _FwdIt _Beginning_of_falses, _Pr _Pred) { // Swap elements in [_First, _Last) satisfying _Pred with elements from _Beginning_of_falses. // Pre: _Beginning_of_falses < _First _Iter_diff_t<_FwdIt> _Trues{}; if constexpr (_Is_bidi_iter_v<_FwdIt>) { while (_First != _Last) { --_Last; if (_Pred(*_Last)) { _STD iter_swap(_Beginning_of_falses, _Last); ++_Beginning_of_falses; ++_Trues; if (_Beginning_of_falses == _First) { auto _Remain = _Partition_with_count_unchecked(_First, _Last, _Pred); return {_Remain.first, static_cast<_Iter_diff_t<_FwdIt>>(_Remain.second + _Trues)}; } } } } else { for (; _First != _Last; ++_First) { if (_Pred(*_First)) { _STD iter_swap(_First, _Beginning_of_falses); ++_Beginning_of_falses; ++_Trues; } } } return {_Beginning_of_falses, _Trues}; } template _FwdIt _Partition_merge(const _FwdIt _False_first, const _FwdIt _True_first, const _FwdIt _True_last, const _Iter_diff_t<_FwdIt> _Count1, const _Iter_diff_t<_FwdIt> _Count2) { // Merge partition ranges where [_False_first, _True_first) are falses, [_True_first, _True_last) are trues // pre: _Count1 == distance(_False_first, _True_first) && _Count2 == distance(_True_first, _True_last) if (_Count1 < _Count2) { // move the false range to the end of the true range const _Iter_diff_t<_FwdIt> _Offset = _Count2 - _Count1; auto _Result = _True_first; if constexpr (_Is_random_iter_v<_FwdIt>) { _Result += _Offset; } else if constexpr (_Is_bidi_iter_v<_FwdIt>) { if (_Count1 < _Offset) { _Result = _True_last; _STD advance(_Result, -_Count1); } else { _STD advance(_Result, _Offset); } } else { _STD advance(_Result, _Offset); } _Swap_ranges_unchecked(_False_first, _True_first, _Result); return _Result; } // move the true range to the beginning of the false range return _Swap_ranges_unchecked(_True_first, _True_last, _False_first); } template struct _Static_partitioned_partition2 { using _Diff = _Iter_diff_t<_FwdIt>; enum class _Chunk_state : unsigned char { _Serial, // while a chunk is in the serial state, it is touched only by an owner thread _Merging, // while a chunk is in the merging state, threads all try to CAS the chunk _Merging -> _Swapping // the thread that succeeds takes responsibility for swapping the trues from that chunk to the // results _Swapping, // while a chunk is in the swapping state, the trues are being merged with _Results // only one chunk at a time is ever _Swapping; this also serves to synchronize access to // _Results and _Results_falses _Done // when a chunk becomes _Done, it is complete / will never need to touch _Results again }; #pragma warning(push) #pragma warning(disable : 4324) // structure was padded due to alignment specifier struct alignas(hardware_destructive_interference_size) alignas(_FwdIt) _Chunk_local_data { atomic<_Chunk_state> _State; _FwdIt _Beginning_of_falses; _Diff _Chunk_trues; }; #pragma warning(pop) _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt> _Basis; _Pr _Pred; _Parallel_vector<_Chunk_local_data> _Chunk_locals; _FwdIt _Results; _Diff _Results_falses; _Static_partitioned_partition2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt _First, const _Pr _Pred_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Pred{_Pred_}, _Chunk_locals(_Team._Chunks), _Results{_First}, _Results_falses{} { _Basis._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } // serial-on-each-chunk phase: auto _Merge_index = _Key._Chunk_number; // merge step will start from this index { auto& _Chunk_data = _Chunk_locals[_Merge_index]; const auto _Range = _Basis._Get_chunk(_Key); if (_Merge_index == 0 || _Chunk_locals[_Merge_index - 1]._State.load() == _Chunk_state::_Done) { // no predecessor, so run serial algorithm directly into results const auto _Chunk_results = _Merge_index == 0 || _Results == _Range._First ? _Partition_with_count_unchecked(_Range._First, _Range._Last, _Pred) : _Partition_swap_backward(_Range._First, _Range._Last, _Results, _Pred); _Results = _Chunk_results.first; _Chunk_data._Chunk_trues = _Chunk_results.second; _Results_falses += _Key._Size - _Chunk_results.second; _Chunk_data._State.store(_Chunk_state::_Done); ++_Merge_index; // this chunk is already merged } else { // predecessor, run serial algorithm in place and attempt to merge later auto _Chunk_results = _Partition_with_count_unchecked(_Range._First, _Range._Last, _Pred); _Chunk_data._Beginning_of_falses = _Chunk_results.first; _Chunk_data._Chunk_trues = _Chunk_results.second; _Chunk_data._State.store(_Chunk_state::_Merging); if (_Chunk_locals[_Merge_index - 1]._State.load() != _Chunk_state::_Done) { // if the predecessor isn't done, whichever thread merges our predecessor will merge us too return _Cancellation_status::_Running; } } } // merge phase: at this point, we have observed that our predecessor chunk has been merged to the output, // attempt to become the new merging thread if the previous merger gave up // note: it is an invariant when we get here that _Chunk_locals[_Merge_index - 1]._State == _Chunk_state::_Done for (; _Merge_index != _Team._Chunks; ++_Merge_index) { auto& _Merge_chunk_data = _Chunk_locals[_Merge_index]; auto _Expected = _Chunk_state::_Merging; if (!_Merge_chunk_data._State.compare_exchange_strong(_Expected, _Chunk_state::_Swapping)) { // either the _Merge_index chunk isn't ready to merge yet, or another thread will do it return _Cancellation_status::_Running; } const auto _Merge_key = _Team._Get_chunk_key(_Merge_index); const auto _Chunk_trues = _Merge_chunk_data._Chunk_trues; _Results = _Partition_merge(_Results, _Basis._Get_first(_Merge_index, _Merge_key._Start_at), _STD exchange(_Merge_chunk_data._Beginning_of_falses, {}), _Results_falses, _Chunk_trues); _Results_falses += _Merge_key._Size - _Chunk_trues; _Merge_chunk_data._State.store(_Chunk_state::_Done); } return _Cancellation_status::_Canceled; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_partition2*>(_Context)); } }; template /* = 0 */> _FwdIt partition(_ExPo&&, _FwdIt _First, const _FwdIt _Last, _Pr _Pred) noexcept /* terminates */ { // move elements satisfying _Pred to beginning of sequence _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { const auto _Count = _STD distance(_UFirst, _ULast); if (_Count >= 2) { _TRY_BEGIN _Static_partitioned_partition2 _Operation{_Hw_threads, _Count, _UFirst, _Pass_fn(_Pred)}; _Run_chunked_parallel_work(_Hw_threads, _Operation); _Seek_wrapped(_First, _Operation._Results); return _First; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_First, _STD partition(_UFirst, _ULast, _Pass_fn(_Pred))); return _First; } // PARALLEL FUNCTION TEMPLATE set_intersection template struct _Storage_for { // uninitialized space to store a _Ty alignas(_Ty) unsigned char _Storage[sizeof(_Ty)]; _Storage_for() = default; _Storage_for(const _Storage_for&) = delete; _Storage_for& operator=(const _Storage_for&) = delete; _Ty& _Ref() { return reinterpret_cast<_Ty&>(_Storage); } }; inline constexpr unsigned char _Local_available = 1; inline constexpr unsigned char _Sum_available = 2; template struct _Scan_decoupled_lookback { // inter-chunk communication block in "Single-pass Parallel Prefix Scan with Decoupled Look-back" by Merrill and // Garland using value_type = _Ty; atomic _State; _Storage_for<_Ty> _Local; // owned by thread iff _State < _Local_available; otherwise const and shared _Storage_for<_Ty> _Sum; // owned by thread iff _State < _Sum_available unsigned char _Get_available_state() const { for (;;) { const unsigned char _Local_state = _State.load(); if (_Local_state != 0) { return _Local_state; } __std_execution_wait_on_uchar(reinterpret_cast(&_State), _Local_state); } } void _Store_available_state(const unsigned char _New_state) { _State.store(_New_state); __std_execution_wake_by_address_all(&_State); } template void _Apply_exclusive_predecessor(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) { // apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op _Construct_in_place(_Sum._Ref(), _Reduce_op(_Preceding, _Local._Ref())); _State.store(_Local_available | _Sum_available); *_First = _Preceding; #pragma loop(ivdep) while (++_First != _Last) { *_First = _Reduce_op(_Preceding, _STD move(*_First)); } } template void _Apply_inclusive_predecessor(_Ty& _Preceding, _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) { // apply _Preceding to [_First, _Last) and _Sum._Ref(), using _Reduce_op _Construct_in_place(_Sum._Ref(), _Reduce_op(_Preceding, _Local._Ref())); _State.store(_Local_available | _Sum_available); #pragma loop(ivdep) for (; _First != _Last; ++_First) { *_First = _Reduce_op(_Preceding, _STD move(*_First)); } } ~_Scan_decoupled_lookback() { const auto _State_bits = _State.load(memory_order_relaxed); if (_State_bits & _Sum_available) { _Destroy_in_place(_Sum._Ref()); } if (_State_bits & _Local_available) { _Destroy_in_place(_Local._Ref()); } } }; template typename _Iter_value_t<_BidIt>::value_type _Get_lookback_sum(const _BidIt _Current, _BinOp _Reduce_op) { // Calculate the sum of the _Scan_decoupled_lookback referenced by _Current. // pre: _Current->_State & _Local_available // pre: Some iterator i exists before _Current such that i->_Get_available_state & _Sum_available static_assert(_Is_specialization_v<_Iter_value_t<_BidIt>, _Scan_decoupled_lookback>, "Bad _Get_lookback_sum"); auto _Prev = _Current; --_Prev; auto _Prev_state = _Prev->_Get_available_state(); typename _Iter_value_t<_BidIt>::value_type _Result( _Reduce_op(_Prev_state & _Sum_available ? _Prev->_Sum._Ref() : _Prev->_Local._Ref(), _Current->_Local._Ref())); while (!(_Prev_state & _Sum_available)) { --_Prev; _Prev_state = _Prev->_Get_available_state(); _Result = _Reduce_op(_Prev_state & _Sum_available ? _Prev->_Sum._Ref() : _Prev->_Local._Ref(), _STD move(_Result)); } return _Result; } template struct _Casty_plus { // Adds the two arguments together and casts the result back to _Ty. // pre: the result from adding the two arguments together can fit in _Ty _NODISCARD constexpr _Ty operator()(_Ty _Val1, _Ty _Val2) const noexcept /* terminates */ { return static_cast<_Ty>(_Val1 + _Val2); } }; template void _Surrender_elements_to_next_chunk(const size_t _Chunk_number, const typename _Parallel_vector<_Scan_decoupled_lookback<_Diff>>::iterator _Chunk_lookback_data) { // Deals with the case in which all of the elements in the chunk corresponding to _Chunk_lookback_data will be // handled by the following chunk, so this chunk needs to publish its results accordingly. if (_Chunk_number == 0) { // This is the first chunk, so we can immediately publish results. No need to set // _Chunk_lookback_data->_Local._Ref() to be 0, since chunk 0 has no predecessors and its local and total sums // are the same. Chunk_lookback_data->_Sum is already 0, so we can just publish results immediately. _Chunk_lookback_data->_Store_available_state(_Sum_available); return; } // We need to pass the previous chunk's sum to the right. // _Chunk_lookback_data->_Local is already 0, so we can just publish results immediately. _Chunk_lookback_data->_Store_available_state(_Local_available); const auto _Prev_chunk_lookback_data = _Prev_iter(_Chunk_lookback_data); _Diff _Prev_chunk_sum; if (_Prev_chunk_lookback_data->_Get_available_state() & _Sum_available) { _Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref(); } else { // Note that we can use _Casty_plus because _Diff is defined as _Common_diff<..., _RanIt3> and the maximum value // that will be placed in _Lookback by adding two of the previous sums together is the total number of elements // in the result. Assuming that _Dest has enough space for the result, the value produced by adding two previous // sums should fit inside _Diff. _Prev_chunk_sum = _Get_lookback_sum(_Prev_chunk_lookback_data, _Casty_plus<_Diff>{}); } _Chunk_lookback_data->_Sum._Ref() = _Prev_chunk_sum; _Chunk_lookback_data->_Store_available_state(_Sum_available); } template void _Place_elements_from_indices( const _RanIt1 _First, _RanIt2 _Dest, _RanIt3 _Indices_first, const ptrdiff_t _Num_results) { // Places _Num_results elements at indices in _Indices_first from the range indicated by _First into _Dest. const auto _Last_index = _Indices_first + _Num_results; for (; _Indices_first != _Last_index; ++_Indices_first) { const auto _Curr_index = *_Indices_first; *_Dest = *(_First + static_cast<_Iter_diff_t<_RanIt1>>(_Curr_index)); ++_Dest; } } template struct _Static_partitioned_set_subtraction { using _Diff = _Common_diff_t<_RanIt1, _RanIt2, _RanIt3>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_RanIt1, _Diff> _Basis; _Parallel_vector<_Diff> _Index_indicator; // buffer used to store information about indices in Range 1 _Iterator_range<_RanIt2> _Range2; _RanIt3 _Dest; _Parallel_vector<_Scan_decoupled_lookback<_Diff>> _Lookback; // the "Single-pass Parallel Prefix Scan with // Decoupled Look-back" is used here to track // information about how many elements were placed // in _Dest by preceding chunks _Pr _Pred; _SetOper _Set_oper_per_chunk; _Static_partitioned_set_subtraction(const size_t _Hw_threads, const _Diff _Count, _RanIt1 _First1, _RanIt2 _First2, const _RanIt2 _Last2, _RanIt3 _Dest_, _Pr _Pred_, _SetOper _Set_oper) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis{}, _Index_indicator(static_cast(_Count)), _Range2{_First2, _Last2}, _Dest(_Dest_), _Lookback(_Team._Chunks), _Pred(_Pred_), _Set_oper_per_chunk(_Set_oper) { _Basis._Populate(_Team, _First1); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _Chunk_lookback_data = _Lookback.begin() + static_cast(_Chunk_number); // iterators for the actual beginning of this chunk's range in _Range1 (after adjustments below) auto [_Range1_chunk_first, _Range1_chunk_last] = _Basis._Get_chunk(_Key); const bool _Last_chunk = _Chunk_number == _Team._Chunks - 1; // Get appropriate range for _Range1. // We don't want any spans of equal elements to reach across chunk boundaries. if (!_Last_chunk) { // Slide _Range1_chunk_last to the left so that there are no copies of *_Range1_chunk_last in _Range1_chunk. // Note that we know that this chunk is not the last, so we can look at the element at _Range1_chunk_last. _Range1_chunk_last = _STD lower_bound(_Range1_chunk_first, _Range1_chunk_last, *_Range1_chunk_last, _Pred); if (_Range1_chunk_last <= _Range1_chunk_first) { // All of the elements in _Range1's chunk are equal to the element at _Range1_chunk_last, so they will // be handled by the next chunk. _Surrender_elements_to_next_chunk<_Diff>(_Chunk_number, _Chunk_lookback_data); return _Cancellation_status::_Running; } } // Slide _Range1_chunk_first to the left so that all copies of *_Range1_chunk_first are in this chunk // of Range 1. _Range1_chunk_first = _STD lower_bound(_Basis._Start_at, _Range1_chunk_first, *_Range1_chunk_first, _Pred); // Get chunk in _Range2 that corresponds to our current chunk from _Range1 auto _Range2_chunk_first = _STD lower_bound(_Range2._First, _Range2._Last, *_Range1_chunk_first, _Pred); auto _Range2_chunk_last = _STD upper_bound(_Range2_chunk_first, _Range2._Last, *_Prev_iter(_Range1_chunk_last), _Pred); // Publish results to rest of chunks. if (_Chunk_number == 0) { // Chunk 0 is special as it has no predecessor; // its local and total sums are the same and we can immediately put its results in _Dest. const auto _Num_results = _Set_oper_per_chunk._Update_dest( _Range1_chunk_first, _Range1_chunk_last, _Range2_chunk_first, _Range2_chunk_last, _Dest, _Pred); _Chunk_lookback_data->_Sum._Ref() = _Num_results; _Chunk_lookback_data->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } const auto _Prev_chunk_lookback_data = _Prev_iter(_Chunk_lookback_data); if (_Prev_chunk_lookback_data->_State.load() & _Sum_available) { // If the predecessor sum is already complete, we can incorporate its value directly for 1 pass. const auto _Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref(); auto _Chunk_specific_dest = _Dest + static_cast<_Iter_diff_t<_RanIt3>>(_Prev_chunk_sum); const auto _Num_results = _Set_oper_per_chunk._Update_dest(_Range1_chunk_first, _Range1_chunk_last, _Range2_chunk_first, _Range2_chunk_last, _Chunk_specific_dest, _Pred); _Chunk_lookback_data->_Sum._Ref() = static_cast<_Diff>(_Num_results + _Prev_chunk_sum); _Chunk_lookback_data->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } // Get range we can use for this chunk of range 1 in the buffer. auto _Index_chunk_first = _Index_indicator.begin() + static_cast(_Range1_chunk_first - _Basis._Start_at); // Determine the indices of elements that should be in the result from this chunk. const auto _Num_results = _Set_oper_per_chunk._Mark_indices(_Range1_chunk_first, _Range1_chunk_last, _Range2_chunk_first, _Range2_chunk_last, _Index_chunk_first, _Pred); _Chunk_lookback_data->_Local._Ref() = _Num_results; _Chunk_lookback_data->_Store_available_state(_Local_available); // Apply the predecessor overall sum to current overall sum and elements. _Diff _Prev_chunk_sum; if (_Prev_chunk_lookback_data->_Get_available_state() & _Sum_available) { // Predecessor overall sum is done, use directly. _Prev_chunk_sum = _Prev_chunk_lookback_data->_Sum._Ref(); } else { _Prev_chunk_sum = _Get_lookback_sum(_Prev_chunk_lookback_data, _Casty_plus<_Diff>{}); } _Chunk_lookback_data->_Sum._Ref() = static_cast<_Diff>(_Num_results + _Prev_chunk_sum); _Chunk_lookback_data->_Store_available_state(_Sum_available); // Place elements from _Range1 in _Dest according to the offsets previously calculated. auto _Chunk_specific_dest = _Dest + static_cast<_Iter_diff_t<_RanIt3>>(_Prev_chunk_sum); _Place_elements_from_indices( _Range1_chunk_first, _Chunk_specific_dest, _Index_chunk_first, static_cast(_Num_results)); return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_set_subtraction*>(_Context)); } }; struct _Set_intersection_per_chunk { template _Common_diff_t<_RanIt1, _RanIt2, _RanIt3> _Update_dest( _RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2, _RanIt3 _Dest, _Pr _Pred) { // Copy elements from [_First, _Last1) that are also present in [_First2, _Last2) according to _Pred, to // _Dest. Returns the number of elements stored. return _STD set_intersection(_First1, _Last1, _First2, _Last2, _Dest, _Pred) - _Dest; } template _Iter_value_t<_BidIt> _Mark_indices(_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2, _BidIt _Index_chunk_first, _Pr _Pred) { // Stores offsets of elements in [_First1, _Last1) that are also present in [_First2, _Last2) according to // _Pred, to _Index_chunk_first. Returns the number of offsets stored. static_assert(is_integral_v<_Iter_value_t<_BidIt>>); static_assert( is_same_v<_Iter_value_t<_BidIt>, common_type_t<_Iter_value_t<_BidIt>, _Common_diff_t<_RanIt1, _RanIt2>>>); _DEBUG_ORDER_SET_UNWRAPPED(_RanIt2, _First1, _Last1, _Pred); _DEBUG_ORDER_SET_UNWRAPPED(_RanIt1, _First2, _Last2, _Pred); const auto _Index_chunk_first_save = _Index_chunk_first; _Iter_diff_t<_RanIt1> _Curr_range1_index = 0; _Iter_diff_t<_RanIt2> _Curr_range2_index = 0; const auto _Range1_dist = _Last1 - _First1; const auto _Range2_dist = _Last2 - _First2; while (_Curr_range1_index < _Range1_dist && _Curr_range1_index < _Range2_dist) { if (_DEBUG_LT_PRED(_Pred, *(_First1 + _Curr_range1_index), *(_First2 + _Curr_range2_index))) { ++_Curr_range1_index; } else { if (!_Pred(*(_First2 + _Curr_range2_index), *(_First1 + _Curr_range1_index))) { *_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index); ++_Index_chunk_first; ++_Curr_range1_index; } ++_Curr_range2_index; } } return static_cast<_Iter_value_t<_BidIt>>(_Index_chunk_first - _Index_chunk_first_save); } }; template /* = 0 */> _FwdIt3 set_intersection(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _FwdIt3 _Dest, _Pr _Pred) noexcept /* terminates */ { // AND sets [_First1, _Last1) and [_First2, _Last2), using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _REQUIRE_PARALLEL_ITERATOR(_FwdIt3); _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); auto _UDest = _Get_unwrapped_unverified(_Dest); using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>; if constexpr (remove_reference_t<_ExPo>::_Parallelize && _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) { // only parallelize if desired, and all of the iterators given are random access const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const _Diff _Count1 = _ULast1 - _UFirst1; const _Diff _Count2 = _ULast2 - _UFirst2; if (_Count1 >= 2 && _Count2 >= 2) { // ... with each range containing at least 2 elements _TRY_BEGIN _Static_partitioned_set_subtraction _Operation(_Hw_threads, _Count1, _UFirst1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred), _Set_intersection_per_chunk()); _Run_chunked_parallel_work(_Hw_threads, _Operation); _UDest += static_cast<_Iter_diff_t<_FwdIt3>>(_Operation._Lookback.back()._Sum._Ref()); _Seek_wrapped(_Dest, _UDest); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_Dest, _STD set_intersection(_UFirst1, _ULast1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred))); return _Dest; } // PARALLEL FUNCTION TEMPLATE set_difference struct _Set_difference_per_chunk { template _Common_diff_t<_RanIt1, _RanIt2, _RanIt3> _Update_dest( _RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, _RanIt2 _Last2, _RanIt3 _Dest, _Pr _Pred) { // Copy elements from [_First1, _Last1), except those in [_First2, _Last2) according to _Pred, to _Dest. // Returns the number of elements stored. return _STD set_difference(_First1, _Last1, _First2, _Last2, _Dest, _Pred) - _Dest; } template _Iter_value_t<_BidIt> _Mark_indices(_RanIt1 _First1, const _RanIt1 _Last1, _RanIt2 _First2, const _RanIt2 _Last2, _BidIt _Index_chunk_first, _Pr _Pred) { // Stores offsets of elements in [_First1, _Last1), except those in [_First2, _Last2) according to _Pred, to // _Index_chunk_first. Returns the number of offsets stored. static_assert(is_integral_v<_Iter_value_t<_BidIt>>); static_assert( is_same_v<_Iter_value_t<_BidIt>, common_type_t<_Iter_value_t<_BidIt>, _Common_diff_t<_RanIt1, _RanIt2>>>); _DEBUG_ORDER_SET_UNWRAPPED(_RanIt2, _First1, _Last1, _Pred); _DEBUG_ORDER_SET_UNWRAPPED(_RanIt1, _First2, _Last2, _Pred); const auto _Index_chunk_first_save = _Index_chunk_first; _Iter_diff_t<_RanIt1> _Curr_range1_index = 0; _Iter_diff_t<_RanIt2> _Curr_range2_index = 0; const auto _Range1_dist = _Last1 - _First1; const auto _Range2_dist = _Last2 - _First2; while (_Curr_range1_index < _Range1_dist && _Curr_range2_index < _Range2_dist) { if (_DEBUG_LT_PRED(_Pred, *(_First1 + _Curr_range1_index), *(_First2 + _Curr_range2_index))) { *_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index); ++_Index_chunk_first; ++_Curr_range1_index; } else { if (!_Pred(*(_First2 + _Curr_range2_index), *(_First1 + _Curr_range1_index))) { ++_Curr_range1_index; } ++_Curr_range2_index; } } // If we haven't traversed all of range 1 yet, we want to include the rest of it in the results. for (; _Curr_range1_index < _Range1_dist; ++_Curr_range1_index) { *_Index_chunk_first = static_cast<_Iter_value_t<_BidIt>>(_Curr_range1_index); ++_Index_chunk_first; } return static_cast<_Iter_value_t<_BidIt>>(_Index_chunk_first - _Index_chunk_first_save); } }; template /* = 0 */> _FwdIt3 set_difference(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt2 _Last2, _FwdIt3 _Dest, _Pr _Pred) noexcept /* terminates */ { // take set [_First2, _Last2) from [_First1, _Last1), using _Pred _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _REQUIRE_PARALLEL_ITERATOR(_FwdIt3); _Adl_verify_range(_First1, _Last1); _Adl_verify_range(_First2, _Last2); auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); auto _UFirst2 = _Get_unwrapped(_First2); const auto _ULast2 = _Get_unwrapped(_Last2); auto _UDest = _Get_unwrapped_unverified(_Dest); using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>; if constexpr (remove_reference_t<_ExPo>::_Parallelize && _Is_random_iter_v<_FwdIt1> && _Is_random_iter_v<_FwdIt2> && _Is_random_iter_v<_FwdIt3>) { // only parallelize if desired, and all of the iterators given are random access const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const _Diff _Count = _ULast1 - _UFirst1; if (_Count >= 2) { // ... with at least 2 elements in [_First1, _Last1) _TRY_BEGIN _Static_partitioned_set_subtraction _Operation(_Hw_threads, _Count, _UFirst1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred), _Set_difference_per_chunk()); _Run_chunked_parallel_work(_Hw_threads, _Operation); _UDest += static_cast<_Iter_diff_t<_FwdIt3>>(_Operation._Lookback.back()._Sum._Ref()); _Seek_wrapped(_Dest, _UDest); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } _Seek_wrapped(_Dest, _STD set_difference(_UFirst1, _ULast1, _UFirst2, _ULast2, _UDest, _Pass_fn(_Pred))); return _Dest; } // PARALLEL FUNCTION TEMPLATE reduce template _Ty _Reduce_move_unchecked(_InIt _First, const _InIt _Last, _Ty _Val, _BinOp _Reduce_op) { // return reduction, choose optimization if constexpr (_Plus_on_arithmetic_ranges_reduction_v<_Unwrapped_t, _Ty, _BinOp>) { return _Reduce_plus_arithmetic_ranges(_First, _Last, _Val); } else { for (; _First != _Last; ++_First) { _Val = _Reduce_op(_STD move(_Val), _STD move(*_First)); // Requirement missing from N4713 } return _Val; } } template _Ty _Reduce_at_least_two(const _FwdIt _First, const _FwdIt _Last, _BinOp _Reduce_op) { // return reduction with no initial value // pre: distance(_First, _Last) >= 2 if constexpr (_Plus_on_arithmetic_ranges_reduction_v<_FwdIt, _Ty, _BinOp>) { return _Reduce_plus_arithmetic_ranges(_First, _Last, _Ty{0}); } else { auto _Next = _First; _Ty _Val = _Reduce_op(*_First, *++_Next); while (++_Next != _Last) { _Val = _Reduce_op(_STD move(_Val), *_Next); // Requirement missing from N4713 } return _Val; } } template struct _Static_partitioned_reduce2 { // reduction task scheduled on the system thread pool _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _BinOp _Reduce_op; _Generalized_sum_drop<_Ty> _Results; _Static_partitioned_reduce2( const _Iter_diff_t<_FwdIt> _Count, const size_t _Chunks, const _FwdIt _First, _BinOp _Reduce_op_) : _Team{_Count, _Chunks}, _Basis{}, _Reduce_op(_Reduce_op_), _Results{_Team._Chunks} { _Basis._Populate(_Team, _First); } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { const auto _This = static_cast<_Static_partitioned_reduce2*>(_Context); auto _Key = _This->_Team._Get_next_key(); if (_Key) { auto _Chunk = _This->_Basis._Get_chunk(_Key); auto _Local_result = _Reduce_at_least_two<_Ty>(_Chunk._First, _Chunk._Last, _This->_Reduce_op); while ((_Key = _This->_Team._Get_next_key())) { _Chunk = _This->_Basis._Get_chunk(_Key); _Local_result = _STD reduce(_Chunk._First, _Chunk._Last, _STD move(_Local_result), _This->_Reduce_op); } _This->_Results._Add_result(_STD move(_Local_result)); } } }; template /* = 0 */> _NODISCARD _Ty reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Ty _Val, _BinOp _Reduce_op) noexcept /* terminates */ { // return commutative and associative reduction of _Val and [_First, _Last), using _Reduce_op _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_UFirst, _ULast); const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count); if (_Chunks > 1) { _TRY_BEGIN auto _Passed_fn = _Pass_fn(_Reduce_op); _Static_partitioned_reduce2 _Operation{ _Count, _Chunks, _UFirst, _Passed_fn}; { // we don't use _Run_chunked_parallel_work here because the initial value on background threads // is synthesized from the input, but on this thread the initial value is _Val const _Work_ptr _Work{_Operation}; // setup complete, hereafter nothrow or terminate _Work._Submit_for_chunks(_Hw_threads, _Chunks); while (const auto _Stolen_key = _Operation._Team._Get_next_key()) { auto _Chunk = _Operation._Basis._Get_chunk(_Stolen_key); _Val = _STD reduce(_Chunk._First, _Chunk._Last, _STD move(_Val), _Pass_fn(_Reduce_op)); } } // join with _Work_ptr threads auto& _Results = _Operation._Results; return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op)); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } return _STD reduce(_UFirst, _ULast, _STD move(_Val), _Pass_fn(_Reduce_op)); } // PARALLEL FUNCTION TEMPLATE transform_reduce template struct _Static_partitioned_transform_reduce_binary2 { // transform-reduction task scheduled on the system thread pool using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _BinOp1 _Reduce_op; _BinOp2 _Transform_op; _Generalized_sum_drop<_Ty> _Results; _Static_partitioned_transform_reduce_binary2(const _Diff _Count, const size_t _Chunks, const _FwdIt1 _First1, const _FwdIt2 _First2, _BinOp1 _Reduce_op_, _BinOp2 _Transform_op_) : _Team{_Count, _Chunks}, _Basis1{}, _Basis2{}, _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_), _Results{_Chunks} { _Basis1._Populate(_Team, _First1); _Basis2._Populate(_Team, _First2); } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { const auto _This = static_cast<_Static_partitioned_transform_reduce_binary2*>(_Context); auto _Key = _This->_Team._Get_next_key(); if (_Key) { auto _Reduce_op = _This->_Reduce_op; auto _Transform_op = _This->_Transform_op; auto _Chunk1 = _This->_Basis1._Get_chunk(_Key); auto _First2 = _This->_Basis2._Get_first(_Key._Chunk_number, _This->_Team._Get_chunk_offset(_Key._Chunk_number)); auto _Next1 = _Chunk1._First; auto _Next2 = _First2; // Requirement missing from N4713: _Ty _Val = _Reduce_op(_Transform_op(*_Chunk1._First, *_First2), _Transform_op(*++_Next1, *++_Next2)); while (++_Next1 != _Chunk1._Last) { // Requirement missing from N4713: _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next1, *++_Next2)); } while ((_Key = _This->_Team._Get_next_key())) { _Chunk1 = _This->_Basis1._Get_chunk(_Key); _First2 = _This->_Basis2._Get_first(_Key._Chunk_number, _This->_Team._Get_chunk_offset(_Key._Chunk_number)); _Next1 = _Chunk1._First; _Next2 = _First2; for (; _Next1 != _Chunk1._Last; ++_Next1, (void) ++_Next2) { // Requirement missing from N4713: _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next1, *_Next2)); } } _This->_Results._Add_result(_STD move(_Val)); } } }; #pragma warning(push) #pragma warning(disable : 4868) // compiler may not enforce left-to-right evaluation order // in braced initializer list (/Wall) template /* = 0 */> _NODISCARD _Ty transform_reduce(_ExPo&&, _FwdIt1 _First1, _FwdIt1 _Last1, _FwdIt2 _First2, _Ty _Val, _BinOp1 _Reduce_op, _BinOp2 _Transform_op) noexcept /* terminates */ { // return commutative and associative transform-reduction of sequences, using _Reduce_op and _Transform_op _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First1, _Last1); auto _UFirst1 = _Get_unwrapped(_First1); const auto _ULast1 = _Get_unwrapped(_Last1); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_UFirst1, _ULast1); auto _UFirst2 = _Get_unwrapped_n(_First2, _Count); const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count); if (_Chunks > 1) { _TRY_BEGIN auto _Passed_reduce = _Pass_fn(_Reduce_op); auto _Passed_transform = _Pass_fn(_Transform_op); _Static_partitioned_transform_reduce_binary2 _Operation{_Count, _Chunks, _UFirst1, _UFirst2, _Passed_reduce, _Passed_transform}; { // ditto no _Run_chunked_parallel_work for the same reason as reduce const _Work_ptr _Work{_Operation}; // setup complete, hereafter nothrow or terminate _Work._Submit_for_chunks(_Hw_threads, _Chunks); while (const auto _Stolen_key = _Operation._Team._Get_next_key()) { const auto _Chunk_number = _Stolen_key._Chunk_number; const auto _Chunk1 = _Operation._Basis1._Get_chunk(_Stolen_key); _Val = _STD transform_reduce(_Chunk1._First, _Chunk1._Last, _Operation._Basis2._Get_first( _Chunk_number, _Operation._Team._Get_chunk_offset(_Chunk_number)), _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)); } } // join with _Work_ptr threads auto& _Results = _Operation._Results; // note: already transformed return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op)); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } return _STD transform_reduce( _UFirst1, _ULast1, _UFirst2, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)); } } return _STD transform_reduce(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Idl_distance<_FwdIt1>(_UFirst1, _ULast1)), _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)); } #pragma warning(pop) template struct _Static_partitioned_transform_reduce2 { // transformed reduction task scheduled on the system thread pool _Static_partition_team<_Iter_diff_t<_FwdIt>> _Team; _Static_partition_range<_FwdIt> _Basis; _BinOp _Reduce_op; _UnaryOp _Transform_op; _Generalized_sum_drop<_Ty> _Results; _Static_partitioned_transform_reduce2(const _Iter_diff_t<_FwdIt> _Count, const size_t _Chunks, _FwdIt _First, _BinOp _Reduce_op_, _UnaryOp _Transform_op_) : _Team{_Count, _Chunks}, _Basis{}, _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_), _Results{_Chunks} { _Basis._Populate(_Team, _First); } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { const auto _This = static_cast<_Static_partitioned_transform_reduce2*>(_Context); auto _Key = _This->_Team._Get_next_key(); if (_Key) { auto& _Reduce_op = _This->_Reduce_op; auto& _Transform_op = _This->_Transform_op; auto _Chunk = _This->_Basis._Get_chunk(_Key); auto _Next = _Chunk._First; _Ty _Val{_Reduce_op(_Transform_op(*_Chunk._First), _Transform_op(*++_Next))}; while (++_Next != _Chunk._Last) { _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next)); } while ((_Key = _This->_Team._Get_next_key())) { _Chunk = _This->_Basis._Get_chunk(_Key); _Next = _Chunk._First; for (; _Next != _Chunk._Last; ++_Next) { _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_Next)); } } _This->_Results._Add_result(_STD move(_Val)); } } }; template /* = 0 */> _NODISCARD _Ty transform_reduce(_ExPo&&, const _FwdIt _First, const _FwdIt _Last, _Ty _Val, _BinOp _Reduce_op, _UnaryOp _Transform_op) noexcept /* terminates */ { // return commutative and associative reduction of transformed sequence, using _Reduce_op and _Transform_op _REQUIRE_PARALLEL_ITERATOR(_FwdIt); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines... const auto _Count = _STD distance(_UFirst, _ULast); const auto _Chunks = _Get_least2_chunked_work_chunk_count(_Hw_threads, _Count); if (_Chunks > 1) { _TRY_BEGIN auto _Passed_reduce = _Pass_fn(_Reduce_op); auto _Passed_transform = _Pass_fn(_Transform_op); _Static_partitioned_transform_reduce2 _Operation{_Count, _Chunks, _UFirst, _Passed_reduce, _Passed_transform}; { // ditto no _Run_chunked_parallel_work for the same reason as reduce const _Work_ptr _Work{_Operation}; // setup complete, hereafter nothrow or terminate _Work._Submit_for_chunks(_Hw_threads, _Chunks); while (auto _Stolen_key = _Operation._Team._Get_next_key()) { // keep processing remaining chunks to comply with N4687 [intro.progress]/14 auto _Chunk = _Operation._Basis._Get_chunk(_Stolen_key); _Val = _STD transform_reduce(_Chunk._First, _Chunk._Last, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)); } } // join with _Work_ptr threads auto& _Results = _Operation._Results; // note: already transformed return _Reduce_move_unchecked(_Results.begin(), _Results.end(), _STD move(_Val), _Pass_fn(_Reduce_op)); _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } } } return _STD transform_reduce(_UFirst, _ULast, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op)); } // PARALLEL FUNCTION TEMPLATE exclusive_scan struct _No_init_tag { explicit _No_init_tag() = default; }; // tag to indicate that no initial value is to be used template _FwdIt2 _Exclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val) { // local-sum for parallel exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and stores // successor sum in _Val // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, *_First); for (;;) { ++_First; ++_Dest; if (_First == _Last) { return _Dest; } _Ty _Tmp(_Reduce_op(_Val, *_First)); // temp to enable _First == _Dest *_Dest = _Val; _Val = _STD move(_Tmp); } } template void _Exclusive_scan_per_chunk_complete( _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _Ty& _Init) { // Sum for parallel exclusive_scan with predecessor available, into [_Dest, _Dest + (_Last - _First)) and stores // successor sum in _Val. // Pre: _Val is *uninitialized* && _First != _Last && predecessor sum is in _Init _Construct_in_place(_Val, _Reduce_op(_Init, *_First)); *_Dest = _Init; while (++_First != _Last) { ++_Dest; _Ty _Tmp(_Reduce_op(_Val, *_First)); // temp to enable _First == _Dest *_Dest = _STD move(_Val); _Val = _STD move(_Tmp); } } template struct _Static_partitioned_exclusive_scan2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback; _Ty& _Initial; _BinOp _Reduce_op; _Static_partitioned_exclusive_scan2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _Ty& _Initial_, _BinOp _Reduce_op_, const _FwdIt2&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Lookback(_Team._Chunks), _Initial(_Initial_), _Reduce_op(_Reduce_op_) { _Basis1._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _In_range = _Basis1._Get_chunk(_Key); const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); // Run local exclusive_scan on this chunk const auto _Chunk = _Lookback.begin() + static_cast(_Chunk_number); if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same _Exclusive_scan_per_chunk_complete( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Initial); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } const auto _Prev_chunk = _Prev_iter(_Chunk); if (_Prev_chunk->_State.load() & _Sum_available) { // if predecessor sum already complete, we can incorporate its value directly for 1 pass _Exclusive_scan_per_chunk_complete( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref()); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } // Calculate local sum and publish to other threads const auto _Last = _Exclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref()); _Chunk->_Store_available_state(_Local_available); // Apply the predecessor overall sum to current overall sum and elements if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly _Chunk->_Apply_exclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); } else { auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op); _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op); } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_exclusive_scan2*>(_Context)); } }; template /* = 0 */> _FwdIt2 exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Ty _Val, _BinOp _Reduce_op) noexcept /* terminates */ { // set each value in [_Dest, _Dest + (_Last - _First)) to the associative reduction of predecessors and _Val _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); const auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_exclusive_scan2 _Operation{ _Hw_threads, _Count, _UFirst, _Val, _Pass_fn(_Reduce_op), _UDest}; _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); // Note that _Val is used as temporary storage by whichever thread runs the first chunk. // If any thread starts any chunk, initialization is complete, so we can't enter the // catch or serial fallback below, so that's OK. _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD exclusive_scan(_UFirst, _ULast, _UDest, _STD move(_Val), _Pass_fn(_Reduce_op))); return _Dest; } } _Seek_wrapped( _Dest, _STD exclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _STD move(_Val), _Pass_fn(_Reduce_op))); return _Dest; } // PARALLEL FUNCTION TEMPLATE inclusive_scan template _FwdIt2 _Inclusive_scan_per_chunk( _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _No_init_tag) { // Local-sum for parallel inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall sum in // _Val. // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, *_First); for (;;) { *_Dest = _Val; ++_Dest; ++_First; if (_First == _Last) { return _Dest; } _Val = _Reduce_op(_STD move(_Val), *_First); } } template >>, int> = 0> _FwdIt2 _Inclusive_scan_per_chunk( _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty& _Val, _Ty_fwd&& _Predecessor) { // local-sum for parallel inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall sum in // _Val. // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, _Reduce_op(_STD forward<_Ty_fwd>(_Predecessor), *_First)); for (;;) { *_Dest = _Val; ++_Dest; ++_First; if (_First == _Last) { return _Dest; } _Val = _Reduce_op(_STD move(_Val), *_First); } } template struct _Static_partitioned_inclusive_scan2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback; _BinOp _Reduce_op; _Init_ty& _Initial; _Static_partitioned_inclusive_scan2( const size_t _Hw_threads, const _Diff _Count, _BinOp _Reduce_op_, _Init_ty& _Initial_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Lookback(_Team._Chunks), _Reduce_op(_Reduce_op_), _Initial(_Initial_) {} _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _In_range = _Basis1._Get_chunk(_Key); const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); // Run local inclusive_scan on this chunk const auto _Chunk = _Lookback.begin() + static_cast(_Chunk_number); if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same _Inclusive_scan_per_chunk( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _STD move(_Initial)); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } const auto _Prev_chunk = _Prev_iter(_Chunk); if (_Prev_chunk->_State.load() & _Sum_available) { // if predecessor sum already complete, we can incorporate its value directly for 1 pass _Inclusive_scan_per_chunk( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref()); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } // Calculate local sum and publish to other threads const auto _Last = _Inclusive_scan_per_chunk( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Chunk->_Local._Ref(), _No_init_tag{}); _Chunk->_Store_available_state(_Local_available); // Apply the predecessor overall sum to current overall sum and elements if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly _Chunk->_Apply_inclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); } else { auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op); _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op); } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_inclusive_scan2*>(_Context)); } }; template /* = 0 */> _FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _Ty _Val) noexcept /* terminates */ { // compute partial noncommutative and associative reductions including _Val into _Dest, using _Reduce_op _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_First, _Last); auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN auto _Passed_op = _Pass_fn(_Reduce_op); _Static_partitioned_inclusive_scan2<_Ty, _Ty, _Unwrapped_t, decltype(_UDest), decltype(_Passed_op)> _Operation{_Hw_threads, _Count, _Passed_op, _Val}; _Operation._Basis1._Populate(_Operation._Team, _UFirst); _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); // Note that _Val is moved from by whichever thread runs the first chunk. // If any thread starts any chunk, initialization is complete, so we can't enter the // catch or serial fallback below. _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op), _STD move(_Val))); return _Dest; } } _Seek_wrapped( _Dest, _STD inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Reduce_op), _STD move(_Val))); return _Dest; } template /* = 0 */> _FwdIt2 inclusive_scan(_ExPo&&, _FwdIt1 _First, _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op) noexcept /* terminates */ { // compute partial noncommutative and associative reductions into _Dest, using _Reduce_op _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _No_init_tag _Tag; auto _Passed_op = _Pass_fn(_Reduce_op); _Static_partitioned_inclusive_scan2<_Iter_value_t<_FwdIt1>, _No_init_tag, _Unwrapped_t, decltype(_UDest), decltype(_Passed_op)> _Operation{_Hw_threads, _Count, _Passed_op, _Tag}; _Operation._Basis1._Populate(_Operation._Team, _UFirst); _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op))); return _Dest; } } _Seek_wrapped(_Dest, _STD inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Reduce_op))); return _Dest; } // PARALLEL FUNCTION TEMPLATE transform_exclusive_scan template _FwdIt2 _Transform_exclusive_scan_per_chunk( _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val) { // Local-sum for parallel transform_exclusive_scan; writes local sums into [_Dest + 1, _Dest + (_Last - _First)) and // stores successor sum in _Val. // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, _Transform_op(*_First)); for (;;) { ++_First; ++_Dest; if (_First == _Last) { return _Dest; } _Ty _Tmp(_Reduce_op(_Val, _Transform_op(*_First))); // temp to enable _First == _Dest *_Dest = _Val; _Val = _STD move(_Tmp); } } template void _Transform_exclusive_scan_per_chunk_complete(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val, _Ty& _Init) { // Sum for parallel transform_exclusive_scan with predecessor available, into [_Dest, _Dest + (_Last - _First)) and // stores successor sum in _Val. // pre: _Val is *uninitialized* && _First != _Last && predecessor sum is in _Init _Construct_in_place(_Val, _Reduce_op(_Init, _Transform_op(*_First))); *_Dest = _Init; while (++_First != _Last) { ++_Dest; _Ty _Tmp(_Reduce_op(_Val, _Transform_op(*_First))); // temp to enable _First == _Dest *_Dest = _STD move(_Val); _Val = _STD move(_Tmp); } } template struct _Static_partitioned_transform_exclusive_scan2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback; _Ty& _Initial; _BinOp _Reduce_op; _UnaryOp _Transform_op; _Static_partitioned_transform_exclusive_scan2(const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _Ty& _Initial_, _BinOp _Reduce_op_, _UnaryOp _Transform_op_, const _FwdIt2&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Lookback(_Team._Chunks), _Initial(_Initial_), _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_) { _Basis1._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _In_range = _Basis1._Get_chunk(_Key); const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); // Run local transform_exclusive_scan on this chunk const auto _Chunk = _Lookback.begin() + static_cast(_Chunk_number); if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same _Transform_exclusive_scan_per_chunk_complete( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Sum._Ref(), _Initial); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } const auto _Prev_chunk = _Prev_iter(_Chunk); if (_Prev_chunk->_State.load() & _Sum_available) { // if predecessor sum already complete, we can incorporate its value directly for 1 pass _Transform_exclusive_scan_per_chunk_complete(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref()); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } // Calculate local sum and publish to other threads const auto _Last = _Transform_exclusive_scan_per_chunk( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Local._Ref()); _Chunk->_Store_available_state(_Local_available); // Apply the predecessor overall sum to current overall sum and elements if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly _Chunk->_Apply_exclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); } else { auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op); _Chunk->_Apply_exclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op); } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_transform_exclusive_scan2*>(_Context)); } }; template /* = 0 */> _FwdIt2 transform_exclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Ty _Val, _BinOp _Reduce_op, _UnaryOp _Transform_op) noexcept /* terminates */ { // set each value in [_Dest, _Dest + (_Last - _First)) to the associative reduction of transformed predecessors _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); const auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _Static_partitioned_transform_exclusive_scan2 _Operation{ _Hw_threads, _Count, _UFirst, _Val, _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op), _UDest}; _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); // Note that _Val is used as temporary storage by whichever thread runs the first chunk. // If any thread starts any chunk, initialization is complete, so we can't enter the // catch or serial fallback below, so that's OK. _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD transform_exclusive_scan(_UFirst, _ULast, _UDest, _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op))); return _Dest; } } _Seek_wrapped(_Dest, _STD transform_exclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _STD move(_Val), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op))); return _Dest; } // PARALLEL FUNCTION TEMPLATE transform_inclusive_scan template _FwdIt2 _Transform_inclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val, _No_init_tag) { // Local-sum for parallel transform_inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall // sum in _Val. // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, _Transform_op(*_First)); for (;;) { *_Dest = _Val; ++_Dest; ++_First; if (_First == _Last) { return _Dest; } _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_First)); } } template >>, int> = 0> _FwdIt2 _Transform_inclusive_scan_per_chunk(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty& _Val, _Ty_fwd&& _Predecessor) { // local-sum for parallel transform_inclusive_scan; writes local inclusive prefix sums into _Dest and stores overall // sum in _Val // pre: _Val is *uninitialized* && _First != _Last _Construct_in_place(_Val, _Reduce_op(_STD forward<_Ty_fwd>(_Predecessor), _Transform_op(*_First))); for (;;) { *_Dest = _Val; ++_Dest; ++_First; if (_First == _Last) { return _Dest; } _Val = _Reduce_op(_STD move(_Val), _Transform_op(*_First)); } } template struct _Static_partitioned_transform_inclusive_scan2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; _Static_partition_range<_FwdIt1, _Diff> _Basis1; _Static_partition_range<_FwdIt2, _Diff> _Basis2; _Parallel_vector<_Scan_decoupled_lookback<_Ty>> _Lookback; _BinOp _Reduce_op; _UnaryOp _Transform_op; _Init_ty& _Initial; _Static_partitioned_transform_inclusive_scan2( const size_t _Hw_threads, const _Diff _Count, _BinOp _Reduce_op_, _UnaryOp _Transform_op_, _Init_ty& _Initial_) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Lookback(_Team._Chunks), _Reduce_op(_Reduce_op_), _Transform_op(_Transform_op_), _Initial(_Initial_) {} _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; const auto _In_range = _Basis1._Get_chunk(_Key); const auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); // Run local transform_inclusive_scan on this chunk const auto _Chunk = _Lookback.begin() + static_cast(_Chunk_number); if (_Chunk_number == 0) { // chunk 0 is special as it has no predecessor; its local and total sums are the same _Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Sum._Ref(), _STD move(_Initial)); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } const auto _Prev_chunk = _Prev_iter(_Chunk); if (_Prev_chunk->_State.load() & _Sum_available) { // if predecessor sum already complete, we can incorporate its value directly for 1 pass _Transform_inclusive_scan_per_chunk(_In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Sum._Ref(), _Prev_chunk->_Sum._Ref()); _Chunk->_Store_available_state(_Sum_available); return _Cancellation_status::_Running; } // Calculate local sum and publish to other threads const auto _Last = _Transform_inclusive_scan_per_chunk( _In_range._First, _In_range._Last, _Dest, _Reduce_op, _Transform_op, _Chunk->_Local._Ref(), _No_init_tag{}); _Chunk->_Store_available_state(_Local_available); // Apply the predecessor overall sum to current overall sum and elements if (_Prev_chunk->_Get_available_state() & _Sum_available) { // predecessor overall sum done, use directly _Chunk->_Apply_inclusive_predecessor(_Prev_chunk->_Sum._Ref(), _Dest, _Last, _Reduce_op); } else { auto _Tmp = _Get_lookback_sum(_Prev_chunk, _Reduce_op); _Chunk->_Apply_inclusive_predecessor(_Tmp, _Dest, _Last, _Reduce_op); } return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_transform_inclusive_scan2*>(_Context)); } }; template /* = 0 */> _FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op, _Ty _Val) noexcept /* terminates */ { // compute partial noncommutative and associative transformed reductions including _Val into _Dest _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN auto _Passed_reduce = _Pass_fn(_Reduce_op); auto _Passed_transform = _Pass_fn(_Transform_op); _Static_partitioned_transform_inclusive_scan2<_Ty, _Ty, _Unwrapped_t, decltype(_UDest), decltype(_Passed_reduce), decltype(_Passed_transform)> _Operation{_Hw_threads, _Count, _Passed_reduce, _Passed_transform, _Val}; _Operation._Basis1._Populate(_Operation._Team, _UFirst); _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); // Note that _Val is moved from by whichever thread runs the first chunk. // If any thread starts any chunk, initialization is complete, so we can't enter the // catch or serial fallback below. _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD transform_inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op), _STD move(_Val))); return _Dest; } } _Seek_wrapped(_Dest, _STD transform_inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op), _STD move(_Val))); return _Dest; } template /* = 0 */> _FwdIt2 transform_inclusive_scan(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Reduce_op, _UnaryOp _Transform_op) noexcept /* terminates */ { // compute partial noncommutative and associative transformed reductions into _Dest _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); const auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines const auto _Count = _STD distance(_UFirst, _ULast); auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN _No_init_tag _Tag; auto _Passed_reduce = _Pass_fn(_Reduce_op); auto _Passed_transform = _Pass_fn(_Transform_op); using _Intermediate_t = decay_t; _Static_partitioned_transform_inclusive_scan2<_Intermediate_t, _No_init_tag, _Unwrapped_t, decltype(_UDest), decltype(_Passed_reduce), decltype(_Passed_transform)> _Operation{_Hw_threads, _Count, _Passed_reduce, _Passed_transform, _Tag}; _Operation._Basis1._Populate(_Operation._Team, _UFirst); _Seek_wrapped(_Dest, _Operation._Basis2._Populate(_Operation._Team, _UDest)); _Run_chunked_parallel_work(_Hw_threads, _Operation); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _STD transform_inclusive_scan(_UFirst, _ULast, _UDest, _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op))); return _Dest; } } _Seek_wrapped(_Dest, _STD transform_inclusive_scan(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Reduce_op), _Pass_fn(_Transform_op))); return _Dest; } // PARALLEL FUNCTION TEMPLATE adjacent_difference template struct _Static_partitioned_adjacent_difference2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; _Static_partition_team<_Diff> _Team; // note offset partitioning: _Static_partition_range<_FwdIt1, _Diff> _Basis1; // contains partition of [_First, _Last - 1) _Static_partition_range<_FwdIt2, _Diff> _Basis2; // contains partition of [_Dest + 1, _Dest + (_Last - _First)) _BinOp _Diff_op; _Static_partitioned_adjacent_difference2( const size_t _Hw_threads, const _Diff _Count, const _FwdIt1 _First, _BinOp _Diff_op_, const _FwdIt2&) : _Team{_Count, _Get_chunked_work_chunk_count(_Hw_threads, _Count)}, _Basis1{}, _Basis2{}, _Diff_op(_Diff_op_) { _Basis1._Populate(_Team, _First); } _Cancellation_status _Process_chunk() { const auto _Key = _Team._Get_next_key(); if (!_Key) { return _Cancellation_status::_Canceled; } const auto _Chunk_number = _Key._Chunk_number; auto _In_range = _Basis1._Get_chunk(_Key); auto _Dest = _Basis2._Get_first(_Chunk_number, _Team._Get_chunk_offset(_Chunk_number)); auto _Next = _In_range._First; do { ++_Next; // note: steps 1 element into the following chunk *_Dest = _Diff_op(*_Next, *_In_range._First); ++_Dest; _In_range._First = _Next; } while (_In_range._First != _In_range._Last); return _Cancellation_status::_Running; } static void __stdcall _Threadpool_callback( __std_PTP_CALLBACK_INSTANCE, void* const _Context, __std_PTP_WORK) noexcept /* terminates */ { _Run_available_chunked_work(*static_cast<_Static_partitioned_adjacent_difference2*>(_Context)); } }; template _FwdIt2 _Adjacent_difference_seq(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Diff_op) { // compute adjacent differences into _Dest, serially if (_First != _Last) { *_Dest = *_First; ++_Dest; for (auto _Next = _First; ++_Next != _Last; _First = _Next) { *_Dest = _Diff_op(*_Next, *_First); ++_Dest; } } return _Dest; } template /* = 0 */> _FwdIt2 adjacent_difference(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _BinOp _Diff_op) noexcept /* terminates */ { // compute adjacent differences into _Dest _REQUIRE_PARALLEL_ITERATOR(_FwdIt1); _REQUIRE_PARALLEL_ITERATOR(_FwdIt2); _Adl_verify_range(_First, _Last); auto _UFirst = _Get_unwrapped(_First); const auto _ULast = _Get_unwrapped(_Last); if constexpr (remove_reference_t<_ExPo>::_Parallelize) { const size_t _Hw_threads = __std_parallel_algorithms_hw_threads(); if (_Hw_threads > 1) { // parallelize on multiprocessor machines auto _Count = _STD distance(_UFirst, _ULast); const auto _UDest = _Get_unwrapped_n(_Dest, _Count); if (_Count >= 2) { // ... with at least 2 elements _TRY_BEGIN --_Count; // note unusual offset partitioning _Static_partitioned_adjacent_difference2 _Operation{ _Hw_threads, _Count, _UFirst, _Pass_fn(_Diff_op), _UDest}; auto _Result = _Operation._Basis2._Populate(_Operation._Team, _Next_iter(_UDest)); const _Work_ptr _Work_op{_Operation}; // setup complete, hereafter nothrow or terminate _Work_op._Submit_for_chunks(_Hw_threads, _Operation._Team._Chunks); // must be done after setup is complete to avoid duplicate assign in serial fallback: *_UDest = *_UFirst; _Run_available_chunked_work(_Operation); _Seek_wrapped(_Dest, _Result); return _Dest; _CATCH(const _Parallelism_resources_exhausted&) // fall through to serial case below _CATCH_END } _Seek_wrapped(_Dest, _Adjacent_difference_seq(_UFirst, _ULast, _UDest, _Pass_fn(_Diff_op))); return _Dest; } } // Don't call serial adjacent_difference because it's described as creating a temporary we can avoid _Seek_wrapped(_Dest, _Adjacent_difference_seq(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Diff_op))); return _Dest; } _STD_END #pragma pop_macro("new") _STL_RESTORE_CLANG_WARNINGS #pragma warning(pop) #pragma pack(pop) #endif // _HAS_CXX17 #endif // _STL_COMPILER_PREPROCESSOR #endif // _EXECUTION_