Skip to content

[libc++][PSTL] Introduce cpu traits #88134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libcxx/include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,7 @@ set(files
__numeric/transform_exclusive_scan.h
__numeric/transform_inclusive_scan.h
__numeric/transform_reduce.h
__pstl/cpu_algos/cpu_traits.h
__random/bernoulli_distribution.h
__random/binomial_distribution.h
__random/cauchy_distribution.h
Expand Down
47 changes: 1 addition & 46 deletions libcxx/include/__algorithm/pstl_backends/cpu_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,52 +9,6 @@
#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H

#include <__config>

/*

// _Functor takes a subrange for [__first, __last) that should be executed in serial
template <class _RandomAccessIterator, class _Functor>
optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);

template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
optional<_Tp>
__parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);

// Cancel the execution of other jobs - they aren't needed anymore
void __cancel_execution();

template <class _RandomAccessIterator1,
class _RandomAccessIterator2,
class _RandomAccessIterator3,
class _Compare,
class _LeafMerge>
optional<void> __parallel_merge(
_RandomAccessIterator1 __first1,
_RandomAccessIterator1 __last1,
_RandomAccessIterator2 __first2,
_RandomAccessIterator2 __last2,
_RandomAccessIterator3 __outit,
_Compare __comp,
_LeafMerge __leaf_merge);

template <class _RandomAccessIterator, class _Comp, class _LeafSort>
void __parallel_stable_sort(_RandomAccessIterator __first,
_RandomAccessIterator __last,
_Comp __comp,
_LeafSort __leaf_sort);

TODO: Document the parallel backend

Exception handling
==================

CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
*/

#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__algorithm/pstl_backends/cpu_backends/fill.h>
Expand All @@ -64,5 +18,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
#include <__algorithm/pstl_backends/cpu_backends/transform.h>
#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
#include <__config>

#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <__config>
#include <__functional/operations.h>
#include <__iterator/concepts.h>
#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <__utility/pair.h>
Expand All @@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS

_LIBCPP_BEGIN_NAMESPACE_STD

template <class _Index, class _Brick>
template <class _Backend, class _Index, class _Brick>
_LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
std::atomic<bool> __found(false);
auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
__found.store(true, std::memory_order_relaxed);
__par_backend::__cancel_execution();
__pstl::__cpu_traits<_Backend>::__cancel_execution();
}
});
if (!__ret)
Expand Down Expand Up @@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional<bool>
__pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
return std::__parallel_or(
return std::__parallel_or<__cpu_backend_tag>(
__first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __pred);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,20 @@
# pragma GCC system_header
#endif

#if _LIBCPP_STD_VER >= 17
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17

_LIBCPP_BEGIN_NAMESPACE_STD

struct __cpu_backend_tag {};

inline constexpr size_t __lane_size = 64;
# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
using __cpu_backend_tag = __pstl::__serial_backend_tag;
# elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
# elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
# endif

_LIBCPP_END_NAMESPACE_STD

#endif // _LIBCPP_STD_VER >= 17
#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17

#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/empty.h>
#include <optional>
Expand All @@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
return __par_backend::__parallel_for(
return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __value);
Expand Down
22 changes: 12 additions & 10 deletions libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <__functional/operations.h>
#include <__iterator/concepts.h>
#include <__iterator/iterator_traits.h>
#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/move.h>
#include <__utility/pair.h>
Expand All @@ -33,16 +34,16 @@ _LIBCPP_PUSH_MACROS

_LIBCPP_BEGIN_NAMESPACE_STD

template <class _Index, class _Brick, class _Compare>
template <class _Backend, class _Index, class _Brick, class _Compare>
_LIBCPP_HIDE_FROM_ABI optional<_Index>
__parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) {
typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
const _DifferenceType __n = __last - __first;
_DifferenceType __initial_dist = __b_first ? __n : -1;
std::atomic<_DifferenceType> __extremum(__initial_dist);
// TODO: find out what is better here: parallel_for or parallel_reduce
auto __res =
__par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
// See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
// why using a shared variable scales fairly well in this situation.
if (__comp(__i - __first, __extremum)) {
Expand All @@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last;
}

template <class _Index, class _DifferenceType, class _Compare>
template <class _Backend, class _Index, class _DifferenceType, class _Compare>
_LIBCPP_HIDE_FROM_ABI _Index
__simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {
// Experiments show good block sizes like this
const _DifferenceType __block_size = 8;
alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
const _DifferenceType __block_size = 8;
alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0};
while (__end - __begin >= __block_size) {
_DifferenceType __found = 0;
_PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) {
Expand Down Expand Up @@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
__pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
return std::__parallel_find(
return std::__parallel_find<__cpu_backend_tag>(
__first,
__last,
[&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
Expand All @@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
using __diff_t = __iter_diff_t<_ForwardIterator>;
return std::__simd_first(__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
return __pred(__iter[__i]);
});
return std::__simd_first<__cpu_backend_tag>(
__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
return __pred(__iter[__i]);
});
} else {
return std::find_if(__first, __last, __pred);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
#include <__config>
#include <__iterator/concepts.h>
#include <__pstl/cpu_algos/cpu_traits.h>
#include <__type_traits/is_execution_policy.h>
#include <__utility/empty.h>
#include <optional>
Expand All @@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
__pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
return std::__par_backend::__parallel_for(
return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
__first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
[[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
__cpu_backend_tag{}, __brick_first, __brick_last, __func);
Expand Down
Loading