Skip to content

Commit a3ce29f

Browse files
authored
[libc++][PSTL] Introduce cpu traits (#88134)
Currently, CPU backends in the PSTL are created by defining functions in the __par_backend namespace. Then, the PSTL includes the CPU backend that gets configured via CMake and gets those definitions. This prevents CPU backends from easily co-existing and is a bit confusing. To solve this problem, this patch introduces the notion of __cpu_traits, which is a cheap encapsulation of the basis operations required to implement a CPU-based PSTL. Different backends can now define their own tag and coexist, and the CPU-based PSTL will simply use __cpu_traits to dispatch to the right implementation of e.g. __for_each. Note that this patch doesn't change the actual implementation of the backends in any way, it only modifies how that implementation is accessed to implement PSTL algorithms. This patch is a step towards #88131.
1 parent b68ff06 commit a3ce29f

File tree

19 files changed

+476
-406
lines changed

19 files changed

+476
-406
lines changed

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,7 @@ set(files
594594
__numeric/transform_exclusive_scan.h
595595
__numeric/transform_inclusive_scan.h
596596
__numeric/transform_reduce.h
597+
__pstl/cpu_algos/cpu_traits.h
597598
__random/bernoulli_distribution.h
598599
__random/binomial_distribution.h
599600
__random/cauchy_distribution.h

libcxx/include/__algorithm/pstl_backends/cpu_backend.h

Lines changed: 1 addition & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -9,52 +9,6 @@
99
#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
1010
#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
1111

12-
#include <__config>
13-
14-
/*
15-
16-
// _Functor takes a subrange for [__first, __last) that should be executed in serial
17-
template <class _RandomAccessIterator, class _Functor>
18-
optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
19-
20-
template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
21-
optional<_Tp>
22-
__parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
23-
24-
// Cancel the execution of other jobs - they aren't needed anymore
25-
void __cancel_execution();
26-
27-
template <class _RandomAccessIterator1,
28-
class _RandomAccessIterator2,
29-
class _RandomAccessIterator3,
30-
class _Compare,
31-
class _LeafMerge>
32-
optional<void> __parallel_merge(
33-
_RandomAccessIterator1 __first1,
34-
_RandomAccessIterator1 __last1,
35-
_RandomAccessIterator2 __first2,
36-
_RandomAccessIterator2 __last2,
37-
_RandomAccessIterator3 __outit,
38-
_Compare __comp,
39-
_LeafMerge __leaf_merge);
40-
41-
template <class _RandomAccessIterator, class _Comp, class _LeafSort>
42-
void __parallel_stable_sort(_RandomAccessIterator __first,
43-
_RandomAccessIterator __last,
44-
_Comp __comp,
45-
_LeafSort __leaf_sort);
46-
47-
TODO: Document the parallel backend
48-
49-
Exception handling
50-
==================
51-
52-
CPU backends are expected to report errors (i.e. failure to allocate) by returning a disengaged `optional` from their
53-
implementation. Exceptions shouldn't be used to report an internal failure-to-allocate, since all exceptions are turned
54-
into a program termination at the front-end level. When a backend returns a disengaged `optional` to the frontend, the
55-
frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
56-
*/
57-
5812
#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
5913
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
6014
#include <__algorithm/pstl_backends/cpu_backends/fill.h>
@@ -64,5 +18,6 @@ frontend will turn that into a call to `std::__throw_bad_alloc();` to report the
6418
#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
6519
#include <__algorithm/pstl_backends/cpu_backends/transform.h>
6620
#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
21+
#include <__config>
6722

6823
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H

libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <__config>
1818
#include <__functional/operations.h>
1919
#include <__iterator/concepts.h>
20+
#include <__pstl/cpu_algos/cpu_traits.h>
2021
#include <__type_traits/is_execution_policy.h>
2122
#include <__utility/move.h>
2223
#include <__utility/pair.h>
@@ -30,13 +31,13 @@ _LIBCPP_PUSH_MACROS
3031

3132
_LIBCPP_BEGIN_NAMESPACE_STD
3233

33-
template <class _Index, class _Brick>
34+
template <class _Backend, class _Index, class _Brick>
3435
_LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
3536
std::atomic<bool> __found(false);
36-
auto __ret = __par_backend::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
37+
auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
3738
if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
3839
__found.store(true, std::memory_order_relaxed);
39-
__par_backend::__cancel_execution();
40+
__pstl::__cpu_traits<_Backend>::__cancel_execution();
4041
}
4142
});
4243
if (!__ret)
@@ -74,7 +75,7 @@ _LIBCPP_HIDE_FROM_ABI optional<bool>
7475
__pstl_any_of(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
7576
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
7677
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
77-
return std::__parallel_or(
78+
return std::__parallel_or<__cpu_backend_tag>(
7879
__first, __last, [&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
7980
auto __res = std::__pstl_any_of<__remove_parallel_policy_t<_ExecutionPolicy>>(
8081
__cpu_backend_tag{}, __brick_first, __brick_last, __pred);

libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,16 +26,20 @@
2626
# pragma GCC system_header
2727
#endif
2828

29-
#if _LIBCPP_STD_VER >= 17
29+
#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
3030

3131
_LIBCPP_BEGIN_NAMESPACE_STD
3232

33-
struct __cpu_backend_tag {};
34-
35-
inline constexpr size_t __lane_size = 64;
33+
# if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
34+
using __cpu_backend_tag = __pstl::__serial_backend_tag;
35+
# elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
36+
using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
37+
# elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
38+
using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
39+
# endif
3640

3741
_LIBCPP_END_NAMESPACE_STD
3842

39-
#endif // _LIBCPP_STD_VER >= 17
43+
#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
4044

4145
#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H

libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
1414
#include <__config>
1515
#include <__iterator/concepts.h>
16+
#include <__pstl/cpu_algos/cpu_traits.h>
1617
#include <__type_traits/is_execution_policy.h>
1718
#include <__utility/empty.h>
1819
#include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
3940
__pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
4041
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
4142
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
42-
return __par_backend::__parallel_for(
43+
return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
4344
__first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
4445
[[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
4546
__cpu_backend_tag{}, __brick_first, __brick_last, __value);

libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <__functional/operations.h>
1717
#include <__iterator/concepts.h>
1818
#include <__iterator/iterator_traits.h>
19+
#include <__pstl/cpu_algos/cpu_traits.h>
1920
#include <__type_traits/is_execution_policy.h>
2021
#include <__utility/move.h>
2122
#include <__utility/pair.h>
@@ -33,16 +34,16 @@ _LIBCPP_PUSH_MACROS
3334

3435
_LIBCPP_BEGIN_NAMESPACE_STD
3536

36-
template <class _Index, class _Brick, class _Compare>
37+
template <class _Backend, class _Index, class _Brick, class _Compare>
3738
_LIBCPP_HIDE_FROM_ABI optional<_Index>
3839
__parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool __b_first) {
3940
typedef typename std::iterator_traits<_Index>::difference_type _DifferenceType;
4041
const _DifferenceType __n = __last - __first;
4142
_DifferenceType __initial_dist = __b_first ? __n : -1;
4243
std::atomic<_DifferenceType> __extremum(__initial_dist);
4344
// TODO: find out what is better here: parallel_for or parallel_reduce
44-
auto __res =
45-
__par_backend::__parallel_for(__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
45+
auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
46+
__first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
4647
// See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
4748
// why using a shared variable scales fairly well in this situation.
4849
if (__comp(__i - __first, __extremum)) {
@@ -61,12 +62,12 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
6162
return __extremum.load() != __initial_dist ? __first + __extremum.load() : __last;
6263
}
6364

64-
template <class _Index, class _DifferenceType, class _Compare>
65+
template <class _Backend, class _Index, class _DifferenceType, class _Compare>
6566
_LIBCPP_HIDE_FROM_ABI _Index
6667
__simd_first(_Index __first, _DifferenceType __begin, _DifferenceType __end, _Compare __comp) noexcept {
6768
// Experiments show good block sizes like this
68-
const _DifferenceType __block_size = 8;
69-
alignas(__lane_size) _DifferenceType __lane[__block_size] = {0};
69+
const _DifferenceType __block_size = 8;
70+
alignas(__pstl::__cpu_traits<_Backend>::__lane_size) _DifferenceType __lane[__block_size] = {0};
7071
while (__end - __begin >= __block_size) {
7172
_DifferenceType __found = 0;
7273
_PSTL_PRAGMA_SIMD_REDUCTION(| : __found) for (_DifferenceType __i = __begin; __i < __begin + __block_size; ++__i) {
@@ -102,7 +103,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardIterator>
102103
__pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
103104
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
104105
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
105-
return std::__parallel_find(
106+
return std::__parallel_find<__cpu_backend_tag>(
106107
__first,
107108
__last,
108109
[&__pred](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
@@ -116,9 +117,10 @@ __pstl_find_if(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __l
116117
} else if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
117118
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
118119
using __diff_t = __iter_diff_t<_ForwardIterator>;
119-
return std::__simd_first(__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
120-
return __pred(__iter[__i]);
121-
});
120+
return std::__simd_first<__cpu_backend_tag>(
121+
__first, __diff_t(0), __last - __first, [&__pred](_ForwardIterator __iter, __diff_t __i) {
122+
return __pred(__iter[__i]);
123+
});
122124
} else {
123125
return std::find_if(__first, __last, __pred);
124126
}

libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <__algorithm/pstl_backends/cpu_backends/backend.h>
1414
#include <__config>
1515
#include <__iterator/concepts.h>
16+
#include <__pstl/cpu_algos/cpu_traits.h>
1617
#include <__type_traits/is_execution_policy.h>
1718
#include <__utility/empty.h>
1819
#include <optional>
@@ -39,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
3940
__pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
4041
if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
4142
__has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
42-
return std::__par_backend::__parallel_for(
43+
return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
4344
__first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
4445
[[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
4546
__cpu_backend_tag{}, __brick_first, __brick_last, __func);

0 commit comments

Comments
 (0)