Skip to content

Commit 04f386d

Browse files
committed
Optimize ranges::copy_backward for vector<bool>::iterator
1 parent e5b2be3 commit 04f386d

File tree

8 files changed

+428
-71
lines changed

8 files changed

+428
-71
lines changed

libcxx/docs/ReleaseNotes/21.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Implemented Papers
4444
Improvements and New Features
4545
-----------------------------
4646

47-
- The ``std::ranges::copy`` and ``std::ranges::copy_n`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
47+
- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector<bool>::iterator``\s,
4848
resulting in a performance improvement of up to 2000x.
4949

5050

libcxx/include/__algorithm/copy_backward.h

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,14 @@
1010
#define _LIBCPP___ALGORITHM_COPY_BACKWARD_H
1111

1212
#include <__algorithm/copy_move_common.h>
13+
#include <__algorithm/copy_n.h>
1314
#include <__algorithm/iterator_operations.h>
1415
#include <__algorithm/min.h>
1516
#include <__config>
17+
#include <__fwd/bit_reference.h>
1618
#include <__iterator/iterator_traits.h>
1719
#include <__iterator/segmented_iterator.h>
20+
#include <__memory/pointer_traits.h>
1821
#include <__type_traits/common_type.h>
1922
#include <__type_traits/enable_if.h>
2023
#include <__type_traits/is_constructible.h>
@@ -34,6 +37,124 @@ template <class _AlgPolicy, class _InIter, class _Sent, class _OutIter>
3437
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, _OutIter>
3538
__copy_backward(_InIter __first, _Sent __last, _OutIter __result);
3639

40+
template <class _Cp, bool _IsConst>
41+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
42+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
43+
using _In = __bit_iterator<_Cp, _IsConst>;
44+
using difference_type = typename _In::difference_type;
45+
using __storage_type = typename _In::__storage_type;
46+
47+
const int __bits_per_word = _In::__bits_per_word;
48+
difference_type __n = __last - __first;
49+
if (__n > 0) {
50+
// do first word
51+
if (__last.__ctz_ != 0) {
52+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
53+
__n -= __dn;
54+
unsigned __clz = __bits_per_word - __last.__ctz_;
55+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz);
56+
__storage_type __b = *__last.__seg_ & __m;
57+
*__result.__seg_ &= ~__m;
58+
*__result.__seg_ |= __b;
59+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
60+
// __last.__ctz_ = 0
61+
}
62+
// __last.__ctz_ == 0 || __n == 0
63+
// __result.__ctz_ == 0 || __n == 0
64+
// do middle words
65+
__storage_type __nw = __n / __bits_per_word;
66+
__result.__seg_ -= __nw;
67+
__last.__seg_ -= __nw;
68+
std::copy_n(std::__to_address(__last.__seg_), __nw, std::__to_address(__result.__seg_));
69+
__n -= __nw * __bits_per_word;
70+
// do last word
71+
if (__n > 0) {
72+
__storage_type __m = ~__storage_type(0) << (__bits_per_word - __n);
73+
__storage_type __b = *--__last.__seg_ & __m;
74+
*--__result.__seg_ &= ~__m;
75+
*__result.__seg_ |= __b;
76+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
77+
}
78+
}
79+
return __result;
80+
}
81+
82+
template <class _Cp, bool _IsConst>
83+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
84+
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
85+
using _In = __bit_iterator<_Cp, _IsConst>;
86+
using difference_type = typename _In::difference_type;
87+
using __storage_type = typename _In::__storage_type;
88+
89+
const int __bits_per_word = _In::__bits_per_word;
90+
difference_type __n = __last - __first;
91+
if (__n > 0) {
92+
// do first word
93+
if (__last.__ctz_ != 0) {
94+
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
95+
__n -= __dn;
96+
unsigned __clz_l = __bits_per_word - __last.__ctz_;
97+
__storage_type __m = (~__storage_type(0) << (__last.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_l);
98+
__storage_type __b = *__last.__seg_ & __m;
99+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
100+
__storage_type __ddn = std::min(__dn, static_cast<difference_type>(__result.__ctz_));
101+
if (__ddn > 0) {
102+
__m = (~__storage_type(0) << (__result.__ctz_ - __ddn)) & (~__storage_type(0) >> __clz_r);
103+
*__result.__seg_ &= ~__m;
104+
if (__result.__ctz_ > __last.__ctz_)
105+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
106+
else
107+
*__result.__seg_ |= __b >> (__last.__ctz_ - __result.__ctz_);
108+
__result.__ctz_ = static_cast<unsigned>(((-__ddn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
109+
__dn -= __ddn;
110+
}
111+
if (__dn > 0) {
112+
// __result.__ctz_ == 0
113+
--__result.__seg_;
114+
__result.__ctz_ = static_cast<unsigned>(-__dn & (__bits_per_word - 1));
115+
__m = ~__storage_type(0) << __result.__ctz_;
116+
*__result.__seg_ &= ~__m;
117+
__last.__ctz_ -= __dn + __ddn;
118+
*__result.__seg_ |= __b << (__result.__ctz_ - __last.__ctz_);
119+
}
120+
// __last.__ctz_ = 0
121+
}
122+
// __last.__ctz_ == 0 || __n == 0
123+
// __result.__ctz_ != 0 || __n == 0
124+
// do middle words
125+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
126+
__storage_type __m = ~__storage_type(0) >> __clz_r;
127+
for (; __n >= __bits_per_word; __n -= __bits_per_word) {
128+
__storage_type __b = *--__last.__seg_;
129+
*__result.__seg_ &= ~__m;
130+
*__result.__seg_ |= __b >> __clz_r;
131+
*--__result.__seg_ &= __m;
132+
*__result.__seg_ |= __b << __result.__ctz_;
133+
}
134+
// do last word
135+
if (__n > 0) {
136+
__m = ~__storage_type(0) << (__bits_per_word - __n);
137+
__storage_type __b = *--__last.__seg_ & __m;
138+
__clz_r = __bits_per_word - __result.__ctz_;
139+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__result.__ctz_));
140+
__m = (~__storage_type(0) << (__result.__ctz_ - __dn)) & (~__storage_type(0) >> __clz_r);
141+
*__result.__seg_ &= ~__m;
142+
*__result.__seg_ |= __b >> (__bits_per_word - __result.__ctz_);
143+
__result.__ctz_ = static_cast<unsigned>(((-__dn & (__bits_per_word - 1)) + __result.__ctz_) % __bits_per_word);
144+
__n -= __dn;
145+
if (__n > 0) {
146+
// __result.__ctz_ == 0
147+
--__result.__seg_;
148+
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
149+
__m = ~__storage_type(0) << __result.__ctz_;
150+
*__result.__seg_ &= ~__m;
151+
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
152+
}
153+
}
154+
}
155+
return __result;
156+
}
157+
37158
template <class _AlgPolicy>
38159
struct __copy_backward_impl {
39160
template <class _InIter, class _Sent, class _OutIter>
@@ -107,6 +228,16 @@ struct __copy_backward_impl {
107228
}
108229
}
109230

231+
template <class _Cp, bool _IsConst>
232+
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
233+
operator()(__bit_iterator<_Cp, _IsConst> __first,
234+
__bit_iterator<_Cp, _IsConst> __last,
235+
__bit_iterator<_Cp, false> __result) {
236+
if (__last.__ctz_ == __result.__ctz_)
237+
return std::make_pair(__last, std::__copy_backward_aligned(__first, __last, __result));
238+
return std::make_pair(__last, std::__copy_backward_unaligned(__first, __last, __result));
239+
}
240+
110241
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
111242
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
112243
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>

libcxx/include/__bit_reference

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#define _LIBCPP___BIT_REFERENCE
1212

1313
#include <__algorithm/copy.h>
14+
#include <__algorithm/copy_backward.h>
1415
#include <__algorithm/copy_n.h>
1516
#include <__algorithm/min.h>
1617
#include <__bit/countr.h>
@@ -185,10 +186,17 @@ private:
185186
__mask_(__m) {}
186187
};
187188

189+
<<<<<<< HEAD
188190
// copy_backward
189191

190192
template <class _Cp, bool _IsConst>
191193
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_aligned(
194+
=======
195+
// copy
196+
197+
template <class _Cp, bool _IsConst>
198+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
199+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
192200
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
193201
using _In = __bit_iterator<_Cp, _IsConst>;
194202
using difference_type = typename _In::difference_type;
@@ -198,6 +206,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _
198206
difference_type __n = __last - __first;
199207
if (__n > 0) {
200208
// do first word
209+
<<<<<<< HEAD
201210
if (__last.__ctz_ != 0) {
202211
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
203212
__n -= __dn;
@@ -224,13 +233,46 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _
224233
*--__result.__seg_ &= ~__m;
225234
*__result.__seg_ |= __b;
226235
__result.__ctz_ = static_cast<unsigned>(-__n & (__bits_per_word - 1));
236+
=======
237+
if (__first.__ctz_ != 0) {
238+
unsigned __clz = __bits_per_word - __first.__ctz_;
239+
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
240+
__n -= __dn;
241+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
242+
__storage_type __b = *__first.__seg_ & __m;
243+
*__result.__seg_ &= ~__m;
244+
*__result.__seg_ |= __b;
245+
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
246+
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
247+
++__first.__seg_;
248+
// __first.__ctz_ = 0;
249+
}
250+
// __first.__ctz_ == 0;
251+
// do middle words
252+
__storage_type __nw = __n / __bits_per_word;
253+
std::copy_n(std::__to_address(__first.__seg_), __nw, std::__to_address(__result.__seg_));
254+
__n -= __nw * __bits_per_word;
255+
__result.__seg_ += __nw;
256+
// do last word
257+
if (__n > 0) {
258+
__first.__seg_ += __nw;
259+
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
260+
__storage_type __b = *__first.__seg_ & __m;
261+
*__result.__seg_ &= ~__m;
262+
*__result.__seg_ |= __b;
263+
__result.__ctz_ = static_cast<unsigned>(__n);
264+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
227265
}
228266
}
229267
return __result;
230268
}
231269

232270
template <class _Cp, bool _IsConst>
271+
<<<<<<< HEAD
233272
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_backward_unaligned(
273+
=======
274+
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
275+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
234276
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
235277
using _In = __bit_iterator<_Cp, _IsConst>;
236278
using difference_type = typename _In::difference_type;
@@ -240,6 +282,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _
240282
difference_type __n = __last - __first;
241283
if (__n > 0) {
242284
// do first word
285+
<<<<<<< HEAD
243286
if (__last.__ctz_ != 0) {
244287
difference_type __dn = std::min(static_cast<difference_type>(__last.__ctz_), __n);
245288
__n -= __dn;
@@ -299,18 +342,82 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> _
299342
__m = ~__storage_type(0) << __result.__ctz_;
300343
*__result.__seg_ &= ~__m;
301344
*__result.__seg_ |= __b << (__result.__ctz_ - (__bits_per_word - __n - __dn));
345+
=======
346+
if (__first.__ctz_ != 0) {
347+
unsigned __clz_f = __bits_per_word - __first.__ctz_;
348+
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
349+
__n -= __dn;
350+
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
351+
__storage_type __b = *__first.__seg_ & __m;
352+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
353+
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
354+
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
355+
*__result.__seg_ &= ~__m;
356+
if (__result.__ctz_ > __first.__ctz_)
357+
*__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
358+
else
359+
*__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
360+
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
361+
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
362+
__dn -= __ddn;
363+
if (__dn > 0) {
364+
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
365+
*__result.__seg_ &= ~__m;
366+
*__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
367+
__result.__ctz_ = static_cast<unsigned>(__dn);
368+
}
369+
++__first.__seg_;
370+
// __first.__ctz_ = 0;
371+
}
372+
// __first.__ctz_ == 0;
373+
// do middle words
374+
unsigned __clz_r = __bits_per_word - __result.__ctz_;
375+
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
376+
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
377+
__storage_type __b = *__first.__seg_;
378+
*__result.__seg_ &= ~__m;
379+
*__result.__seg_ |= __b << __result.__ctz_;
380+
++__result.__seg_;
381+
*__result.__seg_ &= __m;
382+
*__result.__seg_ |= __b >> __clz_r;
383+
}
384+
// do last word
385+
if (__n > 0) {
386+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
387+
__storage_type __b = *__first.__seg_ & __m;
388+
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
389+
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
390+
*__result.__seg_ &= ~__m;
391+
*__result.__seg_ |= __b << __result.__ctz_;
392+
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
393+
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
394+
__n -= __dn;
395+
if (__n > 0) {
396+
__m = ~__storage_type(0) >> (__bits_per_word - __n);
397+
*__result.__seg_ &= ~__m;
398+
*__result.__seg_ |= __b >> __dn;
399+
__result.__ctz_ = static_cast<unsigned>(__n);
400+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
302401
}
303402
}
304403
}
305404
return __result;
306405
}
307406

308407
template <class _Cp, bool _IsConst>
408+
<<<<<<< HEAD
309409
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false> copy_backward(
310410
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
311411
if (__last.__ctz_ == __result.__ctz_)
312412
return std::__copy_backward_aligned(__first, __last, __result);
313413
return std::__copy_backward_unaligned(__first, __last, __result);
414+
=======
415+
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __bit_iterator<_Cp, false>
416+
copy(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
417+
if (__first.__ctz_ == __result.__ctz_)
418+
return std::__copy_aligned(__first, __last, __result);
419+
return std::__copy_unaligned(__first, __last, __result);
420+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
314421
}
315422

316423
// move
@@ -876,9 +983,10 @@ private:
876983
template <class _Dp, bool _IC>
877984
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_backward_unaligned(
878985
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
879-
template <class _Dp, bool _IC>
880-
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false>
881-
copy_backward(__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
986+
// Note: dependent nested name specifier __copy_backward_impl<_AlgPolicy>::operator() for friend declaration
987+
// is not supported in clang. Thus, we use a friend declaration for the entire class.
988+
template <class _AlgPolicy>
989+
friend struct __copy_backward_impl;
882990
template <class _Cl, class _Cr>
883991
friend __bit_iterator<_Cr, false>
884992
__swap_ranges_aligned(__bit_iterator<_Cl, false>, __bit_iterator<_Cl, false>, __bit_iterator<_Cr, false>);

libcxx/include/__vector/vector_bool.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define _LIBCPP___VECTOR_VECTOR_BOOL_H
1111

1212
#include <__algorithm/copy.h>
13+
#include <__algorithm/copy_backward.h>
1314
#include <__algorithm/fill_n.h>
1415
#include <__algorithm/iterator_operations.h>
1516
#include <__algorithm/max.h>

libcxx/include/bitset

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,10 @@ template <size_t N> struct hash<std::bitset<N>>;
130130
# include <__cxx03/bitset>
131131
#else
132132
# include <__algorithm/copy.h>
133+
<<<<<<< HEAD
134+
=======
135+
# include <__algorithm/copy_backward.h>
136+
>>>>>>> 03028d1b8610 (Optimize ranges::copy_backward for vector<bool>::iterator)
133137
# include <__algorithm/count.h>
134138
# include <__algorithm/fill.h>
135139
# include <__algorithm/fill_n.h>

0 commit comments

Comments
 (0)