-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[libc++] Optimize std::for_each_n for segmented iterators #135468
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
9b95e81
to
2c5d9e3
Compare
2c5d9e3
to
5b6a846
Compare
@llvm/pr-subscribers-libcxx Author: Peng Liu (winner245) ChangesThis patch enhances the performance of
|
Benchmark | deque<char> | deque<short> | deque<int> |
---|---|---|---|
std::for_each_n | 13.1x | 17.7x | 3.7x |
std::join_view
iterators
Benchmark | vector<vector<char>> |
vector<vector<short>> |
vector<vector<int>> |
---|---|---|---|
std::for_each_n | 11.8x | 13.9x | 3.1x |
which results in performance improvements up to 17.7x for std::deque<short>
iterators, and up to 13.9x for join_view<vector<vector<short>>>
iterators.
Detailed Benchmarks
std::deque
iterators
--------------------------------------------------------------------------
Benchmark Before After Speedup
--------------------------------------------------------------------------
std::for_each_n(vector<char>)/8 4.26 ns 4.23 ns 1.0x
std::for_each_n(vector<char>)/32 2.68 ns 2.67 ns 1.0x
std::for_each_n(vector<char>)/50 9.49 ns 9.36 ns 1.0x
std::for_each_n(vector<char>)/1024 42.3 ns 40.1 ns 1.1x
std::for_each_n(vector<char>)/4096 163 ns 151 ns 1.1x
std::for_each_n(vector<char>)/8192 308 ns 294 ns 1.0x
std::for_each_n(vector<char>)/16384 608 ns 593 ns 1.0x
std::for_each_n(vector<char>)/65536 2435 ns 2464 ns 1.0x
std::for_each_n(vector<char>)/262144 10029 ns 10190 ns 1.0x
std::for_each_n(deque<char>)/8 6.57 ns 2.43 ns 2.7x
std::for_each_n(deque<char>)/32 24.0 ns 2.73 ns 8.8x
std::for_each_n(deque<char>)/50 33.2 ns 4.53 ns 7.3x
std::for_each_n(deque<char>)/1024 541 ns 44.9 ns 12.0x
std::for_each_n(deque<char>)/4096 2067 ns 169 ns 12.2x
std::for_each_n(deque<char>)/8192 4005 ns 305 ns 13.1x
std::for_each_n(deque<char>)/16384 7831 ns 639 ns 12.3x
std::for_each_n(deque<char>)/65536 31819 ns 2717 ns 11.7x
std::for_each_n(deque<char>)/262144 120801 ns 10674 ns 11.3x
std::for_each_n(list<char>)/8 4.97 ns 5.16 ns 1.0x
std::for_each_n(list<char>)/32 19.9 ns 20.6 ns 1.0x
std::for_each_n(list<char>)/50 40.6 ns 42.7 ns 1.0x
std::for_each_n(list<char>)/1024 996 ns 1038 ns 1.0x
std::for_each_n(list<char>)/4096 6186 ns 6341 ns 1.0x
std::for_each_n(list<char>)/8192 12522 ns 12391 ns 1.0x
std::for_each_n(list<char>)/16384 26158 ns 25739 ns 1.0x
std::for_each_n(list<char>)/65536 106410 ns 105299 ns 1.0x
std::for_each_n(list<char>)/262144 621473 ns 625741 ns 1.0x
std::for_each_n(vector<short>)/8 4.42 ns 3.92 ns 1.1x
std::for_each_n(vector<short>)/32 1.62 ns 1.64 ns 1.0x
std::for_each_n(vector<short>)/50 2.74 ns 2.75 ns 1.0x
std::for_each_n(vector<short>)/1024 34.0 ns 33.6 ns 1.0x
std::for_each_n(vector<short>)/4096 120 ns 117 ns 1.0x
std::for_each_n(vector<short>)/8192 229 ns 267 ns 0.9x
std::for_each_n(vector<short>)/16384 452 ns 469 ns 1.0x
std::for_each_n(vector<short>)/65536 2262 ns 2265 ns 1.0x
std::for_each_n(vector<short>)/262144 9129 ns 9140 ns 1.0x
std::for_each_n(deque<short>)/8 5.28 ns 1.78 ns 3.0x
std::for_each_n(deque<short>)/32 22.8 ns 2.08 ns 11.0x
std::for_each_n(deque<short>)/50 32.3 ns 4.46 ns 7.2x
std::for_each_n(deque<short>)/1024 545 ns 35.2 ns 15.5x
std::for_each_n(deque<short>)/4096 2158 ns 128 ns 16.9x
std::for_each_n(deque<short>)/8192 4303 ns 243 ns 17.7x
std::for_each_n(deque<short>)/16384 8624 ns 516 ns 16.7x
std::for_each_n(deque<short>)/65536 34569 ns 2336 ns 14.8x
std::for_each_n(deque<short>)/262144 137820 ns 9319 ns 14.8x
std::for_each_n(list<short>)/8 4.66 ns 4.95 ns 0.9x
std::for_each_n(list<short>)/32 19.9 ns 20.4 ns 1.0x
std::for_each_n(list<short>)/50 41.3 ns 41.1 ns 1.0x
std::for_each_n(list<short>)/1024 1018 ns 1021 ns 1.0x
std::for_each_n(list<short>)/4096 6110 ns 6294 ns 1.0x
std::for_each_n(list<short>)/8192 12433 ns 12692 ns 1.0x
std::for_each_n(list<short>)/16384 24739 ns 24820 ns 1.0x
std::for_each_n(list<short>)/65536 103376 ns 102812 ns 1.0x
std::for_each_n(list<short>)/262144 538314 ns 555664 ns 1.0x
std::for_each_n(vector<int>)/8 2.78 ns 2.73 ns 1.0x
std::for_each_n(vector<int>)/32 5.22 ns 5.26 ns 1.0x
std::for_each_n(vector<int>)/50 8.20 ns 8.65 ns 0.9x
std::for_each_n(vector<int>)/1024 156 ns 175 ns 0.9x
std::for_each_n(vector<int>)/4096 602 ns 758 ns 0.8x
std::for_each_n(vector<int>)/8192 1214 ns 1393 ns 0.9x
std::for_each_n(vector<int>)/16384 2417 ns 2690 ns 0.9x
std::for_each_n(vector<int>)/65536 9989 ns 10703 ns 0.9x
std::for_each_n(vector<int>)/262144 41512 ns 43798 ns 0.9x
std::for_each_n(deque<int>)/8 5.04 ns 2.75 ns 1.8x
std::for_each_n(deque<int>)/32 19.1 ns 5.56 ns 3.4x
std::for_each_n(deque<int>)/50 30.6 ns 8.55 ns 3.6x
std::for_each_n(deque<int>)/1024 567 ns 152 ns 3.7x
std::for_each_n(deque<int>)/4096 2241 ns 657 ns 3.4x
std::for_each_n(deque<int>)/8192 4512 ns 1334 ns 3.4x
std::for_each_n(deque<int>)/16384 9066 ns 2701 ns 3.4x
std::for_each_n(deque<int>)/65536 35955 ns 10887 ns 3.3x
std::for_each_n(deque<int>)/262144 146489 ns 44361 ns 3.3x
std::for_each_n(list<int>)/8 4.68 ns 6.05 ns 0.8x
std::for_each_n(list<int>)/32 21.0 ns 21.9 ns 1.0x
std::for_each_n(list<int>)/50 43.0 ns 42.2 ns 1.0x
std::for_each_n(list<int>)/1024 1015 ns 1035 ns 1.0x
std::for_each_n(list<int>)/4096 6373 ns 6331 ns 1.0x
std::for_each_n(list<int>)/8192 12757 ns 12836 ns 1.0x
std::for_each_n(list<int>)/16384 24879 ns 25035 ns 1.0x
std::for_each_n(list<int>)/65536 103931 ns 103773 ns 1.0x
std::for_each_n(list<int>)/262144 536841 ns 555330 ns 1.0x
--------------------------------------------------------------------------
std::join_view
iterators
-----------------------------------------------------------------------------------------------------
Benchmark Before After Speedup
-----------------------------------------------------------------------------------------------------
std::for_each_n(join_view(vector<vector<char>>))/8 5.83 ns 2.56 ns 2.3x
std::for_each_n(join_view(vector<vector<char>>))/32 22.8 ns 3.12 ns 7.3x
std::for_each_n(join_view(vector<vector<char>>))/50 32.3 ns 5.37 ns 6.0x
std::for_each_n(join_view(vector<vector<char>>))/1024 477 ns 45.1 ns 10.6x
std::for_each_n(join_view(vector<vector<char>>))/4096 1898 ns 161 ns 11.8x
std::for_each_n(join_view(vector<vector<char>>))/8192 3785 ns 332 ns 11.4x
std::for_each_n(join_view(vector<vector<char>>))/16384 7530 ns 646 ns 11.7x
std::for_each_n(join_view(vector<vector<char>>))/65536 30685 ns 2670 ns 11.5x
std::for_each_n(join_view(vector<vector<char>>))/262144 122600 ns 10539 ns 11.6x
std::for_each_n(join_view(vector<vector<short>>))/8 6.14 ns 2.83 ns 2.2x
std::for_each_n(join_view(vector<vector<short>>))/32 25.2 ns 3.28 ns 7.7x
std::for_each_n(join_view(vector<vector<short>>))/50 33.7 ns 5.02 ns 6.7x
std::for_each_n(join_view(vector<vector<short>>))/1024 487 ns 38.6 ns 12.6x
std::for_each_n(join_view(vector<vector<short>>))/4096 1925 ns 150 ns 12.8x
std::for_each_n(join_view(vector<vector<short>>))/8192 3863 ns 291 ns 13.3x
std::for_each_n(join_view(vector<vector<short>>))/16384 7779 ns 558 ns 13.9x
std::for_each_n(join_view(vector<vector<short>>))/65536 30656 ns 2634 ns 11.6x
std::for_each_n(join_view(vector<vector<short>>))/262144 124937 ns 10667 ns 11.7x
std::for_each_n(join_view(vector<vector<int>>))/8 5.97 ns 3.45 ns 1.7x
std::for_each_n(join_view(vector<vector<int>>))/32 22.5 ns 7.20 ns 3.1x
std::for_each_n(join_view(vector<vector<int>>))/50 30.7 ns 10.7 ns 2.9x
std::for_each_n(join_view(vector<vector<int>>))/1024 491 ns 191 ns 2.6x
std::for_each_n(join_view(vector<vector<int>>))/4096 1928 ns 731 ns 2.6x
std::for_each_n(join_view(vector<vector<int>>))/8192 3874 ns 1402 ns 2.8x
std::for_each_n(join_view(vector<vector<int>>))/16384 7818 ns 2852 ns 2.7x
std::for_each_n(join_view(vector<vector<int>>))/65536 31101 ns 11439 ns 2.7x
std::for_each_n(join_view(vector<vector<int>>))/262144 126378 ns 45348 ns 2.8x
-----------------------------------------------------------------------------------------------------
Full diff: https://github.com/llvm/llvm-project/pull/135468.diff
8 Files Affected:
- (modified) libcxx/docs/ReleaseNotes/21.rst (+3)
- (modified) libcxx/include/CMakeLists.txt (+1)
- (modified) libcxx/include/__algorithm/for_each.h (-1)
- (modified) libcxx/include/__algorithm/for_each_n.h (+63-7)
- (added) libcxx/include/__algorithm/for_each_n_segment.h (+63)
- (modified) libcxx/include/module.modulemap.in (+1)
- (added) libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp (+98)
- (modified) libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp (+89-40)
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index a7382c5222d08..3ea9f17418447 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -70,6 +70,9 @@ Improvements and New Features
- The segmented iterator optimization for ``std::for_each`` has been backported to C++11. Previously it was only available
in C++23 and later.
+- The ``std::for_each_n`` algorithm has been optimized for segmented iterators, resulting in a performance improvement of
+ up to 17.7x for ``std::deque<short>`` iterators, and up to 13.9x for ``std::join_view<vector<vector<short>>>`` iterators.
+
Deprecations and Removals
-------------------------
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f1bdf684a8549..b6de4b1800dff 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -25,6 +25,7 @@ set(files
__algorithm/find_segment_if.h
__algorithm/for_each.h
__algorithm/for_each_n.h
+ __algorithm/for_each_n_segment.h
__algorithm/for_each_segment.h
__algorithm/generate.h
__algorithm/generate_n.h
diff --git a/libcxx/include/__algorithm/for_each.h b/libcxx/include/__algorithm/for_each.h
index b6c2c7c056edd..0b14d8c219931 100644
--- a/libcxx/include/__algorithm/for_each.h
+++ b/libcxx/include/__algorithm/for_each.h
@@ -9,7 +9,6 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_H
#define _LIBCPP___ALGORITHM_FOR_EACH_H
-
#include <__algorithm/for_each_segment.h>
#include <__config>
#include <__iterator/segmented_iterator.h>
diff --git a/libcxx/include/__algorithm/for_each_n.h b/libcxx/include/__algorithm/for_each_n.h
index fce380b49df3e..12b8d1810685d 100644
--- a/libcxx/include/__algorithm/for_each_n.h
+++ b/libcxx/include/__algorithm/for_each_n.h
@@ -10,20 +10,36 @@
#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_H
#define _LIBCPP___ALGORITHM_FOR_EACH_N_H
+#include <__algorithm/for_each.h>
+#include <__algorithm/for_each_n_segment.h>
#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+#include <__type_traits/enable_if.h>
#include <__utility/convert_to_integral.h>
+#include <__utility/move.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
-_LIBCPP_BEGIN_NAMESPACE_STD
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
-#if _LIBCPP_STD_VER >= 17
+_LIBCPP_BEGIN_NAMESPACE_STD
-template <class _InputIterator, class _Size, class _Function>
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
-for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+template <class _InputIterator,
+ class _Size,
+ class _Func,
+ __enable_if_t<!__has_random_access_iterator_category<_InputIterator>::value &&
+ (!__is_segmented_iterator<_InputIterator>::value
+ // || !__has_random_access_iterator_category<
+ // typename __segmented_iterator_traits<_InputIterator>::__local_iterator>::value
+ ), // TODO: __segmented_iterator_traits<_InputIterator> results in template instantiation
+ // during SFINAE, which is a hard error to be fixed. Once fixed, we should uncomment.
+ int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+__for_each_n(_InputIterator __first, _Size __orig_n, _Func& __f) {
typedef decltype(std::__convert_to_integral(__orig_n)) _IntegralSize;
_IntegralSize __n = __orig_n;
while (__n > 0) {
@@ -31,11 +47,51 @@ for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
++__first;
--__n;
}
- return __first;
+ return std::move(__first);
}
-#endif
+template <class _RandIter,
+ class _Size,
+ class _Func,
+ __enable_if_t<__has_random_access_iterator_category<_RandIter>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _RandIter
+__for_each_n(_RandIter __first, _Size __orig_n, _Func& __f) {
+ typename std::iterator_traits<_RandIter>::difference_type __n = __orig_n;
+ auto __last = __first + __n;
+ std::__for_each(__first, __last, __f);
+ return std::move(__last);
+}
+
+#ifndef _LIBCPP_CXX03_LANG
+template <class _SegmentedIterator,
+ class _Size,
+ class _Func,
+ __enable_if_t<!__has_random_access_iterator_category<_SegmentedIterator>::value &&
+ __is_segmented_iterator<_SegmentedIterator>::value &&
+ __has_random_access_iterator_category<
+ typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+ int> = 0>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _SegmentedIterator
+__for_each_n(_SegmentedIterator __first, _Size __orig_n, _Func& __f) {
+ using __local_iterator_t = typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator;
+ return std::__for_each_n_segment(__first, __orig_n, [&](__local_iterator_t __lfirst, __local_iterator_t __llast) {
+ std::__for_each(__lfirst, __llast, __f);
+ });
+}
+#endif // !_LIBCPP_CXX03_LANG
+
+#if _LIBCPP_STD_VER >= 17
+
+template <class _InputIterator, class _Size, class _Function>
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _InputIterator
+for_each_n(_InputIterator __first, _Size __orig_n, _Function __f) {
+ return std::__for_each_n(__first, __orig_n, __f);
+}
+
+#endif // _LIBCPP_STD_VER >= 17
_LIBCPP_END_NAMESPACE_STD
+_LIBCPP_POP_MACROS
+
#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_H
diff --git a/libcxx/include/__algorithm/for_each_n_segment.h b/libcxx/include/__algorithm/for_each_n_segment.h
new file mode 100644
index 0000000000000..1b522fb373eee
--- /dev/null
+++ b/libcxx/include/__algorithm/for_each_n_segment.h
@@ -0,0 +1,63 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+#define _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
+
+#include <__config>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/segmented_iterator.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+# pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// __for_each_n_segment optimizes linear iteration over segmented iterators. It processes a segmented
+// input range [__first, __first + __n) by applying the functor __func to each element within the segment.
+// The return value of __func is ignored, and the function returns an iterator pointing to one past the
+// last processed element in the input range.
+
+template <class _SegmentedIterator, class _Size, class _Functor>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _SegmentedIterator
+__for_each_n_segment(_SegmentedIterator __first, _Size __orig_n, _Functor __func) {
+ static_assert(__is_segmented_iterator<_SegmentedIterator>::value &&
+ __has_random_access_iterator_category<
+ typename __segmented_iterator_traits<_SegmentedIterator>::__local_iterator>::value,
+ "__for_each_n_segment only works with segmented iterators with random-access local iterators");
+ if (__orig_n <= 0)
+ return __first;
+
+ using _Traits = __segmented_iterator_traits<_SegmentedIterator>;
+ using __local_iter_t = typename _Traits::__local_iterator;
+ using __difference_t = typename std::iterator_traits<__local_iter_t>::difference_type;
+ __difference_t __n = __orig_n;
+ auto __seg = _Traits::__segment(__first);
+ auto __local_first = _Traits::__local(__first);
+ __local_iter_t __local_last;
+
+ while (__n > 0) {
+ __local_last = _Traits::__end(__seg);
+ auto __seg_size = __local_last - __local_first;
+ if (__n <= __seg_size) {
+ __local_last = __local_first + __n;
+ __func(__local_first, __local_last);
+ break;
+ }
+ __func(__local_first, __local_last);
+ __n -= __seg_size;
+ __local_first = _Traits::__begin(++__seg);
+ }
+
+ return _Traits::__compose(__seg, __local_last);
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___ALGORITHM_FOR_EACH_N_SEGMENT_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index af928a63f2315..8e8b7a6f400d9 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -437,6 +437,7 @@ module std [system] {
module find_segment_if { header "__algorithm/find_segment_if.h" }
module find { header "__algorithm/find.h" }
module for_each_n { header "__algorithm/for_each_n.h" }
+ module for_each_n_segment { header "__algorithm/for_each_n_segment.h" }
module for_each_segment { header "__algorithm/for_each_segment.h" }
module for_each { header "__algorithm/for_each.h" }
module generate_n { header "__algorithm/generate_n.h" }
diff --git a/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
new file mode 100644
index 0000000000000..784708c7e01eb
--- /dev/null
+++ b/libcxx/test/benchmarks/algorithms/nonmodifying/for_each_n.bench.cpp
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <deque>
+#include <list>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+int main(int argc, char** argv) {
+ auto std_for_each_n = [](auto first, auto n, auto f) { return std::for_each_n(first, n, f); };
+
+ // std::for_each_n
+ {
+ auto bm = []<class Container>(std::string name, auto for_each_n) {
+ using ElemType = typename Container::value_type;
+ benchmark::RegisterBenchmark(
+ name,
+ [for_each_n](auto& st) {
+ std::size_t const n = st.range(0);
+ Container c(n, 1);
+ auto first = c.begin();
+
+ for ([[maybe_unused]] auto _ : st) {
+ benchmark::DoNotOptimize(c);
+ auto result = for_each_n(first, n, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+ benchmark::DoNotOptimize(result);
+ }
+ })
+ ->Arg(8)
+ ->Arg(32)
+ ->Arg(50) // non power-of-two
+ ->Arg(1024)
+ ->Arg(4096)
+ ->Arg(8192)
+ ->Arg(1 << 14)
+ ->Arg(1 << 16)
+ ->Arg(1 << 18);
+ };
+ bm.operator()<std::vector<int>>("std::for_each_n(vector<int>)", std_for_each_n);
+ bm.operator()<std::deque<int>>("std::for_each_n(deque<int>)", std_for_each_n);
+ bm.operator()<std::list<int>>("std::for_each_n(list<int>)", std_for_each_n);
+ }
+
+ // std::for_each_n for join_view
+ {
+ auto bm = []<class Container>(std::string name, auto for_each_n) {
+ using C1 = typename Container::value_type;
+ using ElemType = typename C1::value_type;
+ benchmark::RegisterBenchmark(
+ name,
+ [for_each_n](auto& st) {
+ std::size_t const size = st.range(0);
+ std::size_t const seg_size = 256;
+ std::size_t const segments = (size + seg_size - 1) / seg_size;
+ Container c(segments);
+ for (std::size_t i = 0, n = size; i < segments; ++i, n -= seg_size) {
+ c[i].resize(std::min(seg_size, n), ElemType(1));
+ }
+
+ auto view = c | std::views::join;
+ auto first = view.begin();
+
+ for ([[maybe_unused]] auto _ : st) {
+ benchmark::DoNotOptimize(c);
+ auto result = for_each_n(first, size, [](ElemType& x) { x = std::clamp<ElemType>(x, 10, 100); });
+ benchmark::DoNotOptimize(result);
+ }
+ })
+ ->Arg(8)
+ ->Arg(32)
+ ->Arg(50) // non power-of-two
+ ->Arg(1024)
+ ->Arg(4096)
+ ->Arg(8192)
+ ->Arg(1 << 14)
+ ->Arg(1 << 16)
+ ->Arg(1 << 18);
+ };
+ bm.operator()<std::vector<std::vector<int>>>("std::for_each_n(join_view(vector<vector<int>>))", std_for_each_n);
+ }
+
+ benchmark::Initialize(&argc, argv);
+ benchmark::RunSpecifiedBenchmarks();
+ benchmark::Shutdown();
+ return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
index 371f6c92f1ed1..39c1174dcec8b 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/for_each_n.pass.cpp
@@ -13,69 +13,118 @@
// constexpr InputIterator // constexpr after C++17
// for_each_n(InputIterator first, Size n, Function f);
-
#include <algorithm>
#include <cassert>
+#include <deque>
#include <functional>
+#include <iterator>
+#include <ranges>
+#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
-#if TEST_STD_VER > 17
-TEST_CONSTEXPR bool test_constexpr() {
- int ia[] = {1, 3, 6, 7};
- int expected[] = {3, 5, 8, 9};
- const std::size_t N = 4;
+struct for_each_test {
+ TEST_CONSTEXPR for_each_test(int c) : count(c) {}
+ int count;
+ TEST_CONSTEXPR_CXX14 void operator()(int& i) {
+ ++i;
+ ++count;
+ }
+};
- auto it = std::for_each_n(std::begin(ia), N, [](int &a) { a += 2; });
- return it == (std::begin(ia) + N)
- && std::equal(std::begin(ia), std::end(ia), std::begin(expected))
- ;
- }
-#endif
+struct deque_test {
+ std::deque<int>* d_;
+ int* i_;
+
+ deque_test(std::deque<int>& d, int& i) : d_(&d), i_(&i) {}
-struct for_each_test
-{
- for_each_test(int c) : count(c) {}
- int count;
- void operator()(int& i) {++i; ++count;}
+ void operator()(int& v) {
+ assert(&(*d_)[*i_] == &v);
+ ++*i_;
+ }
};
-int main(int, char**)
-{
+/*TEST_CONSTEXPR_CXX26*/
+void test_segmented_deque_iterator() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
+ // check that segmented deque iterators work properly
+ int sizes[] = {0, 1, 2, 1023, 1024, 1025, 2047, 2048, 2049};
+ for (const int size : sizes) {
+ std::deque<int> d(size);
+ int index = 0;
+
+ std::for_each_n(d.begin(), d.size(), deque_test(d, index));
+ }
+}
+
+TEST_CONSTEXPR_CXX20 bool test() {
+ {
typedef cpp17_input_iterator<int*> Iter;
- int ia[] = {0, 1, 2, 3, 4, 5};
- const unsigned s = sizeof(ia)/sizeof(ia[0]);
+ int ia[] = {0, 1, 2, 3, 4, 5};
+ const unsigned s = sizeof(ia) / sizeof(ia[0]);
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), 0, std::ref(f));
- assert(it == Iter(ia));
- assert(f.count == 0);
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), 0, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia));
+ assert(count == 0);
}
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), s, std::ref(f));
-
- assert(it == Iter(ia+s));
- assert(f.count == s);
- for (unsigned i = 0; i < s; ++i)
- assert(ia[i] == static_cast<int>(i+1));
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), s, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia + s));
+ assert(count == s);
+ for (unsigned i = 0; i < s; ++i)
+ assert(ia[i] == static_cast<int>(i + 1));
}
{
- auto f = for_each_test(0);
- Iter it = std::for_each_n(Iter(ia), 1, std::ref(f));
-
- assert(it == Iter(ia+1));
- assert(f.count == 1);
- for (unsigned i = 0; i < 1; ++i)
- assert(ia[i] == static_cast<int>(i+2));
+ unsigned count = 0;
+ Iter it = std::for_each_n(Iter(ia), 1, [&count](int& i) {
+ ++i;
+ ++count;
+ });
+ assert(it == Iter(ia + 1));
+ assert(count == 1);
+ for (unsigned i = 0; i < 1; ++i)
+ assert(ia[i] == static_cast<int>(i + 2));
}
+ }
+
+ {
+ int ia[] = {1, 3, 6, 7};
+ int expected[] = {3, 5, 8, 9};
+ const std::size_t N = 4;
+
+ auto it = std::for_each_n(std::begin(ia), N, [](int& a) { a += 2; });
+ assert(it == (std::begin(ia) + N) && std::equal(std::begin(ia), std::end(ia), std::begin(expected)));
+ }
+
+ if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
+ test_segmented_deque_iterator();
+
+#if TEST_STD_VER >= 20
+ {
+ std::vector<std::vector<int>> vec = {{0}, {1, 2}, {3, 4, 5}, {6, 7, 8, 9}, {10}, {11, 12, 13}};
+ auto v = vec | std::views::join;
+ std::for_each_n(v.begin(), std::ranges::distance(v), [i = 0](int& a) mutable { assert(a == i++); });
+ }
+#endif
+
+ return true;
+}
+int main(int, char**) {
+ assert(test());
#if TEST_STD_VER > 17
- static_assert(test_constexpr());
+ static_assert(test());
#endif
return 0;
This patch enhances the performance of
std::for_each_n
when used with segmented iterators, leading to significant performance improvements, summarized in the tables below. This addresses a subtask of #102817.std::deque
iteratorsstd::join_view
iteratorsvector<vector<char>>
vector<vector<short>>
vector<vector<int>>
which results in performance improvements up to 17.7x for
std::deque<short>
iterators, and up to 13.9x forjoin_view<vector<vector<short>>>
iterators.Detailed Benchmarks
std::deque
iteratorsstd::join_view
iterators