diff --git a/benchmark/multidimensional_memset_benchmark.cpp b/benchmark/multidimensional_memset_benchmark.cpp index cee49514..5174bc22 100644 --- a/benchmark/multidimensional_memset_benchmark.cpp +++ b/benchmark/multidimensional_memset_benchmark.cpp @@ -10,38 +10,84 @@ #include #include +#include +#include namespace an = ankerl::nanobench; -extern void memset_2d_reference(double* A, std::size_t N, std::size_t M); - -extern void memset_2d_flux_cartesian_product_iota(double* A, std::size_t N, std::size_t M); +// Kernels are placed in a separate translation unit to prevent compilers from +// optimizing them based on the input that we'll be giving them and to make it +// easier to study their compiled assembly. +extern void memset_2d_reference(double* A, flux::distance_t N, flux::distance_t M); +extern void memset_2d_std_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M); +extern void memset_2d_flux_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M); +extern void memset_diagonal_2d_reference(double* A, flux::distance_t N, flux::distance_t M); +extern void memset_diagonal_2d_std_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M); +extern void memset_diagonal_2d_flux_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M); int main(int argc, char** argv) { int const n_iters = argc > 1 ? std::atoi(argv[1]) : 40; - constexpr std::size_t N = 1024; - constexpr std::size_t M = 2048; + constexpr flux::distance_t N = 1024; + constexpr flux::distance_t M = 2048; std::vector A(N * M); - { - auto bench = an::Bench().minEpochIterations(n_iters).relative(true); - - std::iota(A.begin(), A.end(), 0); - - bench.run("memset_2d_handwritten", - [&] { memset_2d_reference(A.data(), N, M); }); - - if (auto it = std::ranges::find_if_not(A, [&] (auto e) { return e == 0; }); it != A.end()) - throw false; - + const auto run_benchmark = + [] (auto& bench, auto& A, auto N, auto M, auto name, auto func, auto check) { std::iota(A.begin(), A.end(), 0); + bench.run(name, [&] { func(A.data(), N, M); }); + check(A, N, M); + }; - bench.run("memset_2d_flux_cartesian_product_iota", - [&] { memset_2d_flux_cartesian_product_iota(A.data(), N, M); }); + { + const auto check_2d = [] (auto& A, auto N, auto M) { + const auto it = std::ranges::find_if_not(A, [&] (auto e) { return e == 0.0; }); + if (it != A.end()) + throw false; + }; + + auto bench = an::Bench() + .minEpochIterations(n_iters) + .relative(true) + .performanceCounters(false); + + const auto run_2d_benchmark_impl = [&] (auto name, auto func) { + run_benchmark(bench, A, N, M, name, func, check_2d); + }; + + #define run_2d_benchmark(func) run_2d_benchmark_impl(#func, func) + + run_2d_benchmark(memset_2d_reference); + run_2d_benchmark(memset_2d_std_cartesian_product_iota); + run_2d_benchmark(memset_2d_flux_cartesian_product_iota); + } - if (auto it = std::ranges::find_if_not(A, [&] (auto e) { return e == 0; }); it != A.end()) - throw false; + { + const auto check_diagonal_2d = [] (auto& A, auto N, auto M) { + for (auto i : std::views::iota(0, N)) + for (auto j : std::views::iota(0, M)) { + if (i == j) { + if (A[i * M + j] != 0.0) throw false; + } else { + if (A[i * M + j] != i * M + j) throw false; + } + } + }; + + auto bench = an::Bench() + .minEpochIterations(n_iters) + .relative(true) + .performanceCounters(false); + + const auto run_diagonal_2d_benchmark_impl = [&] (auto name, auto func) { + run_benchmark(bench, A, N, M, name, func, check_diagonal_2d); + }; + + #define run_diagonal_2d_benchmark(func) run_diagonal_2d_benchmark_impl(#func, func) + + run_diagonal_2d_benchmark(memset_diagonal_2d_reference); + run_diagonal_2d_benchmark(memset_diagonal_2d_std_cartesian_product_iota_filter); + run_diagonal_2d_benchmark(memset_diagonal_2d_flux_cartesian_product_iota_filter); } } diff --git a/benchmark/multidimensional_memset_benchmark_kernels.cpp b/benchmark/multidimensional_memset_benchmark_kernels.cpp index cdfc817b..5399ab91 100644 --- a/benchmark/multidimensional_memset_benchmark_kernels.cpp +++ b/benchmark/multidimensional_memset_benchmark_kernels.cpp @@ -7,18 +7,62 @@ #include #include #include +#include -void memset_2d_reference(double* A, std::size_t N, std::size_t M) +#include "ranges_cartesian_product.hpp" + +#include +#include + +void memset_2d_reference(double* A, flux::distance_t N, flux::distance_t M) +{ + for (flux::distance_t i = 0; i != N; ++i) + for (flux::distance_t j = 0; j != M; ++j) + A[i * M + j] = 0.0; +} + +void memset_2d_std_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M) +{ + std::ranges::for_each( + std::views::cartesian_product(std::views::iota(0, N), std::views::iota(0, M)), + flux::unpack([&] (auto i, auto j) { + A[i * M + j] = 0.0; + })); +} + +void memset_2d_flux_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M) { - for (std::size_t j = 0; j != M; ++j) - for (std::size_t i = 0; i != N; ++i) - A[i + j * N] = 0.0; + flux::for_each( + flux::cartesian_product(flux::ints(0, N), flux::ints(0, M)), + flux::unpack([&] (auto i, auto j) { + A[i * M + j] = 0.0; + })); } -void memset_2d_flux_cartesian_product_iota(double* A, std::size_t N, std::size_t M) +void memset_diagonal_2d_reference(double* A, flux::distance_t N, flux::distance_t M) { - flux::cartesian_product(flux::iota(0LU, N), flux::iota(0LU, M)) - .for_each(flux::unpack([&] (auto i, auto j) { - A[i + j * N] = 0.0; + for (flux::distance_t i = 0; i != N; ++i) + for (flux::distance_t j = 0; j != M; ++j) + if (i == j) A[i * M + j] = 0.0; +} + +void memset_diagonal_2d_std_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M) +{ + std::ranges::for_each( + std::views::cartesian_product(std::views::iota(0, N), std::views::iota(0, M)) + | std::views::filter(flux::unpack([] (auto i, auto j) { return i == j; })), + flux::unpack([&] (auto i, auto j) { + A[i * M + j] = 0.0; })); } + +void memset_diagonal_2d_flux_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M) +{ + flux::for_each( + flux::cartesian_product(flux::ints(0, N), flux::ints(0, M)) + .filter(flux::unpack([] (auto i, auto j) { return i == j; })), + flux::unpack([&] (auto i, auto j) { + A[i * M + j] = 0.0; + })); +} + diff --git a/benchmark/ranges_cartesian_product.hpp b/benchmark/ranges_cartesian_product.hpp new file mode 100644 index 00000000..430bf529 --- /dev/null +++ b/benchmark/ranges_cartesian_product.hpp @@ -0,0 +1,423 @@ + +// Copyright (c) 2015-2017 Bryce Adelstein Lelbach +// Copyright (c) 2020-2023 Corentin Jabot +// Copyright (c) 2017-2023 NVIDIA Corporation (reply-to: brycelelbach@gmail.com) +// +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include + +namespace std::ranges { + +namespace detail { + +template +constexpr bool valid_cartesian_product_pack + = std::ranges::input_range && (std::ranges::forward_range && ...); + +template +concept cartesian_product_simple_view + = std::ranges::view &&std::ranges::range + && std::same_as, std::ranges::iterator_t> + && std::same_as, std::ranges::sentinel_t>; + +} // namespace detail + +template + requires(sizeof...(Ranges) == 0) || detail::valid_cartesian_product_pack +struct cartesian_product_view + : std::ranges::view_interface> +{ +private: + std::tuple bases; + +public: + constexpr cartesian_product_view() = default; + + constexpr cartesian_product_view(Ranges... base_) + : bases(std::move(base_)...) {} + + template + struct sentinel; + + template + struct iterator + { + private: + using parent = std::conditional_t< + IsConst, cartesian_product_view const, cartesian_product_view + >; + + parent* view = nullptr; + std::tuple...> its; + + template + friend struct cartesian_product_view::sentinel; + + static constexpr auto iterator_category_impl() { + if constexpr ((std::ranges::random_access_range && ...)) + return std::random_access_iterator_tag{}; + else if constexpr ((std::ranges::bidirectional_range && ...)) + return std::bidirectional_iterator_tag{}; + else if constexpr ((std::ranges::forward_range && ...)) + return std::forward_iterator_tag{}; + else if constexpr ((std::ranges::input_range && ...)) + return std::input_iterator_tag{}; + else + return std::output_iterator_tag{}; + } + + public: + using iterator_category = decltype(iterator_category_impl()); + using reference = std::tuple...>; + using value_type = std::tuple...>; + using difference_type = std::common_type_t...>; + + constexpr iterator() = default; + constexpr explicit iterator( + parent* view_, std::ranges::iterator_t... its_ + ) + : view(view_), its(std::move(its_)...) {} + + constexpr auto operator*() const + { + return std::apply( + [&](auto const&... args) { return reference{*(args)...}; } + , its + ); + } + + constexpr iterator operator++(int) + { + if constexpr ((std::ranges::forward_range && ...)) { + auto tmp = *this; + ++*this; + return tmp; + } + ++*this; + } + + constexpr iterator& operator++() + { + next(); + return *this; + } + + constexpr iterator& operator--() + requires(std::ranges::bidirectional_range && ...) + { + prev(); + return *this; + } + + constexpr iterator operator--(int) + requires(std::ranges::bidirectional_range && ...) + { + auto tmp = *this; + --*this; + return tmp; + } + + constexpr iterator& operator+=(difference_type n) + requires(std::ranges::random_access_range && ...) + { + advance(n); + return *this; + } + + constexpr iterator &operator-=(difference_type n) + requires(std::ranges::random_access_range && ...) + { + advance(-n); + return *this; + } + + friend constexpr iterator operator+(iterator i, difference_type n) + requires(std::ranges::random_access_range && ...) + { + return {i + n}; + } + + friend constexpr iterator operator+(difference_type n, iterator i) + requires(std::ranges::random_access_range && ...) + { + return {i + n}; + } + + friend constexpr iterator operator-(iterator i, difference_type n) + requires(std::ranges::random_access_range && ...) + { + return {i - n}; + } + + friend constexpr difference_type operator-( + iterator const& x, iterator const& y + ) + requires(std::ranges::random_access_range && ...) + { + return y.distance(x); + } + + constexpr decltype(auto) operator[](difference_type n) const + requires(std::ranges::random_access_range && ...) + { + return *iterator{*this + n}; + } + + constexpr bool operator==(iterator const& other) const + { + if (at_end() && other.at_end()) + return true; + return eq(*this, other); + } + + friend constexpr auto operator<=>(iterator const& x, iterator const& y) + requires( + (std::ranges::random_access_range && ...) && + (std::three_way_comparable> && ...) + ) + { + return compare(x, y); + } + + friend constexpr bool operator==(const iterator &i, sentinel const&) + { + return i.at_end(); + } + friend constexpr bool operator==(const iterator &i, sentinel const&) + { + return i.at_end(); + } + + private: + constexpr bool at_end() const + { + auto const& v = std::get<0>(view->bases); + return std::end(v) == std::get<0>(its); + } + + template + constexpr static auto compare(iterator const& a, iterator const& b) + -> std::strong_ordering + { + auto cmp = std::get(a.its) <=> std::get(b.its); + if constexpr (N + 1 < sizeof...(Ranges)) { + if (cmp == 0) + return compare(a, b); + } + return cmp; + } + + template + constexpr static bool eq(iterator const& a, iterator const& b) + { + if (std::get(a.its) != std::get(b.its)) + return false; + if constexpr (N > 0) + return eq(a, b); + return true; + } + + template + constexpr void next() + { + const auto &v = std::get(view->bases); + auto &it = std::get(its); + if (++it == std::end(v)) { + if constexpr (N != 0) { + it = std::ranges::begin(v); + next(); + } + } + } + + template + constexpr void prev() + { + const auto &v = std::get(view->bases); + auto &it = std::get(its); + if (it == std::ranges::begin(v)) + { + std::ranges::advance(it, std::ranges::end(v)); + if constexpr (N > 0) + prev(); + } + --it; + } + + template + constexpr difference_type distance(iterator const& other) const + { + if constexpr (N == 0) { + return std::get<0>(other.its) - std::get<0>(its); + } else { + const auto d = this->distance(other); + auto const scale = std::ranges::distance(std::get(view->bases)); + auto const increment = std::get(other.its) - std::get(its); + + return difference_type{(d * scale) + increment}; + } + } + + template + void advance(difference_type n) + { + if (n == 0) + return; + + auto &i = std::get(its); + auto const size = static_cast( + std::ranges::size(std::get(view->bases)) + ); + auto const first = std::ranges::begin(std::get(view->bases)); + + auto const idx = static_cast(i - first); + n += idx; + + auto div = size ? n / size : 0; + auto mod = size ? n % size : 0; + + if constexpr (N != 0) { + if (mod < 0) { + mod += size; + div--; + } + advance(div); + } else { + if (div > 0) { + mod = size; + } + } + using D = std::iter_difference_t; + i = first + static_cast(mod); + } + }; + + template + struct sentinel + { + private: + friend iterator; + friend iterator; + + using parent = std::conditional_t< + IsConst, cartesian_product_view const, cartesian_product_view + >; + + parent* view = nullptr; + std::tuple...> end; + + public: + sentinel() = default; + + constexpr explicit sentinel( + parent* view_, std::ranges::sentinel_t... end_ + ) + : view(view_), end(std::move(end_)...) { + } + + constexpr sentinel(sentinel other) + requires( + IsConst + && (std::convertible_to< + std::ranges::sentinel_t + , std::ranges::sentinel_t + > && ...) + ) + : view(other.view_), end(other.end_) {} + }; + +public: + constexpr auto size() const + requires(std::ranges::sized_range && ...) + { + return std::apply( + [] (Args const&... args) + { + using Size = std::common_type_t...>; + return (Size(std::ranges::size(args)) * ...); + } + , bases); + } + + constexpr auto begin() + requires(!detail::cartesian_product_simple_view || ...) + { + return std::apply( + [&] (auto&... args) + { + using std::ranges::begin; + return iterator{this, begin(args)...}; + } + , bases + ); + } + + constexpr auto begin() const + requires(detail::cartesian_product_simple_view && ...) + { + return std::apply( + [&] (auto const&... args) + { + using std::ranges::begin; + return iterator{this, begin(args)...}; + } + , bases + ); + } + + constexpr auto end() const + requires(std::ranges::common_range && ...) + { + return std::apply( + [&] (auto const& first, auto const&... args) + { + using std::ranges::end; + using std::ranges::begin; + return iterator(this, end(first), begin(args)...); + }, + bases + ); + } + + constexpr auto end() const + requires(!std::ranges::common_range || ...) + { + return std::apply( + [&] (auto const&... args) { + return sentinel(this, std::end(args)...); + } + , bases + ); + } +}; + +template +cartesian_product_view(Ranges&&...) -> + cartesian_product_view...>; + +namespace detail { + +struct cartesian_product_fn +{ + template + constexpr auto operator()(Ranges&&... ranges) const + { + return cartesian_product_view((Ranges&&)ranges...); + } +}; + +} // namespace detail + +namespace views { + +inline constexpr detail::cartesian_product_fn cartesian_product{}; + +} // namespace views + +} // namespace std::ranges +