rapidsai · rapids-bot · Jul 10, 2024 · Jul 1, 2024 · Jul 3, 2024 · Jul 3, 2024
@@ -226,8 +226,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp
-  groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multistream.cpp groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/groupby/group_max_multistream.cpp b/cpp/benchmarks/groupby/group_max_multistream.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_max_multistream_helper(nvbench::state& state,
+                                    cudf::size_type num_rows,
+                                    cudf::size_type cardinality,
+                                    double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+
+  auto const num_streams = state.get_int64("num_streams");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_streams);
+
+  std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_streams);
+  for (int64_t i = 0; i < num_streams; i++) {
+    requests[i].emplace_back(cudf::groupby::aggregation_request());
+    requests[i][0].values = vals->view();
+    requests[i][0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      auto perform_agg = [&](int index) { gb_obj.aggregate(requests[index], streams[index]); };
+      timer.start();
+      for (int i = 0; i < num_streams; i++) {
+        perform_agg(i);
+      }
+      cudf::detail::join_streams(streams, cudf::get_default_stream());
+      timer.stop();
+    });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_max_multistream(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  groupby_max_multistream_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_multistream,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_max_multistream")
+  .add_int64_axis("cardinality", {0})
+  .add_int64_power_of_two_axis("num_rows", {12, 18})
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_streams", {1, 2, 4, 8});
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void groupby_max_multithreaded_helper(nvbench::state& state,
+                                      cudf::size_type num_rows,
+                                      cudf::size_type cardinality,
+                                      double null_probability)
+{
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+
+  auto const num_threads = state.get_int64("num_threads");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
+  for (int64_t i = 0; i < num_threads; i++) {
+    requests[i].emplace_back(cudf::groupby::aggregation_request());
+    requests[i][0].values = vals->view();
+    requests[i][0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      auto perform_agg = [&](int index) { gb_obj.aggregate(requests[index], streams[index]); };
+      threads.paused   = true;
+      for (int64_t i = 0; i < num_threads; ++i) {
+        threads.submit(perform_agg, i);
+      }
+      timer.start();
+      threads.paused = false;
+      threads.wait_for_tasks();
+      cudf::detail::join_streams(streams, cudf::get_default_stream());
+      timer.stop();
+    });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+
+  groupby_max_multithreaded_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+template <typename Type>
+void bench_groupby_max_multithreaded_cardinality(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto constexpr num_rows         = 20'000'000;
+  auto constexpr null_probability = 0.;
+  auto const cardinality          = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+
+  groupby_max_multithreaded_helper<Type>(state, num_rows, cardinality, null_probability);
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_max_multithreaded")
+  .add_int64_axis("cardinality", {0})
+  .add_int64_power_of_two_axis("num_rows", {12, 18})
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_threads", {1, 2, 4, 8});