Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix empty cluster handling in tdigest merge #16675

Merged
Merged
Prev Previous commit
Next Next commit
Improve docs; add consts
  • Loading branch information
jihoonson committed Sep 3, 2024
commit 47fcf0dd9acfa73a119bffd6d2bcf13a16f4f4c9
2 changes: 1 addition & 1 deletion cpp/include/cudf/detail/tdigest/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
/**
* @brief Create an empty tdigest column.
*
* An empty tdigest column contains rows of length 0.
* An empty tdigest column contains specified number of rows of length 0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* An empty tdigest column contains specified number of rows of length 0.
* An empty tdigest column contains the specified number of rows of length 0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I fixed this doc to further clarify it.

*
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
Expand Down
12 changes: 6 additions & 6 deletions cpp/src/quantiles/tdigest/tdigest_aggregation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
* @param group_cluster_wl Output. The set of cluster weight limits for each group.
* @param group_num_clusters Output. The number of output clusters for each input group.
* @param group_cluster_offsets Offsets per-group to the start of it's clusters
* @param may_have_empty_clusters Whether or not there could be empty clusters. False if there
* should be no empty clusters.
* @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
* set to false only when there is no empty cluster.
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
*/

template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
Expand Down Expand Up @@ -502,8 +502,8 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
* stream that falls before our current cluster limit
* @param group_info A functor which returns the info for the specified group (total weight,
* size and start offset)
* @param may_have_empty_clusters Whether or not there could be empty clusters. False if there
* should be no empty clusters.
* @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
* set to false only when there is no empty cluster.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
Expand Down Expand Up @@ -720,8 +720,8 @@ struct compute_tdigests_keys_fn {
* @param group_cluster_wl Cluster weight limits for each group.
* @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
* @param total_clusters Total number of clusters in all groups.
* @param may_have_empty_clusters Whether or not there could be empty clusters. False if there
* should be no empty clusters.
* @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
* set to false only when there is no empty cluster.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
*
Expand Down
27 changes: 14 additions & 13 deletions cpp/tests/groupby/tdigest_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,7 @@ std::unique_ptr<cudf::table> do_agg(
{
std::vector<cudf::column_view> keys;
keys.push_back(key);
cudf::table_view key_table(keys);
cudf::table_view const key_table(keys);

cudf::groupby::groupby gb(key_table);
std::vector<cudf::groupby::aggregation_request> requests;
Expand All @@ -523,7 +523,7 @@ std::unique_ptr<cudf::table> do_agg(
req.aggregations.push_back(make_agg());
requests.push_back(std::move(req));

auto result = gb.aggregate(requests);
auto result = gb.aggregate(std::move(requests));

std::vector<std::unique_ptr<cudf::column>> result_columns;
for (auto&& c : result.first->release()) {
Expand All @@ -541,25 +541,26 @@ TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
{
// The input must be sorted by the key.
// See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
jihoonson marked this conversation as resolved.
Show resolved Hide resolved
auto keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
auto const keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
auto const keys_view = cudf::column_view(keys);
auto val_elems = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
// All values are null
return false;
});
auto vals = cudf::test::fixed_width_column_wrapper<int32_t>{
val_elems, val_elems + cudf::column_view(keys).size(), val_valids};
auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
val_elems, val_elems + keys_view.size(), val_valids};

auto const delta = 10000;

// Compute tdigest. The result should have 3 empty clusters, one per group.
auto compute_result = do_agg(cudf::column_view(keys), cudf::column_view(vals), [&delta]() {
auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
});

auto expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
cudf::column_view expected_computed_keys_view{expected_computed_keys};
auto expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
cudf::column_view const expected_computed_keys_view{expected_computed_keys};
auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
expected_computed_keys_view.size(),
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
Expand All @@ -569,14 +570,14 @@ TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
compute_result->get_column(1).view());

// Merge tdigest. The result should have 3 empty clusters, one per group.
auto merge_result =
auto const merge_result =
do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
});

auto expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
cudf::column_view expected_merged_keys_view{expected_merged_keys};
auto expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
cudf::column_view const expected_merged_keys_view{expected_merged_keys};
auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
expected_merged_keys_view.size(),
cudf::get_default_stream(),
rmm::mr::get_current_device_resource());
Expand Down