Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix hash join when the input tables have nulls on only one side #13120

Merged
merged 22 commits into from
Apr 13, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add overload constructor and add docs
  • Loading branch information
ttnghia committed Apr 12, 2023
commit b09a3916f4641d07fec0e7ebd6a0201669b40067
4 changes: 2 additions & 2 deletions cpp/include/cudf/detail/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,12 @@ struct hash_join {
*
* @param build The build table, from which the hash table is built.
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any table that will be used later for join.
* any probe table that will be used later for join.
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
* @param compare_nulls Controls whether null join-key values should match or not.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
hash_join(cudf::table_view const& build,
std::optional<bool> has_nulls,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream);

Expand Down
31 changes: 28 additions & 3 deletions cpp/include/cudf/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,13 +282,20 @@ class hash_join {
* undefined.
*
* @param build The build table, from which the hash table is built
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any table that will be used later for join
* @param compare_nulls Controls whether null join-key values should match or not
* @param stream CUDA stream used for device memory operations and kernel launches
*/
hash_join(cudf::table_view const& build,
std::optional<bool> has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());
/**
* @copydoc hash_join
*
* @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
* any probe table that will be used later for join
*/
hash_join(cudf::table_view const& build,
bool has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream = cudf::get_default_stream());

Expand All @@ -303,6 +310,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing an inner join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -325,6 +335,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a left join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -347,6 +360,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the returned table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
* the result of performing a full join between two tables with `build` and `probe`
* as the the join keys .
Expand All @@ -365,6 +381,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return The exact number of output when performing an inner join between two tables with
* `build` and `probe` as the the join keys .
*/
Expand All @@ -378,6 +397,9 @@ class hash_join {
* @param probe The probe table, from which the tuples are probed
* @param stream CUDA stream used for device memory operations and kernel launches
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return The exact number of output when performing a left join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand All @@ -393,6 +415,9 @@ class hash_join {
* @param mr Device memory resource used to allocate the intermediate table and columns' device
* memory.
*
* @throw cudf::logic_error If the input probe table has nulls while this hash_join class was not
* constructed with null check.
*
* @return The exact number of output when performing a full join between two tables with `build`
* and `probe` as the the join keys .
*/
Expand Down
18 changes: 13 additions & 5 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,10 @@ std::size_t get_full_join_size(

template <typename Hasher>
hash_join<Hasher>::hash_join(cudf::table_view const& build,
std::optional<bool> has_nulls,
bool has_nulls,
cudf::null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _has_nulls(has_nulls.value_or(true)),
: _has_nulls(has_nulls),
_is_empty{build.num_rows() == 0},
_nulls_equal{compare_nulls},
_hash_table{compute_hash_table_size(build.num_rows()),
Expand All @@ -381,8 +381,8 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,

if (_is_empty) { return; }

auto const row_bitmask = std::move(
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first);
auto const row_bitmask =
cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
cudf::detail::build_join_hash_table(_build,
_preprocessed_build,
_hash_table,
Expand Down Expand Up @@ -586,7 +586,15 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
hash_join::~hash_join() = default;

hash_join::hash_join(cudf::table_view const& build,
std::optional<bool> has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
// If we cannot know beforehand about null existence then let's assume that there are nulls.
: hash_join(build, true /*has_nulls*/, compare_nulls, stream)
divyegala marked this conversation as resolved.
Show resolved Hide resolved
{
}

hash_join::hash_join(cudf::table_view const& build,
bool has_nulls,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _impl{std::make_unique<const impl_type>(build, has_nulls, compare_nulls, stream)}
Expand Down