Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Avoid inserting null elements into join hash table when nulls are treated as unequal #6943

Merged
merged 19 commits into from
Dec 11, 2020
Merged
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
1e1086e
Initial pass at fixing nulls not equal performance with joins
hyperbolic2346 Dec 8, 2020
2020b4c
Merge remote-tracking branch 'upstream/branch-0.18' into mwilson/join…
hyperbolic2346 Dec 8, 2020
95f791f
adding changelog
hyperbolic2346 Dec 8, 2020
7db4f1b
Apply suggestions from code review
hyperbolic2346 Dec 8, 2020
539f461
removing stale comments
hyperbolic2346 Dec 9, 2020
cb88696
Merge branch 'mwilson/join_nulls' of github.com:hyperbolic2346/cudf i…
hyperbolic2346 Dec 9, 2020
8af46b2
attempting to clarify parameter comments
hyperbolic2346 Dec 9, 2020
e1ee434
moving device view creation up into build_join_hash_table and only pa…
hyperbolic2346 Dec 10, 2020
1e6461d
Merge remote-tracking branch 'upstream/branch-0.18' into mwilson/join…
hyperbolic2346 Dec 10, 2020
3d77c73
updating based on review
hyperbolic2346 Dec 10, 2020
e2f2810
Apply suggestions from code review
hyperbolic2346 Dec 10, 2020
9ab1e19
Adding srand per review comments
hyperbolic2346 Dec 10, 2020
6b0e5d4
linting
hyperbolic2346 Dec 10, 2020
2e233da
review comment changes
hyperbolic2346 Dec 10, 2020
5b66b19
Merge remote-tracking branch 'upstream/branch-0.18' into mwilson/join…
hyperbolic2346 Dec 10, 2020
e57855a
Apply suggestions from code review
hyperbolic2346 Dec 11, 2020
d300ad6
Merge remote-tracking branch 'upstream/branch-0.18' into mwilson/join…
hyperbolic2346 Dec 11, 2020
a458a84
cleanup and fixes for UniformRandomGenerator
hyperbolic2346 Dec 11, 2020
39403b2
linting
hyperbolic2346 Dec 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
moving device view creation up into build_join_hash_table and only pa…
…ssing one table
  • Loading branch information
hyperbolic2346 committed Dec 10, 2020
commit e1ee434dc542dcb827fd228322dafe7779c89d1a
17 changes: 8 additions & 9 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -196,23 +196,23 @@ get_left_join_indices_complement(rmm::device_vector<size_type> &right_indices,
* @throw std::out_of_range if elements of `build_on` exceed the number of columns in the `build`
* table.
*
* @param build_table Device view of table of columns used to build join hash.
* @param build Table of columns used to build join hash.
* @param compare_nulls Controls whether null join-key values should match or not.
* @param stream CUDA stream used for device memory operations and kernel launches.
*
* @return Built hash table.
*/
std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_hash_table(
cudf::table_device_view build_table,
cudf::table_view const &build,
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved
null_equality compare_nulls,
rmm::cuda_stream_view stream)
{
CUDF_EXPECTS(0 != build_table.num_columns(), "Selected build dataset is empty");
CUDF_EXPECTS(0 != build_table.num_rows(), "Build side table has no rows");
auto build_table = cudf::table_device_view::create(build, stream);
hyperbolic2346 marked this conversation as resolved.
Show resolved Hide resolved

CUDF_EXPECTS(0 != build_table->num_columns(), "Selected build dataset is empty");
CUDF_EXPECTS(0 != build_table->num_rows(), "Build side table has no rows");

const size_type build_table_num_rows{build_table.num_rows()};
const size_type build_table_num_rows{build_table->num_rows()};
size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);

auto hash_table = multimap_type::create(hash_table_size,
Expand All @@ -222,7 +222,7 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
multimap_type::key_equal(),
multimap_type::allocator_type());

row_hash hash_build{build_table};
row_hash hash_build{*build_table};
rmm::device_scalar<int> failure(0, stream);
constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
detail::grid_1d config(build_table_num_rows, block_size);
Expand All @@ -236,7 +236,7 @@ std::unique_ptr<multimap_type, std::function<void(multimap_type *)>> build_join_
*hash_table,
hash_build,
build_table_num_rows,
static_cast<bitmask_type *>(row_bitmask.data()),
static_cast<bitmask_type const *>(row_bitmask.data()),
failure.data());
// Check error code from the kernel
if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); }
Expand Down Expand Up @@ -515,8 +515,7 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,

if (_build_on.empty() || 0 == build.num_rows()) { return; }

auto build_table = cudf::table_device_view::create(_build_selected, stream);
_hash_table = build_join_hash_table(*build_table, build, compare_nulls, stream);
_hash_table = build_join_hash_table(_build_selected, compare_nulls, stream);
}

std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
Expand Down