Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add XXHash_32 hasher #17533

Merged
merged 36 commits into from
Jan 7, 2025
Merged
Show file tree
Hide file tree
Changes from 31 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
bff159d
Add xxhash_32
PointKernel Dec 5, 2024
8d5c2dd
Change default hash to xxhash_32
PointKernel Dec 5, 2024
24c5a76
Add missing headers
PointKernel Dec 5, 2024
1597e1f
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 6, 2024
d2635a9
Apply suggestions from code review
PointKernel Dec 6, 2024
e25e0e1
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 6, 2024
e6f204c
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 9, 2024
200d557
Add xxhash_32 column hash API
PointKernel Dec 9, 2024
50ac47f
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 10, 2024
4d69dc9
Add xxhash32 tests
PointKernel Dec 10, 2024
03c3c81
Add xxhash in cmake
PointKernel Dec 10, 2024
c660740
Update reference values
PointKernel Dec 10, 2024
4277feb
Minor
PointKernel Dec 10, 2024
8070750
Update expected hash results
PointKernel Dec 11, 2024
7ca8b2b
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 11, 2024
fe88247
Merge remote-tracking branch 'origin/add-xxhash32' into add-xxhash32
PointKernel Dec 11, 2024
1b486ba
Merge branch 'branch-25.02' into add-xxhash32
bdice Dec 11, 2024
f93bde9
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Dec 16, 2024
71df59e
Merge remote-tracking branch 'origin/add-xxhash32' into add-xxhash32
PointKernel Dec 16, 2024
93e6af0
Revert default hash changes
PointKernel Dec 16, 2024
b26f0c7
Add Python bindings for xxhash32.
bdice Dec 16, 2024
67d9157
Remove unused helper function.
bdice Dec 16, 2024
71bf77b
Update tests.
bdice Dec 17, 2024
1508aad
Make tests pass.
bdice Dec 17, 2024
1010775
Fix xxhash32 implementation to avoid hash_combine steps.
bdice Dec 17, 2024
6fbd870
Fix typo in result value.
bdice Dec 17, 2024
fc35cc5
Merge branch 'branch-25.02' into add-xxhash32
bdice Dec 17, 2024
538a416
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 20, 2024
1f79ad0
Merge branch 'branch-25.02' into add-xxhash32
PointKernel Dec 30, 2024
aa4bf60
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Jan 6, 2025
6440b54
Update copyright years
PointKernel Jan 6, 2025
5c068bd
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Jan 6, 2025
a33a319
Remove redundant hash_value_type
PointKernel Jan 6, 2025
83f0b54
Fix copyright years
PointKernel Jan 6, 2025
bc102bf
Fix copy-paste leftovers
PointKernel Jan 6, 2025
1c06eaa
Merge remote-tracking branch 'upstream/branch-25.02' into add-xxhash32
PointKernel Jan 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ add_library(
src/hash/sha256_hash.cu
src/hash/sha384_hash.cu
src/hash/sha512_hash.cu
src/hash/xxhash_32.cu
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/arrow_utilities.cpp
Expand Down
22 changes: 21 additions & 1 deletion cpp/include/cudf/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_32 hash value of each row in the given table
*
* This function computes the hash of each column using the `seed` for the first column
* and the resulting hash as a seed for the next column and so on.
* The result is a uint32 value for each row.
*
* @param input The table of columns to hash
* @param seed Optional seed value to use for the hash function
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*
* @returns A column where each row is the hash of a row from the input
*/
std::unique_ptr<column> xxhash_32(
table_view const& input,
uint32_t seed = DEFAULT_HASH_SEED,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Computes the XXHash_64 hash value of each row in the given table
*
Expand Down
7 changes: 6 additions & 1 deletion cpp/include/cudf/hashing/detail/hashing.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_32(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
rmm::device_async_resource_ref mr);

std::unique_ptr<column> xxhash_64(table_view const& input,
uint64_t seed,
rmm::cuda_stream_view,
Expand Down
118 changes: 118 additions & 0 deletions cpp/include/cudf/hashing/detail/xxhash_32.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/fixed_point/fixed_point.hpp>
#include <cudf/hashing.hpp>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/lists/list_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/structs/struct_view.hpp>
#include <cudf/types.hpp>

#include <cuco/hash_functions.cuh>
#include <cuda/std/cstddef>

namespace cudf::hashing::detail {

template <typename Key>
struct XXHash_32 {
using result_type = std::uint32_t;

CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}

__device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }

__device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
std::uint64_t size) const
{
return this->_impl.compute_hash(bytes, size);
}

private:
template <typename T>
__device__ constexpr result_type compute(T const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
}

cuco::xxhash_32<Key> _impl;
};

template <>
XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
{
return this->compute(static_cast<uint8_t>(key));
}

template <>
XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
double const& key) const
{
return this->compute(normalize_nans_and_zeros(key));
}

template <>
XXHash_32<cudf::string_view>::result_type
__device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
{
return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
key.size_bytes());
}

template <>
XXHash_32<numeric::decimal32>::result_type
__device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal64>::result_type
__device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
{
return this->compute(key.value());
}

template <>
XXHash_32<numeric::decimal128>::result_type
__device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
{
return this->compute(key.value());
}

template <>
hash_value_type __device__ inline XXHash_32<cudf::list_view>::operator()(
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
cudf::list_view const& key) const
{
CUDF_UNREACHABLE("List column hashing is not supported");
}

template <>
hash_value_type __device__ inline XXHash_32<cudf::struct_view>::operator()(
cudf::struct_view const& key) const
{
CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
}

} // namespace cudf::hashing::detail
138 changes: 138 additions & 0 deletions cpp/src/hash/xxhash_32.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/algorithm.cuh>
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/hashing/detail/xxhash_32.cuh>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/memory_resource.hpp>
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <cuda/std/limits>
#include <thrust/tabulate.h>

namespace cudf {
namespace hashing {
namespace detail {

namespace {

using hash_value_type = uint32_t;
PointKernel marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Computes the hash value of a row in the given table.
*
* @tparam Nullate A cudf::nullate type describing whether to check for nulls.
*/
template <typename Nullate>
class device_row_hasher {
public:
device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
: _check_nulls(nulls), _table(t), _seed(seed)
{
}

__device__ auto operator()(size_type row_index) const noexcept
{
return cudf::detail::accumulate(
_table.begin(),
_table.end(),
_seed,
[row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
return cudf::type_dispatcher(
column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
});
}

/**
* @brief Computes the hash value of an element in the given column.
*/
class element_hasher_adapter {
public:
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const& col,
size_type const row_index,
Nullate const _check_nulls,
hash_value_type const _seed) const noexcept
{
if (_check_nulls && col.is_null(row_index)) {
return cuda::std::numeric_limits<hash_value_type>::max();
}
auto const hasher = XXHash_32<T>{_seed};
return hasher(col.element<T>(row_index));
}

template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view const&,
size_type const,
Nullate const,
hash_value_type const) const noexcept
{
CUDF_UNREACHABLE("Unsupported type for XXHash_32");
}
};

Nullate const _check_nulls;
table_device_view const _table;
hash_value_type const _seed;
};

} // namespace

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
input.num_rows(),
mask_state::UNALLOCATED,
stream,
mr);

// Return early if there's nothing to hash
if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }

bool const nullable = has_nulls(input);
auto const input_view = table_device_view::create(input, stream);
auto output_view = output->mutable_view();

// Compute the hash value for each row
thrust::tabulate(rmm::exec_policy(stream),
output_view.begin<hash_value_type>(),
output_view.end<hash_value_type>(),
device_row_hasher(nullable, *input_view, seed));

return output;
}

} // namespace detail

std::unique_ptr<column> xxhash_32(table_view const& input,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::xxhash_32(input, seed, stream, mr);
}

} // namespace hashing
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@

#include <cudf/detail/offsets_iterator.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/io/orc_types.hpp>
#include <cudf/table/experimental/row_operators.cuh>

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/parquet/chunk_dict.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>

#include <rmm/exec_policy.hpp>
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/join/join_common_utils.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
* Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/utilities/memory_resource.hpp>

Expand Down
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -192,6 +192,7 @@ ConfigureTest(
hashing/sha256_test.cpp
hashing/sha384_test.cpp
hashing/sha512_test.cpp
hashing/xxhash_32_test.cpp
hashing/xxhash_64_test.cpp
)

Expand Down
Loading
Loading