Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose search function with pre-filter for ANN #302

Merged
merged 54 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
0dbe5b2
[WIP] CAGRA - separable compilation for distance computation
achirkin Aug 16, 2024
93b0439
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Aug 16, 2024
ba52b13
Fix style
achirkin Aug 16, 2024
434e50a
Add missing multi-kernel implementation
achirkin Aug 19, 2024
6352550
Move common code out of virtual functions scope (aiming for more inli…
achirkin Aug 19, 2024
d161f79
Make small descriptor functions into fields
achirkin Aug 20, 2024
35c3813
Minor updates to improve reg count
achirkin Aug 20, 2024
4b5dcd3
Refactor distance_core -> compute_distance, and update the instance list
achirkin Aug 21, 2024
e5878db
Merge remote-tracking branch 'rapidsai/branch-24.10' into enh-cagra-s…
achirkin Aug 21, 2024
385a8c4
Make the compute_distance instances controlled from a single place
achirkin Aug 21, 2024
3f77cda
Refactor usage of init_kernel to make sure it instantiated in the sam…
achirkin Aug 22, 2024
ddb0488
Reduce the register usage in distance functions
achirkin Aug 22, 2024
c244ead
Partially implemented manual dispatch
achirkin Aug 23, 2024
7eb6a27
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Aug 23, 2024
ff2fdbe
Finish manual dispatch
achirkin Aug 23, 2024
78a9809
Change instance generator to have blockdim/team_size ratio 16
achirkin Aug 23, 2024
6082bf7
Trying various minor things to reduce register spilling
achirkin Aug 23, 2024
fc7d832
Move the metric parameter to the compute_distance template
achirkin Aug 26, 2024
6763bf7
Expose search() with optional filter for ANN
lowener Aug 26, 2024
118808e
Further reduce register pressure by moving code out of the non-inlina…
achirkin Aug 27, 2024
4e254fc
Merge branch 'branch-24.10' into 24.10-search-filter
lowener Aug 27, 2024
abec125
Manually unroll device::team_sum
achirkin Aug 27, 2024
cf0101c
Remove the test of a compute_distance instance that is not compiled (…
achirkin Aug 28, 2024
b3e6d26
Hide previously not hidden kernels
achirkin Aug 28, 2024
f231828
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Aug 28, 2024
e4cb424
Fix CAGRA filter test
lowener Aug 29, 2024
dc75f7a
Reduce register usage by minimizing the part of descriptor struct pas…
achirkin Sep 2, 2024
6630a99
Further reduce the size size of the dataset descriptor and add explic…
achirkin Sep 2, 2024
790e79c
Cache dataset descriptors to recover small batch performance
achirkin Sep 2, 2024
7599331
Reduce the register usage in compute_distance_standard further
achirkin Sep 3, 2024
4d9241e
Reduce the generated code volume
achirkin Sep 3, 2024
5fdcdd0
More explicit ldg cache behavior and a few smaller things
achirkin Sep 4, 2024
5984596
Simplify vpq indexing arithmetics a bit
achirkin Sep 4, 2024
337d990
Fix style
lowener Sep 4, 2024
6eb34be
Merge branch 'branch-24.10' into 24.10-search-filter
lowener Sep 4, 2024
af0cc12
Bring back the fatbin.ld link option
achirkin Sep 5, 2024
9023e68
relax the config for checking the raft_cutlass symbol exclusion (see …
achirkin Sep 5, 2024
99d2bd3
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 6, 2024
75a2dac
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 9, 2024
6a1b898
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 10, 2024
c1eed0e
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 10, 2024
d4673cf
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 11, 2024
a78797f
Merge branch 'branch-24.10' into 24.10-search-filter
lowener Sep 11, 2024
0046a73
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 11, 2024
267902e
Merge branch 'branch-24.10' into enh-cagra-separable-compilation
achirkin Sep 16, 2024
4ae3fa5
Merge remote-tracking branch 'achirkin/enh-cagra-separable-compilatio…
lowener Sep 16, 2024
b145d6d
Add base_filter to ANN API
lowener Sep 23, 2024
d357b99
Merge branch 'branch-24.10' into 24.10-search-filter
lowener Sep 25, 2024
66f633c
Fix details, finalize merge
lowener Sep 25, 2024
9e3a4ca
Fix documentation and parameter names
lowener Sep 26, 2024
e6e9f3b
Use references in public API of pre-filtering
lowener Sep 30, 2024
9bdd8f6
Merge branch 'branch-24.10' into 24.10-search-filter
lowener Sep 30, 2024
9386a9e
Unify none_ivf_sample_filter with none_cagra_sample_filter
lowener Oct 1, 2024
e258866
Add Bruteforce Prefilter API
lowener Oct 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Reduce the register usage in distance functions
  • Loading branch information
achirkin committed Aug 22, 2024
commit ddb048808c6d4179c80e965c8c47fad5dfb1ae62
2 changes: 1 addition & 1 deletion cpp/src/neighbors/detail/cagra/compute_distance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct dataset_descriptor_base_t {
* This covers all standard and VPQ descriptors; we need this to copy the descriptor from global
* memory. Increase this if new fields are needed (but try to keep the descriptors small really).
*/
static constexpr size_t kMaxStructSize = 128;
static constexpr size_t kMaxStructSize = 64;

template <size_t ActualSize, size_t MaximumSize = kMaxStructSize>
static inline constexpr void assert_struct_size()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,9 @@
}} // namespace cuvs::neighbors::cagra::detail
"""

#mxdim_team = [(128, 8), (256, 16), (512, 32)]
mxdim_team = [(64, 8), (128, 16), (256, 32)]
# block = [(64, 16), (128, 8), (256, 4), (512, 2), (1024, 1)]
# itopk_candidates = [64, 128, 256]
# itopk_size = [64, 128, 256, 512]
# mxelem = [64, 128, 256]
#mxdim_team = [(32, 8), (64, 16), (128, 32)]

pq_bits = [8]
pq_lens = [2, 4]
Expand Down
52 changes: 15 additions & 37 deletions cpp/src/neighbors/detail/cagra/compute_distance_standard.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <cuvs/neighbors/common.hpp>
#include <raft/core/logger-macros.hpp>
#include <raft/core/operators.hpp>
#include <raft/util/device_loads_stores.cuh>

#include <raft/util/vectorized.cuh>

Expand Down Expand Up @@ -79,41 +80,9 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, I
INDEX_T dataset_index,
cuvs::distance::DistanceType metric,
bool valid) const -> DISTANCE_T
{
switch (metric) {
case cuvs::distance::DistanceType::L2Expanded:
return compute_similarity<cuvs::distance::DistanceType::L2Expanded>(
smem_workspace, dataset_index, valid);
case cuvs::distance::DistanceType::InnerProduct:
return compute_similarity<cuvs::distance::DistanceType::InnerProduct>(
smem_workspace, dataset_index, valid);
default: return 0;
}
}

private:
template <typename T, cuvs::distance::DistanceType METRIC>
RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b)
-> std::enable_if_t<METRIC == cuvs::distance::DistanceType::L2Expanded, T>
{
T diff = a - b;
return diff * diff;
}

template <typename T, cuvs::distance::DistanceType METRIC>
RAFT_DEVICE_INLINE_FUNCTION constexpr static auto dist_op(T a, T b)
-> std::enable_if_t<METRIC == cuvs::distance::DistanceType::InnerProduct, T>
{
return -a * b;
}

template <cuvs::distance::DistanceType METRIC>
RAFT_DEVICE_INLINE_FUNCTION auto compute_similarity(ws_handle smem_workspace,
const INDEX_T dataset_i,
const bool valid) const -> DISTANCE_T
{
auto query_ptr = smem_query_buffer(smem_workspace);
const auto dataset_ptr = ptr + dataset_i * ld;
const auto dataset_ptr = ptr + dataset_index * ld;
const unsigned lane_id = threadIdx.x % TeamSize;

DISTANCE_T norm2 = 0;
Expand All @@ -134,14 +103,22 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, I
if (k >= dim) break;
#pragma unroll
for (uint32_t v = 0; v < vlen; v++) {
const uint32_t kv = k + v;
// Note this loop can go above the dataset_dim for padded arrays. This is not a problem
// because:
// - Above the last element (dataset_dim-1), the query array is filled with zeros.
// - The data buffer has to be also padded with zeros.
DISTANCE_T d = query_ptr[device::swizzling(kv)];
norm2 += dist_op<DISTANCE_T, METRIC>(
d, cuvs::spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]));
DISTANCE_T d;
raft::lds(d, query_ptr + device::swizzling(k + v));
constexpr cuvs::spatial::knn::detail::utils::mapping<float> mapping{};
switch (metric) {
case cuvs::distance::DistanceType::L2Expanded:
d -= mapping(dl_buff[e].val.data[v]);
norm2 += d * d;
break;
case cuvs::distance::DistanceType::InnerProduct:
norm2 -= d * mapping(dl_buff[e].val.data[v]);
break;
}
}
}
}
Expand All @@ -153,6 +130,7 @@ struct standard_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, I
return norm2;
}

private:
RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_query_buffer(ws_handle smem_workspace) const
-> QUERY_T*
{
Expand Down
75 changes: 20 additions & 55 deletions cpp/src/neighbors/detail/cagra/compute_distance_vpq.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ template <uint32_t TeamSize,
typename CodeBookT,
typename DataT,
typename IndexT,
typename DistanceT = float>
typename DistanceT>
struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, IndexT, DistanceT> {
using base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
using CODE_BOOK_T = CodeBookT;
Expand All @@ -43,18 +43,13 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
using typename base_type::INDEX_T;
using typename base_type::ws_handle;

static_assert(std::is_same_v<CODE_BOOK_T, half>,
"Only CODE_BOOK_T = "
"`half` is supported "
"now");
static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");

const std::uint8_t* encoded_dataset_ptr;
const CODE_BOOK_T* vq_code_book_ptr;
const CODE_BOOK_T* pq_code_book_ptr;
std::uint32_t encoded_dataset_dim;
std::uint32_t n_subspace;
float vq_scale;
float pq_scale;

static constexpr std::uint32_t kSMemCodeBookSizeInBytes =
(1 << PQ_BITS) * PQ_LEN * utils::size_of<CODE_BOOK_T>();
Expand All @@ -63,19 +58,15 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
std::uint32_t encoded_dataset_dim,
std::uint32_t n_subspace,
const CODE_BOOK_T* vq_code_book_ptr,
float vq_scale,
const CODE_BOOK_T* pq_code_book_ptr,
float pq_scale,
std::size_t size,
std::uint32_t dim)
: base_type(size, dim, TeamSize, get_smem_ws_size_in_bytes(dim)),
encoded_dataset_ptr(encoded_dataset_ptr),
encoded_dataset_dim(encoded_dataset_dim),
n_subspace(n_subspace),
vq_code_book_ptr(vq_code_book_ptr),
vq_scale(vq_scale),
pq_code_book_ptr(pq_code_book_ptr),
pq_scale(pq_scale)
pq_code_book_ptr(pq_code_book_ptr)
{
base_type::template assert_struct_size<sizeof(*this)>();
}
Expand Down Expand Up @@ -110,7 +101,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
half2 buf2{0, 0};
if (i < dim) { buf2.x = mapping(query_ptr[i]); }
if (i + 1 < dim) { buf2.y = mapping(query_ptr[i + 1]); }
if ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
if constexpr ((PQ_BITS == 8) && (PQ_LEN % 2 == 0)) {
// Use swizzling in the condition to reduce bank conflicts in shared
// memory, which are likely to occur when pq_code_book_dim is large.
((half2*)smem_query_ptr)[device::swizzling<std::uint32_t, DatasetBlockDim / 2>(i / 2)] =
Expand All @@ -121,36 +112,21 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
}
}

_RAFT_DEVICE auto compute_distance(ws_handle smem_workspace,
INDEX_T dataset_index,
cuvs::distance::DistanceType metric,
bool valid) const -> DISTANCE_T
_RAFT_DEVICE auto compute_distance(
ws_handle smem_workspace,
INDEX_T dataset_index,
cuvs::distance::DistanceType /* only L2 metric is implemented */,
bool valid) const -> DISTANCE_T
{
switch (metric) {
case cuvs::distance::DistanceType::L2Expanded:
return compute_similarity<cuvs::distance::DistanceType::L2Expanded>(
smem_workspace, dataset_index, valid);
case cuvs::distance::DistanceType::InnerProduct:
return compute_similarity<cuvs::distance::DistanceType::InnerProduct>(
smem_workspace, dataset_index, valid);
default: return 0;
}
}

private:
template <cuvs::distance::DistanceType METRIC>
RAFT_DEVICE_INLINE_FUNCTION DISTANCE_T compute_similarity(ws_handle smem_workspace,
const INDEX_T node_id,
const bool valid) const
{
auto codebook_ptr = smem_pq_code_book_ptr(smem_workspace);
auto query_ptr = smem_query_buffer(smem_workspace);
float norm = 0;
auto* __restrict__ codebook_ptr = smem_pq_code_book_ptr(smem_workspace);
auto* __restrict__ query_ptr = smem_query_buffer(smem_workspace);
auto* __restrict__ node_ptr =
encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * dataset_index);
float norm = 0;
if (valid) {
const unsigned lane_id = threadIdx.x % TeamSize;
const uint32_t vq_code = *(reinterpret_cast<const std::uint32_t*>(
encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id)));
if (PQ_BITS == 8) {
const uint32_t vq_code = *reinterpret_cast<const std::uint32_t*>(node_ptr);
if constexpr (PQ_BITS == 8) {
for (uint32_t elem_offset = 0; elem_offset < dim; elem_offset += DatasetBlockDim) {
constexpr unsigned vlen = 4; // **** DO NOT CHANGE ****
constexpr unsigned nelem =
Expand All @@ -162,9 +138,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
const std::uint32_t k = (lane_id + (TeamSize * e)) * vlen + elem_offset / PQ_LEN;
if (k >= n_subspace) break;
// Loading 4 x 8-bit PQ-codes using 32-bit load ops (from device memory)
pq_codes[e] = *(reinterpret_cast<const std::uint32_t*>(
encoded_dataset_ptr + (static_cast<std::uint64_t>(encoded_dataset_dim) * node_id) +
4 + k));
pq_codes[e] = *(reinterpret_cast<const std::uint32_t*>(node_ptr + 4 + k));
}
//
if constexpr (PQ_LEN % 2 == 0) {
Expand Down Expand Up @@ -237,8 +211,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
const std::uint32_t d = d1 + (PQ_LEN * k);
// if (d >= dataset_dim) break;
DISTANCE_T diff = query_ptr[d]; // (from smem)
diff -= pq_scale * static_cast<float>(pq_vals.data[m]);
diff -= vq_scale * static_cast<float>(vq_vals[d1 / vlen].val.data[d1 % vlen]);
diff -= static_cast<float>(pq_vals.data[m]);
diff -= static_cast<float>(vq_vals[d1 / vlen].val.data[d1 % vlen]);
norm += diff * diff;
}
pq_code >>= 8;
Expand All @@ -255,6 +229,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<DataT, In
return norm;
}

private:
RAFT_DEVICE_INLINE_FUNCTION constexpr auto smem_pq_code_book_ptr(ws_handle smem_workspace) const
-> CODE_BOOK_T*
{
Expand Down Expand Up @@ -293,9 +268,7 @@ __launch_bounds__(1, 1) __global__
std::uint32_t encoded_dataset_dim,
std::uint32_t n_subspace,
const CodeBookT* vq_code_book_ptr,
float vq_scale,
const CodeBookT* pq_code_book_ptr,
float pq_scale,
std::size_t size,
std::uint32_t dim)
{
Expand All @@ -310,9 +283,7 @@ __launch_bounds__(1, 1) __global__
encoded_dataset_dim,
n_subspace,
vq_code_book_ptr,
vq_scale,
pq_code_book_ptr,
pq_scale,
size,
dim);
}
Expand Down Expand Up @@ -361,15 +332,11 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
const DatasetT& dataset,
rmm::cuda_stream_view stream) -> host_type
{
const float vq_scale = 1.0f;
const float pq_scale = 1.0f;
descriptor_type dd_host{dataset.data.data_handle(),
dataset.encoded_row_length(),
dataset.pq_dim(),
dataset.vq_code_book.data_handle(),
vq_scale,
dataset.pq_code_book.data_handle(),
pq_scale,
IndexT(dataset.n_rows()),
dataset.dim()};
host_type result{dd_host, stream, DatasetBlockDim};
Expand All @@ -379,9 +346,7 @@ struct vpq_descriptor_spec : public instance_spec<DataT, IndexT, DistanceT> {
&dd_host.encoded_dataset_dim,
&dd_host.n_subspace,
&dd_host.vq_code_book_ptr,
&dd_host.vq_scale,
&dd_host.pq_code_book_ptr,
&dd_host.pq_scale,
&dd_host.size,
&dd_host.dim};
RAFT_CUDA_TRY(cudaLaunchKernel(init_kernel, 1, 1, args, 0, stream));
Expand Down
Loading