Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make IVF-PQ build index in batches when necessary #1056

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
4d4512f
Make ivf-pq build index in batches when necessary
achirkin Nov 30, 2022
b42b848
Merge branch 'branch-23.02' into enh-ivf-pq-batched-building
achirkin Dec 15, 2022
6c5d024
Adjust the logic for choosing the batch sizes and memory type to avoi…
achirkin Dec 15, 2022
e977f55
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-ivf-pq-…
achirkin Dec 15, 2022
9645b11
Fix integer comparison with different signedness
achirkin Dec 16, 2022
3e8cc03
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-ivf-pq-…
achirkin Dec 16, 2022
7155eef
Use raft operators in place of thrust
achirkin Dec 16, 2022
66e58db
Explain the in-place transform of the trainset in comments
achirkin Dec 16, 2022
b5491f6
Add docs to process_and_fill_codes
achirkin Dec 16, 2022
232b772
extend-fill codes and indices in a single kernel, exec time scaling l…
achirkin Dec 19, 2022
1c3800c
Update documentation
achirkin Dec 19, 2022
9a6c6f7
Update documentation
achirkin Dec 19, 2022
63a6caa
Wrap the allocation error with the raft logic error in case if the in…
achirkin Dec 19, 2022
e28ca3d
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-ivf-pq-…
achirkin Dec 19, 2022
81250e8
Avoid narrowing conversion error for long->size_t in the error reporting
achirkin Dec 20, 2022
45920ec
Split out flat_compute_residuals code from process_and_fill_codes
achirkin Dec 20, 2022
0c69e89
Merge branch 'branch-23.02' into enh-ivf-pq-batched-building
tfeher Jan 4, 2023
c66cc02
Fix somehow incorrect inclusive scan / padded cumulative sum
achirkin Jan 6, 2023
7fe84a9
Merge remote-tracking branch 'rapidsai/branch-23.02' into enh-ivf-pq-…
achirkin Jan 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions cpp/include/raft/neighbors/ivf_pq_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,16 @@ struct index : ann::index {
*/
void allocate(const handle_t& handle, IdxT index_size)
{
pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
try {
pq_dataset_ = make_device_mdarray<uint8_t>(handle, make_pq_dataset_extents(index_size));
indices_ = make_device_mdarray<IdxT>(handle, make_extents<IdxT>(index_size));
} catch (std::bad_alloc& e) {
RAFT_FAIL(
"ivf-pq: failed to allocate a big enough index to hold all data (size: %zu). "
"Allocator exception: %s",
size_t(index_size),
e.what());
}
if (index_size > 0) {
thrust::fill_n(
handle.get_thrust_policy(), indices_.data_handle(), index_size, kInvalidRecord);
Expand Down Expand Up @@ -434,7 +442,7 @@ struct index : ann::index {

/** A helper function to determine the extents of an array enough to hold a given amount of data.
*/
auto make_pq_dataset_extents(IdxT n_rows) -> pq_dataset_extents
auto make_pq_dataset_extents(IdxT n_rows) const -> pq_dataset_extents
{
// how many elems of pq_dim fit into one kIndexGroupVecLen-byte chunk
auto pq_chunk = (kIndexGroupVecLen * 8u) / pq_bits();
Expand Down
205 changes: 205 additions & 0 deletions cpp/include/raft/spatial/knn/detail/ann_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,19 @@

#pragma once

#include <raft/core/logger.hpp>
#include <raft/distance/distance.cuh>
#include <raft/distance/distance_types.hpp>
#include <raft/util/cuda_utils.cuh>
#include <raft/util/cudart_utils.hpp>
#include <raft/util/integer_utils.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <memory>
#include <optional>

namespace raft::spatial::knn::detail::utils {

Expand Down Expand Up @@ -359,4 +366,202 @@ void copy_selected(IdxT n_rows,
}
}

/**
* A batch input iterator over the data source.
* Given an input pointer, it decides whether the current device has the access to the data and
* gives it back to the user in batches. Three scenarios are possible:
*
* 1. if `source == nullptr`: then `batch.data() == nullptr`
* 2. if `source` is accessible from the device, `batch.data()` points directly at the source at
* the proper offsets on each iteration.
* 3. if `source` is not accessible from the device, `batch.data()` points to an intermediate
* buffer; the corresponding data is copied in the given `stream` on every iterator dereference
* (i.e. batches can be skipped). Dereferencing the same batch two times in a row does not force
* the copy.
*
* In all three scenarios, the number of iterations, batch offsets and sizes are the same.
*
* The iterator can be reused. If the number of iterations is one, at most one copy will ever be
* invoked (i.e. small datasets are not reloaded multiple times).
*/
template <typename T>
struct batch_load_iterator {
using size_type = size_t;

/** A single batch of data residing in device memory. */
struct batch {
/** Logical width of a single row in a batch, in elements of type `T`. */
[[nodiscard]] auto row_width() const -> size_type { return row_width_; }
/** Logical offset of the batch, in rows (`row_width()`) */
[[nodiscard]] auto offset() const -> size_type { return pos_.value_or(0) * batch_size_; }
/** Logical size of the batch, in rows (`row_width()`) */
[[nodiscard]] auto size() const -> size_type { return batch_len_; }
/** Logical size of the batch, in rows (`row_width()`) */
[[nodiscard]] auto data() const -> const T* { return const_cast<const T*>(dev_ptr_); }
/** Whether this batch copies the data (i.e. the source is inaccessible from the device). */
[[nodiscard]] auto does_copy() const -> bool { return needs_copy_; }

private:
batch(const T* source,
size_type n_rows,
size_type row_width,
size_type batch_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
: stream_(stream),
buf_(0, stream, mr),
source_(source),
dev_ptr_(nullptr),
n_rows_(n_rows),
row_width_(row_width),
batch_size_(std::min(batch_size, n_rows)),
pos_(std::nullopt),
n_iters_(raft::div_rounding_up_safe(n_rows, batch_size)),
needs_copy_(false)
{
if (source_ == nullptr) { return; }
cudaPointerAttributes attr;
RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, source_));
dev_ptr_ = reinterpret_cast<T*>(attr.devicePointer);
if (dev_ptr_ == nullptr) {
buf_.resize(row_width_ * batch_size_, stream);
dev_ptr_ = buf_.data();
needs_copy_ = true;
}
}
rmm::cuda_stream_view stream_;
rmm::device_uvector<T> buf_;
const T* source_;
size_type n_rows_;
size_type row_width_;
size_type batch_size_;
size_type n_iters_;
bool needs_copy_;

std::optional<size_type> pos_;
size_type batch_len_;
T* dev_ptr_;

friend class batch_load_iterator<T>;

/**
* Changes the state of the batch to point at the `pos` index.
* If necessary, copies the data from the source in the registered stream.
*/
void load(const size_type& pos)
{
// No-op if the data is already loaded, or it's the end of the input.
if (pos == pos_ || pos >= n_iters_) { return; }
pos_.emplace(pos);
batch_len_ = std::min(batch_size_, n_rows_ - std::min(offset(), n_rows_));
if (source_ == nullptr) { return; }
if (needs_copy_) {
if (size() > 0) {
RAFT_LOG_DEBUG("batch_load_iterator::copy(offset = %zu, size = %zu, row_width = %zu)",
size_t(offset()),
size_t(size()),
size_t(row_width()));
copy(dev_ptr_, source_ + offset() * row_width(), size() * row_width(), stream_);
}
} else {
dev_ptr_ = const_cast<T*>(source_) + offset() * row_width();
}
}
};

using value_type = batch;
using reference = const value_type&;
using pointer = const value_type*;

/**
* Create a batch iterator over the data `source`.
*
* For convenience, the data `source` is read in logical units of size `row_width`; batch sizes
* and offsets are calculated in logical rows. Hence, can interpret the data as a contiguous
* row-major matrix of size [n_rows, row_width], and the batches are the sub-matrices of size
* [x<=batch_size, n_rows].
*
* @param source the input data -- host, device, or nullptr.
* @param n_rows the size of the input in logical rows.
* @param row_width the size of the logical row in the elements of type `T`.
* @param batch_size the desired size of the batch.
* @param stream the ordering for the host->device copies, if applicable.
* @param mr a custom memory resource for the intermediate buffer, if applicable.
*/
batch_load_iterator(const T* source,
size_type n_rows,
size_type row_width,
size_type batch_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
: cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0)
{
}
/**
* Whether this iterator copies the data on every iteration
* (i.e. the source is inaccessible from the device).
*/
[[nodiscard]] auto does_copy() const -> bool { return cur_batch_->does_copy(); }
/** Reset the iterator position to `begin()` */
void reset() { cur_pos_ = 0; }
/** Reset the iterator position to `end()` */
void reset_to_end() { cur_pos_ = cur_batch_->n_iters_; }
[[nodiscard]] auto begin() const -> const batch_load_iterator<T>
{
batch_load_iterator<T> x(*this);
x.reset();
return x;
}
[[nodiscard]] auto end() const -> const batch_load_iterator<T>
{
batch_load_iterator<T> x(*this);
x.reset_to_end();
return x;
}
[[nodiscard]] auto operator*() const -> reference
{
cur_batch_->load(cur_pos_);
return *cur_batch_;
}
[[nodiscard]] auto operator->() const -> pointer
{
cur_batch_->load(cur_pos_);
return cur_batch_.get();
}
friend auto operator==(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
{
return x.cur_batch_ == y.cur_batch_ && x.cur_pos_ == y.cur_pos_;
};
friend auto operator!=(const batch_load_iterator<T>& x, const batch_load_iterator<T>& y) -> bool
{
return x.cur_batch_ != y.cur_batch_ || x.cur_pos_ != y.cur_pos_;
};
auto operator++() -> batch_load_iterator<T>&
{
++cur_pos_;
return *this;
}
auto operator++(int) -> batch_load_iterator<T>
{
batch_load_iterator<T> x(*this);
++cur_pos_;
return x;
}
auto operator--() -> batch_load_iterator<T>&
{
--cur_pos_;
return *this;
}
auto operator--(int) -> batch_load_iterator<T>
{
batch_load_iterator<T> x(*this);
--cur_pos_;
return x;
}

private:
std::shared_ptr<value_type> cur_batch_;
size_type cur_pos_;
};

} // namespace raft::spatial::knn::detail::utils
Loading