Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Sampler] Avoid unnecessary sync in GPU verifier #2260

Merged
merged 1 commit into from
May 1, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 105 additions & 56 deletions cpp/serve/sampler/gpu_sampler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,9 @@ class GPUSampler : public SamplerObj {
sample_results.resize(num_sequence);

int num_nodes = cum_verify_lengths.back();
ICHECK(num_nodes <= max_num_sample_);
CHECK_EQ(draft_probs_on_device->shape[0], num_nodes);
NDArray uniform_samples_host = uniform_samples_host_.CreateView({num_nodes}, dtype_f32_);
NDArray uniform_samples_device = uniform_samples_device_.CreateView({num_nodes}, dtype_f32_);
NDArray uniform_samples_device = GenerateUniformSamples(rngs, cum_verify_lengths);
NDArray draft_tokens_host = draft_tokens_host_.CreateView({num_nodes}, dtype_i32_);
NDArray draft_tokens_device = draft_tokens_device_.CreateView({num_nodes}, dtype_i32_);

Expand All @@ -201,16 +201,6 @@ class GPUSampler : public SamplerObj {
}
CopyArray(draft_tokens_host, draft_tokens_device, copy_stream_);

float* p_uniform_samples = static_cast<float*>(uniform_samples_host->data);
for (int i = 0; i < num_sequence; ++i) {
int start = cum_verify_lengths[i];
int end = cum_verify_lengths[i + 1];
for (int j = start; j < end; j++) {
p_uniform_samples[j] = rngs[i]->GetRandomNumber();
}
}
CopyArray(uniform_samples_host, uniform_samples_device, copy_stream_);

NDArray token_tree_first_child_host =
token_tree_first_child_host_.CreateView({num_nodes}, dtype_i32_);
NDArray token_tree_first_child_device =
Expand Down Expand Up @@ -254,10 +244,44 @@ class GPUSampler : public SamplerObj {
token_tree_first_child_device, token_tree_next_sibling_device,
uniform_samples_device, token_tree_parent_ptr_device);

CopyArray(token_tree_parent_ptr_device, token_tree_parent_ptr_host, compute_stream_);
TVMSynchronize(device_.device_type, device_.device_id, compute_stream_);
CopyArray(token_tree_parent_ptr_device, token_tree_parent_ptr_host, copy_stream_);

std::vector<int> sample_indices;
std::vector<SampleResult> additional_sample_result;
{
additional_sample_result.reserve(num_sequence);
// Sample one additional token for each sequence using the probablity at the last accepted
// token.
uniform_samples_device = GenerateUniformSamples(rngs, num_sequence);
const NDArray& sample_indices_device = token_tree_parent_ptr_device;
// Check need_prob_values
bool need_prob_values = false;
for (int i = 0; i < num_sequence; i++) {
need_prob_values |= generation_cfg[i]->logprobs;
}
std::vector<int> top_prob_offset_indptr;
if (!need_prob_values) {
top_prob_offset_indptr.resize(num_sequence + 1, 0);
} else {
// Slow path: if any of the generation config requires prob values, we need to copy
// sample_indices to host to compute top_prob_offset_indptr.
TVMSynchronize(device_.device_type, device_.device_id, copy_stream_);
std::vector<int> sample_indices;
sample_indices.reserve(num_sequence);
const int* p_token_tree_parent_ptr = static_cast<int*>(token_tree_parent_ptr_host->data);
for (int i = 0; i < num_sequence; i++) {
sample_indices.push_back(p_token_tree_parent_ptr[i]);
}
CheckProbValues(generation_cfg, sample_indices, num_nodes, num_sequence, vocab_size_,
&top_prob_offset_indptr);
}
auto device_arrays =
SampleOnGPU(probs_on_device, uniform_samples_device, sample_indices_device,
/*need_top_p=*/false, need_prob_values, num_nodes, top_prob_offset_indptr);
auto host_arrays = CopyArraysToCPU(device_arrays, num_sequence, need_prob_values,
top_prob_offset_indptr.back());
additional_sample_result =
CollectSampleResult(host_arrays, num_sequence, need_prob_values, top_prob_offset_indptr);
}

for (int i = 0; i < num_sequence; i++) {
int start = cum_verify_lengths[i];
Expand All @@ -270,11 +294,9 @@ class GPUSampler : public SamplerObj {
num_accepted++;
}
std::reverse(sample_results[i].rbegin(), sample_results[i].rbegin() + num_accepted);
sample_indices.push_back(last_accepted);
}
std::vector<SampleResult> additional_sample_result;
additional_sample_result = this->BatchSampleTokensWithProbAfterTopP(
probs_on_device, sample_indices, request_ids, generation_cfg, rngs);

// Append the additional sample result to the sample_results
ICHECK_EQ(additional_sample_result.size(), num_sequence);
for (int i = 0; i < num_sequence; i++) {
sample_results[i].push_back(additional_sample_result[i]);
Expand Down Expand Up @@ -347,6 +369,36 @@ class GPUSampler : public SamplerObj {
return sample_results;
}

/*! \brief Collect the sampling results from the computed NDArray results. */
std::vector<SampleResult> CollectSampleResult(const std::vector<NDArray>& host_arrays,
int num_samples, bool need_prob_values,
const std::vector<int> top_prob_offset_indptr) {
const int* p_sampled_token_ids = static_cast<const int*>(host_arrays[0]->data);
const float* p_sampled_probs = nullptr;
const float* p_top_prob_probs = nullptr;
const int* p_top_prob_indices = nullptr;
if (need_prob_values) {
p_sampled_probs = static_cast<const float*>(host_arrays[1]->data);
p_top_prob_probs = static_cast<const float*>(host_arrays[2]->data);
p_top_prob_indices = static_cast<const int*>(host_arrays[3]->data);
}
std::vector<SampleResult> sample_results;
sample_results.reserve(num_samples);
ICHECK_EQ(top_prob_offset_indptr.size(), num_samples + 1);
for (int i = 0; i < num_samples; ++i) {
// Note: we set the probability in SampleResult to 1.0 since prob value is not needed.
float sampled_prob = need_prob_values ? p_sampled_probs[i] : 1.0;
std::vector<TokenProbPair> top_prob_tokens;
top_prob_tokens.reserve(top_prob_offset_indptr[i + 1] - top_prob_offset_indptr[i]);
for (int j = top_prob_offset_indptr[i]; j < top_prob_offset_indptr[i + 1]; ++j) {
top_prob_tokens.emplace_back(p_top_prob_indices[j], p_top_prob_probs[j]);
}
sample_results.push_back(
SampleResult{{p_sampled_token_ids[i], sampled_prob}, top_prob_tokens});
}
return sample_results;
}

std::vector<SampleResult> ChunkSampleTokensImpl(NDArray probs_on_device, //
const std::vector<int>& sample_indices, //
const Array<GenerationConfig>& generation_cfg, //
Expand All @@ -359,8 +411,8 @@ class GPUSampler : public SamplerObj {

// - Generate random numbers.
// Copy the random numbers and sample indices.
auto [uniform_samples_device, sample_indices_device] =
CopySamplesAndIndicesToGPU(sample_indices, rngs, num_samples);
auto uniform_samples_device = GenerateUniformSamples(rngs, num_samples);
auto sample_indices_device = CopySampleIndicesToGPU(sample_indices);

// - Check if there is need for applying top p or prob values,
// so that argsort is needed.
Expand All @@ -383,52 +435,49 @@ class GPUSampler : public SamplerObj {
top_prob_offset_indptr.back());

// - Collect the sampling results.
const int* p_sampled_token_ids = static_cast<const int*>(host_arrays[0]->data);
const float* p_sampled_probs = nullptr;
const float* p_top_prob_probs = nullptr;
const int* p_top_prob_indices = nullptr;
if (need_prob_values) {
p_sampled_probs = static_cast<const float*>(host_arrays[1]->data);
p_top_prob_probs = static_cast<const float*>(host_arrays[2]->data);
p_top_prob_indices = static_cast<const int*>(host_arrays[3]->data);
}
std::vector<SampleResult> sample_results;
sample_results.reserve(num_samples);
ICHECK_EQ(top_prob_offset_indptr.size(), num_samples + 1);
for (int i = 0; i < num_samples; ++i) {
// Note: we set the probability in SampleResult to 1.0 since prob value is not needed.
float sampled_prob = need_prob_values ? p_sampled_probs[i] : 1.0;
std::vector<TokenProbPair> top_prob_tokens;
top_prob_tokens.reserve(top_prob_offset_indptr[i + 1] - top_prob_offset_indptr[i]);
for (int j = top_prob_offset_indptr[i]; j < top_prob_offset_indptr[i + 1]; ++j) {
top_prob_tokens.emplace_back(p_top_prob_indices[j], p_top_prob_probs[j]);
}
sample_results.push_back(
SampleResult{{p_sampled_token_ids[i], sampled_prob}, top_prob_tokens});
}

return sample_results;
return CollectSampleResult(host_arrays, num_samples, need_prob_values, top_prob_offset_indptr);
}

/*! \brief Generate uniform random numbers, and copy the numbers and sample indices to GPU. */
std::pair<NDArray, NDArray> CopySamplesAndIndicesToGPU(const std::vector<int>& sample_indices,
const std::vector<RandomGenerator*>& rngs,
int num_samples) {
// Generate random numbers.
/*! \brief Generate num_samples uniform random numbers, and copy them to GPU. */
NDArray GenerateUniformSamples(const std::vector<RandomGenerator*>& rngs, int num_samples) {
float* p_uniform_samples = static_cast<float*>(uniform_samples_host_->data);
int* p_sample_indices = static_cast<int*>(sample_indices_host_->data);
for (int i = 0; i < num_samples; ++i) {
p_uniform_samples[i] = rngs[i]->GetRandomNumber();
p_sample_indices[i] = sample_indices[i];
}
// Copy the random numbers and sample indices to GPU.
NDArray uniform_samples_host = uniform_samples_host_.CreateView({num_samples}, dtype_f32_);
NDArray uniform_samples_device = uniform_samples_device_.CreateView({num_samples}, dtype_f32_);
CopyArray(/*src=*/uniform_samples_host, /*dst=*/uniform_samples_device, copy_stream_);
return uniform_samples_device;
}

/*! \brief Generate uniform random numbers, and copy the numbers and sample indices to GPU. The
* number of samples for each random generator is given by `cum_num_samples`. */
NDArray GenerateUniformSamples(const std::vector<RandomGenerator*>& rngs,
const std::vector<int>& cum_num_samples) {
float* p_uniform_samples = static_cast<float*>(uniform_samples_host_->data);
int total_samples = cum_num_samples.back();
for (int i = 0; i + 1 < static_cast<int>(cum_num_samples.size()); ++i) {
for (int j = cum_num_samples[i]; j < cum_num_samples[i + 1]; ++j) {
p_uniform_samples[j] = rngs[i]->GetRandomNumber();
}
}
NDArray uniform_samples_host = uniform_samples_host_.CreateView({total_samples}, dtype_f32_);
NDArray uniform_samples_device =
uniform_samples_device_.CreateView({total_samples}, dtype_f32_);
CopyArray(/*src=*/uniform_samples_host, /*dst=*/uniform_samples_device, copy_stream_);
return uniform_samples_device;
}

/*! \brief Generate uniform random numbers, and copy the numbers and sample indices to GPU. */
NDArray CopySampleIndicesToGPU(const std::vector<int>& sample_indices) {
int* p_sample_indices = static_cast<int*>(sample_indices_host_->data);
std::copy(sample_indices.begin(), sample_indices.end(), p_sample_indices);
// Copy the sample indices to GPU.
int num_samples = static_cast<int>(sample_indices.size());
NDArray sample_indices_host = sample_indices_host_.CreateView({num_samples}, dtype_i32_);
NDArray sample_indices_device = sample_indices_device_.CreateView({num_samples}, dtype_i32_);
CopyArray(/*src=*/uniform_samples_host, /*dst=*/uniform_samples_device, copy_stream_);
CopyArray(/*src=*/sample_indices_host, /*dst=*/sample_indices_device, copy_stream_);
return {uniform_samples_device, sample_indices_device};
return sample_indices_device;
}

/*! \brief Check if top p is needed. Update host top p array in place. */
Expand Down
Loading