Skip to content

Commit

Permalink
Fix error handling for GPU tensors (triton-inference-server#249)
Browse files Browse the repository at this point in the history
* Fix error handling for GPU tensors

* Fix GPU buffer handling

* Review edit

* Fix for dynamically batched responses with GPU tensor

* Review edits

* Fix unused i variable for GPU=OFF

* Review comments

* Review edit
  • Loading branch information
Tabrizian authored Jun 6, 2023
1 parent 637c7e3 commit 0a54e59
Show file tree
Hide file tree
Showing 10 changed files with 280 additions and 172 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ set(
src/metric.cc
src/metric_family.h
src/metric_family.cc
src/gpu_buffers.cc
src/gpu_buffers.h
)

set(
Expand Down
88 changes: 88 additions & 0 deletions src/gpu_buffers.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "gpu_buffers.h"
#include "pb_string.h"

namespace triton { namespace backend { namespace python {
GPUBuffersHelper::GPUBuffersHelper()
{
completed_ = false;
}

void
GPUBuffersHelper::AddBuffer(const bi::managed_external_buffer::handle_t& handle)
{
if (completed_) {
throw PythonBackendException(
"It is not possible to add buffers after 'Complete' has been called on "
"a GPUBuffersHelper.");
}

buffers_.emplace_back(handle);
}

void
GPUBuffersHelper::SetError(
std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error)
{
error_shm_ = PbString::Create(shm_pool, error);
}

void
GPUBuffersHelper::Complete(std::unique_ptr<SharedMemoryManager>& shm_pool)
{
if (completed_) {
throw PythonBackendException(
"Complete has already been called. Complete should only be called "
"once.");
}
gpu_buffers_shm_ = shm_pool->Construct<GPUBuffersShm>();
if (!error_shm_) {
buffers_handle_shm_ =
shm_pool->Construct<bi::managed_external_buffer::handle_t>(
buffers_.size());
gpu_buffers_shm_.data_->buffer_count = buffers_.size();
gpu_buffers_shm_.data_->success = true;
gpu_buffers_shm_.data_->buffers = buffers_handle_shm_.handle_;
for (size_t i = 0; i < buffers_.size(); ++i) {
buffers_handle_shm_.data_.get()[i] = buffers_[i];
}
} else {
gpu_buffers_shm_.data_->success = false;
gpu_buffers_shm_.data_->error = error_shm_->ShmHandle();
}
completed_ = true;
}


bi::managed_external_buffer::handle_t
GPUBuffersHelper::ShmHandle()
{
return gpu_buffers_shm_.handle_;
}

}}} // namespace triton::backend::python
67 changes: 67 additions & 0 deletions src/gpu_buffers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#pragma once

#include "pb_string.h"
#include "pb_utils.h"
#include "scoped_defer.h"

namespace triton { namespace backend { namespace python {

/// \param success indicating whether the process of fetching the GPU buffers
/// was successful.
/// \param error if success is equal to false, the error object will be set.
/// \param buffers list of buffers elements.
/// \param buffer_count the number of buffers.
struct GPUBuffersShm {
bool success;
bi::managed_external_buffer::handle_t error;
bi::managed_external_buffer::handle_t buffers;
uint32_t buffer_count;
};

/// Helper class to facilitate transfer of metadata associated
/// the GPU buffers in shared memory.
class GPUBuffersHelper {
public:
GPUBuffersHelper();
void AddBuffer(const bi::managed_external_buffer::handle_t& handle);
void Complete(std::unique_ptr<SharedMemoryManager>& shm_pool);
void SetError(
std::unique_ptr<SharedMemoryManager>& shm_pool, const std::string& error);
bi::managed_external_buffer::handle_t ShmHandle();

private:
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm_;
std::vector<bi::managed_external_buffer::handle_t> buffers_;
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
buffers_handle_shm_;
std::unique_ptr<PbString> error_shm_;
bool completed_;
};

}}}; // namespace triton::backend::python
11 changes: 10 additions & 1 deletion src/infer_request.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <boost/interprocess/sync/scoped_lock.hpp>

#include "gpu_buffers.h"
#include "pb_utils.h"
#include "scoped_defer.h"
#ifdef TRITON_PB_STUB
Expand Down Expand Up @@ -481,11 +482,19 @@ InferRequest::Exec(const bool is_decoupled)
// Additional round trip required for asking the stub process
// to fill in the GPU tensor buffers
if (has_gpu_tensor) {
AllocatedSharedMemory<GPUBuffersShm> gpu_buffers_shm =
shm_pool->Load<GPUBuffersShm>(
request_batch_shm_ptr->gpu_buffers_handle);
AllocatedSharedMemory<bi::managed_external_buffer::handle_t>
gpu_buffers_handle =
shm_pool->Load<bi::managed_external_buffer::handle_t>(
request_batch_shm_ptr->gpu_buffers_handle);
gpu_buffers_shm.data_->buffers);
try {
if (!gpu_buffers_shm.data_->success) {
std::unique_ptr<PbString> error = PbString::LoadFromSharedMemory(
shm_pool, gpu_buffers_shm.data_->error);
throw PythonBackendException(error->String());
}
#ifdef TRITON_ENABLE_GPU
size_t i = 0;
for (auto& input_tensor : this->Inputs()) {
Expand Down
54 changes: 20 additions & 34 deletions src/infer_response.cc
Original file line number Diff line number Diff line change
Expand Up @@ -201,64 +201,50 @@ InferResponse::IsLastResponse()
}

#ifndef TRITON_PB_STUB
std::shared_ptr<TRITONSERVER_Error*>
void
InferResponse::Send(
TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
TRITONBACKEND_Response* response, void* cuda_stream,
bool& requires_deferred_callback, const uint32_t flags,
std::unique_ptr<SharedMemoryManager>& shm_pool,
GPUBuffersHelper& gpu_buffer_helper,
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
const std::set<std::string>& requested_output_names,
TRITONBACKEND_Response* response)
const std::set<std::string>& requested_output_names)
{
std::shared_ptr<TRITONSERVER_Error*> response_error =
WrapTritonErrorInSharedPtr(nullptr);
std::unique_ptr<ScopedDefer> response_error_handling;
requires_deferred_callback = false;

// Should only destruct the response factory whenever a response factory is
// being created.
bool destruct_response_factor = (response == nullptr);

if (response == nullptr) {
SET_ERROR_AND_RETURN(
response_error,
TRITONBACKEND_ResponseNewFromFactory(&response, response_factory));
}

// This lambda expression will be called when this function exits, if the
// inference response doesn't have any GPU tensors. Otherwise, it will be
// called when the object is destructed or DeferredSendCallback is called.
response_error_handling = std::make_unique<ScopedDefer>(
[response, response_error, flags, response_factory,
destruct_response_factor] {
response_error_handling =
std::make_unique<ScopedDefer>([response, response_error, flags] {
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(response, flags, *response_error),
"failed to send the response.");
if (flags == TRITONSERVER_RESPONSE_COMPLETE_FINAL &&
destruct_response_factor) {
std::unique_ptr<
TRITONBACKEND_ResponseFactory, backend::ResponseFactoryDeleter>
response_factory_ptr(
reinterpret_cast<TRITONBACKEND_ResponseFactory*>(
response_factory));
}
}
});

// Moves the response sending callback so that it is not called until the stub
// process fills in the GPU buffers.
ScopedDefer deferred_task(
[this, &requires_deferred_callback, &response_error_handling] {
if (requires_deferred_callback) {
deferred_send_callback_ = std::move(response_error_handling);
}
});
ScopedDefer deferred_task([this, &requires_deferred_callback,
&response_error_handling, &gpu_buffer_helper,
response_error, &shm_pool] {
if (*response_error != nullptr) {
gpu_buffer_helper.SetError(
shm_pool, TRITONSERVER_ErrorMessage(*response_error));
}
if (requires_deferred_callback) {
deferred_send_callback_ = std::move(response_error_handling);
}
});

if (HasError()) {
*response_error = TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL, Error()->Message().c_str());
return nullptr;
return;
}

bool cuda_copy = false;
Expand Down Expand Up @@ -322,6 +308,7 @@ InferResponse::Send(
output_tensor->ByteSize(), reinterpret_cast<char*>(buffer),
true /* copy_gpu */));
}
gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
output_buffers.push_back({std::move(output_buffer), buffer});
#endif
}
Expand All @@ -336,6 +323,7 @@ InferResponse::Send(
shm_pool, actual_memory_type, actual_memory_type_id,
output_tensor->ByteSize(), nullptr /* data ptr */));

gpu_buffer_helper.AddBuffer(output_buffer->ShmHandle());
output_buffers.push_back({std::move(output_buffer), buffer});
}

Expand All @@ -357,8 +345,6 @@ InferResponse::Send(
cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(cuda_stream));
}
#endif // TRITON_ENABLE_GPU

return response_error;
}
#endif

Expand Down
13 changes: 7 additions & 6 deletions src/infer_response.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#pragma once

#include <future>
#include "gpu_buffers.h"
#include "pb_error.h"
#include "pb_tensor.h"
#include "pb_utils.h"
Expand All @@ -49,7 +50,7 @@ struct ResponseShm {
TRITONSERVER_Error* raasnie_err__ = (X); \
if (raasnie_err__ != nullptr) { \
*E = raasnie_err__; \
return E; \
return; \
} \
} while (false)

Expand All @@ -62,7 +63,7 @@ struct ResponseShm {
TRITONSERVER_Error* rarie_err__ = TRITONSERVER_ErrorNew( \
TRITONSERVER_ERROR_INTERNAL, pb_exception.what()); \
*E = rarie_err__; \
return E; \
return; \
} \
} while (false)

Expand Down Expand Up @@ -96,13 +97,13 @@ class InferResponse {
/// response needs to be done in two step. The boolean
/// 'requires_deferred_callback' indicates whether DeferredSendCallback method
/// should be called or not.
std::shared_ptr<TRITONSERVER_Error*> Send(
TRITONBACKEND_ResponseFactory* response_factory, void* cuda_stream,
void Send(
TRITONBACKEND_Response* response, void* cuda_stream,
bool& requires_deferred_callback, const uint32_t flags,
std::unique_ptr<SharedMemoryManager>& shm_pool,
GPUBuffersHelper& gpu_buffer_helper,
std::vector<std::pair<std::unique_ptr<PbMemory>, void*>>& output_buffers,
const std::set<std::string>& requested_output_names = {},
TRITONBACKEND_Response* response = nullptr);
const std::set<std::string>& requested_output_names = {});

void DeferredSendCallback();
#endif
Expand Down
Loading

0 comments on commit 0a54e59

Please sign in to comment.