Skip to content

Commit

Permalink
Add mem_type parameter in SetFixedSizeOutputBuffer. Modify contexts t…
Browse files Browse the repository at this point in the history
…o use SetFixedSizeOutputBuffer (triton-inference-server#559)

* Add 'src_memory_type' parameter in SetFixedSizeOutputBuffer

* Apply the change on all backend contexts

* Fix provider's AllocateOutputBuffer to use mem_type param

* temp

* refactor backend context

* fix provider AllocateOutputBuffer logic to be the same as alloc_fn

* backend_context temp

* Fix provider to return error on class output and 0 byte size

* Update SetFixedSizeOutputBuffer

* Create cuda stream for all backend context if GPU is supported

* Address comment on synchronizing CUDA stream once for all outputs

* Fix no-GPU build failure

* Fix GPU supported build on CPU-only machine

* temp input

* Fix rebase artifact. Work on TODOs related to this PR

* Fix AllocatedSystemMemory size to be 0 if allocated buffer is nullptr

* Fix ensemble ResponseAlloc function

* Remove plan backend copy check as ...
cuda_copy can be false if output is not required
  • Loading branch information
GuanLuo authored and deadeyegoodwin committed Aug 21, 2019
1 parent 0092e07 commit ba755b4
Show file tree
Hide file tree
Showing 13 changed files with 400 additions and 300 deletions.
68 changes: 32 additions & 36 deletions src/backends/caffe2/netdef_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,8 @@ NetDefBackend::CreateExecutionContext(
contexts_.emplace_back(new Context(instance_name, gpu_device, mbs));
const std::unique_ptr<Context>& context = contexts_.back();

RETURN_IF_ERROR(context->CreateCudaStream());

// Extract input and output names from the config...
std::vector<std::string> input_names;
for (const auto& io : Config().input()) {
Expand Down Expand Up @@ -362,7 +364,7 @@ NetDefBackend::Context::SetFixedSizedInputTensor(
const std::string& name, const std::vector<int64_t>& shape,
const Caffe2Workspace::DataType dtype, const size_t batch1_byte_size,
const size_t total_byte_size, std::vector<Scheduler::Payload>* payloads,
std::vector<std::unique_ptr<char[]>>* input_buffers)
std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy)
{
// The entire input tensor must be delivered as a single
// contiguous chunk so create a buffer large enough to hold the
Expand All @@ -380,7 +382,7 @@ NetDefBackend::Context::SetFixedSizedInputTensor(
request_header.batch_size() * batch1_byte_size);
}

SetInputBuffer(
*cuda_copy |= SetInputBuffer(
name, expected_byte_sizes, payloads, TRTSERVER_MEMORY_CPU, buffer);

Caffe2Workspace::Error err = workspace_->SetInputTensor(
Expand All @@ -396,7 +398,8 @@ Status
NetDefBackend::Context::ReadFixedSizedOutputTensor(
const std::string& name, const Caffe2Workspace::DataType dtype,
const size_t dtype_byte_size, const size_t total_batch_size,
std::vector<Scheduler::Payload>* payloads, const DimsList& dims)
const DimsList& dims, std::vector<Scheduler::Payload>* payloads,
bool* cuda_copy)
{
std::vector<int64_t> content_shape;
const char* content = nullptr;
Expand Down Expand Up @@ -436,42 +439,23 @@ NetDefBackend::Context::ReadFixedSizedOutputTensor(
std::to_string(batch1_byte_size));
}

size_t content_offset = 0;

for (auto& payload : *payloads) {
const InferRequestHeader& request_header =
payload.request_provider_->RequestHeader();
const size_t expected_byte_size =
request_header.batch_size() * batch1_byte_size;

// If 'payload' requested this output then copy it from
// 'content'. If it did not request this output then just
// skip it in the 'content'.
if ((payload.response_provider_ != nullptr) &&
payload.response_provider_->RequiresOutput(name)) {
void* buffer;
Status status = payload.response_provider_->AllocateOutputBuffer(
name, &buffer, expected_byte_size, content_shape);
if (status.IsOk()) {
memcpy(buffer, content + content_offset, expected_byte_size);
}

if (!status.IsOk()) {
payload.status_ = status;
}
}

content_offset += expected_byte_size;
}

// [TODO] use the following statement. Right now we always create
// netdef workspace with inputs / outputs on CPU node
// auto content_memory_type = (gpu_device_ == NO_GPU_DEVICE)
// ? TRTSERVER_MEMORY_CPU
// : TRTSERVER_MEMORY_GPU;
auto content_memory_type = TRTSERVER_MEMORY_CPU;
*cuda_copy |= SetFixedSizeOutputBuffer(
name, batch1_byte_size, content, content_shape, content_memory_type,
payloads);
return Status::Success;
}

Status
NetDefBackend::Context::SetInput(
const std::string& name, const DataType datatype, const DimsList& dims,
const size_t total_batch_size, std::vector<Scheduler::Payload>* payloads,
std::vector<std::unique_ptr<char[]>>* input_buffers)
std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy)
{
// Get the shape of the input. The provider has already checked that
// the request shape is valid so don't need to do it here.
Expand All @@ -498,7 +482,7 @@ NetDefBackend::Context::SetInput(

return SetFixedSizedInputTensor(
name, shape, dtype, batch1_byte_size, total_byte_size, payloads,
input_buffers);
input_buffers, cuda_copy);
}

Status
Expand Down Expand Up @@ -555,6 +539,7 @@ NetDefBackend::Context::Run(
// into the corresponding tensor.

// Inputs from the request...
bool cuda_copy = false;
for (const auto& input : input_request_provider->RequestHeader().input()) {
const std::string& name = input.name();

Expand All @@ -563,7 +548,7 @@ NetDefBackend::Context::Run(

RETURN_IF_ERROR(SetInput(
name, input_config->data_type(), input.dims(), total_batch_size,
payloads, &input_buffers));
payloads, &input_buffers, &cuda_copy));
}

// Additional inputs added to the provider...
Expand All @@ -576,9 +561,14 @@ NetDefBackend::Context::Run(
pr.second;
RETURN_IF_ERROR(SetInput(
name, override->datatype_, override->dims_, total_batch_size,
payloads, &input_buffers));
payloads, &input_buffers, &cuda_copy));
}
}
#ifdef TRTIS_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(stream_);
}
#endif // TRTIS_ENABLE_GPU

// Run...
Caffe2Workspace::Error err = workspace_->Run();
Expand All @@ -588,6 +578,7 @@ NetDefBackend::Context::Run(

// Make sure each output is of the expected size and copy it into
// the payload responses.
cuda_copy = false;
for (const auto& output : base->Config().output()) {
const std::string& name = output.name();

Expand All @@ -605,8 +596,13 @@ NetDefBackend::Context::Run(

RETURN_IF_ERROR(ReadFixedSizedOutputTensor(
name, dtype, GetDataTypeByteSize(output_config->data_type()),
total_batch_size, payloads, output_dims));
total_batch_size, output_dims, payloads, &cuda_copy));
}
#ifdef TRTIS_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(stream_);
}
#endif // TRTIS_ENABLE_GPU

return Status::Success;
}
Expand Down
7 changes: 4 additions & 3 deletions src/backends/caffe2/netdef_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ class NetDefBackend : public InferenceBackend {
const std::string& name, const DataType datatype, const DimsList& dims,
const size_t total_batch_size,
std::vector<Scheduler::Payload>* payloads,
std::vector<std::unique_ptr<char[]>>* input_buffers);
std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy);

// Run model to execute for one or more requests. This function
// assumes that it is only called by the single runner thread that
Expand All @@ -100,13 +100,14 @@ class NetDefBackend : public InferenceBackend {
const std::string& input_name, const std::vector<int64_t>& shape,
const Caffe2Workspace::DataType dtype, const size_t batch1_byte_size,
const size_t total_byte_size, std::vector<Scheduler::Payload>* payloads,
std::vector<std::unique_ptr<char[]>>* input_buffers);
std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy);

// Read an output tensor into one or more payloads.
Status ReadFixedSizedOutputTensor(
const std::string& name, const Caffe2Workspace::DataType dtype,
const size_t dtype_byte_size, const size_t total_batch_size,
std::vector<Scheduler::Payload>* payloads, const DimsList& dims);
const DimsList& dims, std::vector<Scheduler::Payload>* payloads,
bool* cuda_copy);

// Caffe2 workspace.
std::unique_ptr<Caffe2Workspace> workspace_;
Expand Down
18 changes: 16 additions & 2 deletions src/backends/onnx/onnx_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ OnnxBackend::CreateExecutionContext(
contexts_.emplace_back(new Context(instance_name, gpu_device, mbs));
Context* context = contexts_.back().get();

RETURN_IF_ERROR(context->CreateCudaStream());

// Set Onnx session option with proper device
OrtSessionOptions* session_options;
RETURN_IF_ORT_ERROR(
Expand Down Expand Up @@ -685,6 +687,7 @@ OnnxBackend::Context::ReadOutputTensors(
const std::vector<const char*>& output_names,
std::vector<Scheduler::Payload>* payloads)
{
bool cuda_copy = false;
for (size_t idx = 0; idx < output_names.size(); idx++) {
std::string name = std::string(output_names[idx]);

Expand Down Expand Up @@ -755,10 +758,21 @@ OnnxBackend::Context::ReadOutputTensors(
RETURN_IF_ORT_ERROR(
OrtGetTensorMutableData(output_tensor, (void**)&content));

SetFixedSizeOutputBuffer(
name, batch1_byte_size, content, content_shape, payloads);
// [TODO] currently ONNX output data are always on CPU
// https://github.com/microsoft/onnxruntime/issues/1621
auto content_memory_type = TRTSERVER_MEMORY_CPU;
cuda_copy |= SetFixedSizeOutputBuffer(
name, batch1_byte_size, content, content_shape, content_memory_type,
payloads);
}
}

#ifdef TRTIS_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(stream_);
}
#endif // TRTIS_ENABLE_GPU

return Status::Success;
}

Expand Down
Loading

0 comments on commit ba755b4

Please sign in to comment.