Add mem_type parameter in SetFixedSizeOutputBuffer. Modify contexts t…

…o use SetFixedSizeOutputBuffer (triton-inference-server#559) * Add 'src_memory_type' parameter in SetFixedSizeOutputBuffer * Apply the change on all backend contexts * Fix provider's AllocateOutputBuffer to use mem_type param * temp * refactor backend context * fix provider AllocateOutputBuffer logic to be the same as alloc_fn * backend_context temp * Fix provider to return error on class output and 0 byte size * Update SetFixedSizeOutputBuffer * Create cuda stream for all backend context if GPU is supported * Address comment on synchronizing CUDA stream once for all outputs * Fix no-GPU build failure * Fix GPU supported build on CPU-only machine * temp input * Fix rebase artifact. Work on TODOs related to this PR * Fix AllocatedSystemMemory size to be 0 if allocated buffer is nullptr * Fix ensemble ResponseAlloc function * Remove plan backend copy check as ... cuda_copy can be false if output is not required
Marshall2M · Aug 21, 2019 · ba755b4 · ba755b4
1 parent 0092e07
commit ba755b4
Show file tree

Hide file tree

Showing 13 changed files with 400 additions and 300 deletions.
diff --git a/src/backends/caffe2/netdef_backend.cc b/src/backends/caffe2/netdef_backend.cc
@@ -217,6 +217,8 @@ NetDefBackend::CreateExecutionContext(
   contexts_.emplace_back(new Context(instance_name, gpu_device, mbs));
   const std::unique_ptr<Context>& context = contexts_.back();
 
+  RETURN_IF_ERROR(context->CreateCudaStream());
+
   // Extract input and output names from the config...
   std::vector<std::string> input_names;
   for (const auto& io : Config().input()) {
@@ -362,7 +364,7 @@ NetDefBackend::Context::SetFixedSizedInputTensor(
     const std::string& name, const std::vector<int64_t>& shape,
     const Caffe2Workspace::DataType dtype, const size_t batch1_byte_size,
     const size_t total_byte_size, std::vector<Scheduler::Payload>* payloads,
-    std::vector<std::unique_ptr<char[]>>* input_buffers)
+    std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy)
 {
   // The entire input tensor must be delivered as a single
   // contiguous chunk so create a buffer large enough to hold the
@@ -380,7 +382,7 @@ NetDefBackend::Context::SetFixedSizedInputTensor(
         request_header.batch_size() * batch1_byte_size);
   }
 
-  SetInputBuffer(
+  *cuda_copy |= SetInputBuffer(
       name, expected_byte_sizes, payloads, TRTSERVER_MEMORY_CPU, buffer);
 
   Caffe2Workspace::Error err = workspace_->SetInputTensor(
@@ -396,7 +398,8 @@ Status
 NetDefBackend::Context::ReadFixedSizedOutputTensor(
     const std::string& name, const Caffe2Workspace::DataType dtype,
     const size_t dtype_byte_size, const size_t total_batch_size,
-    std::vector<Scheduler::Payload>* payloads, const DimsList& dims)
+    const DimsList& dims, std::vector<Scheduler::Payload>* payloads,
+    bool* cuda_copy)
 {
   std::vector<int64_t> content_shape;
   const char* content = nullptr;
@@ -436,42 +439,23 @@ NetDefBackend::Context::ReadFixedSizedOutputTensor(
             std::to_string(batch1_byte_size));
   }
 
-  size_t content_offset = 0;
-
-  for (auto& payload : *payloads) {
-    const InferRequestHeader& request_header =
-        payload.request_provider_->RequestHeader();
-    const size_t expected_byte_size =
-        request_header.batch_size() * batch1_byte_size;
-
-    // If 'payload' requested this output then copy it from
-    // 'content'. If it did not request this output then just
-    // skip it in the 'content'.
-    if ((payload.response_provider_ != nullptr) &&
-        payload.response_provider_->RequiresOutput(name)) {
-      void* buffer;
-      Status status = payload.response_provider_->AllocateOutputBuffer(
-          name, &buffer, expected_byte_size, content_shape);
-      if (status.IsOk()) {
-        memcpy(buffer, content + content_offset, expected_byte_size);
-      }
-
-      if (!status.IsOk()) {
-        payload.status_ = status;
-      }
-    }
-
-    content_offset += expected_byte_size;
-  }
-
+  // [TODO] use the following statement. Right now we always create
+  // netdef workspace with inputs / outputs on CPU node
+  // auto content_memory_type = (gpu_device_ == NO_GPU_DEVICE)
+  //                                ? TRTSERVER_MEMORY_CPU
+  //                                : TRTSERVER_MEMORY_GPU;
+  auto content_memory_type = TRTSERVER_MEMORY_CPU;
+  *cuda_copy |= SetFixedSizeOutputBuffer(
+      name, batch1_byte_size, content, content_shape, content_memory_type,
+      payloads);
   return Status::Success;
 }
 
 Status
 NetDefBackend::Context::SetInput(
     const std::string& name, const DataType datatype, const DimsList& dims,
     const size_t total_batch_size, std::vector<Scheduler::Payload>* payloads,
-    std::vector<std::unique_ptr<char[]>>* input_buffers)
+    std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy)
 {
   // Get the shape of the input. The provider has already checked that
   // the request shape is valid so don't need to do it here.
@@ -498,7 +482,7 @@ NetDefBackend::Context::SetInput(
 
   return SetFixedSizedInputTensor(
       name, shape, dtype, batch1_byte_size, total_byte_size, payloads,
-      input_buffers);
+      input_buffers, cuda_copy);
 }
 
 Status
@@ -555,6 +539,7 @@ NetDefBackend::Context::Run(
   // into the corresponding tensor.
 
   // Inputs from the request...
+  bool cuda_copy = false;
   for (const auto& input : input_request_provider->RequestHeader().input()) {
     const std::string& name = input.name();
 
@@ -563,7 +548,7 @@ NetDefBackend::Context::Run(
 
     RETURN_IF_ERROR(SetInput(
         name, input_config->data_type(), input.dims(), total_batch_size,
-        payloads, &input_buffers));
+        payloads, &input_buffers, &cuda_copy));
   }
 
   // Additional inputs added to the provider...
@@ -576,9 +561,14 @@ NetDefBackend::Context::Run(
           pr.second;
       RETURN_IF_ERROR(SetInput(
           name, override->datatype_, override->dims_, total_batch_size,
-          payloads, &input_buffers));
+          payloads, &input_buffers, &cuda_copy));
     }
   }
+#ifdef TRTIS_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream_);
+  }
+#endif  // TRTIS_ENABLE_GPU
 
   // Run...
   Caffe2Workspace::Error err = workspace_->Run();
@@ -588,6 +578,7 @@ NetDefBackend::Context::Run(
 
   // Make sure each output is of the expected size and copy it into
   // the payload responses.
+  cuda_copy = false;
   for (const auto& output : base->Config().output()) {
     const std::string& name = output.name();
 
@@ -605,8 +596,13 @@ NetDefBackend::Context::Run(
 
     RETURN_IF_ERROR(ReadFixedSizedOutputTensor(
         name, dtype, GetDataTypeByteSize(output_config->data_type()),
-        total_batch_size, payloads, output_dims));
+        total_batch_size, output_dims, payloads, &cuda_copy));
+  }
+#ifdef TRTIS_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream_);
   }
+#endif  // TRTIS_ENABLE_GPU
 
   return Status::Success;
 }

diff --git a/src/backends/caffe2/netdef_backend.h b/src/backends/caffe2/netdef_backend.h
@@ -84,7 +84,7 @@ class NetDefBackend : public InferenceBackend {
         const std::string& name, const DataType datatype, const DimsList& dims,
         const size_t total_batch_size,
         std::vector<Scheduler::Payload>* payloads,
-        std::vector<std::unique_ptr<char[]>>* input_buffers);
+        std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy);
 
     // Run model to execute for one or more requests. This function
     // assumes that it is only called by the single runner thread that
@@ -100,13 +100,14 @@ class NetDefBackend : public InferenceBackend {
         const std::string& input_name, const std::vector<int64_t>& shape,
         const Caffe2Workspace::DataType dtype, const size_t batch1_byte_size,
         const size_t total_byte_size, std::vector<Scheduler::Payload>* payloads,
-        std::vector<std::unique_ptr<char[]>>* input_buffers);
+        std::vector<std::unique_ptr<char[]>>* input_buffers, bool* cuda_copy);
 
     // Read an output tensor into one or more payloads.
     Status ReadFixedSizedOutputTensor(
         const std::string& name, const Caffe2Workspace::DataType dtype,
         const size_t dtype_byte_size, const size_t total_batch_size,
-        std::vector<Scheduler::Payload>* payloads, const DimsList& dims);
+        const DimsList& dims, std::vector<Scheduler::Payload>* payloads,
+        bool* cuda_copy);
 
     // Caffe2 workspace.
     std::unique_ptr<Caffe2Workspace> workspace_;

diff --git a/src/backends/onnx/onnx_backend.cc b/src/backends/onnx/onnx_backend.cc
@@ -196,6 +196,8 @@ OnnxBackend::CreateExecutionContext(
   contexts_.emplace_back(new Context(instance_name, gpu_device, mbs));
   Context* context = contexts_.back().get();
 
+  RETURN_IF_ERROR(context->CreateCudaStream());
+
   // Set Onnx session option with proper device
   OrtSessionOptions* session_options;
   RETURN_IF_ORT_ERROR(
@@ -685,6 +687,7 @@ OnnxBackend::Context::ReadOutputTensors(
     const std::vector<const char*>& output_names,
     std::vector<Scheduler::Payload>* payloads)
 {
+  bool cuda_copy = false;
   for (size_t idx = 0; idx < output_names.size(); idx++) {
     std::string name = std::string(output_names[idx]);
 
@@ -755,10 +758,21 @@ OnnxBackend::Context::ReadOutputTensors(
       RETURN_IF_ORT_ERROR(
           OrtGetTensorMutableData(output_tensor, (void**)&content));
 
-      SetFixedSizeOutputBuffer(
-          name, batch1_byte_size, content, content_shape, payloads);
+      // [TODO] currently ONNX output data are always on CPU
+      // https://github.com/microsoft/onnxruntime/issues/1621
+      auto content_memory_type = TRTSERVER_MEMORY_CPU;
+      cuda_copy |= SetFixedSizeOutputBuffer(
+          name, batch1_byte_size, content, content_shape, content_memory_type,
+          payloads);
     }
   }
+
+#ifdef TRTIS_ENABLE_GPU
+  if (cuda_copy) {
+    cudaStreamSynchronize(stream_);
+  }
+#endif  // TRTIS_ENABLE_GPU
+
   return Status::Success;
 }