Add support for DLA in TensorRT backend (triton-inference-server#2912)

* Add support for DLA to TRT * Add checks and return error if DLA not supported - Or if secondary device id not valid * Add L0_trt_dla * Update device_id in ValidateModelConfigInt64 * cleanup to use a single device_engine_ map for TRT - use -1 for dla when not enabled means no impact to earlier * Additional cleanup and fixes * Pick up dla resnet50 model from trt_dla_model_store * Don;t modify dla core if runtime is provided
ileixe · May 26, 2021 · 1f333c6 · 1f333c6
1 parent 6438fce
commit 1f333c6
Show file tree

Hide file tree

Showing 8 changed files with 184 additions and 25 deletions.
diff --git a/qa/L0_trt_dla/test.sh b/qa/L0_trt_dla/test.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
+if [ "$#" -ge 1 ]; then
+    REPO_VERSION=$1
+fi
+if [ -z "$REPO_VERSION" ]; then
+    echo -e "Repository version must be specified"
+    echo -e "\n***\n*** Test Failed\n***"
+    exit 1
+fi
+
+export CUDA_VISIBLE_DEVICES=0
+
+# Need to run on only one device since only creating a single
+# PLAN. Without this test will fail on a heterogeneous system.
+export CUDA_VISIBLE_DEVICES=0
+
+IMAGE_CLIENT=../clients/image_client
+IMAGE=../images/vulture.jpeg
+
+CAFFE2PLAN=../common/caffe2plan
+
+DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
+OPTDIR=${OPTDIR:="/opt"}
+SERVER=${OPTDIR}/tritonserver/bin/tritonserver
+BACKEND_DIR=${OPTDIR}/tritonserver/backends
+
+SERVER_ARGS="--model-repository=`pwd`/models --exit-timeout-secs=120 --backend-directory=${BACKEND_DIR}"
+SERVER_LOG="./inference_server.log"
+source ../common/util.sh
+
+rm -fr models && mkdir models 
+cp -r $DATADIR/trt_dla_model_store/resnet50_plan models/.
+rm -f *.log
+
+set +e
+
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+RET=0
+
+set +e
+
+CLIENT_LOG=${IMAGE_CLIENT##*/}.log
+
+echo "Model: resnet50_plan" >> $CLIENT_LOG
+$IMAGE_CLIENT $EXTRA_ARGS -m resnet50_plan -s VGG -c 1 -b 1 $IMAGE >> $CLIENT_LOG 2>&1
+if [ $? -ne 0 ]; then
+    cat $CLIENT_LOG
+    RET=1
+fi
+
+if [ `grep -c VULTURE $CLIENT_LOG` != "1" ]; then
+    echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***"
+    RET=1
+fi
+
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
+rm -rf models
+
+if [ $RET -eq 0 ]; then
+    echo -e "\n***\n*** Test Passed\n***"
+else
+    echo -e "\n***\n*** Test FAILED\n***"
+fi
+
+exit $RET
diff --git a/src/backends/tensorrt/autofill.cc b/src/backends/tensorrt/autofill.cc
@@ -471,7 +471,7 @@ AutoFillPlan::Create(
     }
     std::vector<char> plan_data(plan_data_str.begin(), plan_data_str.end());
 
-    if (!LoadPlan(plan_data, &runtime, &engine).IsOk()) {
+    if (!LoadPlan(plan_data, -1 /* dla_core_id */, &runtime, &engine).IsOk()) {
       if (engine != nullptr) {
         engine->destroy();
         engine = nullptr;

diff --git a/src/backends/tensorrt/loader.cc b/src/backends/tensorrt/loader.cc
@@ -35,8 +35,8 @@ namespace nvidia { namespace inferenceserver {
 
 Status
 LoadPlan(
-    const std::vector<char>& model_data, nvinfer1::IRuntime** runtime,
-    nvinfer1::ICudaEngine** engine)
+    const std::vector<char>& model_data, int64_t dla_core_id,
+    nvinfer1::IRuntime** runtime, nvinfer1::ICudaEngine** engine)
 {
   // Create runtime only if it is not provided
   if (*runtime == nullptr) {
@@ -45,6 +45,19 @@ LoadPlan(
       return Status(
           Status::Code::INTERNAL, "unable to create TensorRT runtime");
     }
+
+    // Report error if 'dla_core_id' >= number of DLA cores
+    if (dla_core_id != -1) {
+      if (dla_core_id < (*runtime)->getNbDLACores()) {
+        (*runtime)->setDLACore(dla_core_id);
+      } else {
+        return Status(
+            Status::Code::INVALID_ARG,
+            ("unable to create TensorRT runtime with DLA Core ID: " +
+             std::to_string(dla_core_id))
+                .c_str());
+      }
+    }
   }
 
   *engine =

diff --git a/src/backends/tensorrt/loader.h b/src/backends/tensorrt/loader.h
@@ -36,14 +36,16 @@ namespace nvidia { namespace inferenceserver {
 /// responsibility to destroy any returned runtime or engine object
 /// even if an error is returned.
 ///
-/// \param model_data The binary blob of the plan data
+/// \param model_data The binary blob of the plan data.
+/// \param dla_core_id The DLA core to use for this runtime. Does not
+/// use DLA when set to -1.
 /// \param runtime Returns the IRuntime object, or nullptr if failed
-/// to create
+/// to create.
 /// \param engine Returns the ICudaEngine object, or nullptr if failed
-/// to create
+/// to create.
 /// \return Error status.
 Status LoadPlan(
-    const std::vector<char>& model_data, nvinfer1::IRuntime** runtime,
-    nvinfer1::ICudaEngine** engine);
+    const std::vector<char>& model_data, int64_t dla_core_id,
+    nvinfer1::IRuntime** runtime, nvinfer1::ICudaEngine** engine);
 
 }}  // namespace nvidia::inferenceserver
diff --git a/src/backends/tensorrt/plan_backend.cc b/src/backends/tensorrt/plan_backend.cc
@@ -290,6 +290,28 @@ PlanBackend::CreateExecutionContexts(
               " must be KIND_GPU and must specify at least one GPU id");
     }
 
+    // Use DLA core id or GPU id from config based on instance group type
+    int64_t dla_core_id = -1;
+    uint32_t secondary_device_count = group.secondary_devices().size();
+    if (secondary_device_count != 0) {
+      if (secondary_device_count != 1) {
+        return Status(
+            Status::Code::INVALID_ARG,
+            group.name() + " of model " + Name() +
+                " must have either zero or or one secondary devices");
+      }
+
+      auto secondary_device = group.secondary_devices().at(0);
+      if (secondary_device.kind() !=
+          inference::ModelInstanceGroup::SecondaryDevice::KIND_NVDLA) {
+        return Status(
+            Status::Code::INVALID_ARG, "secondary device " + group.name() +
+                                           " of model " + Name() +
+                                           " must be KIND_NVDLA");
+      }
+      dla_core_id = secondary_device.device_id();
+    }
+
     for (int c = 0; c < group.count(); c++) {
       for (int gpu_device : group.gpus()) {
         size_t runner_idx = 0;
@@ -312,13 +334,20 @@ PlanBackend::CreateExecutionContexts(
           }
           runner_idx = it->second;
         }
+
         // The last entry in contexts_ is the newly created context
         auto& queue = available_context_queue_[runner_idx];
         queue->Put(contexts_.size());
 
-        const std::string instance_name = group.name() + "_" +
-                                          std::to_string(c) + "_gpu" +
-                                          std::to_string(gpu_device);
+        std::string instance_name;
+        if (dla_core_id != -1) {
+          instance_name = group.name() + "_" + std::to_string(c) + "_gpu" +
+                          std::to_string(gpu_device) + "_dla" +
+                          std::to_string(dla_core_id);
+        } else {
+          instance_name = group.name() + "_" + std::to_string(c) + "_gpu" +
+                          std::to_string(gpu_device);
+        }
 
         // Determine the model file to use for device compute capability
         cudaDeviceProp cuprops;
@@ -347,10 +376,11 @@ PlanBackend::CreateExecutionContexts(
         }
 
         // Create shared engine for the device if haven't tried so.
-        auto eit = device_engines_.find(gpu_device);
+        auto device_pair = std::make_pair(gpu_device, dla_core_id);
+        auto eit = device_engines_.find(device_pair);
         if (eit == device_engines_.end()) {
           eit = device_engines_
-                    .emplace(gpu_device, std::make_pair(nullptr, nullptr))
+                    .emplace(device_pair, std::make_pair(nullptr, nullptr))
                     .first;
 
           // Create a CUDA engine shared by all contexts
@@ -362,17 +392,21 @@ PlanBackend::CreateExecutionContexts(
           }
 
           RETURN_IF_ERROR(LoadPlan(
-              mn_itr->second, &eit->second.first, &eit->second.second));
+              mn_itr->second, dla_core_id, &eit->second.first,
+              &eit->second.second));
+
           // Validate whether the engine can be shared
           bool is_dynamic = false;
           for (int idx = 0; idx < eit->second.second->getNbBindings(); idx++) {
             auto dims = eit->second.second->getBindingDimensions(idx);
+
             // Detect whether dynamic or not
             if (ContainsWildcard(dims)) {
               is_dynamic = true;
               break;
             }
           }
+
           // Model with dynamic shapes can't share engine, set to engine to
           // 'nullptr' as hint, but keeping runtime as it can be used repeatedly
           if (is_dynamic) {
@@ -386,7 +420,8 @@ PlanBackend::CreateExecutionContexts(
         LOG_INFO << "Creating instance " << instance_name << " on GPU "
                  << gpu_device << " (" << cc << ") using " << cc_model_filename;
         RETURN_IF_ERROR(CreateExecutionContext(
-            instance_name, gpu_device, mn_itr->second, group.profile(), queue));
+            instance_name, gpu_device, dla_core_id, mn_itr->second,
+            group.profile(), queue));
       }
     }
   }
@@ -551,7 +586,7 @@ PlanBackend::Context::InitOptimizationProfiles(
 Status
 PlanBackend::CreateExecutionContext(
     const std::string& instance_name, const int gpu_device,
-    const std::vector<char>& model,
+    const int64_t dla_core_id, const std::vector<char>& model,
     const ::google::protobuf::RepeatedPtrField<std::string>& profile_names,
     const std::shared_ptr<triton::common::SyncQueue<size_t>>& context_queue)
 {
@@ -611,10 +646,12 @@ PlanBackend::CreateExecutionContext(
   RETURN_IF_ERROR(
       context->InitEventSet(Config().optimization().cuda().busy_wait_events()));
 
-  auto eit = device_engines_.find(gpu_device);
+  auto device_pair = std::make_pair(gpu_device, dla_core_id);
+  auto eit = device_engines_.find(device_pair);
   if (eit->second.second == nullptr) {
     context->is_shared_engine_ = false;
-    RETURN_IF_ERROR(LoadPlan(model, &eit->second.first, &context->engine_));
+    RETURN_IF_ERROR(
+        LoadPlan(model, dla_core_id, &eit->second.first, &context->engine_));
   } else {
     context->engine_ = eit->second.second;
   }
@@ -2213,7 +2250,7 @@ PlanBackend::~PlanBackend()
   contexts_.clear();
 
   for (auto& device_engine : device_engines_) {
-    cudaSetDevice(device_engine.first);
+    cudaSetDevice(device_engine.first.first);
     auto& runtime = device_engine.second.first;
     auto& engine = device_engine.second.second;
     if (engine != nullptr) {

diff --git a/src/backends/tensorrt/plan_backend.h b/src/backends/tensorrt/plan_backend.h
@@ -28,10 +28,10 @@
 #include <NvInfer.h>
 #include <cuda_runtime_api.h>
 #include <thread>
+#include "model_config.pb.h"
 #include "src/core/backend.h"
 #include "src/core/backend_context.h"
 #include "src/core/metric_model_reporter.h"
-#include "model_config.pb.h"
 #include "src/core/scheduler.h"
 #include "src/core/status.h"
 #include "triton/common/sync_queue.h"
@@ -60,7 +60,7 @@ class PlanBackend : public InferenceBackend {
       const std::unordered_map<std::string, std::vector<char>>& models);
   Status CreateExecutionContext(
       const std::string& instance_name, const int gpu_device,
-      const std::vector<char>& models,
+      const int64_t dla_core_id, const std::vector<char>& models,
       const ::google::protobuf::RepeatedPtrField<std::string>& profile_names,
       const std::shared_ptr<triton::common::SyncQueue<size_t>>& context_queue);
 
@@ -430,8 +430,12 @@ class PlanBackend : public InferenceBackend {
     bool eager_batching_;
   };
 
-  // CUDA engine shared across all model instances on the same device.
-  std::map<int, std::pair<nvinfer1::IRuntime*, nvinfer1::ICudaEngine*>>
+  // CUDA engine shared across all model instances using the same (or no) DLA
+  // core on same GPU. The first element in the key pair is the GPU ID, the
+  // second is the DLA core ID.
+  std::map<
+      std::pair<int, int64_t>,
+      std::pair<nvinfer1::IRuntime*, nvinfer1::ICudaEngine*>>
       device_engines_;
 
   // vector for storing available context queue associated with a runner

diff --git a/src/backends/tensorrt/plan_backend_factory.cc b/src/backends/tensorrt/plan_backend_factory.cc
@@ -31,11 +31,11 @@
 #include <vector>
 
 #include <NvInferPlugin.h>
+#include "model_config.pb.h"
 #include "src/backends/tensorrt/logging.h"
 #include "src/core/constants.h"
 #include "src/core/filesystem.h"
 #include "src/core/logging.h"
-#include "model_config.pb.h"
 #include "src/core/model_config_utils.h"
 
 namespace nvidia { namespace inferenceserver {

diff --git a/src/core/model_config_utils.cc b/src/core/model_config_utils.cc
@@ -1544,7 +1544,8 @@ ValidateModelConfigInt64()
       "ModelConfig::model_warmup::inputs::value::dims",
       "ModelConfig::optimization::cuda::graph_spec::input::value::dim",
       "ModelConfig::optimization::cuda::graph_spec::graph_lower_bound::input::"
-      "value::dim"};
+      "value::dim",
+      "ModelConfig::instance_group::secondary_devices::device_id"};
 
   if (int64_fields != expected) {
     return Status(