Skip to content

Commit

Permalink
Add support for DLA in TensorRT backend (triton-inference-server#2912)
Browse files Browse the repository at this point in the history
* Add support for DLA to TRT

* Add checks and return error if  DLA not supported
- Or if secondary device id not valid

* Add L0_trt_dla

* Update device_id in ValidateModelConfigInt64

* cleanup to use a single device_engine_ map for TRT
- use -1 for dla when not enabled means no impact to earlier

* Additional cleanup and fixes

* Pick up dla resnet50 model from trt_dla_model_store

* Don;t modify dla core if runtime is provided
  • Loading branch information
CoderHam authored May 26, 2021
1 parent 6438fce commit 1f333c6
Show file tree
Hide file tree
Showing 8 changed files with 184 additions and 25 deletions.
102 changes: 102 additions & 0 deletions qa/L0_trt_dla/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

REPO_VERSION=${NVIDIA_TRITON_SERVER_VERSION}
if [ "$#" -ge 1 ]; then
REPO_VERSION=$1
fi
if [ -z "$REPO_VERSION" ]; then
echo -e "Repository version must be specified"
echo -e "\n***\n*** Test Failed\n***"
exit 1
fi

export CUDA_VISIBLE_DEVICES=0

# Need to run on only one device since only creating a single
# PLAN. Without this test will fail on a heterogeneous system.
export CUDA_VISIBLE_DEVICES=0

IMAGE_CLIENT=../clients/image_client
IMAGE=../images/vulture.jpeg

CAFFE2PLAN=../common/caffe2plan

DATADIR=${DATADIR:="/data/inferenceserver/${REPO_VERSION}"}
OPTDIR=${OPTDIR:="/opt"}
SERVER=${OPTDIR}/tritonserver/bin/tritonserver
BACKEND_DIR=${OPTDIR}/tritonserver/backends

SERVER_ARGS="--model-repository=`pwd`/models --exit-timeout-secs=120 --backend-directory=${BACKEND_DIR}"
SERVER_LOG="./inference_server.log"
source ../common/util.sh

rm -fr models && mkdir models
cp -r $DATADIR/trt_dla_model_store/resnet50_plan models/.
rm -f *.log

set +e

run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

RET=0

set +e

CLIENT_LOG=${IMAGE_CLIENT##*/}.log

echo "Model: resnet50_plan" >> $CLIENT_LOG
$IMAGE_CLIENT $EXTRA_ARGS -m resnet50_plan -s VGG -c 1 -b 1 $IMAGE >> $CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
cat $CLIENT_LOG
RET=1
fi

if [ `grep -c VULTURE $CLIENT_LOG` != "1" ]; then
echo -e "\n***\n*** Failed. Expected 1 VULTURE results\n***"
RET=1
fi

set -e

kill $SERVER_PID
wait $SERVER_PID

rm -rf models

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
echo -e "\n***\n*** Test FAILED\n***"
fi

exit $RET
2 changes: 1 addition & 1 deletion src/backends/tensorrt/autofill.cc
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ AutoFillPlan::Create(
}
std::vector<char> plan_data(plan_data_str.begin(), plan_data_str.end());

if (!LoadPlan(plan_data, &runtime, &engine).IsOk()) {
if (!LoadPlan(plan_data, -1 /* dla_core_id */, &runtime, &engine).IsOk()) {
if (engine != nullptr) {
engine->destroy();
engine = nullptr;
Expand Down
17 changes: 15 additions & 2 deletions src/backends/tensorrt/loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ namespace nvidia { namespace inferenceserver {

Status
LoadPlan(
const std::vector<char>& model_data, nvinfer1::IRuntime** runtime,
nvinfer1::ICudaEngine** engine)
const std::vector<char>& model_data, int64_t dla_core_id,
nvinfer1::IRuntime** runtime, nvinfer1::ICudaEngine** engine)
{
// Create runtime only if it is not provided
if (*runtime == nullptr) {
Expand All @@ -45,6 +45,19 @@ LoadPlan(
return Status(
Status::Code::INTERNAL, "unable to create TensorRT runtime");
}

// Report error if 'dla_core_id' >= number of DLA cores
if (dla_core_id != -1) {
if (dla_core_id < (*runtime)->getNbDLACores()) {
(*runtime)->setDLACore(dla_core_id);
} else {
return Status(
Status::Code::INVALID_ARG,
("unable to create TensorRT runtime with DLA Core ID: " +
std::to_string(dla_core_id))
.c_str());
}
}
}

*engine =
Expand Down
12 changes: 7 additions & 5 deletions src/backends/tensorrt/loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,16 @@ namespace nvidia { namespace inferenceserver {
/// responsibility to destroy any returned runtime or engine object
/// even if an error is returned.
///
/// \param model_data The binary blob of the plan data
/// \param model_data The binary blob of the plan data.
/// \param dla_core_id The DLA core to use for this runtime. Does not
/// use DLA when set to -1.
/// \param runtime Returns the IRuntime object, or nullptr if failed
/// to create
/// to create.
/// \param engine Returns the ICudaEngine object, or nullptr if failed
/// to create
/// to create.
/// \return Error status.
Status LoadPlan(
const std::vector<char>& model_data, nvinfer1::IRuntime** runtime,
nvinfer1::ICudaEngine** engine);
const std::vector<char>& model_data, int64_t dla_core_id,
nvinfer1::IRuntime** runtime, nvinfer1::ICudaEngine** engine);

}} // namespace nvidia::inferenceserver
59 changes: 48 additions & 11 deletions src/backends/tensorrt/plan_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,28 @@ PlanBackend::CreateExecutionContexts(
" must be KIND_GPU and must specify at least one GPU id");
}

// Use DLA core id or GPU id from config based on instance group type
int64_t dla_core_id = -1;
uint32_t secondary_device_count = group.secondary_devices().size();
if (secondary_device_count != 0) {
if (secondary_device_count != 1) {
return Status(
Status::Code::INVALID_ARG,
group.name() + " of model " + Name() +
" must have either zero or or one secondary devices");
}

auto secondary_device = group.secondary_devices().at(0);
if (secondary_device.kind() !=
inference::ModelInstanceGroup::SecondaryDevice::KIND_NVDLA) {
return Status(
Status::Code::INVALID_ARG, "secondary device " + group.name() +
" of model " + Name() +
" must be KIND_NVDLA");
}
dla_core_id = secondary_device.device_id();
}

for (int c = 0; c < group.count(); c++) {
for (int gpu_device : group.gpus()) {
size_t runner_idx = 0;
Expand All @@ -312,13 +334,20 @@ PlanBackend::CreateExecutionContexts(
}
runner_idx = it->second;
}

// The last entry in contexts_ is the newly created context
auto& queue = available_context_queue_[runner_idx];
queue->Put(contexts_.size());

const std::string instance_name = group.name() + "_" +
std::to_string(c) + "_gpu" +
std::to_string(gpu_device);
std::string instance_name;
if (dla_core_id != -1) {
instance_name = group.name() + "_" + std::to_string(c) + "_gpu" +
std::to_string(gpu_device) + "_dla" +
std::to_string(dla_core_id);
} else {
instance_name = group.name() + "_" + std::to_string(c) + "_gpu" +
std::to_string(gpu_device);
}

// Determine the model file to use for device compute capability
cudaDeviceProp cuprops;
Expand Down Expand Up @@ -347,10 +376,11 @@ PlanBackend::CreateExecutionContexts(
}

// Create shared engine for the device if haven't tried so.
auto eit = device_engines_.find(gpu_device);
auto device_pair = std::make_pair(gpu_device, dla_core_id);
auto eit = device_engines_.find(device_pair);
if (eit == device_engines_.end()) {
eit = device_engines_
.emplace(gpu_device, std::make_pair(nullptr, nullptr))
.emplace(device_pair, std::make_pair(nullptr, nullptr))
.first;

// Create a CUDA engine shared by all contexts
Expand All @@ -362,17 +392,21 @@ PlanBackend::CreateExecutionContexts(
}

RETURN_IF_ERROR(LoadPlan(
mn_itr->second, &eit->second.first, &eit->second.second));
mn_itr->second, dla_core_id, &eit->second.first,
&eit->second.second));

// Validate whether the engine can be shared
bool is_dynamic = false;
for (int idx = 0; idx < eit->second.second->getNbBindings(); idx++) {
auto dims = eit->second.second->getBindingDimensions(idx);

// Detect whether dynamic or not
if (ContainsWildcard(dims)) {
is_dynamic = true;
break;
}
}

// Model with dynamic shapes can't share engine, set to engine to
// 'nullptr' as hint, but keeping runtime as it can be used repeatedly
if (is_dynamic) {
Expand All @@ -386,7 +420,8 @@ PlanBackend::CreateExecutionContexts(
LOG_INFO << "Creating instance " << instance_name << " on GPU "
<< gpu_device << " (" << cc << ") using " << cc_model_filename;
RETURN_IF_ERROR(CreateExecutionContext(
instance_name, gpu_device, mn_itr->second, group.profile(), queue));
instance_name, gpu_device, dla_core_id, mn_itr->second,
group.profile(), queue));
}
}
}
Expand Down Expand Up @@ -551,7 +586,7 @@ PlanBackend::Context::InitOptimizationProfiles(
Status
PlanBackend::CreateExecutionContext(
const std::string& instance_name, const int gpu_device,
const std::vector<char>& model,
const int64_t dla_core_id, const std::vector<char>& model,
const ::google::protobuf::RepeatedPtrField<std::string>& profile_names,
const std::shared_ptr<triton::common::SyncQueue<size_t>>& context_queue)
{
Expand Down Expand Up @@ -611,10 +646,12 @@ PlanBackend::CreateExecutionContext(
RETURN_IF_ERROR(
context->InitEventSet(Config().optimization().cuda().busy_wait_events()));

auto eit = device_engines_.find(gpu_device);
auto device_pair = std::make_pair(gpu_device, dla_core_id);
auto eit = device_engines_.find(device_pair);
if (eit->second.second == nullptr) {
context->is_shared_engine_ = false;
RETURN_IF_ERROR(LoadPlan(model, &eit->second.first, &context->engine_));
RETURN_IF_ERROR(
LoadPlan(model, dla_core_id, &eit->second.first, &context->engine_));
} else {
context->engine_ = eit->second.second;
}
Expand Down Expand Up @@ -2213,7 +2250,7 @@ PlanBackend::~PlanBackend()
contexts_.clear();

for (auto& device_engine : device_engines_) {
cudaSetDevice(device_engine.first);
cudaSetDevice(device_engine.first.first);
auto& runtime = device_engine.second.first;
auto& engine = device_engine.second.second;
if (engine != nullptr) {
Expand Down
12 changes: 8 additions & 4 deletions src/backends/tensorrt/plan_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <thread>
#include "model_config.pb.h"
#include "src/core/backend.h"
#include "src/core/backend_context.h"
#include "src/core/metric_model_reporter.h"
#include "model_config.pb.h"
#include "src/core/scheduler.h"
#include "src/core/status.h"
#include "triton/common/sync_queue.h"
Expand Down Expand Up @@ -60,7 +60,7 @@ class PlanBackend : public InferenceBackend {
const std::unordered_map<std::string, std::vector<char>>& models);
Status CreateExecutionContext(
const std::string& instance_name, const int gpu_device,
const std::vector<char>& models,
const int64_t dla_core_id, const std::vector<char>& models,
const ::google::protobuf::RepeatedPtrField<std::string>& profile_names,
const std::shared_ptr<triton::common::SyncQueue<size_t>>& context_queue);

Expand Down Expand Up @@ -430,8 +430,12 @@ class PlanBackend : public InferenceBackend {
bool eager_batching_;
};

// CUDA engine shared across all model instances on the same device.
std::map<int, std::pair<nvinfer1::IRuntime*, nvinfer1::ICudaEngine*>>
// CUDA engine shared across all model instances using the same (or no) DLA
// core on same GPU. The first element in the key pair is the GPU ID, the
// second is the DLA core ID.
std::map<
std::pair<int, int64_t>,
std::pair<nvinfer1::IRuntime*, nvinfer1::ICudaEngine*>>
device_engines_;

// vector for storing available context queue associated with a runner
Expand Down
2 changes: 1 addition & 1 deletion src/backends/tensorrt/plan_backend_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
#include <vector>

#include <NvInferPlugin.h>
#include "model_config.pb.h"
#include "src/backends/tensorrt/logging.h"
#include "src/core/constants.h"
#include "src/core/filesystem.h"
#include "src/core/logging.h"
#include "model_config.pb.h"
#include "src/core/model_config_utils.h"

namespace nvidia { namespace inferenceserver {
Expand Down
3 changes: 2 additions & 1 deletion src/core/model_config_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1544,7 +1544,8 @@ ValidateModelConfigInt64()
"ModelConfig::model_warmup::inputs::value::dims",
"ModelConfig::optimization::cuda::graph_spec::input::value::dim",
"ModelConfig::optimization::cuda::graph_spec::graph_lower_bound::input::"
"value::dim"};
"value::dim",
"ModelConfig::instance_group::secondary_devices::device_id"};

if (int64_fields != expected) {
return Status(
Expand Down

0 comments on commit 1f333c6

Please sign in to comment.