Skip to content

Commit c0e6229

Browse files
committed
Add multithreading test and put a lock on nvinfer1::createInferRuntime() for TRT EP (#10714)
* Add multithread unit test and put lock on library call * update code * remove debug code * add comment * add one session multi-threads inference * Put lock for build engine all the time * Update naming and comment * remove unnecessary lock * Revert "remove unnecessary lock" This reverts commit 9c2317b.
1 parent 93d0f8a commit c0e6229

File tree

3 files changed

+200
-14
lines changed

3 files changed

+200
-14
lines changed

onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,11 @@ TensorrtLogger& GetTensorrtLogger() {
251251
return trt_logger;
252252
}
253253

254+
std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const {
255+
static OrtMutex singleton;
256+
return std::unique_lock<OrtMutex>(singleton);
257+
}
258+
254259
TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info)
255260
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, info_(info), device_id_(info.device_id) {
256261
CUDA_CALL_THROW(cudaSetDevice(device_id_));
@@ -396,7 +401,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
396401
throw std::runtime_error("Failed to create directory " + cache_path_);
397402
}
398403
}
399-
runtime_ = tensorrt_ptr::unique_pointer<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger()));
404+
{
405+
auto lock = GetApiLock();
406+
runtime_ = tensorrt_ptr::unique_pointer<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger()));
407+
}
400408
}
401409

402410
if (engine_decryption_enable_) {
@@ -1001,13 +1009,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
10011009
return result;
10021010
}
10031011

1004-
std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetEngineBuildLock() const {
1005-
static OrtMutex singleton;
1006-
1007-
// Acquire a lock only when force_sequential_engine_build_ is true;
1008-
return force_sequential_engine_build_ ? std::unique_lock<OrtMutex>(singleton) : std::unique_lock<OrtMutex>();
1009-
}
1010-
10111012
common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fused_nodes,
10121013
std::vector<NodeComputeInfo>& node_compute_funcs) {
10131014
for (const auto* fused_node : fused_nodes) {
@@ -1197,7 +1198,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
11971198

11981199
// Build engine
11991200
{
1200-
auto lock = GetEngineBuildLock();
1201+
auto lock = GetApiLock();
12011202
trt_engine = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
12021203
}
12031204
if (trt_engine == nullptr) {
@@ -1538,7 +1539,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse
15381539

15391540
// Build engine
15401541
{
1541-
auto lock = GetEngineBuildLock();
1542+
auto lock = GetApiLock();
15421543
*(trt_state->engine) = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>(
15431544
trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
15441545
}

onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,10 +194,10 @@ class TensorrtExecutionProvider : public IExecutionProvider {
194194
void RemoveTensorRTGraphCycles(SubGraphCollection_t& supported_nodes_vector, const GraphViewer& graph) const;
195195

196196
/**
197-
Get a unique_lock object to control the concurrency behavior of TensorRT engine building. When force_sequential_engine_build
198-
is set to true, the lock object is associated with a mutex shared across all providers to enforce sequential engine build.
199-
Otherwise, the constructed unique_lock is not associated with any mutex therefore no locking/unlocking will happen.
197+
Get a unique_lock object to control the concurrency behavior.
198+
Every api call not in the thread-safe operations(https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading)
199+
should be protected by a lock when invoked by multiple threads concurrently.
200200
*/
201-
std::unique_lock<OrtMutex> GetEngineBuildLock() const;
201+
std::unique_lock<OrtMutex> GetApiLock() const;
202202
};
203203
} // namespace onnxruntime

onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include "core/providers/tensorrt/tensorrt_provider_options.h"
1111
#include "core/providers/tensorrt/tensorrt_execution_provider_utils.h"
1212
#include <string>
13+
#include <thread>
1314

1415
using namespace std;
1516
using namespace ONNX_NAMESPACE;
@@ -87,6 +88,190 @@ void CreateBaseModel(std::string model_name, std::string graph_name, std::vector
8788
status = onnxruntime::Model::Save(model, model_name);
8889
}
8990

91+
void RunSession(InferenceSession& session_object,
92+
RunOptions& run_options,
93+
NameMLValMap& feeds,
94+
std::vector<std::string> output_names,
95+
std::vector<int64_t> expected_dims,
96+
std::vector<float> expected_values) {
97+
std::vector<OrtValue> fetches;
98+
auto status = session_object.Run(run_options, feeds, output_names, &fetches);
99+
ASSERT_TRUE(status.IsOK());
100+
VerifyOutputs(fetches, expected_dims, expected_values);
101+
}
102+
103+
void RunWithOneSessionSingleThreadInference(std::string model_name, std::string sess_log_id) {
104+
SessionOptions so;
105+
so.session_logid = sess_log_id;
106+
RunOptions run_options;
107+
run_options.run_tag = so.session_logid;
108+
InferenceSession session_object{so, GetEnvironment()};
109+
auto allocator_manager = session_object.GetAllocatorManager();
110+
auto cuda_provider = DefaultCudaExecutionProvider();
111+
cuda_provider->RegisterAllocator(allocator_manager);
112+
auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU);
113+
std::vector<int64_t> dims_mul_x = {1, 3, 2};
114+
std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
115+
OrtValue ml_value_x;
116+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x);
117+
OrtValue ml_value_y;
118+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y);
119+
OrtValue ml_value_z;
120+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z);
121+
NameMLValMap feeds;
122+
feeds.insert(std::make_pair("X", ml_value_x));
123+
feeds.insert(std::make_pair("Y", ml_value_y));
124+
feeds.insert(std::make_pair("Z", ml_value_z));
125+
126+
// prepare outputs
127+
std::vector<std::string> output_names;
128+
output_names.push_back("M");
129+
130+
// prepare expected inputs and outputs
131+
std::vector<int64_t> expected_dims_mul_m = {1, 3, 2};
132+
std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
133+
134+
OrtTensorRTProviderOptionsV2 params{
135+
0,
136+
0,
137+
nullptr,
138+
1000,
139+
1,
140+
1 << 30,
141+
0,
142+
0,
143+
nullptr,
144+
0,
145+
0,
146+
0,
147+
0,
148+
0,
149+
nullptr,
150+
0,
151+
nullptr,
152+
0};
153+
154+
params.trt_engine_cache_enable = 1;
155+
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
156+
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
157+
auto status = session_object.Load(model_name);
158+
ASSERT_TRUE(status.IsOK());
159+
status = session_object.Initialize();
160+
ASSERT_TRUE(status.IsOK());
161+
162+
// run inference
163+
// TRT engine will be created and cached
164+
// TRT profile will be created and cached only for dynamic input shape
165+
// Data in profile,
166+
// X: 1, 3, 3, 2, 2, 2
167+
// Y: 1, 3, 3, 2, 2, 2
168+
// Z: 1, 3, 3, 2, 2, 2
169+
RunSession(session_object, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m);
170+
}
171+
172+
void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id) {
173+
SessionOptions so;
174+
so.session_logid = sess_log_id;
175+
RunOptions run_options;
176+
run_options.run_tag = so.session_logid;
177+
InferenceSession session_object{so, GetEnvironment()};
178+
auto allocator_manager = session_object.GetAllocatorManager();
179+
auto cuda_provider = DefaultCudaExecutionProvider();
180+
cuda_provider->RegisterAllocator(allocator_manager);
181+
auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU);
182+
std::vector<int64_t> dims_mul_x = {1, 3, 2};
183+
std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
184+
OrtValue ml_value_x;
185+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x);
186+
OrtValue ml_value_y;
187+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y);
188+
OrtValue ml_value_z;
189+
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z);
190+
NameMLValMap feeds;
191+
feeds.insert(std::make_pair("X", ml_value_x));
192+
feeds.insert(std::make_pair("Y", ml_value_y));
193+
feeds.insert(std::make_pair("Z", ml_value_z));
194+
195+
// prepare outputs
196+
std::vector<std::string> output_names;
197+
output_names.push_back("M");
198+
199+
// prepare expected inputs and outputs
200+
std::vector<int64_t> expected_dims_mul_m = {1, 3, 2};
201+
std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f};
202+
203+
OrtTensorRTProviderOptionsV2 params{
204+
0,
205+
0,
206+
nullptr,
207+
1000,
208+
1,
209+
1 << 30,
210+
0,
211+
0,
212+
nullptr,
213+
0,
214+
0,
215+
0,
216+
0,
217+
0,
218+
nullptr,
219+
0,
220+
nullptr,
221+
0};
222+
223+
params.trt_engine_cache_enable = 1;
224+
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(&params);
225+
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK());
226+
auto status = session_object.Load(model_name);
227+
ASSERT_TRUE(status.IsOK());
228+
status = session_object.Initialize();
229+
ASSERT_TRUE(status.IsOK());
230+
231+
// run inference with multi-threads
232+
// TRT engine will be created and cached
233+
// TRT profile will be created and cached only for dynamic input shape
234+
// Data in profile,
235+
// X: 1, 3, 3, 2, 2, 2
236+
// Y: 1, 3, 3, 2, 2, 2
237+
// Z: 1, 3, 3, 2, 2, 2
238+
239+
std::vector<std::thread> threads;
240+
int num_thread = 5;
241+
for (int i = 0; i < num_thread; ++i)
242+
threads.push_back(std::thread(RunSession, std::ref(session_object), std::ref(run_options), std::ref(feeds), std::ref(output_names), std::ref(expected_dims_mul_m), std::ref(expected_values_mul_m)));
243+
244+
for (auto& th : threads)
245+
th.join();
246+
}
247+
248+
TEST(TensorrtExecutionProviderTest, MultiThreadsTestWithOneSessionSingleThreadInference) {
249+
std::vector<std::thread> threads;
250+
std::string model_name = "trt_execution_provider_multithreading_test.onnx";
251+
std::string graph_name = "multithreading_test";
252+
std::string sess_log_id = "TRTEPMultiThreadingTestWithOneSessionSingleThread";
253+
std::vector<int> dims = {1, 3, 2};
254+
int num_thread = 5;
255+
256+
CreateBaseModel(model_name, graph_name, dims);
257+
258+
for (int i = 0; i < num_thread; ++i)
259+
threads.push_back(std::thread(RunWithOneSessionSingleThreadInference, model_name, sess_log_id));
260+
261+
for (auto& th : threads)
262+
th.join();
263+
}
264+
265+
TEST(TensorrtExecutionProviderTest, MultiThreadsTestWithOneSessionMultiThreadsInference) {
266+
std::string model_name = "trt_execution_provider_multithreading_test.onnx";
267+
std::string graph_name = "multithreading_test";
268+
std::string sess_log_id = "TRTEPMultiThreadingTestWithOneSessionMultiThreads";
269+
std::vector<int> dims = {1, 3, 2};
270+
271+
CreateBaseModel(model_name, graph_name, dims);
272+
RunWithOneSessionMultiThreadsInference(model_name, sess_log_id);
273+
}
274+
90275
TEST_P(TensorrtExecutionProviderCacheTest, Run) {
91276
// GetParam() returns the parameter of following format:
92277
// ##cache type##_##input shape type##

0 commit comments

Comments
 (0)