-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Add multithreading test and put a lock on nvinfer1::createInferRuntime() for TRT EP #10714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
eee3501
d697e78
641aac4
5e47eaa
f78efea
30b86ab
a2c14ca
9c2317b
b7560c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -250,6 +250,11 @@ TensorrtLogger& GetTensorrtLogger() { | |||
return trt_logger; | ||||
} | ||||
|
||||
std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetApiLock() const { | ||||
static OrtMutex singleton; | ||||
return std::unique_lock<OrtMutex>(singleton); | ||||
} | ||||
|
||||
TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProviderInfo& info) | ||||
: IExecutionProvider{onnxruntime::kTensorrtExecutionProvider, true}, info_(info), device_id_(info.device_id) { | ||||
CUDA_CALL_THROW(cudaSetDevice(device_id_)); | ||||
|
@@ -395,7 +400,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv | |||
throw std::runtime_error("Failed to create directory " + cache_path_); | ||||
} | ||||
} | ||||
runtime_ = tensorrt_ptr::unique_pointer<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger())); | ||||
{ | ||||
auto lock = GetApiLock(); | ||||
runtime_ = tensorrt_ptr::unique_pointer<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(GetTensorrtLogger())); | ||||
} | ||||
} | ||||
|
||||
if (engine_decryption_enable_) { | ||||
|
@@ -1000,13 +1008,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, | |||
return result; | ||||
} | ||||
|
||||
std::unique_lock<OrtMutex> TensorrtExecutionProvider::GetEngineBuildLock() const { | ||||
static OrtMutex singleton; | ||||
|
||||
// Acquire a lock only when force_sequential_engine_build_ is true; | ||||
return force_sequential_engine_build_ ? std::unique_lock<OrtMutex>(singleton) : std::unique_lock<OrtMutex>(); | ||||
} | ||||
|
||||
common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fused_nodes, | ||||
std::vector<NodeComputeInfo>& node_compute_funcs) { | ||||
for (const auto* fused_node : fused_nodes) { | ||||
|
@@ -1196,7 +1197,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse | |||
|
||||
// Build engine | ||||
{ | ||||
auto lock = GetEngineBuildLock(); | ||||
auto lock = GetApiLock(); | ||||
trt_engine = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config)); | ||||
} | ||||
if (trt_engine == nullptr) { | ||||
|
@@ -1537,7 +1538,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<Node*>& fuse | |||
|
||||
// Build engine | ||||
{ | ||||
auto lock = GetEngineBuildLock(); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the lock supposed to be removed here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the reminding and explanation. Yes, we shouldn't remove this lock here. |
||||
auto lock = GetApiLock(); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was wondering do we need a lock here? since the whole inference has been protected by a lock already.
|
||||
*(trt_state->engine) = tensorrt_ptr::unique_pointer<nvinfer1::ICudaEngine>( | ||||
trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config)); | ||||
} | ||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
#include "core/providers/tensorrt/tensorrt_provider_options.h" | ||
#include "core/providers/tensorrt/tensorrt_execution_provider_utils.h" | ||
#include <string> | ||
#include <thread> | ||
|
||
using namespace std; | ||
using namespace ONNX_NAMESPACE; | ||
|
@@ -87,6 +88,190 @@ void CreateBaseModel(std::string model_name, std::string graph_name, std::vector | |
status = onnxruntime::Model::Save(model, model_name); | ||
} | ||
|
||
void RunSession(InferenceSession& session_object, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we might consider in the future to migrate these to inference_session_test ? |
||
RunOptions& run_options, | ||
NameMLValMap& feeds, | ||
std::vector<std::string> output_names, | ||
std::vector<int64_t> expected_dims, | ||
std::vector<float> expected_values) { | ||
std::vector<OrtValue> fetches; | ||
auto status = session_object.Run(run_options, feeds, output_names, &fetches); | ||
ASSERT_TRUE(status.IsOK()); | ||
VerifyOutputs(fetches, expected_dims, expected_values); | ||
} | ||
|
||
void RunWithOneSessionSingleThreadInference(std::string model_name, std::string sess_log_id) { | ||
SessionOptions so; | ||
so.session_logid = sess_log_id; | ||
RunOptions run_options; | ||
run_options.run_tag = so.session_logid; | ||
InferenceSession session_object{so, GetEnvironment()}; | ||
auto allocator_manager = session_object.GetAllocatorManager(); | ||
auto cuda_provider = DefaultCudaExecutionProvider(); | ||
cuda_provider->RegisterAllocator(allocator_manager); | ||
auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU); | ||
std::vector<int64_t> dims_mul_x = {1, 3, 2}; | ||
std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; | ||
OrtValue ml_value_x; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); | ||
OrtValue ml_value_y; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); | ||
OrtValue ml_value_z; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); | ||
NameMLValMap feeds; | ||
feeds.insert(std::make_pair("X", ml_value_x)); | ||
feeds.insert(std::make_pair("Y", ml_value_y)); | ||
feeds.insert(std::make_pair("Z", ml_value_z)); | ||
|
||
// prepare outputs | ||
std::vector<std::string> output_names; | ||
output_names.push_back("M"); | ||
|
||
// prepare expected inputs and outputs | ||
std::vector<int64_t> expected_dims_mul_m = {1, 3, 2}; | ||
std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; | ||
|
||
OrtTensorRTProviderOptionsV2 params{ | ||
0, | ||
0, | ||
nullptr, | ||
1000, | ||
1, | ||
1 << 30, | ||
0, | ||
0, | ||
nullptr, | ||
0, | ||
0, | ||
0, | ||
0, | ||
0, | ||
nullptr, | ||
0, | ||
nullptr, | ||
0}; | ||
|
||
params.trt_engine_cache_enable = 1; | ||
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms); | ||
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); | ||
auto status = session_object.Load(model_name); | ||
ASSERT_TRUE(status.IsOK()); | ||
status = session_object.Initialize(); | ||
ASSERT_TRUE(status.IsOK()); | ||
|
||
// run inference | ||
// TRT engine will be created and cached | ||
// TRT profile will be created and cached only for dynamic input shape | ||
// Data in profile, | ||
// X: 1, 3, 3, 2, 2, 2 | ||
// Y: 1, 3, 3, 2, 2, 2 | ||
// Z: 1, 3, 3, 2, 2, 2 | ||
RunSession(session_object, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m); | ||
} | ||
|
||
void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string sess_log_id) { | ||
SessionOptions so; | ||
so.session_logid = sess_log_id; | ||
RunOptions run_options; | ||
run_options.run_tag = so.session_logid; | ||
InferenceSession session_object{so, GetEnvironment()}; | ||
auto allocator_manager = session_object.GetAllocatorManager(); | ||
auto cuda_provider = DefaultCudaExecutionProvider(); | ||
cuda_provider->RegisterAllocator(allocator_manager); | ||
auto cpu_allocator = cuda_provider->GetAllocator(0, OrtMemTypeCPU); | ||
std::vector<int64_t> dims_mul_x = {1, 3, 2}; | ||
std::vector<float> values_mul_x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; | ||
OrtValue ml_value_x; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_x); | ||
OrtValue ml_value_y; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_y); | ||
OrtValue ml_value_z; | ||
CreateMLValue<float>(cpu_allocator, dims_mul_x, values_mul_x, &ml_value_z); | ||
NameMLValMap feeds; | ||
feeds.insert(std::make_pair("X", ml_value_x)); | ||
feeds.insert(std::make_pair("Y", ml_value_y)); | ||
feeds.insert(std::make_pair("Z", ml_value_z)); | ||
|
||
// prepare outputs | ||
std::vector<std::string> output_names; | ||
output_names.push_back("M"); | ||
|
||
// prepare expected inputs and outputs | ||
std::vector<int64_t> expected_dims_mul_m = {1, 3, 2}; | ||
std::vector<float> expected_values_mul_m = {3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}; | ||
|
||
OrtTensorRTProviderOptionsV2 params{ | ||
0, | ||
0, | ||
nullptr, | ||
1000, | ||
1, | ||
1 << 30, | ||
0, | ||
0, | ||
nullptr, | ||
0, | ||
0, | ||
0, | ||
0, | ||
0, | ||
nullptr, | ||
0, | ||
nullptr, | ||
0}; | ||
|
||
params.trt_engine_cache_enable = 1; | ||
std::unique_ptr<IExecutionProvider> execution_provider = TensorrtExecutionProviderWithOptions(¶ms); | ||
EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); | ||
auto status = session_object.Load(model_name); | ||
ASSERT_TRUE(status.IsOK()); | ||
status = session_object.Initialize(); | ||
ASSERT_TRUE(status.IsOK()); | ||
|
||
// run inference with multi-threads | ||
// TRT engine will be created and cached | ||
// TRT profile will be created and cached only for dynamic input shape | ||
// Data in profile, | ||
// X: 1, 3, 3, 2, 2, 2 | ||
// Y: 1, 3, 3, 2, 2, 2 | ||
// Z: 1, 3, 3, 2, 2, 2 | ||
|
||
std::vector<std::thread> threads; | ||
int num_thread = 5; | ||
for (int i = 0; i < num_thread; ++i) | ||
threads.push_back(std::thread(RunSession, std::ref(session_object), std::ref(run_options), std::ref(feeds), std::ref(output_names), std::ref(expected_dims_mul_m), std::ref(expected_values_mul_m))); | ||
|
||
for (auto& th : threads) | ||
th.join(); | ||
} | ||
|
||
TEST(TensorrtExecutionProviderTest, MultiThreadsTestWithOneSessionSingleThreadInference) { | ||
std::vector<std::thread> threads; | ||
std::string model_name = "trt_execution_provider_multithreading_test.onnx"; | ||
std::string graph_name = "multithreading_test"; | ||
std::string sess_log_id = "TRTEPMultiThreadingTestWithOneSessionSingleThread"; | ||
std::vector<int> dims = {1, 3, 2}; | ||
int num_thread = 5; | ||
|
||
CreateBaseModel(model_name, graph_name, dims); | ||
|
||
for (int i = 0; i < num_thread; ++i) | ||
threads.push_back(std::thread(RunWithOneSessionSingleThreadInference, model_name, sess_log_id)); | ||
|
||
for (auto& th : threads) | ||
th.join(); | ||
} | ||
|
||
TEST(TensorrtExecutionProviderTest, MultiThreadsTestWithOneSessionMultiThreadsInference) { | ||
std::string model_name = "trt_execution_provider_multithreading_test.onnx"; | ||
std::string graph_name = "multithreading_test"; | ||
std::string sess_log_id = "TRTEPMultiThreadingTestWithOneSessionMultiThreads"; | ||
std::vector<int> dims = {1, 3, 2}; | ||
|
||
CreateBaseModel(model_name, graph_name, dims); | ||
RunWithOneSessionMultiThreadsInference(model_name, sess_log_id); | ||
} | ||
|
||
TEST_P(TensorrtExecutionProviderCacheTest, Run) { | ||
// GetParam() returns the parameter of following format: | ||
// ##cache type##_##input shape type## | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we need to be careful if we expand usage of this beyond the current few locking calls since it's singleton and shared across all sessions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we keep the lock on all the time, we should remove the TRT option force_sequential_engine_build, which provided an option to turn the lock off before.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
be careful when removing options from structs and api's though.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
legacy OrtTensorRTProviderOptions struct: I don't think we can remove this field from the struct and build ORT directly. User application built with the old struct may fail to access force_sequential_engine_build field. We can disable this field instead?
new OrtTensorRTProviderOptionsV2: I think we can remove this field since it's a opaque struct. Probably need to provide warning information when user set force_sequential_engine_build through CreateTensorRTProviderOptions().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the option has become a no-op , so it's not as critical I think
we should remove it from our documentation though