Skip to content

Commit b6dd22b

Browse files
authored
Merge pull request #2521 from pytorch/cherry_pick_safe_mode_build_args
cherry-pick: Safe mode and Build Arguments PRs
2 parents fdd6bad + 80743b0 commit b6dd22b

22 files changed

+624
-69
lines changed

core/runtime/TRTEngine.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ TRTEngine::TRTEngine(
5252
auto most_compatible_device = get_most_compatible_device(cuda_device);
5353
TORCHTRT_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
5454
device_info = most_compatible_device.value();
55+
multi_gpu_device_check();
5556
set_rt_device(device_info);
5657

5758
rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));

core/runtime/execute_engine.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
7474
LOG_INFO("" << log_info);
7575
}
7676

77-
{
77+
if (MULTI_DEVICE_SAFE_MODE) {
7878
std::unique_ptr<torch::autograd::profiler::RecordProfile> device_profiler_guard;
7979
if (compiled_engine->profile_execution) {
8080
device_profiler_guard =

core/runtime/register_jit_hooks.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ TORCH_LIBRARY(tensorrt, m) {
114114
m.def("execute_engine", execute_engine);
115115
m.def("SERIALIZED_ENGINE_BINDING_DELIM", []() -> std::string { return std::string(1, TRTEngine::BINDING_DELIM); });
116116
m.def("ABI_VERSION", []() -> std::string { return ABI_VERSION; });
117+
m.def("get_multi_device_safe_mode", []() -> bool { return MULTI_DEVICE_SAFE_MODE; });
118+
m.def("set_multi_device_safe_mode", [](bool multi_device_safe_mode) -> void {
119+
MULTI_DEVICE_SAFE_MODE = multi_device_safe_mode;
120+
});
117121
}
118122

119123
} // namespace

core/runtime/runtime.cpp

+15-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ namespace torch_tensorrt {
77
namespace core {
88
namespace runtime {
99

10+
bool MULTI_DEVICE_SAFE_MODE = false;
11+
1012
c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
1113
LOG_DEBUG("Target Device: " << target_device);
1214
auto device_options = find_compatible_devices(target_device);
@@ -31,13 +33,13 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
3133
if (device.device_name == target_device.device_name) {
3234
// First priority is selecting a candidate which agrees with the current device ID
3335
// If such a device is found, we can select it and break out of the loop
34-
if (device.id == current_device.id && best_match.id != current_device.id) {
36+
if (device.id == current_device.id) {
3537
best_match = device;
3638
break;
3739
}
3840
// Second priority is selecting a candidate which agrees with the target device ID
3941
// At deserialization time, the current device and target device may not agree
40-
else if (device.id == target_device.id && best_match.id != target_device.id) {
42+
else if (device.id == target_device.id) {
4143
best_match = device;
4244
}
4345
// If no such GPU ID is found, select the first available candidate GPU
@@ -103,6 +105,17 @@ RTDevice get_current_device() {
103105
return RTDevice(device_id, nvinfer1::DeviceType::kGPU);
104106
}
105107

108+
void multi_gpu_device_check() {
109+
// If multi-device safe mode is disabled and more than 1 device is registered on the machine, warn user
110+
if (!(MULTI_DEVICE_SAFE_MODE) && get_available_device_list().get_devices().size() > 1) {
111+
LOG_WARNING(
112+
"Detected this engine is being instantitated in a multi-GPU system with "
113+
<< "multi-device safe mode disabled. For more on the implications of this "
114+
<< "as well as workarounds, see the linked documentation "
115+
<< "(https://pytorch.org/TensorRT/user_guide/runtime.html#multi-device-safe-mode)");
116+
}
117+
}
118+
106119
namespace {
107120
static DeviceList cuda_device_list;
108121
}

core/runtime/runtime.h

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ namespace runtime {
1616

1717
using EngineID = int64_t;
1818
const std::string ABI_VERSION = "4";
19+
extern bool MULTI_DEVICE_SAFE_MODE;
1920
typedef enum {
2021
ABI_TARGET_IDX = 0,
2122
NAME_IDX,
@@ -33,6 +34,8 @@ std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);
3334

3435
std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
3536

37+
void multi_gpu_device_check();
38+
3639
class DeviceList {
3740
using DeviceMap = std::unordered_map<int, RTDevice>;
3841
DeviceMap device_list;

docsrc/user_guide/runtime.rst

+34
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,37 @@ Plugin Library
3434
In the case you use Torch-TensorRT as a converter to a TensorRT engine and your engine uses plugins provided by Torch-TensorRT, Torch-TensorRT
3535
ships the library ``libtorchtrt_plugins.so`` which contains the implementation of the TensorRT plugins used by Torch-TensorRT during
3636
compilation. This library can be ``DL_OPEN`` or ``LD_PRELOAD`` similar to other TensorRT plugin libraries.
37+
38+
Multi Device Safe Mode
39+
---------------
40+
41+
Multi-device safe mode is a setting in Torch-TensorRT which allows the user to determine whether
42+
the runtime checks for device consistency prior to every inference call.
43+
44+
There is a non-negligible, fixed cost per-inference call when multi-device safe mode is enabled, which is why
45+
it is now disabled by default. It can be controlled via the following convenience function which
46+
doubles as a context manager.
47+
48+
.. code-block:: python
49+
50+
# Enables Multi Device Safe Mode
51+
torch_tensorrt.runtime.set_multi_device_safe_mode(True)
52+
53+
# Disables Multi Device Safe Mode [Default Behavior]
54+
torch_tensorrt.runtime.set_multi_device_safe_mode(False)
55+
56+
# Enables Multi Device Safe Mode, then resets the safe mode to its prior setting
57+
with torch_tensorrt.runtime.set_multi_device_safe_mode(True):
58+
...
59+
60+
TensorRT requires that each engine be associated with the CUDA context in the active thread from which it is invoked.
61+
Therefore, if the device were to change in the active thread, which may be the case when invoking
62+
engines on multiple GPUs from the same Python process, safe mode will cause Torch-TensorRT to display
63+
an alert and switch GPUs accordingly. If safe mode were not enabled, there could be a mismatch in the engine
64+
device and CUDA context device, which could lead the program to crash.
65+
66+
One technique for managing multiple TRT engines on different GPUs while not sacrificing performance for
67+
multi-device safe mode is to use Python threads. Each thread is responsible for all of the TRT engines
68+
on a single GPU, and the default CUDA device on each thread corresponds to the GPU for which it is
69+
responsible (can be set via ``torch.cuda.set_device(...)``). In this way, multiple threads can be used in the same
70+
Python script without needing to switch CUDA contexts and incur performance overhead.

py/torch_tensorrt/__init__.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -85,15 +85,17 @@ def _find_lib(name: str, paths: List[str]) -> str:
8585
from torch_tensorrt._Device import Device # noqa: F401
8686
from torch_tensorrt._enums import * # noqa: F403
8787
from torch_tensorrt._Input import Input # noqa: F401
88-
from torch_tensorrt.logging import *
89-
from torch_tensorrt.ptq import *
9088
from torch_tensorrt._utils import * # noqa: F403
9189
from torch_tensorrt._utils import sanitized_torch_version
90+
from torch_tensorrt.logging import *
91+
from torch_tensorrt.ptq import *
92+
from torch_tensorrt.runtime import * # noqa: F403
9293

9394
if version.parse(sanitized_torch_version()) >= version.parse("2.1.dev"):
94-
from torch_tensorrt import dynamo # noqa: F401
9595
from torch_tensorrt.dynamo import backend # noqa: F401
9696

97+
from torch_tensorrt import dynamo # noqa: F401
98+
9799

98100
def _register_with_torch() -> None:
99101
trtorch_dir = os.path.dirname(__file__)

py/torch_tensorrt/dynamo/_compiler.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,21 @@
1616
from torch_tensorrt.dynamo._defaults import (
1717
DEBUG,
1818
DEVICE,
19+
DISABLE_TF32,
20+
DLA_GLOBAL_DRAM_SIZE,
21+
DLA_LOCAL_DRAM_SIZE,
22+
DLA_SRAM_SIZE,
1923
ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
24+
ENGINE_CAPABILITY,
2025
MAX_AUX_STREAMS,
2126
MIN_BLOCK_SIZE,
27+
NUM_AVG_TIMING_ITERS,
2228
OPTIMIZATION_LEVEL,
2329
PASS_THROUGH_BUILD_FAILURES,
2430
PRECISION,
31+
REFIT,
2532
REQUIRE_FULL_COMPILATION,
33+
SPARSE_WEIGHTS,
2634
TRUNCATE_LONG_AND_DOUBLE,
2735
USE_FAST_PARTITIONER,
2836
USE_PYTHON_RUNTIME,
@@ -51,17 +59,18 @@ def compile(
5159
inputs: Any,
5260
*,
5361
device: Optional[Union[Device, torch.device, str]] = DEVICE,
54-
disable_tf32: bool = False,
55-
sparse_weights: bool = False,
62+
disable_tf32: bool = DISABLE_TF32,
63+
sparse_weights: bool = SPARSE_WEIGHTS,
5664
enabled_precisions: Set[torch.dtype] | Tuple[torch.dtype] = (torch.float32,),
57-
refit: bool = False,
65+
engine_capability: EngineCapability = ENGINE_CAPABILITY,
66+
refit: bool = REFIT,
5867
debug: bool = DEBUG,
5968
capability: EngineCapability = EngineCapability.default,
60-
num_avg_timing_iters: int = 1,
69+
num_avg_timing_iters: int = NUM_AVG_TIMING_ITERS,
6170
workspace_size: int = WORKSPACE_SIZE,
62-
dla_sram_size: int = 1048576,
63-
dla_local_dram_size: int = 1073741824,
64-
dla_global_dram_size: int = 536870912,
71+
dla_sram_size: int = DLA_SRAM_SIZE,
72+
dla_local_dram_size: int = DLA_LOCAL_DRAM_SIZE,
73+
dla_global_dram_size: int = DLA_GLOBAL_DRAM_SIZE,
6574
calibrator: object = None,
6675
truncate_long_and_double: bool = TRUNCATE_LONG_AND_DOUBLE,
6776
require_full_compilation: bool = REQUIRE_FULL_COMPILATION,
@@ -200,6 +209,13 @@ def compile(
200209
"use_fast_partitioner": use_fast_partitioner,
201210
"enable_experimental_decompositions": enable_experimental_decompositions,
202211
"require_full_compilation": require_full_compilation,
212+
"disable_tf32": disable_tf32,
213+
"sparse_weights": sparse_weights,
214+
"refit": refit,
215+
"engine_capability": engine_capability,
216+
"dla_sram_size": dla_sram_size,
217+
"dla_local_dram_size": dla_local_dram_size,
218+
"dla_global_dram_size": dla_global_dram_size,
203219
}
204220

205221
settings = CompilationSettings(**compilation_options)

py/torch_tensorrt/dynamo/_defaults.py

+9
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,28 @@
11
import torch
2+
from tensorrt import EngineCapability
23
from torch_tensorrt._Device import Device
34

45
PRECISION = torch.float32
56
DEBUG = False
67
DEVICE = None
8+
DISABLE_TF32 = False
9+
DLA_LOCAL_DRAM_SIZE = 1073741824
10+
DLA_GLOBAL_DRAM_SIZE = 536870912
11+
DLA_SRAM_SIZE = 1048576
12+
ENGINE_CAPABILITY = EngineCapability.STANDARD
713
WORKSPACE_SIZE = 0
814
MIN_BLOCK_SIZE = 5
915
PASS_THROUGH_BUILD_FAILURES = False
1016
MAX_AUX_STREAMS = None
17+
NUM_AVG_TIMING_ITERS = 1
1118
VERSION_COMPATIBLE = False
1219
OPTIMIZATION_LEVEL = None
20+
SPARSE_WEIGHTS = False
1321
TRUNCATE_LONG_AND_DOUBLE = False
1422
USE_PYTHON_RUNTIME = False
1523
USE_FAST_PARTITIONER = True
1624
ENABLE_EXPERIMENTAL_DECOMPOSITIONS = False
25+
REFIT = False
1726
REQUIRE_FULL_COMPILATION = False
1827

1928

py/torch_tensorrt/dynamo/_settings.py

+25
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,25 @@
22
from typing import Optional, Set
33

44
import torch
5+
from tensorrt import EngineCapability
56
from torch_tensorrt._Device import Device
67
from torch_tensorrt.dynamo._defaults import (
78
DEBUG,
9+
DISABLE_TF32,
10+
DLA_GLOBAL_DRAM_SIZE,
11+
DLA_LOCAL_DRAM_SIZE,
12+
DLA_SRAM_SIZE,
813
ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
14+
ENGINE_CAPABILITY,
915
MAX_AUX_STREAMS,
1016
MIN_BLOCK_SIZE,
17+
NUM_AVG_TIMING_ITERS,
1118
OPTIMIZATION_LEVEL,
1219
PASS_THROUGH_BUILD_FAILURES,
1320
PRECISION,
21+
REFIT,
1422
REQUIRE_FULL_COMPILATION,
23+
SPARSE_WEIGHTS,
1524
TRUNCATE_LONG_AND_DOUBLE,
1625
USE_FAST_PARTITIONER,
1726
USE_PYTHON_RUNTIME,
@@ -46,6 +55,14 @@ class CompilationSettings:
4655
device (Device): GPU to compile the model on
4756
require_full_compilation (bool): Whether to require the graph is fully compiled in TensorRT.
4857
Only applicable for `ir="dynamo"`; has no effect for `torch.compile` path
58+
disable_tf32 (bool): Whether to disable TF32 computation for TRT layers
59+
sparse_weights (bool): Whether to allow the builder to use sparse weights
60+
refit (bool): Whether to build a refittable engine
61+
engine_capability (trt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
62+
num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
63+
dla_sram_size (int): Fast software managed RAM used by DLA to communicate within a layer.
64+
dla_local_dram_size (int): Host RAM used by DLA to share intermediate tensor data across operations
65+
dla_global_dram_size (int): Host RAM used by DLA to store weights and metadata for execution
4966
"""
5067

5168
precision: torch.dtype = PRECISION
@@ -63,3 +80,11 @@ class CompilationSettings:
6380
enable_experimental_decompositions: bool = ENABLE_EXPERIMENTAL_DECOMPOSITIONS
6481
device: Device = field(default_factory=default_device)
6582
require_full_compilation: bool = REQUIRE_FULL_COMPILATION
83+
disable_tf32: bool = DISABLE_TF32
84+
sparse_weights: bool = SPARSE_WEIGHTS
85+
refit: bool = REFIT
86+
engine_capability: EngineCapability = ENGINE_CAPABILITY
87+
num_avg_timing_iters: int = NUM_AVG_TIMING_ITERS
88+
dla_sram_size: int = DLA_SRAM_SIZE
89+
dla_local_dram_size: int = DLA_LOCAL_DRAM_SIZE
90+
dla_global_dram_size: int = DLA_GLOBAL_DRAM_SIZE

0 commit comments

Comments
 (0)