Skip to content

fix: Address multi-GPU issue in engine deserialize #2325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions core/runtime/execute_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ bool is_switch_required(const RTDevice& curr_device, const RTDevice& engine_devi
return false;
}

RTDevice select_rt_device(const RTDevice& engine_device) {
auto new_target_device_opt = get_most_compatible_device(engine_device);
RTDevice select_rt_device(const RTDevice& engine_device, const RTDevice& curr_device) {
auto new_target_device_opt = get_most_compatible_device(engine_device, curr_device);

// REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
// TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
Expand Down Expand Up @@ -89,7 +89,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr

if (is_switch_required(curr_device, compiled_engine->device_info)) {
// Scan through available CUDA devices and set the CUDA device context correctly
RTDevice device = select_rt_device(compiled_engine->device_info);
RTDevice device = select_rt_device(compiled_engine->device_info, curr_device);
set_rt_device(device);

// Target device is new device
Expand Down
27 changes: 22 additions & 5 deletions core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,16 @@ namespace torch_tensorrt {
namespace core {
namespace runtime {

c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device) {
c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device, const RTDevice& curr_device) {
LOG_DEBUG("Target Device: " << target_device);
auto device_options = find_compatible_devices(target_device);
RTDevice current_device;
if (current_device.id == -1) {
current_device = get_current_device();
} else {
current_device = curr_device;
}

if (device_options.size() == 0) {
return {};
} else if (device_options.size() == 1) {
Expand All @@ -21,10 +28,20 @@ c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device
dev_list << "[" << std::endl;
for (auto device : device_options) {
dev_list << " " << device << ',' << std::endl;
if (device.device_name == target_device.device_name && best_match.device_name != target_device.device_name) {
best_match = device;
} else if (device.device_name == target_device.device_name && best_match.device_name == target_device.device_name) {
if (device.id == target_device.id && best_match.id != target_device.id) {
if (device.device_name == target_device.device_name) {
// First priority is selecting a candidate which agrees with the current device ID
// If such a device is found, we can select it and break out of the loop
if (device.id == current_device.id && best_match.id != current_device.id) {
best_match = device;
break;
}
// Second priority is selecting a candidate which agrees with the target device ID
// At deserialization time, the current device and target device may not agree
else if (device.id == target_device.id && best_match.id != target_device.id) {
best_match = device;
}
// If no such GPU ID is found, select the first available candidate GPU
else if (best_match.device_name != target_device.device_name) {
best_match = device;
}
}
Expand Down
4 changes: 3 additions & 1 deletion core/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ typedef enum {
SERIALIZATION_LEN, // NEVER USED FOR DATA, USED TO DETERMINE LENGTH OF SERIALIZED INFO
} SerializedInfoIndex;

c10::optional<RTDevice> get_most_compatible_device(const RTDevice& target_device);
c10::optional<RTDevice> get_most_compatible_device(
const RTDevice& target_device,
const RTDevice& curr_device = RTDevice());
std::vector<RTDevice> find_compatible_devices(const RTDevice& target_device);

std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine);
Expand Down