Fix tp error when torch distributed is already initialized (#38294)

SunMarc · web-flow · commit 03a4c024dcc2 · 2025-05-22T12:34:05.000+02:00
fix tp error
diff --git a/src/transformers/integrations/tensor_parallel.py b/src/transformers/integrations/tensor_parallel.py
@@ -52,6 +52,7 @@ def initialize_tensor_parallelism(tp_plan, tp_size=None):
 
     # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
     device_type = torch._C._get_accelerator().type
+    current_device = getattr(torch, device_type)
     if not torch.distributed.is_initialized():
         try:
             rank = int(os.environ["RANK"])
@@ -73,6 +74,9 @@ def initialize_tensor_parallelism(tp_plan, tp_size=None):
                 "We tried to initialize torch.distributed for you, but it failed. Make "
                 "sure you init torch distributed in your script to use `tp_plan='auto'`."
             ) from e
+
+    if device_type != "cpu":
+        current_device.set_device(int(os.environ["LOCAL_RANK"]))
     index = current_device.current_device() if device_type != "cpu" else None
     tp_device = torch.device(device_type, index)