fix parallel state

Signed-off-by: youkaichao <youkaichao@gmail.com>
vllm-project · youkaichao · Nov 1, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
commit a09f3cb9c52afdecb40d6657d8cadc59d75dfe88
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -349,6 +349,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         if self.world_size == 1:
             return input_
 
+        if input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
+            return input_
+
         if not supports_custom_op():
             self._all_reduce_in_place(input_)
             return input_
@@ -380,9 +385,6 @@ def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
         pynccl_comm = self.pynccl_comm
         if (pynccl_comm is not None and not pynccl_comm.disabled):
             pynccl_comm.all_reduce(input_)
-        elif input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
         else:
             torch.distributed.all_reduce(input_, group=self.device_group)