diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 75b7c374c8e6e..e0641a54c4194 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -71,6 +71,7 @@ def consumer(batch_tgt: Sequence[int], if open_success: # modify the memory lib.cudaMemset(pointer, 2, 1024) + lib.cudaDeviceSynchronize() # use two queues to simulate barrier producer_queue.get() consumer_queue.put(0) @@ -142,8 +143,13 @@ def can_actually_p2p( for src, tgt in zip(batch_src, batch_tgt): a = result_queue.get() b = result_queue.get() - assert a == b - result.append(a) + if a != b: + logger.warning( + "Two processes do not agree on the P2P access" + " status on %d -> %d, treat as disabled.", src, tgt) + result.append(False) + else: + result.append(a) return result