22quinn · 22quinn · Aug 3, 2025 · Aug 3, 2025 · Aug 4, 2025
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
@@ -126,7 +126,10 @@ def __init__(self, *args, **kwargs):
 
 # Synchronize the updated weights to the inference engine.
 for name, p in train_model.named_parameters():
-    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
+    dtype_name = str(p.dtype).split(".")[-1]
+    handle = llm.collective_rpc.remote(
+        "update_weight", args=(name, dtype_name, p.shape)
+    )
     model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
     ray.get(handle)
 

diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
@@ -45,7 +45,8 @@ def init_weight_update_group(
             self.device,
         )
 
-    def update_weight(self, name, dtype, shape):
+    def update_weight(self, name, dtype_name, shape):
+        dtype = getattr(torch, dtype_name)
         weight = torch.empty(shape, dtype=dtype, device="cuda")
         self.model_update_group.broadcast(
             weight, src=0, stream=torch.cuda.current_stream()

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
@@ -47,6 +47,9 @@
         else:
             self.rank = group.rank
             self.world_size = group.world_size
+        logger.warning(
+            f"PyNcclCommunicator init: rank {self.rank} world_size {self.world_size}, group {group}"
+        )
 
         self.group = group
 
@@ -265,6 +268,9 @@
                            self.comm, cudaStream_t(stream.cuda_stream))
 
     def broadcast(self, tensor: torch.Tensor, src: int, stream=None):
+        logger.warning(
+            f"broadcast: rank {self.rank}/{self.world_size}, group {self.group}, src {src}, shape {tensor.shape}"
+        )
         if self.disabled:
             return
         assert tensor.device == self.device, (

diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
@@ -389,6 +389,8 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
+        print("StatelessProcessGroup.create with rank ", rank, ", world_size ",
+              world_size)
         launch_server = rank == 0
         if launch_server:
             # listen on the specified interface (instead of 0.0.0.0)
@@ -423,7 +425,7 @@ def init_gloo_process_group(backend: Backend, prefix_store: PrefixStore,
                             group_rank: int, group_size: int,
                             timeout: timedelta) -> ProcessGroup:
     """
-    Stateless init ProcessGroup with gloo backend compatible with 
+    Stateless init ProcessGroup with gloo backend compatible with
     different torch versions.
     """
     if is_torch_equal_or_newer("2.6"):

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -197,7 +197,6 @@ def __init__(
         **kwargs,
     ) -> None:
         """LLM constructor."""
-
         if "disable_log_stats" not in kwargs:
             kwargs["disable_log_stats"] = True
 

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
@@ -40,6 +40,7 @@
         lazily initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 
         def __init__(self, *args, **kwargs) -> None:
+            logger.warning(f"===quinnzhu Initializing RayWorkerWrapper with {kwargs["rpc_rank"]=}.")
             super().__init__(*args, **kwargs)
             # Since the compiled DAG runs a main execution
             # in a different thread that calls cuda.set_device.

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
@@ -90,8 +90,8 @@
 def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     """For fused modules (QKV and MLP) we have an array of length
     N that holds 1 scale for each "logical" matrix. So the param
-    is an array of length N. The loaded_weight corresponds to 
-    one of the shards on disk. Here, we slice the param based on 
+    is an array of length N. The loaded_weight corresponds to
+    one of the shards on disk. Here, we slice the param based on
     the shard_id for loading.
     """
     qkv_idxs = {"q": 0, "k": 1, "v": 2}
@@ -118,13 +118,13 @@
 
     For example, given bnb weight attributes as below:
     {
-        'bnb_shard_offsets': array([0, 4, 8, 16]), 
+        'bnb_shard_offsets': array([0, 4, 8, 16]),
         'bnb_quant_state': {0: ..., 1: ..., 2: ...},
     }
 
     The function will return:
     {
-        'bnb_shard_offsets': array([0, 4]), 
+        'bnb_shard_offsets': array([0, 4]),
         'bnb_quant_state': {0: ...},
     }
     and
@@ -156,13 +156,13 @@
                        output_partition_sizes: list[int], input_size: int,
                        output_size: int, params_dtype: torch.dtype,
                        **extra_weight_attrs):
-        """Create weights for a linear layer. 
+        """Create weights for a linear layer.
            The weights will be set as attributes of the layer.
 
         Args:
             layer: The layer that is using the LinearMethodBase factory.
             input_size_per_partition: Size of the weight input dim on rank X.
-            output_partition_sizes: Sizes of the output dim of each logical 
+            output_partition_sizes: Sizes of the output dim of each logical
                 weight on rank X. E.g., output_partition_sizes for QKVLinear
                 is a list contains the width of Wq, Wk, Wv on rank X.
             input_size: Size of the input dim of the weight across all ranks.
@@ -464,7 +464,7 @@
         output_sizes: list of output sizes packed into one output, like for QKV
                        the list would be size 3.
         prefix: The name of the layer in the state dict, including all parents
-                        (e.g. model.layers.0.qkv_proj) 
+                        (e.g. model.layers.0.qkv_proj)
     """
 
     def __init__(
@@ -559,11 +559,13 @@
         if output_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[output_dim]
             start_idx = self.tp_rank * shard_size
+            logger.info(f"ColumnParallel loaded_weight before narrow : {loaded_weight.shape}")
             loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                  shard_size)
+            logger.info(f"ColumnParallel loaded_weight after narrow : {loaded_weight.shape}")
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
        if len(loaded_weight.shape) == 0:
            loaded_weight = loaded_weight.reshape(1)

@@ -946,7 +948,7 @@
         self.output_sizes = [
             self.num_heads * self.head_size * tp_size,  # q_proj
             self.num_kv_heads * self.head_size * tp_size,  # k_proj
-            self.num_kv_heads * self.head_size * tp_size,  # v_proj 
+            self.num_kv_heads * self.head_size * tp_size,  # v_proj
         ]
 
         super().__init__(input_size=input_size,
@@ -979,7 +981,7 @@
     def _load_fused_module_from_checkpoint(self, param: BasevLLMParameter,
                                            loaded_weight: torch.Tensor):
         """
-        Handle special case for models where QKV layers are already 
+        Handle special case for models where QKV layers are already
         fused on disk. In this case, we have no shard id. This function
         determmines the shard id by splitting these layers and then calls
         the weight loader using the shard id.
@@ -1203,8 +1205,10 @@
             start_idx = shard_id * shard_size
 
             if not is_sharded_weight:
+                logger.info(f"QKV loaded_weight before narrow : {loaded_weight.shape}")
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx,
                                                      shard_size)
+                logger.info(f"QKV loaded_weight after narrow : {loaded_weight.shape}")
 
         # Special case for for AQLM codebooks.
         elif is_metadata:
@@ -1344,8 +1348,10 @@
         if input_dim is not None and not is_sharded_weight:
             shard_size = param_data.shape[input_dim]
             start_idx = self.tp_rank * shard_size
+            logger.info(f"RowParallel loaded_weight before narrow : {loaded_weight.shape}")
             loaded_weight = loaded_weight.narrow(input_dim, start_idx,
                                                  shard_size)
+            logger.info(f"RowParallel loaded_weight after narrow : {loaded_weight.shape}")
 
         # Special case for loading scales off disk, which often do not
         # have a shape (such as in the case of AutoFP8).
@@ -1569,7 +1575,7 @@
         param: nn.Parameter,
     ) -> nn.Parameter:
         """
-        Given the placeholder param, 
+        Given the placeholder param,
         return the corresponding param in the proj layers.
         """
         target_param_list = [

diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
@@ -47,7 +47,7 @@
 
 class EngineCoreClient(ABC):
     """
-    EngineCoreClient: subclasses handle different methods for pushing 
+    EngineCoreClient: subclasses handle different methods for pushing
         and pulling from the EngineCore for asyncio / multiprocessing.
 
     Subclasses:
@@ -232,7 +232,7 @@ async def collective_rpc_async(
 
 class InprocClient(EngineCoreClient):
     """
-    InprocClient: client for in-process EngineCore. Intended 
+    InprocClient: client for in-process EngineCore. Intended
     for use in LLMEngine for V0-style add_request() and step()
         EngineCore setup in this process (no busy loop).
 
@@ -377,7 +377,7 @@ class MPClient(EngineCoreClient):
 
         * pushes EngineCoreRequests via input_socket
         * pulls EngineCoreOutputs via output_socket
-    
+
         * AsyncMPClient subclass for AsyncLLM usage
         * SyncMPClient subclass for LLM usage
     """
@@ -563,6 +563,7 @@ class SyncMPClient(MPClient):
 
     def __init__(self, vllm_config: VllmConfig, executor_class: type[Executor],
                  log_stats: bool):
+        logger.info("===quinnzhu Initializing SyncMPClient")
         super().__init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
@@ -710,6 +711,7 @@ def collective_rpc(self,
                        timeout: Optional[float] = None,
                        args: tuple = (),
                        kwargs: Optional[dict[str, Any]] = None) -> list[_R]:
+        logger.info(f"===quinnzhu collective_rpc {method=}")
         return self.call_utility("collective_rpc", method, timeout, args,
                                  kwargs)
 

diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -50,7 +50,9 @@ def __init__(
         distributed_init_method: str,
         is_driver_worker: bool = False,
     ):
-
+        logger.warning(
+            f"===quinnzhu Worker init with {local_rank=}, {rank=}, {distributed_init_method=}"
+        )
         super().__init__(vllm_config=vllm_config,
                          local_rank=local_rank,
                          rank=rank,