simplify

Signed-off-by: Roger Wang <ywang@roblox.com>
vllm-project · ywang96 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
commit e523caaaab21db490cb5a130bcf551c3d3ca4b74
@@ -631,44 +631,33 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
                 mm_inputs.append(req_state.mm_inputs[input_id])
                 req_input_ids.append((req_id, input_id))
 
-        # Batch mm inputs as much as we can: if a request has multiple or
-        # a different modality than the previous one, we process it
-        # separately to preserve item order.
+        # Batch mm inputs as much as we can: if a request in the batch has
+        # multiple modalities or a different modality than the previous one,
+        # we process it separately to preserve item order.
         # FIXME(ywang96): This is a hacky way to deal with multiple modalities
         # in the same batch while still being able to benefit from batching
         # multimodal inputs. The proper solution should be reordering the
         # encoder outputs.
         grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
 
-        # If there is only one group (single modality), we can return the
-        # result directly.
-        if len(grouped_mm_inputs_list) == 1:
-            batched_mm_inputs = MultiModalKwargs.batch(
-                grouped_mm_inputs_list[0])
+        encoder_outputs = []
+        for grouped_mm_inputs in grouped_mm_inputs_list:
+            batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
             batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
                                                            device=self.device)
+
             # Run the encoder.
-            # `encoder_outputs` is either of the following:
-            # 1. A tensor of shape [num_items, feature_size, hidden_size]
-            # in case when feature_size is fixed across all multimodal items.
-            # 2. A list of tuple (length: num_items) of tensors, each of shape
-            # (feature_size, hidden_size) in case when the feature size is
-            # dynamic depending on input multimodal items.
-            encoder_outputs = self.model.get_multimodal_embeddings(
+            # `curr_group_outputs` is either of the following:
+            # 1. A tensor of shape (num_items, feature_size, hidden_size)
+            # in case feature_size is fixed across all multimodal items.
+            # 2. A list or tuple (length: num_items) of tensors, each of shape
+            # (feature_size, hidden_size) in case the feature size is dynamic
+            # depending on the input multimodal items.
+            curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
 
-        # If there are multiple groups, we process them one by one
-        # and concatenate the results.
-        else:
-            encoder_outputs = []
-            for grouped_mm_inputs in grouped_mm_inputs_list:
-                batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
-                batched_mm_inputs = MultiModalKwargs.as_kwargs(
-                    batched_mm_inputs, device=self.device)
-                curr_group_outputs = self.model.get_multimodal_embeddings(
-                    **batched_mm_inputs)
-                for output in curr_group_outputs:
-                    encoder_outputs.append(output)
+            for output in curr_group_outputs:
+                encoder_outputs.append(output)
 
         # Cache the encoder outputs.
         for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):