Fix

DarkLight1337 · DarkLight1337 · commit 44f58444d827 · 2025-02-10T17:42:57.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
@@ -83,7 +83,7 @@ class MolmoImageInputs(TypedDict):
     """
     Starting and ending index of placeholder tokens.
 
-    Shape: `(2,)`
+    Shape: `(batch_size, 2)`
     """
 
 
@@ -1151,13 +1151,15 @@ def __call__(
         idxs = inv_idxs.diff(prepend=torch.tensor([-1])).nonzero().squeeze(1)
         assert len(is_image_ids) == len(idxs) == len(counts)
 
-        image_start_end = list[tuple[int, int]]()
+        image_start_end_lst = list[tuple[int, int]]()
         for is_image_id, idx, count in zip(is_image_ids, idxs, counts):
             if is_image_id:
                 assert input_ids[idx] in image_ids
-                image_start_end.append((idx, idx + count))
+                image_start_end_lst.append((idx, idx + count))
 
-        outputs["image_start_end"] = torch.tensor(image_start_end)
+        image_start_end = torch.tensor(image_start_end_lst)
+        assert len(image_start_end) <= 1, "Multi-image input not supported yet"
+        outputs["image_start_end"] = image_start_end.squeeze(0)
 
         return BatchFeature(outputs, tensor_type=return_tensors)