[Bugfix] Fix embedding to support 2D inputs (vllm-project#5829)

WoosukKwon · Alvant · commit a7f70118f693 · 2024-10-26T09:38:30.000+03:00
Signed-off-by: Alvant &lt;alvasian@yandex.ru&gt;
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -306,11 +306,11 @@ def forward(self, input_):
                 self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
-            # Get the embeddings.
+        # Get the embeddings.
         output_parallel = F.embedding(masked_input.long(), self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
+            output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output