vllm-project · simon-mo · Oct 18, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -424,7 +424,7 @@ Text Generation
     - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
     - 
     - ✅︎
-  * - :code:`Phi3VForCausalLM`
+  * - :code:`Phi3VForCausalLM` (see note)
     - Phi-3-Vision, Phi-3.5-Vision
     - T + I\ :sup:`E+`
     - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
@@ -462,6 +462,10 @@ Text Generation
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
+.. note::
+  The :code:`Phi3VForCausalLM` architecture supports both generation and embedding tasks.
+  For text generation, please pass (:code:`--task generate`) to run the model in generation mode.
+
 Multimodal Embedding
 --------------------
 
@@ -475,13 +479,17 @@ Multimodal Embedding
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
-  * - :code:`Phi3VForCausalLM`
+  * - :code:`Phi3VForCausalLM` (see note)
     - Phi-3-Vision-based
     - T + I
     - :code:`TIGER-Lab/VLM2Vec-Full`
     - 🚧
     - ✅︎
 
+.. note::
+  The :code:`Phi3VForCausalLM` architecture supports both generation and embedding tasks.
+  For text generation, please pass (:code:`--task embed`) to run the model in embedding mode.
+
 ----
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
@@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
 
 .. code-block:: bash
 
-    vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
-      --trust-remote-code --limit-mm-per-prompt image=2
+    vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+      --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 .. important::
     Since OpenAI Vision API is based on `Chat Completions <https://platform.openai.com/docs/api-reference/chat>`_ API,

diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
@@ -7,6 +7,7 @@
 # Create an LLM.
 llm = LLM(
     model="TIGER-Lab/VLM2Vec-Full",
+    task="embed",
     trust_remote_code=True,
     max_model_len=4096,
     max_num_seqs=2,

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
@@ -87,6 +87,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
     # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
+        task="generate",
         trust_remote_code=True,
         max_model_len=4096,
         max_num_seqs=2,

diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py
@@ -7,8 +7,8 @@
 vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 
 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
-    --trust-remote-code --limit-mm-per-prompt image=2
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -25,7 +25,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel,
@@ -619,6 +619,7 @@ class VllmRunner:
     def __init__(
         self,
         model_name: str,
+        task: TaskOption = "auto",
         tokenizer_name: Optional[str] = None,
         # Use smaller max model length, otherwise bigger model cannot run due
         # to kv cache size limit.
@@ -634,6 +635,7 @@ def __init__(
     ) -> None:
         self.model = LLM(
             model=model_name,
+            task=task,
             tokenizer=tokenizer_name,
             trust_remote_code=True,
             dtype=dtype,

diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
@@ -42,6 +42,7 @@ def test_simple(use_v2_block_manager: bool):
     max_model_len = 16
     max_num_batched_tokens = 64
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         num_seq_group,
         max_model_len,
@@ -89,6 +90,7 @@ def test_chunk(use_v2_block_manager: bool):
     max_model_len = 80
     max_num_batched_tokens = 64
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -138,6 +140,7 @@ def test_complex(use_v2_block_manager: bool):
     max_model_len = 80
     max_num_batched_tokens = 64
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -209,6 +212,7 @@ def test_maximal_decoding(use_v2_block_manager: bool):
     max_model_len = 8
     max_num_batched_tokens = 2
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -303,6 +307,7 @@ def test_prompt_limit(use_v2_block_manager: bool):
     max_model_len = 64
     max_num_batched_tokens = 32
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -336,7 +341,8 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool):
     max_seqs = 64
     max_model_len = 32
     max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
                                        max_seqs,
                                        max_model_len,
                                        enable_chunked_prefill=True)
@@ -364,6 +370,7 @@ def test_swap(use_v2_block_manager: bool):
     max_model_len = 200
     max_num_batched_tokens = 30
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -421,6 +428,7 @@ def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool):
     max_model_len = 200
     max_num_batched_tokens = 30
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -516,6 +524,7 @@ def test_chunked_prefill_preempt(use_v2_block_manager: bool):
     max_model_len = 200
     max_num_batched_tokens = 30
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -582,6 +591,7 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool):
     max_model_len = 80
     max_num_batched_tokens = 64
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,
@@ -637,6 +647,7 @@ def test_perfix_caching(use_v2_block_manager: bool):
     max_model_len = 80
     max_num_batched_tokens = 64
     scheduler_config = SchedulerConfig(
+        "generate",
         max_num_batched_tokens,
         max_seqs,
         max_model_len,

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
@@ -28,7 +28,12 @@ def check_deprecated_block_manager():
 def test_scheduler_add_seq_group(use_v2_block_manager: bool):
     block_size = 4
     scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
     cache_config.num_cpu_blocks = 4
     cache_config.num_gpu_blocks = 4
@@ -48,7 +53,12 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool):
 def test_scheduler_abort_seq_group(use_v2_block_manager: bool):
     block_size = 4
     scheduler_config = SchedulerConfig(
-        100, 64, 1, use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=1,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 4
     cache_config.num_gpu_blocks = 4
@@ -74,10 +84,12 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool):
     num_seq_group = 4
     max_model_len = 16
     scheduler_config = SchedulerConfig(
-        64,
-        num_seq_group,
-        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
@@ -119,10 +131,12 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool):
     max_model_len = 30
     max_batched_num_tokens = 30
     scheduler_config = SchedulerConfig(
-        max_batched_num_tokens,
-        2,
-        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=max_batched_num_tokens,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 16
     cache_config.num_gpu_blocks = 16
@@ -151,7 +165,12 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool):
     block_size = 4
     max_model_len = 16
     scheduler_config = SchedulerConfig(
-        64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 2
     cache_config.num_gpu_blocks = 2
@@ -208,10 +227,12 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
     max_seq_group = 2
     max_model_len = 16
     scheduler_config = SchedulerConfig(
-        64,
-        max_seq_group,
-        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_model_len,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
@@ -253,11 +274,13 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool):
 def test_scheduler_delay_factor(use_v2_block_manager: bool):
     block_size = 4
     scheduler_config = SchedulerConfig(
-        100,
-        64,
-        16,
+        "generate",
+        max_num_batched_tokens=100,
+        max_num_seqs=64,
+        max_model_len=16,
         delay_factor=0.5,
-        use_v2_block_manager=use_v2_block_manager)
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 8
     cache_config.num_gpu_blocks = 8
@@ -358,10 +381,12 @@ def initialize_scheduler(
 ):
     block_size = block_size
     scheduler_config = SchedulerConfig(
-        max_token_budget,
-        max_num_seqs,
-        max_model_len,
-        use_v2_block_manager=use_v2_block_manager)
+        "generate",
+        max_num_batched_tokens=max_token_budget,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
+        use_v2_block_manager=use_v2_block_manager,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = num_cpu_blocks
     cache_config.num_gpu_blocks = num_gpu_blocks

diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
     block_size = 4
     num_seq_group = 4
     max_model_len = 16
-    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    scheduler_config = SchedulerConfig(
+        task="generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
     cache_config = CacheConfig(block_size, 1.0, 1, "auto")
     cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
     cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group

@@ -165,6 +165,7 @@ def test_multi_chat():
 def test_chat_multi_image(image_urls: List[str]):
     llm = LLM(
         model="microsoft/Phi-3.5-vision-instruct",
+        task="generate",
         dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,

@@ -22,12 +22,12 @@ class MockHFConfig:
 
 @dataclass
 class MockModelConfig:
+    task = "generate"
     tokenizer = MODEL_NAME
     trust_remote_code = False
     tokenizer_mode = "auto"
     max_model_len = 100
     tokenizer_revision = None
-    embedding_mode = False
     multimodal_config = MultiModalConfig()
     hf_config = MockHFConfig()
 

@@ -23,6 +23,8 @@
 @pytest.fixture(scope="module")
 def server():
     args = [
+        "--task",
+        "generate",
         "--dtype",
         "bfloat16",
         "--max-model-len",

@@ -18,7 +18,8 @@
 @pytest.fixture(scope="module")
 def phi3v_model_config():
     return ModelConfig(PHI3V_MODEL_ID,
-                       PHI3V_MODEL_ID,
+                       task="generate",
+                       tokenizer=PHI3V_MODEL_ID,
                        tokenizer_mode="auto",
                        trust_remote_code=True,
                        dtype="bfloat16",

diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
     worker = Worker(
         model_config=ModelConfig(
             "meta-llama/Llama-2-7b-hf",
-            "meta-llama/Llama-2-7b-hf",
+            task="auto",
+            tokenizer="meta-llama/Llama-2-7b-hf",
             tokenizer_mode="auto",
             trust_remote_code=False,
             seed=0,
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
             load_format="dummy",
         ),
         parallel_config=ParallelConfig(1, 1, False),
-        scheduler_config=SchedulerConfig(32, 32, 32),
+        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
         device_config=DeviceConfig("cuda"),
         cache_config=CacheConfig(block_size=16,
                                  gpu_memory_utilization=1.,