Merge remote-tracking branch 'origin/HEAD' into llava_devel

vllm-project · Dec 17, 2023 · 3a38396 · 3a38396
2 parents 3af7b34 + f8c688d
commit 3a38396
Show file tree

Hide file tree

Showing 8 changed files with 17 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
-- Phi-1.5 (`microsoft/phi-1_5`, etc.)
+- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -60,8 +60,8 @@ Alongside each architecture, we include some popular models that use it.
     - OPT, OPT-IML
     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
   * - :code:`PhiForCausalLM`
-    - Phi-1.5
-    - :code:`microsoft/phi-1_5`, etc.
+    - Phi
+    - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.

diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,7 @@ sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch == 2.1.2
 transformers >= 4.36.0  # Required for Mixtral.
-xformers == 0.0.23  # Required for CUDA 12.1.
+xformers == 0.0.23.post1  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic == 1.10.13  # Required for OpenAI server.

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
@@ -15,12 +15,12 @@
     "EleutherAI/pythia-70m",
     "bigscience/bloom-560m",
     "mosaicml/mpt-7b",
-    "microsoft/phi-1_5",
+    "microsoft/phi-2",
 ]
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(
     hf_runner,

diff --git a/vllm/engine/async_llava_engine.py b/vllm/engine/async_llava_engine.py
@@ -93,12 +93,13 @@ async def add_request(
         return stream
 
     async def generate(
-            self,
-            prompt: Optional[str],
-            sampling_params: SamplingParams,
-            request_id: str,
-            prompt_token_ids: Optional[List[int]] = None,
-            images: Optional[List[Image.Image]] = None) -> AsyncIterator[RequestOutput]:
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        request_id: str,
+        prompt_token_ids: Optional[List[int]] = None,
+        images: Optional[List[Image.Image]] = None
+    ) -> AsyncIterator[RequestOutput]:
         """Generate outputs for a request.
 
         Generate outputs for a request. This method is a coroutine. It adds the

diff --git a/vllm/engine/llava_engine.py b/vllm/engine/llava_engine.py
@@ -66,7 +66,7 @@ def add_request(
             execute_model_methord = partial(worker.execute_method.remote,
                                             'execute_model_methord')
         else:
-            execute_model_methord = getattr(worker, 'execute_model_methord')
+            execute_model_methord = worker.execute_model_methord
         outputs = execute_model_methord('prepare_promt', prompt_token_ids,
                                         pixel_values)
         if self.parallel_config.worker_use_ray:

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -24,7 +24,8 @@
     "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
     "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
-    "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
+    "LlavaForConditionalGeneration":
+    ("llava", "LlavaForConditionalGeneration"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("mistral", "MistralForCausalLM"),

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -98,7 +98,7 @@ def profile_num_available_blocks(
         # profiled peak memory.
         torch.cuda.synchronize()
         free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-        # peak_memory = total_gpu_memory - free_gpu_memory # 
+        # peak_memory = total_gpu_memory - free_gpu_memory #
         # if the GPU memory is consumed by others before, the peak_memory is the total consumed memory, that is not what we want.
         peak_memory = torch.cuda.max_memory_allocated()
         cache_block_size = CacheEngine.get_cache_block_size(