diff --git a/README.md b/README.md index d9cc6d26119c1..c6e6a3c7379db 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.) - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) -- Phi-1.5 (`microsoft/phi-1_5`, etc.) +- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.) - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 44e4fe5ead988..c95b158e871fe 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -60,8 +60,8 @@ Alongside each architecture, we include some popular models that use it. - OPT, OPT-IML - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. * - :code:`PhiForCausalLM` - - Phi-1.5 - - :code:`microsoft/phi-1_5`, etc. + - Phi + - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. diff --git a/requirements.txt b/requirements.txt index 81d5839487f3f..b441783a5dfcc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ sentencepiece # Required for LLaMA tokenizer. numpy torch == 2.1.2 transformers >= 4.36.0 # Required for Mixtral. -xformers == 0.0.23 # Required for CUDA 12.1. +xformers == 0.0.23.post1 # Required for CUDA 12.1. fastapi uvicorn[standard] pydantic == 1.10.13 # Required for OpenAI server. diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 95eabaafec811..e65c424c601a2 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -15,12 +15,12 @@ "EleutherAI/pythia-70m", "bigscience/bloom-560m", "mosaicml/mpt-7b", - "microsoft/phi-1_5", + "microsoft/phi-2", ] @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) def test_models( hf_runner, diff --git a/vllm/engine/async_llava_engine.py b/vllm/engine/async_llava_engine.py index 2ebbd0380b169..a2b7c1d5b3e31 100644 --- a/vllm/engine/async_llava_engine.py +++ b/vllm/engine/async_llava_engine.py @@ -93,12 +93,13 @@ async def add_request( return stream async def generate( - self, - prompt: Optional[str], - sampling_params: SamplingParams, - request_id: str, - prompt_token_ids: Optional[List[int]] = None, - images: Optional[List[Image.Image]] = None) -> AsyncIterator[RequestOutput]: + self, + prompt: Optional[str], + sampling_params: SamplingParams, + request_id: str, + prompt_token_ids: Optional[List[int]] = None, + images: Optional[List[Image.Image]] = None + ) -> AsyncIterator[RequestOutput]: """Generate outputs for a request. Generate outputs for a request. This method is a coroutine. It adds the diff --git a/vllm/engine/llava_engine.py b/vllm/engine/llava_engine.py index 8fc9cd00cd4f1..032583f96e9ca 100644 --- a/vllm/engine/llava_engine.py +++ b/vllm/engine/llava_engine.py @@ -66,7 +66,7 @@ def add_request( execute_model_methord = partial(worker.execute_method.remote, 'execute_model_methord') else: - execute_model_methord = getattr(worker, 'execute_model_methord') + execute_model_methord = worker.execute_model_methord outputs = execute_model_methord('prepare_promt', prompt_token_ids, pixel_values) if self.parallel_config.worker_use_ray: diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 42a3e1ed3957b..708a25454ff81 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -24,7 +24,8 @@ "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), "InternLMForCausalLM": ("internlm", "InternLMForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), + "LlavaForConditionalGeneration": + ("llava", "LlavaForConditionalGeneration"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MistralForCausalLM": ("mistral", "MistralForCausalLM"), diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index bdf26a1e21c5b..b40ef6617761a 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -98,7 +98,7 @@ def profile_num_available_blocks( # profiled peak memory. torch.cuda.synchronize() free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() - # peak_memory = total_gpu_memory - free_gpu_memory # + # peak_memory = total_gpu_memory - free_gpu_memory # # if the GPU memory is consumed by others before, the peak_memory is the total consumed memory, that is not what we want. peak_memory = torch.cuda.max_memory_allocated() cache_block_size = CacheEngine.get_cache_block_size(