Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/HEAD' into llava_devel
Browse files Browse the repository at this point in the history
  • Loading branch information
AzureSilent committed Dec 17, 2023
2 parents 3af7b34 + f8c688d commit 3a38396
Show file tree
Hide file tree
Showing 8 changed files with 17 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Phi-1.5 (`microsoft/phi-1_5`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)

Expand Down
4 changes: 2 additions & 2 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ Alongside each architecture, we include some popular models that use it.
- OPT, OPT-IML
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
* - :code:`PhiForCausalLM`
- Phi-1.5
- :code:`microsoft/phi-1_5`, etc.
- Phi
- :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ sentencepiece # Required for LLaMA tokenizer.
numpy
torch == 2.1.2
transformers >= 4.36.0 # Required for Mixtral.
xformers == 0.0.23 # Required for CUDA 12.1.
xformers == 0.0.23.post1 # Required for CUDA 12.1.
fastapi
uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server.
Expand Down
4 changes: 2 additions & 2 deletions tests/models/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
"EleutherAI/pythia-70m",
"bigscience/bloom-560m",
"mosaicml/mpt-7b",
"microsoft/phi-1_5",
"microsoft/phi-2",
]


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("max_tokens", [128])
def test_models(
hf_runner,
Expand Down
13 changes: 7 additions & 6 deletions vllm/engine/async_llava_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,13 @@ async def add_request(
return stream

async def generate(
self,
prompt: Optional[str],
sampling_params: SamplingParams,
request_id: str,
prompt_token_ids: Optional[List[int]] = None,
images: Optional[List[Image.Image]] = None) -> AsyncIterator[RequestOutput]:
self,
prompt: Optional[str],
sampling_params: SamplingParams,
request_id: str,
prompt_token_ids: Optional[List[int]] = None,
images: Optional[List[Image.Image]] = None
) -> AsyncIterator[RequestOutput]:
"""Generate outputs for a request.
Generate outputs for a request. This method is a coroutine. It adds the
Expand Down
2 changes: 1 addition & 1 deletion vllm/engine/llava_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def add_request(
execute_model_methord = partial(worker.execute_method.remote,
'execute_model_methord')
else:
execute_model_methord = getattr(worker, 'execute_model_methord')
execute_model_methord = worker.execute_model_methord
outputs = execute_model_methord('prepare_promt', prompt_token_ids,
pixel_values)
if self.parallel_config.worker_use_ray:
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"InternLMForCausalLM": ("internlm", "InternLMForCausalLM"),
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaForConditionalGeneration":
("llava", "LlavaForConditionalGeneration"),
# For decapoda-research/llama-*
"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
"MistralForCausalLM": ("mistral", "MistralForCausalLM"),
Expand Down
2 changes: 1 addition & 1 deletion vllm/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def profile_num_available_blocks(
# profiled peak memory.
torch.cuda.synchronize()
free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
# peak_memory = total_gpu_memory - free_gpu_memory #
# peak_memory = total_gpu_memory - free_gpu_memory #
# if the GPU memory is consumed by others before, the peak_memory is the total consumed memory, that is not what we want.
peak_memory = torch.cuda.max_memory_allocated()
cache_block_size = CacheEngine.get_cache_block_size(
Expand Down

0 comments on commit 3a38396

Please sign in to comment.