Closed
Description
Your current environment
vllm@cf069aa
🐛 Describe the bug
Running models using the transformers fallback fails if vllm_config.model_config.hf_config
does not contain head_dim
. For example, using Qwen/Qwen2.5-0.5B-Instruct
:
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", model_impl="transformers")
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
Error:
[rank0]: Traceback (most recent call last):
[rank0]: File "/scratch/test_qwen.py", line 20, in <module>
[rank0]: llm = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", enable_prefix_caching=False, compilation_config=3, model_impl="transformers")
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/utils.py", line 1045, in inner
[rank0]: return fn(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/entrypoints/llm.py", line 243, in __init__
[rank0]: self.llm_engine = self.engine_class.from_engine_args(
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/engine/llm_engine.py", line 494, in from_engine_args
[rank0]: engine = cls(
[rank0]: ^^^^
[rank0]: File "/scratch/vllm/vllm/engine/llm_engine.py", line 274, in __init__
[rank0]: self.model_executor = executor_class(vllm_config=vllm_config, )
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/executor/executor_base.py", line 52, in __init__
[rank0]: self._init_executor()
[rank0]: File "/scratch/vllm/vllm/executor/uniproc_executor.py", line 47, in _init_executor
[rank0]: self.collective_rpc("load_model")
[rank0]: File "/scratch/vllm/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
[rank0]: answer = run_method(self.driver_worker, method, args, kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/utils.py", line 2232, in run_method
[rank0]: return func(*args, **kwargs)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/worker/worker.py", line 183, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/scratch/vllm/vllm/worker/model_runner.py", line 1113, in load_model
[rank0]: self.model = get_model(vllm_config=self.vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/__init__.py", line 14, in get_model
[rank0]: return loader.load_model(vllm_config=vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/loader.py", line 416, in load_model
[rank0]: model = _initialize_model(vllm_config=vllm_config)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/model_loader/loader.py", line 126, in _initialize_model
[rank0]: return model_class(vllm_config=vllm_config, prefix=prefix)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/vllm/model_executor/models/transformers.py", line 152, in __init__
[rank0]: head_size=config.head_dim,
[rank0]: ^^^^^^^^^^^^^^^
[rank0]: File "/scratch/vllm/transformers/configuration_utils.py", line 214, in __getattribute__
[rank0]: return super().__getattribute__(key)
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]: AttributeError: 'Qwen2Config' object has no attribute 'head_dim'
I think it's fine to calculate the head size if head_dim
does not exist. Using this diff the model works:
+++ a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -146,14 +146,13 @@ class TransformersModel(nn.Module, SupportsQuant, SupportsLoRA):
# Attention modifications (assumes 1 attention op per hidden layer)
tp_size = get_tensor_model_parallel_world_size()
- head_size = vllm_config.model_config.get_head_size()
self.attention_instances = [
Attention(
num_heads=divide(config.num_attention_heads, tp_size),
- head_size=head_size,
+ head_size=config.head_dim,
# NOTE: We use Llama scale as default, if it's set by
# Transformers, it's updated in vllm_flash_attention_forward
- scale=head_size**-0.5,
+ scale=config.head_dim**-0.5,
num_kv_heads=divide(config.num_key_value_heads, tp_size),
cache_config=cache_config,
quant_config=self.quant_config,
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.