You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Same params for Mistral-7b-Instruct-v0.2 work well, no errors.
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.
warnings.warn(
INFO 05-24 08:02:13 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=16000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=mistralai/Mistral-7B-Instruct-v0.3)
You set add_prefix_space. The tokenizer needs to be converted from the slow tokenizers
INFO 05-24 08:02:15 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-24 08:02:16 selector.py:27] Using FlashAttention-2 backend.
INFO 05-24 08:02:17 weight_utils.py:199] Using model weights format ['*.safetensors']
[rank0]: Traceback (most recent call last):
[rank0]: File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
[rank0]: return _run_code(code, main_globals, None,
[rank0]: File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
[rank0]: exec(code, run_globals)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 168, in
[rank0]: engine = AsyncLLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 366, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 324, in init
[rank0]: self.engine = self._init_engine(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 442, in _init_engine
[rank0]: return engine_class(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 415, in load_weights
[rank0]: param = params_dict[name]
[rank0]: KeyError: 'layers.0.attention.wk.weight'
The text was updated successfully, but these errors were encountered:
Your current environment
Using official Docker image.
🐛 Describe the bug
Using Docker image: vllm/vllm-openai:latest
Params:
Same params for Mistral-7b-Instruct-v0.2 work well, no errors.
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning:
resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, useforce_download=True
.warnings.warn(
INFO 05-24 08:02:13 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=16000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=mistralai/Mistral-7B-Instruct-v0.3)
You set
add_prefix_space
. The tokenizer needs to be converted from the slow tokenizersINFO 05-24 08:02:15 utils.py:660] Found nccl from library /root/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 05-24 08:02:16 selector.py:27] Using FlashAttention-2 backend.
INFO 05-24 08:02:17 weight_utils.py:199] Using model weights format ['*.safetensors']
[rank0]: Traceback (most recent call last):
[rank0]: File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
[rank0]: return _run_code(code, main_globals, None,
[rank0]: File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
[rank0]: exec(code, run_globals)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/api_server.py", line 168, in
[rank0]: engine = AsyncLLMEngine.from_engine_args(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 366, in from_engine_args
[rank0]: engine = cls(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 324, in init
[rank0]: self.engine = self._init_engine(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 442, in _init_engine
[rank0]: return engine_class(*args, **kwargs)
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 160, in init
[rank0]: self.model_executor = executor_class(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/executor_base.py", line 41, in init
[rank0]: self._init_executor()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 23, in _init_executor
[rank0]: self._init_non_spec_worker()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/executor/gpu_executor.py", line 69, in _init_non_spec_worker
[rank0]: self.driver_worker.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 118, in load_model
[rank0]: self.model_runner.load_model()
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 164, in load_model
[rank0]: self.model = get_model(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/init.py", line 19, in get_model
[rank0]: return loader.load_model(model_config=model_config,
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/model_loader/loader.py", line 224, in load_model
[rank0]: model.load_weights(
[rank0]: File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 415, in load_weights
[rank0]: param = params_dict[name]
[rank0]: KeyError: 'layers.0.attention.wk.weight'
The text was updated successfully, but these errors were encountered: