Closed
Description
Your current environment
I am trying to run unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit
with 2 A100-80GB on modal.
subprocess.Popen([
"vllm", "serve",
f"{MODELS_DIR}/{MODEL_NAME}",
"--host", "127.0.0.1",
"--port", "8000",
"--max-model-len", "32767",
"--tensor-parallel-size", "1",
"--gpu-memory-utilization", "0.90",
"--trust-remote-code",
"--quantization", "bitsandbytes",
"--load-format", "bitsandbytes",
])
🐛 Describe the bug
Feb 07 21:57:25.839 | INFO 02-07 13:57:25 api_server.py:841] args: Namespace(subparser='serve', model_tag='/pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit', config='', host='127.0.0.1', port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key='[API_KEY]', lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, enable_reasoning=False, reasoning_parser=None, tool_call_parser=None, tool_parser_plugin='', model='/pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, allowed_local_media_path=None, download_dir=None, load_format='bitsandbytes', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=32767, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization='bitsandbytes', rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', generation_config=None, override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, dispatch_function=<function serve at 0x7eaf0cf73ec0>) |
---|---|
Feb 07 21:57:25.872 | INFO 02-07 13:57:25 api_server.py:206] Started engine process with PID 12 |
Feb 07 21:57:32.946 | INFO 02-07 13:57:32 init.py:190] Automatically detected platform cuda. |
Feb 07 21:57:37.478 | INFO 02-07 13:57:37 config.py:542] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'. |
Feb 07 21:57:39.391 | WARNING 02-07 13:57:39 config.py:621] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models. |
Feb 07 21:57:46.573 | INFO 02-07 13:57:46 config.py:542] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'. |
Feb 07 21:57:48.293 | WARNING 02-07 13:57:48 config.py:621] bitsandbytes quantization is not fully optimized yet. The speed can be slower than non-quantized models. |
Feb 07 21:57:48.381 | INFO 02-07 13:57:48 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit', speculative_config=None, tokenizer='/pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32767, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=True, |
Feb 07 21:57:49.580 | INFO 02-07 13:57:49 cuda.py:230] Using Flash Attention backend. |
Feb 07 21:57:50.825 | INFO 02-07 13:57:50 model_runner.py:1110] Starting to load model /pointer/unsloth/Qwen2.5-VL-72B-Instruct-bnb-4bit... |
Feb 07 21:57:51.164 | WARNING 02-07 13:57:51 vision.py:94] Current vllm-flash-attn has a bug inside vision module, so we use xformers backend instead. You can run pip install flash-attn to use flash-attention backend. |
Feb 07 21:57:51.225 | INFO 02-07 13:57:51 config.py:2992] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256] is overridden by config [256, 128, 2, 1, 4, 136, 8, 144, 16, 152, 24, 160, 32, 168, 40, 176, 48, 184, 56, 192, 64, 200, 72, 208, 80, 216, 88, 120, 224, 96, 232, 104, 240, 112, 248] |
Feb 07 21:57:51.568 | INFO 02-07 13:57:51 loader.py:1102] Loading weights with BitsAndBytes quantization. May take a while ... |
Feb 07 21:57:51.581 | Loading safetensors checkpoint shards: 0% Completed | 0/9 [00:00<?, ?it/s] |
Feb 07 21:57:56.132 | Loading safetensors checkpoint shards: 11% Completed | 1/9 [00:04<00:36, 4.55s/it] |
Feb 07 21:58:00.891 | Loading safetensors checkpoint shards: 22% Completed | 2/9 [00:09<00:32, 4.67s/it] |
Feb 07 21:58:05.548 | Loading safetensors checkpoint shards: 33% Completed | 3/9 [00:13<00:27, 4.67s/it] |
Feb 07 21:58:10.079 | Loading safetensors checkpoint shards: 44% Completed | 4/9 [00:18<00:23, 4.61s/it] |
Feb 07 21:58:14.845 | Loading safetensors checkpoint shards: 56% Completed | 5/9 [00:23<00:18, 4.67s/it] |
Feb 07 21:58:19.530 | Loading safetensors checkpoint shards: 67% Completed | 6/9 [00:27<00:14, 4.67s/it] |
Feb 07 21:58:24.404 | Loading safetensors checkpoint shards: 78% Completed | 7/9 [00:32<00:09, 4.74s/it] |
Feb 07 21:58:28.560 | Loading safetensors checkpoint shards: 89% Completed | 8/9 [00:36<00:04, 4.55s/it] |
Feb 07 21:58:31.041 | Loading safetensors checkpoint shards: 100% Completed | 9/9 [00:39<00:00, 3.91s/it] |
Feb 07 21:58:31.046 | Loading safetensors checkpoint shards: 0% Completed | 0/9 [00:00<?, ?it/s] 38s/it] |
Feb 07 21:58:32.650 | ERROR 02-07 13:58:32 engine.py:389] RuntimeError: shape '[3, 16, 80, 1280]' is invalid for input of size 2457600 106, in __torch_function__load_weightsshts |
Feb 07 21:58:32.651 | Process SpawnProcess-1: |
Feb 07 21:58:32.656 | Traceback (most recent call last): File "/usr/local/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/local/lib/python3.12/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/usr/local/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine raise e File "/usr/local/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 380, in run_mp_engine engine = MQLLMEngine.from_engine_args(engine_args=engine_args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 123, in from_engine_args return cls(ipc_path=ipc_path, ^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/engine/multiprocessing/engine.py", line 75, in init self.engine = LLMEngine(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/engine/llm_engine.py", line 273, in init self.model_executor = executor_class(vllm_config=vllm_config, ) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/executor/executor_base.py", line 51, in init self._init_executor() File "/usr/local/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 42, in _init_executor self.collective_rpc("load_model") File "/usr/local/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 51, in collective_rpc answer = run_method(self.driver_worker, method, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/utils.py", line 2220, in run_method return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/worker/worker.py", line 183, in load_model self.model_runner.load_model() File "/usr/local/lib/python3.12/site-packages/vllm/worker/model_runner.py", line 1112, in load_model self.model = get_model(vllm_config=self.vllm_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/model_loader/init.py", line 14, in get_model return loader.load_model(vllm_config=vllm_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/model_loader/loader.py", line 1225, in load_model self._load_weights(model_config, model) File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/model_loader/loader.py", line 1135, in _load_weights loaded_weights = model.load_weights(qweight_iterator) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen2_5_vl.py", line 1124, in load_weights return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 235, in load_weights autoloaded_weights = set(self._load_module("", self.module, weights)) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 196, in _load_module yield from self._load_module(prefix, File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 173, in _load_module loaded_params = module_load_weights(weights) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/model_executor/models/qwen2_5_vl.py", line 672, in load_weights loaded_weight = loaded_weight.view(3, visual_num_heads, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/torch/utils/_device.py", line 106, in torch_function return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ RuntimeError: shape '[3, 16, 80, 1280]' is invalid for input of size 2457600 |
Feb 07 21:58:32.766 | Loading safetensors checkpoint shards: 0% Completed | 0/9 [00:01<?, ?it/s] |
Feb 07 21:58:33.380 | [rank0]:[W207 13:58:33.655823362 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) |
Feb 07 21:58:41.418 | Traceback (most recent call last): File "/usr/local/bin/vllm", line 8, in |
Feb 07 21:58:41.423 | sys.exit(main()) ^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/scripts.py", line 204, in main args.dispatch_function(args) File "/usr/local/lib/python3.12/site-packages/vllm/scripts.py", line 44, in serve uvloop.run(run_server(args)) File "/usr/local/lib/python3.12/site-packages/uvloop/init.py", line 109, in run return __asyncio.run( ^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/asyncio/runners.py", line 194, in run return runner.run(main) ^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/asyncio/runners.py", line 118, in run return self._loop.run_until_complete(task) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete File "/usr/local/lib/python3.12/site-packages/uvloop/init.py", line 61, in wrapper return await main ^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 875, in run_server async with build_async_engine_client(args) as engine_client: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/contextlib.py", line 210, in aenter return await anext(self.gen) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client async with build_async_engine_client_from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/contextlib.py", line 210, in aenter return await anext(self.gen) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 230, in build_async_engine_client_from_engine_args raise RuntimeError( RuntimeError: Engine process failed to start. See stack trace for the root cause. |
Before submitting a new issue...
- Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.