Open
Description
#!/bin/bash
MODEL_PATH=${MODEL_PATH:-"/llm/models/Qwen2-VL-7B-Instruct"}
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen2-VL-7B-Instruct"}
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-1}
MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-2048}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-16384}
LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-"fp8"}
PORT=${PORT:-8000}
echo "Starting service with model: $MODEL_PATH"
echo "Served model name: $SERVED_MODEL_NAME"
echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
echo "Max num sequences: $MAX_NUM_SEQS"
echo "Max num batched tokens: $MAX_NUM_BATCHED_TOKENS"
echo "Max model length: $MAX_MODEL_LEN"
echo "Load in low bit: $LOAD_IN_LOW_BIT"
echo "Port: $PORT"
export USE_XETLA=OFF
export SYCL_CACHE_PERSISTENT=1
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export FI_PROVIDER=shm
export TORCH_LLM_ALLREDUCE=0
export CCL_WORKER_COUNT=2 # On BMG, set CCL_WORKER_COUNT=1; otherwise, internal-oneccl will not function properly
export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1
export CCL_SAME_STREAM=1
export CCL_BLOCKING_WAIT=0
# export CCL_DG2_USM=1 # Needed on Core to enable USM (Shared Memory GPUDirect). Xeon supports P2P and doesn't need this.
export VLLM_USE_V1=0 # Used to select between V0 and V1 engine
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT # Ensures low-bit info is used for MoE; otherwise, IPEX's default MoE will be used
source /opt/intel/1ccl-wks/setvars.sh
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
--served-model-name $SERVED_MODEL_NAME \
--port $PORT \
--model $MODEL_PATH \
--trust-remote-code \
--block-size 8 \
--gpu-memory-utilization 0.95 \
--device xpu \
--dtype float16 \
--enforce-eager \
--enable-chunked-prefill \
--load-in-low-bit $LOAD_IN_LOW_BIT \
--max-model-len $MAX_MODEL_LEN \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--max-num-seqs $MAX_NUM_SEQS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--disable-async-output-proc \
--distributed-executor-backend ray
测试脚本:
python vllm_online_benchmark_multimodal.py --model-name Qwen2-VL-7B-Instruct --image-url 40_20240903093021_78830772282673.jpg
报错日志
INFO 06-09 13:50:30 [engine.py:310] Added request chatcmpl-fcc96ec1870e430998e0a7dfb64271a0.
ERROR 06-09 13:50:31 [worker_base.py:620] Error executing method 'execute_model'. This might cause deadlock in distributed execution.
ERROR 06-09 13:50:31 [worker_base.py:620] Traceback (most recent call last):
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/worker_base.py", line 612, in execute_method
ERROR 06-09 13:50:31 [worker_base.py:620] return run_method(self, method, args, kwargs)
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/utils.py", line 2349, in run_method
ERROR 06-09 13:50:31 [worker_base.py:620] return func(*args, **kwargs)
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/worker_base.py", line 420, in execute_model
ERROR 06-09 13:50:31 [worker_base.py:620] output = self.model_runner.execute_model(
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 06-09 13:50:31 [worker_base.py:620] return func(*args, **kwargs)
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/xpu_model_runner.py", line 973, in execute_model
ERROR 06-09 13:50:31 [worker_base.py:620] hidden_or_intermediate_states = model_executable(
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
ERROR 06-09 13:50:31 [worker_base.py:620] return self._call_impl(*args, **kwargs)
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
ERROR 06-09 13:50:31 [worker_base.py:620] return forward_call(*args, **kwargs)
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/qwen2_vl.py", line 1414, in forward
ERROR 06-09 13:50:31 [worker_base.py:620] inputs_embeds = self.get_input_embeddings_v0(
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/qwen2_vl.py", line 1352, in get_input_embeddings_v0
ERROR 06-09 13:50:31 [worker_base.py:620] inputs_embeds = merge_multimodal_embeddings(
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/utils.py", line 481, in merge_multimodal_embeddings
ERROR 06-09 13:50:31 [worker_base.py:620] return _merge_multimodal_embeddings(
ERROR 06-09 13:50:31 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [worker_base.py:620] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/utils.py", line 397, in _merge_multimodal_embeddings
ERROR 06-09 13:50:31 [worker_base.py:620] raise ValueError(
ERROR 06-09 13:50:31 [worker_base.py:620] ValueError: Attempted to assign 1989 = 1989 multimodal tokens to 1926 placeholders
ERROR 06-09 13:50:31 [engine.py:160] ValueError('Attempted to assign 1989 = 1989 multimodal tokens to 1926 placeholders')
ERROR 06-09 13:50:31 [engine.py:160] Traceback (most recent call last):
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/multiprocessing/engine.py", line 158, in start
ERROR 06-09 13:50:31 [engine.py:160] self.run_engine_loop()
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/multiprocessing/engine.py", line 221, in run_engine_loop
ERROR 06-09 13:50:31 [engine.py:160] request_outputs = self.engine_step()
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/multiprocessing/engine.py", line 247, in engine_step
ERROR 06-09 13:50:31 [engine.py:160] raise e
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/multiprocessing/engine.py", line 230, in engine_step
ERROR 06-09 13:50:31 [engine.py:160] return self.engine.step()
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/llm_engine.py", line 1430, in step
ERROR 06-09 13:50:31 [engine.py:160] outputs = self.model_executor.execute_model(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/executor/ray_distributed_executor.py", line 451, in execute_model
ERROR 06-09 13:50:31 [engine.py:160] return super().execute_model(execute_model_req)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/executor/executor_base.py", line 299, in execute_model
ERROR 06-09 13:50:31 [engine.py:160] driver_outputs = self._driver_execute_model(execute_model_req)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/executor/ray_distributed_executor.py", line 444, in _driver_execute_model
ERROR 06-09 13:50:31 [engine.py:160] return self.driver_worker.execute_method("execute_model",
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/worker_base.py", line 621, in execute_method
ERROR 06-09 13:50:31 [engine.py:160] raise e
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/worker_base.py", line 612, in execute_method
ERROR 06-09 13:50:31 [engine.py:160] return run_method(self, method, args, kwargs)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/utils.py", line 2349, in run_method
ERROR 06-09 13:50:31 [engine.py:160] return func(*args, **kwargs)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/worker_base.py", line 420, in execute_model
ERROR 06-09 13:50:31 [engine.py:160] output = self.model_runner.execute_model(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 06-09 13:50:31 [engine.py:160] return func(*args, **kwargs)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/worker/xpu_model_runner.py", line 973, in execute_model
ERROR 06-09 13:50:31 [engine.py:160] hidden_or_intermediate_states = model_executable(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
ERROR 06-09 13:50:31 [engine.py:160] return self._call_impl(*args, **kwargs)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
ERROR 06-09 13:50:31 [engine.py:160] return forward_call(*args, **kwargs)
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/qwen2_vl.py", line 1414, in forward
ERROR 06-09 13:50:31 [engine.py:160] inputs_embeds = self.get_input_embeddings_v0(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/qwen2_vl.py", line 1352, in get_input_embeddings_v0
ERROR 06-09 13:50:31 [engine.py:160] inputs_embeds = merge_multimodal_embeddings(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/utils.py", line 481, in merge_multimodal_embeddings
ERROR 06-09 13:50:31 [engine.py:160] return _merge_multimodal_embeddings(
ERROR 06-09 13:50:31 [engine.py:160] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 06-09 13:50:31 [engine.py:160] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/model_executor/models/utils.py", line 397, in _merge_multimodal_embeddings
ERROR 06-09 13:50:31 [engine.py:160] raise ValueError(
ERROR 06-09 13:50:31 [engine.py:160] ValueError: Attempted to assign 1989 = 1989 multimodal tokens to 1926 placeholders
ERROR 06-09 13:50:31 [serving_chat.py:883] Error in chat completion stream generator.
ERROR 06-09 13:50:31 [serving_chat.py:883] Traceback (most recent call last):
ERROR 06-09 13:50:31 [serving_chat.py:883] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/entrypoints/openai/serving_chat.py", line 485, in chat_completion_stream_generator
ERROR 06-09 13:50:31 [serving_chat.py:883] async for res in result_generator:
ERROR 06-09 13:50:31 [serving_chat.py:883] File "/usr/local/lib/python3.11/dist-packages/vllm-0.8.3+ipexllm.xpu-py3.11-linux-x86_64.egg/vllm/engine/multiprocessing/client.py", line 664, in _process_request
ERROR 06-09 13:50:31 [serving_chat.py:883] raise request_output
ERROR 06-09 13:50:31 [serving_chat.py:883] vllm.engine.multiprocessing.MQEngineDeadError: Engine loop is not running. Inspect the stacktrace to find the original error: ValueError('Attempted to assign 1989 = 1989 multimodal tokens to 1926 placeholders').
CRITICAL 06-09 13:50:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO: 127.0.0.1:55246 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
CRITICAL 06-09 13:50:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO: 127.0.0.1:55252 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
CRITICAL 06-09 13:50:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO: 127.0.0.1:55268 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
CRITICAL 06-09 13:50:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO: 127.0.0.1:55284 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
CRITICAL 06-09 13:50:31 [launcher.py:116] MQLLMEngine is already dead, terminating server process
INFO: 127.0.0.1:55286 - "POST /v1/chat/completions HTTP/1.1" 500 Internal Server Error
INFO: Shutting down
INFO: Waiting for application shutdown.
INFO: Application shutdown complete.