Closed
Description
[hw]
4x Arc A770 + 1x XeonW
[model]
DeepSeek-R1-Distill-Qwen-32B
[dtype/load-in-low-bit]
float16/fp8
[convert code]
from vllm import SamplingParams
from ipex_llm.vllm.xpu.engine import IPEXLLMClass as LLM
# Create an LLM.
llm = LLM(model="/llm/models/DeepSeek-R1-Distill-Qwen-32B", # Unquantized model path on disk
device="xpu",
dtype="float16",
enforce_eager=True,
load_in_low_bit="fp8", # The low-bit you may want to quantized to
tensor_parallel_size=4, # The tp-size you choose needs to be same when you later uses the low-bit model
disable_async_output_proc=True,
distributed_executor_backend="ray",
max_model_len=10000,
trust_remote_code=True,
block_size=8,
max_num_batched_tokens=10000,
low_bit_save_path="/llm/models/DeepSeek-R1-Distill-Qwen-32B-fp8") # saved path
[vllm start code]
#!/bin/bash
MODEL_PATH=${MODEL_PATH:-"/llm/models/DeepSeek-R1-Distill-Qwen-32B-fp8"}
SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"/llm/models/DeepSeek-R1-Distill-Qwen-32B-fp8"}
TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE:-4}
MAX_NUM_SEQS=${MAX_NUM_SEQS:-256}
MAX_NUM_BATCHED_TOKENS=${MAX_NUM_BATCHED_TOKENS:-10000}
MAX_MODEL_LEN=${MAX_MODEL_LEN:-10000}
LOAD_IN_LOW_BIT=${LOAD_IN_LOW_BIT:-"fp8"}
PORT=${PORT:-8008}
echo "Starting service with model: $MODEL_PATH"
echo "Served model name: $SERVED_MODEL_NAME"
echo "Tensor parallel size: $TENSOR_PARALLEL_SIZE"
echo "Max num sequences: $MAX_NUM_SEQS"
echo "Max num batched tokens: $MAX_NUM_BATCHED_TOKENS"
echo "Max model length: $MAX_MODEL_LEN"
echo "Load in low bit: $LOAD_IN_LOW_BIT"
echo "Port: $PORT"
export CCL_WORKER_COUNT=4
export SYCL_CACHE_PERSISTENT=1
export FI_PROVIDER=shm
export CCL_ATL_TRANSPORT=ofi
export CCL_ZE_IPC_EXCHANGE=sockets
export CCL_ATL_SHM=1
export USE_XETLA=OFF
export SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=2
export TORCH_LLM_ALLREDUCE=0
export CCL_SAME_STREAM=1
export CCL_BLOCKING_WAIT=0
export VLLM_USE_V1=0
export IPEX_LLM_LOWBIT=$LOAD_IN_LOW_BIT
source /opt/intel/1ccl-wks/setvars.sh
python -m ipex_llm.vllm.xpu.entrypoints.openai.api_server \
--served-model-name $SERVED_MODEL_NAME \
--port $PORT \
--model "/llm/models/DeepSeek-R1-Distill-Qwen-32B" \
--low-bit-model-path $MODEL_PATH \
--trust-remote-code \
--block-size 8 \
--gpu-memory-utilization 0.95 \
--device xpu \
--dtype float16 \
--enforce-eager \
--load-in-low-bit $LOAD_IN_LOW_BIT \
--max-model-len $MAX_MODEL_LEN \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--max-num-seqs $MAX_NUM_SEQS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--disable-async-output-proc \
--distributed-executor-backend ray
Metadata
Metadata
Assignees
Labels
No labels