Skip to content

Commit fcf6ec3

Browse files
yiliu30Yi4Liu
andauthored
Refine start bench (#66)
* remove torch from requirements Signed-off-by: Yi Liu <yiliu4@habana.ai> * update bench code Signed-off-by: Yi Liu <yiliu4@habana.ai> * update Signed-off-by: Yi Liu <yiliu4@habana.ai> * update model path Signed-off-by: Yi Liu <yiliu4@habana.ai> --------- Signed-off-by: Yi Liu <yiliu4@habana.ai> Co-authored-by: Yi Liu <yiliu4@habana.ai>
1 parent c9c5d12 commit fcf6ec3

File tree

1 file changed

+76
-29
lines changed

1 file changed

+76
-29
lines changed

examples/offline_inference/basic/start_vllm.sh

Lines changed: 76 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,32 @@
22

33
# VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0 bash start_vllm.sh --dummy-run
44
# VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0 bash start_vllm.sh --skip-warmup
5-
# bash start_vllm.sh --skip-warmup --ds-nvfp4
5+
# bash start_vllm.sh --skip-warmup --ds-nvfp4
6+
# bash start_vllm.sh --skip-warmup --ds-nvfp4 --dummy-run
7+
# bash start_vllm.sh --skip-warmup --ds-nvfp4 --dummy-run --skip-warmup --next_token
68
# bash start_vllm.sh --skip-warmup --ds-nvfp4 --skip-warmup --next_token
9+
# bash start_vllm.sh --skip-warmup --ds-mxfp4 --skip-warmup --next_token
710

811
model_path=/mnt/disk3/yiliu4/DeepSeek-R1-G2-INC-424-Converter207/
912
model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-MXFP8-OFFLINE/
1013
model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP8-RTN
1114
v2_model_path=/software/users/yiliu4/HF_HOME/Yi30/Yi30/DeepSeek-V2-Lite-MXFP8-llmc
12-
mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP4-RTN
1315
mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-bf16-MXFP4-autoround
1416

15-
nvfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround/
16-
nvfp4_model_path="/software/users/yiliu4/deepseek-ai/DeepSeek-R1-nvfp4-fix-723"
17-
nvfp4_model_path="/software/users/yiliu4/deepseek-ai/DeepSeek-R1-nvfp4-fix-723-skip-atten"
17+
mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP8-RTN-RCEIL
18+
mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP4-RTN
19+
mxfp4_model_path=/software/users/yiliu4/HF_HOME/Yi30/Qwen3-235B-A22B-MXFP8-RTN-AR-LLMC
1820
nvfp4_model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE
21+
nvfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround/
1922
nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
20-
nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
21-
nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround"
2223
tp_size=8
2324

2425
num_samples=128
2526
task_name="mmlu_pro_math,mmlu_pro_biology"
2627
task_name="humaneval"
2728
task_name="gsm8k"
2829

29-
batch_size=32
30+
batch_size=64
3031

3132

3233
# set -x
@@ -61,6 +62,7 @@ for arg in "$@"; do
6162
export VLLM_USE_MXFP4_CT_EMULATIONS=1
6263
export VLLM_INPUT_QUICK_QDQ=1
6364
export USE_CT_UNPACK=1
65+
export VLLM_MXFP4_EVEN_ROUNDING=1
6466
;;
6567
--ds-nvfp4)
6668
model_path=$nvfp4_model_path
@@ -84,9 +86,8 @@ done
8486
# Debugging: Print the values of the variables
8587
echo "USE_FP8_KV=$USE_FP8_KV"
8688
echo "USE_NATIVE_SCALING=$USE_NATIVE_SCALING"
87-
echo "model_path=$model_path"
8889
echo "NEXT_TOKEN=$NEXT_TOKEN"
89-
90+
echo "model_path=$model_path"
9091

9192
BASH_DIR=$(dirname "${BASH_SOURCE[0]}")
9293
# source "$BASH_DIR"/utils.sh
@@ -106,41 +107,49 @@ export VLLM_LOGGING_LEVEL=DEBUG
106107
# export VLLM_MOE_N_SLICE=8
107108
export VLLM_EP_SIZE=$tp_size
108109

110+
109111
block_size=128
110112
# DO NOT change ends...
111113

112114
# memory footprint tunning params
113-
export VLLM_GPU_MEMORY_UTILIZATION=0.45
115+
export VLLM_GPU_MEMORY_UTILIZATION=0.25
114116
export VLLM_GRAPH_RESERVED_MEM=0.4
115117
export VLLM_GRAPH_PROMPT_RATIO=0
116118
export VLLM_MLA_DISABLE_REQUANTIZATION=0
117119
export VLLM_DELAYED_SAMPLING="true"
118-
#export VLLM_MOE_SLICE_LENGTH=20480
119120

121+
export VLLM_PROMPT_BS_BUCKET_STEP=512
122+
export VLLM_PROMPT_SEQ_BUCKET_MIN=512
123+
export VLLM_PROMPT_SEQ_BUCKET_STEP=512
120124

121125
if [ "$NEXT_TOKEN" = true ]; then
122126
echo "Enabling next token prediction"
123127
export VLLM_DELAYED_SAMPLING="false"
124-
task_name="mmlu"
128+
export VLLM_EXPONENTIAL_BUCKETING=false
129+
export VLLM_PROMPT_BS_BUCKET_STEP=1
130+
export VLLM_PROMPT_SEQ_BUCKET_MIN=512
131+
export VLLM_PROMPT_SEQ_BUCKET_STEP=512
125132
else
126133
echo "Disabling next token prediction"
127134
export VLLM_DELAYED_SAMPLING="true"
128135
fi
129136
#export VLLM_MOE_SLICE_LENGTH=20480
130137

131138
# params
132-
CONST_LEN=16384
139+
CONST_LEN=8192
133140
max_model_len=$CONST_LEN
134141
max_num_batched_tokens=$CONST_LEN
135142
max_num_seqs=32
136143
input_min=1
137144
input_max=$CONST_LEN
138145
output_max=$CONST_LEN
139146

140-
unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
141-
unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
142-
unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
143-
unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
147+
#unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
148+
#unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
149+
#unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
150+
#unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
151+
152+
144153

145154

146155
# export PT_HPU_RECIPE_CACHE_CONFIG=/data/16k_cache,false,16384
@@ -259,7 +268,7 @@ fi
259268
# add --max-num-prefill-seqs for next token prediction
260269
if [ "$NEXT_TOKEN" = true ]; then
261270
echo "Enabling next token prediction"
262-
#CMD="$CMD --max-num-prefill-seqs 2"
271+
CMD="$CMD --max-num-prefill-seqs ${batch_size}"
263272
CMD="$CMD --enforce-eager "
264273
else
265274
echo "Disabling next token prediction"
@@ -286,35 +295,73 @@ echo "Server started with PID: ${pid}"
286295

287296
#===========================================================
288297
# RUN BENCHMARK
289-
#===============================a============================
298+
#===========================================================
290299
export no_proxy=localhost,127.0.0.1
291300

292301

293302
model_base_name=$(basename $model_path)
294303

295-
EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output__bs${batch_size}__${timestamp}"
296304

297305
echo "Running lm_eval with model: ${model_path}, task: ${task_name}, batch size: ${batch_size}, num samples: ${num_samples}"
298306

299307
start_time=$(date +%s)
300308

309+
task_name="piqa"
310+
EVAL_LOG_NAME="mxfp4_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
311+
312+
HF_ALLOW_CODE_EVAL=1 \
313+
lm_eval --model local-completions \
314+
--tasks $task_name \
315+
--model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
316+
--batch_size ${batch_size} \
317+
--confirm_run_unsafe_code \
318+
--log_samples \
319+
--output_path "benchmark_logs/$EVAL_LOG_NAME" \
320+
2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"
321+
322+
323+
324+
sleep 10
325+
echo "hellaswag evaluation starts"
326+
task_name="hellaswag"
327+
328+
EVAL_LOG_NAME="mxfp4_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
301329
HF_ALLOW_CODE_EVAL=1 \
302330
lm_eval --model local-completions \
303-
--tasks "$task_name" \
331+
--tasks $task_name \
304332
--model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
305-
--batch_size 32 \
333+
--batch_size ${batch_size} \
306334
--confirm_run_unsafe_code \
307335
--log_samples \
308336
--output_path "benchmark_logs/$EVAL_LOG_NAME" \
309337
2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"
310338

311339

340+
sleep 10
341+
echo "mmlu evaluation starts"
342+
task_name="mmlu"
343+
EVAL_LOG_NAME="mxfp4_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
344+
345+
HF_ALLOW_CODE_EVAL=1 \
346+
lm_eval --model local-completions \
347+
--tasks $task_name \
348+
--model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
349+
--batch_size ${batch_size} \
350+
--confirm_run_unsafe_code \
351+
--log_samples \
352+
--output_path "benchmark_logs/$EVAL_LOG_NAME" \
353+
2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"
354+
312355

313-
end_time=$(date +%s)
314-
echo "Benchmark completed in $((end_time - start_time)) seconds"
315356

316-
# Clean up
357+
358+
359+
# end_time=$(date +%s)
360+
# echo "Benchmark completed in $((end_time - start_time)) seconds"
361+
362+
# # Clean up
317363
# echo "Stopping vLLM server"
318-
#kill ${pid}
319-
#echo "Script execution completed"
320-
#sleep 10
364+
# kill ${pid}
365+
# echo "Script execution completed"
366+
# sleep 10
367+

0 commit comments

Comments
 (0)