22
33# VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0 bash start_vllm.sh --dummy-run
44# VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0 bash start_vllm.sh --skip-warmup
5- # bash start_vllm.sh --skip-warmup --ds-nvfp4
5+ # bash start_vllm.sh --skip-warmup --ds-nvfp4
6+ # bash start_vllm.sh --skip-warmup --ds-nvfp4 --dummy-run
7+ # bash start_vllm.sh --skip-warmup --ds-nvfp4 --dummy-run --skip-warmup --next_token
68# bash start_vllm.sh --skip-warmup --ds-nvfp4 --skip-warmup --next_token
9+ # bash start_vllm.sh --skip-warmup --ds-mxfp4 --skip-warmup --next_token
710
811model_path=/mnt/disk3/yiliu4/DeepSeek-R1-G2-INC-424-Converter207/
912model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-MXFP8-OFFLINE/
1013model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP8-RTN
1114v2_model_path=/software/users/yiliu4/HF_HOME/Yi30/Yi30/DeepSeek-V2-Lite-MXFP8-llmc
12- mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP4-RTN
1315mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-bf16-MXFP4-autoround
1416
15- nvfp4_model_path =/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround/
16- nvfp4_model_path= " /software/users/yiliu4/deepseek-ai/ DeepSeek-R1-nvfp4-fix-723 "
17- nvfp4_model_path= " /software/users/yiliu4/deepseek-ai/DeepSeek-R1-nvfp4-fix-723-skip-atten "
17+ mxfp4_model_path =/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP8-RTN-RCEIL
18+ mxfp4_model_path= /software/users/yiliu4/HF_HOME/weiweiz1/ DeepSeek-R1-MXFP4-RTN
19+ mxfp4_model_path= /software/users/yiliu4/HF_HOME/Yi30/Qwen3-235B-A22B-MXFP8-RTN-AR-LLMC
1820nvfp4_model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE
21+ nvfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround/
1922nvfp4_model_path=" /software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
20- nvfp4_model_path=" /software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
21- nvfp4_model_path=" /software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround"
2223tp_size=8
2324
2425num_samples=128
2526task_name=" mmlu_pro_math,mmlu_pro_biology"
2627task_name=" humaneval"
2728task_name=" gsm8k"
2829
29- batch_size=32
30+ batch_size=64
3031
3132
3233# set -x
@@ -61,6 +62,7 @@ for arg in "$@"; do
6162 export VLLM_USE_MXFP4_CT_EMULATIONS=1
6263 export VLLM_INPUT_QUICK_QDQ=1
6364 export USE_CT_UNPACK=1
65+ export VLLM_MXFP4_EVEN_ROUNDING=1
6466 ;;
6567 --ds-nvfp4)
6668 model_path=$nvfp4_model_path
8486# Debugging: Print the values of the variables
8587echo " USE_FP8_KV=$USE_FP8_KV "
8688echo " USE_NATIVE_SCALING=$USE_NATIVE_SCALING "
87- echo " model_path=$model_path "
8889echo " NEXT_TOKEN=$NEXT_TOKEN "
89-
90+ echo " model_path= $model_path "
9091
9192BASH_DIR=$( dirname " ${BASH_SOURCE[0]} " )
9293# source "$BASH_DIR"/utils.sh
@@ -106,41 +107,49 @@ export VLLM_LOGGING_LEVEL=DEBUG
106107# export VLLM_MOE_N_SLICE=8
107108export VLLM_EP_SIZE=$tp_size
108109
110+
109111block_size=128
110112# DO NOT change ends...
111113
112114# memory footprint tunning params
113- export VLLM_GPU_MEMORY_UTILIZATION=0.45
115+ export VLLM_GPU_MEMORY_UTILIZATION=0.25
114116export VLLM_GRAPH_RESERVED_MEM=0.4
115117export VLLM_GRAPH_PROMPT_RATIO=0
116118export VLLM_MLA_DISABLE_REQUANTIZATION=0
117119export VLLM_DELAYED_SAMPLING=" true"
118- # export VLLM_MOE_SLICE_LENGTH=20480
119120
121+ export VLLM_PROMPT_BS_BUCKET_STEP=512
122+ export VLLM_PROMPT_SEQ_BUCKET_MIN=512
123+ export VLLM_PROMPT_SEQ_BUCKET_STEP=512
120124
121125if [ " $NEXT_TOKEN " = true ]; then
122126 echo " Enabling next token prediction"
123127 export VLLM_DELAYED_SAMPLING=" false"
124- task_name=" mmlu"
128+ export VLLM_EXPONENTIAL_BUCKETING=false
129+ export VLLM_PROMPT_BS_BUCKET_STEP=1
130+ export VLLM_PROMPT_SEQ_BUCKET_MIN=512
131+ export VLLM_PROMPT_SEQ_BUCKET_STEP=512
125132else
126133 echo " Disabling next token prediction"
127134 export VLLM_DELAYED_SAMPLING=" true"
128135fi
129136# export VLLM_MOE_SLICE_LENGTH=20480
130137
131138# params
132- CONST_LEN=16384
139+ CONST_LEN=8192
133140max_model_len=$CONST_LEN
134141max_num_batched_tokens=$CONST_LEN
135142max_num_seqs=32
136143input_min=1
137144input_max=$CONST_LEN
138145output_max=$CONST_LEN
139146
140- unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
141- unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
142- unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
143- unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
147+ # unset VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP VLLM_PROMPT_BS_BUCKET_MAX
148+ # unset VLLM_PROMPT_SEQ_BUCKET_MIN VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_SEQ_BUCKET_MAX
149+ # unset VLLM_DECODE_BS_BUCKET_MIN VLLM_DECODE_BS_BUCKET_STEP VLLM_DECODE_BS_BUCKET_MAX
150+ # unset VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MAX
151+
152+
144153
145154
146155# export PT_HPU_RECIPE_CACHE_CONFIG=/data/16k_cache,false,16384
259268# add --max-num-prefill-seqs for next token prediction
260269if [ " $NEXT_TOKEN " = true ]; then
261270 echo " Enabling next token prediction"
262- # CMD="$CMD --max-num-prefill-seqs 2 "
271+ CMD=" $CMD --max-num-prefill-seqs ${batch_size} "
263272 CMD=" $CMD --enforce-eager "
264273else
265274 echo " Disabling next token prediction"
@@ -286,35 +295,73 @@ echo "Server started with PID: ${pid}"
286295
287296# ===========================================================
288297# RUN BENCHMARK
289- # ===============================a ============================
298+ # ===========================================================
290299export no_proxy=localhost,127.0.0.1
291300
292301
293302model_base_name=$( basename $model_path )
294303
295- EVAL_LOG_NAME=" mxfp8_${model_base_name} _lm_eval_output__bs${batch_size} __${timestamp} "
296304
297305echo " Running lm_eval with model: ${model_path} , task: ${task_name} , batch size: ${batch_size} , num samples: ${num_samples} "
298306
299307start_time=$( date +%s)
300308
309+ task_name=" piqa"
310+ EVAL_LOG_NAME=" mxfp4_${model_base_name} _lm_eval_output_${task_name} _bs${batch_size} __${timestamp} "
311+
312+ HF_ALLOW_CODE_EVAL=1 \
313+ lm_eval --model local-completions \
314+ --tasks $task_name \
315+ --model_args model=${model_path} ,base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
316+ --batch_size ${batch_size} \
317+ --confirm_run_unsafe_code \
318+ --log_samples \
319+ --output_path " benchmark_logs/$EVAL_LOG_NAME " \
320+ 2>&1 | tee " benchmark_logs/${EVAL_LOG_NAME} .log"
321+
322+
323+
324+ sleep 10
325+ echo " hellaswag evaluation starts"
326+ task_name=" hellaswag"
327+
328+ EVAL_LOG_NAME=" mxfp4_${model_base_name} _lm_eval_output_${task_name} _bs${batch_size} __${timestamp} "
301329HF_ALLOW_CODE_EVAL=1 \
302330lm_eval --model local-completions \
303- --tasks " $task_name " \
331+ --tasks $task_name \
304332 --model_args model=${model_path} ,base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
305- --batch_size 32 \
333+ --batch_size ${batch_size} \
306334 --confirm_run_unsafe_code \
307335 --log_samples \
308336 --output_path " benchmark_logs/$EVAL_LOG_NAME " \
309337 2>&1 | tee " benchmark_logs/${EVAL_LOG_NAME} .log"
310338
311339
340+ sleep 10
341+ echo " mmlu evaluation starts"
342+ task_name=" mmlu"
343+ EVAL_LOG_NAME=" mxfp4_${model_base_name} _lm_eval_output_${task_name} _bs${batch_size} __${timestamp} "
344+
345+ HF_ALLOW_CODE_EVAL=1 \
346+ lm_eval --model local-completions \
347+ --tasks $task_name \
348+ --model_args model=${model_path} ,base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
349+ --batch_size ${batch_size} \
350+ --confirm_run_unsafe_code \
351+ --log_samples \
352+ --output_path " benchmark_logs/$EVAL_LOG_NAME " \
353+ 2>&1 | tee " benchmark_logs/${EVAL_LOG_NAME} .log"
354+
312355
313- end_time=$( date +%s)
314- echo " Benchmark completed in $(( end_time - start_time)) seconds"
315356
316- # Clean up
357+
358+
359+ # end_time=$(date +%s)
360+ # echo "Benchmark completed in $((end_time - start_time)) seconds"
361+
362+ # # Clean up
317363# echo "Stopping vLLM server"
318- # kill ${pid}
319- # echo "Script execution completed"
320- # sleep 10
364+ # kill ${pid}
365+ # echo "Script execution completed"
366+ # sleep 10
367+
0 commit comments