yiliu30 · yiliu30 · Aug 16, 2025 · Aug 8, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/examples/offline_inference/basic/basic_hpu.py b/examples/offline_inference/basic/basic_hpu.py
@@ -40,9 +40,10 @@
 model_path = "/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-V2-Lite-NVFP4-autoround"
 # model_path = "/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE"
 model_path = "/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
-
+model_path = "/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround"
+model_path = "/software/users/yiliu4/HF_HOME/Yi30/Llama-3.2-1B-Instruct-MXFP4-llmc"
 # model_path = "/software/users/yiliu4/HF_HOME/Yi30/DeepSeek-V2-Lite-NVFP4-W4A4-RTN-GLOBAL-SCALE-WW"
-
+model_path = "/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP4-RTN"
 
 import os
 

diff --git a/examples/offline_inference/basic/start_vllm.sh b/examples/offline_inference/basic/start_vllm.sh
@@ -2,9 +2,7 @@
 
 # VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0  bash start_vllm.sh --dummy-run
 # VLLM_HPU_LOG_HPU_GRAPH=1 VLLM_DISABLE_INPUT_QDQ=0  bash start_vllm.sh --skip-warmup
-#  bash start_vllm.sh --skip-warmup  --ds-nvfp4 
-#  bash start_vllm.sh --skip-warmup  --ds-nvfp4 --dummy-run
-#  bash start_vllm.sh --skip-warmup  --ds-nvfp4 --dummy-run --skip-warmup --next_token
+#  bash start_vllm.sh --skip-warmup  --ds-nvfp4
 #  bash start_vllm.sh --skip-warmup  --ds-nvfp4 --skip-warmup --next_token
 
 model_path=/mnt/disk3/yiliu4/DeepSeek-R1-G2-INC-424-Converter207/
@@ -13,9 +11,14 @@ model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP8-RTN
 v2_model_path=/software/users/yiliu4/HF_HOME/Yi30/Yi30/DeepSeek-V2-Lite-MXFP8-llmc
 mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-MXFP4-RTN
 mxfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-bf16-MXFP4-autoround
-nvfp4_model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE
+
 nvfp4_model_path=/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround/
+nvfp4_model_path="/software/users/yiliu4/deepseek-ai/DeepSeek-R1-nvfp4-fix-723"
+nvfp4_model_path="/software/users/yiliu4/deepseek-ai/DeepSeek-R1-nvfp4-fix-723-skip-atten"
+nvfp4_model_path=/software/users/yiliu4/deepseek-ai/DeepSeek-R1-NVFP4-OFFLINE
 nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
+nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-RTN"
+nvfp4_model_path="/software/users/yiliu4/HF_HOME/weiweiz1/DeepSeek-R1-NVFP4-autoround"
 tp_size=8
 
 num_samples=128
@@ -81,6 +84,7 @@ done
 # Debugging: Print the values of the variables
 echo "USE_FP8_KV=$USE_FP8_KV"
 echo "USE_NATIVE_SCALING=$USE_NATIVE_SCALING"
+echo "model_path=$model_path"
 echo "NEXT_TOKEN=$NEXT_TOKEN"
 
 
@@ -106,23 +110,26 @@ block_size=128
 # DO NOT change ends...
 
 # memory footprint tunning params
-export VLLM_GPU_MEMORY_UTILIZATION=0.65
+export VLLM_GPU_MEMORY_UTILIZATION=0.45
 export VLLM_GRAPH_RESERVED_MEM=0.4
 export VLLM_GRAPH_PROMPT_RATIO=0
 export VLLM_MLA_DISABLE_REQUANTIZATION=0
 export VLLM_DELAYED_SAMPLING="true"
+#export VLLM_MOE_SLICE_LENGTH=20480
+
 
 if [ "$NEXT_TOKEN" = true ]; then
     echo "Enabling next token prediction"
     export VLLM_DELAYED_SAMPLING="false"
+    task_name="mmlu"
 else
     echo "Disabling next token prediction"
     export VLLM_DELAYED_SAMPLING="true"
 fi
 #export VLLM_MOE_SLICE_LENGTH=20480
 
 # params
-CONST_LEN=4096
+CONST_LEN=16384
 max_model_len=$CONST_LEN
 max_num_batched_tokens=$CONST_LEN
 max_num_seqs=32
@@ -252,7 +259,8 @@ fi
 # add --max-num-prefill-seqs for next token prediction
 if [ "$NEXT_TOKEN" = true ]; then
     echo "Enabling next token prediction"
-    CMD="$CMD --max-num-prefill-seqs 1"
+    #CMD="$CMD --max-num-prefill-seqs 2"
+    CMD="$CMD --enforce-eager "
 else
     echo "Disabling next token prediction"
 fi
@@ -278,13 +286,13 @@ echo "Server started with PID: ${pid}"
 
 #===========================================================
 # RUN BENCHMARK
-#===========================================================
+#===============================a============================
 export no_proxy=localhost,127.0.0.1
 
 
 model_base_name=$(basename $model_path)
 
-EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
+EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output__bs${batch_size}__${timestamp}"
-EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output__bs${batch_size}__${timestamp}"
+EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
-EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output__bs${batch_size}__${timestamp}"
+EVAL_LOG_NAME="mxfp8_${model_base_name}_lm_eval_output_${task_name}_bs${batch_size}__${timestamp}"
 
 echo "Running lm_eval with model: ${model_path}, task: ${task_name}, batch size: ${batch_size}, num samples: ${num_samples}"
 
@@ -296,30 +304,17 @@ lm_eval --model local-completions \
     --model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
     --batch_size 32  \
     --confirm_run_unsafe_code \
-    --limit $num_samples \
     --log_samples \
     --output_path "benchmark_logs/$EVAL_LOG_NAME" \
     2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"
 
 
 
-
-
 end_time=$(date +%s)
 echo "Benchmark completed in $((end_time - start_time)) seconds"
 
 # Clean up
-echo "Stopping vLLM server"
-kill ${pid}
-echo "Script execution completed"
-sleep 10
-
-
-
-# lm_eval --model local-completions \
-#     --tasks "$task_name" \
-#     --model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1 \
-#     --batch_size 32  \
-#     --confirm_run_unsafe_code \
-#     --limit $num_samples \
-#     --log_samples 
+# echo "Stopping vLLM server"
+#kill ${pid}
+#echo "Script execution completed"
+#sleep 10
-# echo "Stopping vLLM server"
-#kill ${pid}
-#echo "Script execution completed"
-#sleep 10
+echo "Stopping vLLM server"
+kill ${pid}
+echo "Script execution completed"
+sleep 10
-# echo "Stopping vLLM server"
-#kill ${pid}
-#echo "Script execution completed"
-#sleep 10
+echo "Stopping vLLM server"
+kill ${pid}
+echo "Script execution completed"
+sleep 10
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,6 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<80.0.0",
     "setuptools-scm>=8.0",
-    "torch == 2.7.0",
     "wheel",
     "jinja2",
 ]

diff --git a/requirements/build.txt b/requirements/build.txt
@@ -4,6 +4,5 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-torch==2.7.0
 wheel
 jinja2>=3.1.6
diff --git a/requirements/common.txt b/requirements/common.txt
@@ -6,6 +6,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
+datasets == 3.6.0
 transformers == 4.53.2
 huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
@@ -37,7 +38,6 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need
 setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
 # compressed-tensors == 0.10.2 # required for compressed-tensors
-torchao @ git+https://github.com/yiliu30/torchao-fork.git@mxfp8
 compressed-tensors @ git+https://github.com/yiliu30/compressed-tensors-fork.git@mxfp4
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
@@ -49,3 +49,4 @@ opentelemetry-sdk>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-api>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-exporter-otlp>=1.26.0,<1.27.0  # vllm.tracing
 opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0  # vllm.tracing
+torchao @ git+https://github.com/yiliu30/torchao-fork.git@mxfp8
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
@@ -2,19 +2,8 @@
 -r common.txt
 
 # Dependencies for CPUs
---extra-index-url https://download.pytorch.org/whl/cpu
-torch==2.7.0+cpu; platform_machine == "x86_64"
-torch==2.7.0; platform_system == "Darwin"
-torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
-torch==2.7.0.dev20250304; platform_machine == "s390x"
 
-# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
-torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchaudio==2.7.0; platform_machine == "ppc64le"
 
-# required for the image processor of phi3v, this must be updated alongside torch
-torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
-torchvision==0.22.0; platform_machine == "ppc64le"
 datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0

diff --git a/requirements/cuda.txt b/requirements/cuda.txt
@@ -6,9 +6,4 @@ numba == 0.61.2; python_version > '3.9'
 
 # Dependencies for NVIDIA GPUs
 ray[cgraph]>=2.43.0, !=2.44.* # Ray Compiled Graph, required for pipeline parallelism in V1.
-torch==2.7.0
-torchaudio==2.7.0
-# These must be updated alongside torch
-torchvision==0.22.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 # https://github.com/facebookresearch/xformers/releases/tag/v0.0.30
-xformers==0.0.30; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch >= 2.7
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -11,5 +11,3 @@ commonmark # Required by sphinx-argparse when using :markdownhelp:
 
 # packages to install to build the documentation
 cachetools
--f https://download.pytorch.org/whl/cpu
-torch
diff --git a/requirements/neuron.txt b/requirements/neuron.txt
@@ -4,5 +4,4 @@
 # Dependencies for Neuron devices
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
-torch-neuronx >= 2.5.0
 neuronx-cc
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
@@ -29,5 +29,4 @@ lm-eval[api]==0.4.8 # required for model evaluation test
 bitsandbytes>=0.45.3
 
 # required for minicpmo_26 test
-vector_quantize_pytorch
 vocos
diff --git a/requirements/rocm-build.txt b/requirements/rocm-build.txt
@@ -1,10 +1,6 @@
 # Common dependencies
 -r common.txt
 
---extra-index-url https://download.pytorch.org/whl/rocm6.2.4
-torch==2.7.0
-torchvision==0.22.0
-torchaudio==2.7.0
 
 triton==3.2
 cmake>=3.26,<4