intel · xin3he · Dec 9, 2025 · Dec 11, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/...p/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/...p/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -88,6 +88,8 @@ Notes:
 
 ### Llama3 Quantization Recipes
 
+Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.
+
 #### Llama 3.1 8B MXFP8
 
 AutoRound tuning helps improve the accuracy, `iters` and `nsamples` is higher than default.
@@ -131,6 +133,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.3-70B --dtype=mxfp8 --input_model=/models/Llama-3.3-70B-Instruct/ --output_model=Llama-3.3-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.3 70B MXFP4 (Mixed with MXFP8, Target_bits=5.8)
 
 `Target_bits=5.8` is an empirical value.
@@ -147,6 +151,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-70B --dtype=mxfp8 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-MXFP8
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B NVFP4
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
@@ -155,6 +161,8 @@ RTN (Round-to-Nearest) is enough to keep accuracy.
 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_quant.sh --topology=Llama-3.1-70B --dtype=nvfp4 --input_model=/models/Llama-3.1-70B-Instruct/ --output_model=Llama-3.1-70B-NVFP4
 ```
 
+> Note: Within the accuracy threshold, lm_head quantization is acceptable, but this feature is not enabled here to support vLLM inference.
+
 #### Llama 3.1 70B uNVFP4
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
@@ -186,27 +194,27 @@ For convenience, we provide a benchmark script that automatically handles GPU de
 
 1. **Llama 3.1 8B MXFP8** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP8 --gpu_memory_utilization=0.8
 ```
 
 2. **Llama 3.1 8B MXFP4 Mixed** (1 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh --model_path=Llama-3.1-8B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-3. **Llama 3.3 70B MXFP8** (4 GPU):
+3. **Llama 3.3 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP8  --gpu_memory_utilization=0.8
 ```
 
-4. **Llama 3.3 70B MXFP4 Mixed** (4 GPU):
+4. **Llama 3.3 70B MXFP4 Mixed** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.3-70B-MXFP4-MXFP8  --gpu_memory_utilization=0.6
 ```
 
-5. **Llama 3.1 70B MXFP8** (4 GPU):
+5. **Llama 3.1 70B MXFP8** (2 GPU):
 ```bash
-CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8
+CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8   --gpu_memory_utilization=0.8
 ```
 
 The script automatically:

diff --git a/.../nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/.../nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -4,7 +4,8 @@
 
 # Parse command line arguments
 TASKS="piqa,hellaswag,mmlu,gsm8k"
-BATCH_SIZE=8
+BATCH_SIZE=64
+GPU_MEMORY_UTILIZATION=0.8
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -20,6 +21,10 @@ while [[ $# -gt 0 ]]; do
             BATCH_SIZE="${1#*=}"
             shift
             ;;
+        --gpu_memory_utilization=*)
+            GPU_MEMORY_UTILIZATION="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -48,6 +53,7 @@ echo "  Model Path: $MODEL_PATH"
 echo "  Tasks: $TASKS"
 echo "  Batch Size: $BATCH_SIZE"
 echo "  Tensor Parallel Size: $TENSOR_PARALLEL_SIZE"
+echo "  GPU Memory Utilization: $GPU_MEMORY_UTILIZATION"
 echo "  CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES"
 
 # Check if the model exists
@@ -68,11 +74,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
 
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 --tasks $tasks --batch_size $BATCH_SIZE"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE"
     echo "Executing command: $cmd"
 
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,data_parallel_size=1 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
         --tasks $tasks \
         --batch_size $BATCH_SIZE
 

diff --git a/...orch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/...orch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -95,13 +95,12 @@ case "$TOPOLOGY" in
         case "$DTYPE" in
             "mxfp8")
                 echo "Running Llama 3.3 70B MXFP8 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype MXFP8 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_path "$OUTPUT_MODEL"
                 ;;
@@ -140,25 +139,23 @@ case "$TOPOLOGY" in
         case "$DTYPE" in
             "mxfp8")
                 echo "Running Llama 3.1 70B MXFP8 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --quant_lm_head --iters 0 --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype MXFP8 --iters 0 --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype MXFP8 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_path "$OUTPUT_MODEL"
                 ;;
             "nvfp4")
                 echo "Running Llama 3.1 70B NVFP4 quantization..."
-                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --quant_lm_head --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
+                CMD="python quantize.py --model_name_or_path \"$INPUT_MODEL\" $COMMON_ARGS --dtype NVFP4 --iters 0 --export_format llm_compressor --export_path \"$OUTPUT_MODEL\""
                 echo "Executing command: $CMD"
                 python quantize.py \
                     --model_name_or_path "$INPUT_MODEL" \
                     $COMMON_ARGS \
                     --dtype NVFP4 \
-                    --quant_lm_head \
                     --iters 0 \
                     --export_format llm_compressor \
                     --export_path "$OUTPUT_MODEL"