intel · xin3he · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/auto_round/eval/eval_cli.py b/auto_round/eval/eval_cli.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 import os
+import time
 
 from auto_round.utils import (
     clear_memory,
@@ -66,6 +67,13 @@ def __init__(self, *args, **kwargs):
             help="Limit the number of examples per task. "
             "If <1, limit is a percentage of the total number of examples.",
         )
+        self.add_argument(
+            "--eval_backend",
+            default="hf",
+            type=str,
+            choices=["hf", "vllm"],
+            help="Use hf backend for evaluation by default.",
+        )
         # vllm related arguments
         self.add_argument("--revision", default=None, type=str, help="model revision for vllm")
         self.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm")
@@ -104,7 +112,15 @@ def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype
 
 
 def eval(args):
-    import time
+    if args.eval_backend == "vllm":
+        try:
+            assert isinstance(args.model, str), "vllm evaluation only supports model name or path."
+            eval_with_vllm(args)
+            return
+        except Exception as e:  # pragma: no cover
+            print(f"vllm evaluation failed: {e}, fallback to default hf backend evaluation.")
+            args.eval_backend = "hf"
+            clear_memory()
 
     tasks, model_args, device_str = _eval_init(
         args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype

diff --git a/test/test_cuda/test_vllm.py b/test/test_cuda/test_vllm.py
@@ -21,56 +21,57 @@
 ]
 
 
-# @pytest.mark.skipif(
-#     not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
-#     reason="only supports CPU/XPU/CUDA backend.",
-# )
-# @pytest.mark.parametrize("model", MODELS)
-# def test_auto_round(model):
-#     # Sample prompts.
-#     prompts = [
-#         "The capital of France is",
-#         "The future of AI is",
-#     ]
-#     # Create a sampling params object.
-#     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-#     # Create an LLM.
-#     QUANTIZATION = "auto-round"
-#     llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
-#     # Generate texts from the prompts.
-#     # The output is a list of RequestOutput objects
-#     # that contain the prompt, generated text, and other information.
-#     outputs = llm.generate(prompts, sampling_params)
-#     # Print the outputs.
-#     for output in outputs:
-#         prompt = output.prompt
-#         generated_text = output.outputs[0].text
-#         if "France" in prompt:
-#             assert "Paris" in generated_text
-#
-#
-# @pytest.mark.parametrize("model", MODELS)
-# def test_vllm_lm_eval(model):
-#     if shutil.which("auto-round") is None:
-#         pytest.skip("auto-round CLI not available")
-#
-#     env = os.environ.copy()
-#     env["VLLM_SKIP_WARMUP"] = "true"
-#     env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-#
-#     cmd = [
-#         "auto-round",
-#         "--model",
-#         model,
-#         "--eval",
-#         "--tasks",
-#         "lambada_openai",
-#         "--eval_bs",
-#         "8",
-#         "--limit",
-#         "10",
-#         "--vllm",
-#     ]
-#
-#     proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
-#     assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
+@pytest.mark.skipif(
+    not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
+    reason="only supports CPU/XPU/CUDA backend.",
+)
+@pytest.mark.parametrize("model", MODELS)
+def test_auto_round(model):
+    # Sample prompts.
+    prompts = [
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    # Create an LLM.
+    QUANTIZATION = "auto-round"
+    llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        if "France" in prompt:
+            assert "Paris" in generated_text
+
+
+@pytest.mark.parametrize("model", MODELS)
+def test_vllm_lm_eval(model):
+    if shutil.which("auto-round") is None:
+        pytest.skip("auto-round CLI not available")
+
+    env = os.environ.copy()
+    env["VLLM_SKIP_WARMUP"] = "true"
+    env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+    cmd = [
+        "auto-round",
+        "--model",
+        model,
+        "--eval",
+        "--tasks",
+        "lambada_openai",
+        "--eval_bs",
+        "8",
+        "--eval_backend",
+        "vllm",
+        "--limit",
+        "10",
+    ]
+
+    proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"