Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion auto_round/eval/eval_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
import argparse
import os
import time

from auto_round.utils import (
clear_memory,
Expand Down Expand Up @@ -66,6 +67,13 @@ def __init__(self, *args, **kwargs):
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.",
)
self.add_argument(
"--eval_backend",
default="hf",
type=str,
choices=["hf", "vllm"],
help="Use hf backend for evaluation by default.",
)
# vllm related arguments
self.add_argument("--revision", default=None, type=str, help="model revision for vllm")
self.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm")
Expand Down Expand Up @@ -104,7 +112,15 @@ def _eval_init(tasks, model_path, device, disable_trust_remote_code=False, dtype


def eval(args):
import time
if args.eval_backend == "vllm":
try:
assert isinstance(args.model, str), "vllm evaluation only supports model name or path."
eval_with_vllm(args)
return
except Exception as e: # pragma: no cover
print(f"vllm evaluation failed: {e}, fallback to default hf backend evaluation.")
args.eval_backend = "hf"
clear_memory()

tasks, model_args, device_str = _eval_init(
args.tasks, args.model, args.device_map, args.disable_trust_remote_code, args.eval_model_dtype
Expand Down
107 changes: 54 additions & 53 deletions test/test_cuda/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,56 +21,57 @@
]


# @pytest.mark.skipif(
# not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
# reason="only supports CPU/XPU/CUDA backend.",
# )
# @pytest.mark.parametrize("model", MODELS)
# def test_auto_round(model):
# # Sample prompts.
# prompts = [
# "The capital of France is",
# "The future of AI is",
# ]
# # Create a sampling params object.
# sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# # Create an LLM.
# QUANTIZATION = "auto-round"
# llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
# # Generate texts from the prompts.
# # The output is a list of RequestOutput objects
# # that contain the prompt, generated text, and other information.
# outputs = llm.generate(prompts, sampling_params)
# # Print the outputs.
# for output in outputs:
# prompt = output.prompt
# generated_text = output.outputs[0].text
# if "France" in prompt:
# assert "Paris" in generated_text
#
#
# @pytest.mark.parametrize("model", MODELS)
# def test_vllm_lm_eval(model):
# if shutil.which("auto-round") is None:
# pytest.skip("auto-round CLI not available")
#
# env = os.environ.copy()
# env["VLLM_SKIP_WARMUP"] = "true"
# env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
#
# cmd = [
# "auto-round",
# "--model",
# model,
# "--eval",
# "--tasks",
# "lambada_openai",
# "--eval_bs",
# "8",
# "--limit",
# "10",
# "--vllm",
# ]
#
# proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
@pytest.mark.skipif(
not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.",
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(model):
# Sample prompts.
prompts = [
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
QUANTIZATION = "auto-round"
llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
if "France" in prompt:
assert "Paris" in generated_text


@pytest.mark.parametrize("model", MODELS)
def test_vllm_lm_eval(model):
if shutil.which("auto-round") is None:
pytest.skip("auto-round CLI not available")

env = os.environ.copy()
env["VLLM_SKIP_WARMUP"] = "true"
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

cmd = [
"auto-round",
"--model",
model,
"--eval",
"--tasks",
"lambada_openai",
"--eval_bs",
"8",
"--eval_backend",
"vllm",
"--limit",
"10",
]

proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
Loading