Skip to content

add pad_to_buckets in evaluation for hpu performance #2011

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
from neural_compressor.torch.utils import is_hpex_available

parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -324,22 +325,26 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
user_model, _ = get_user_model()
tokenizer = AutoTokenizer.from_pretrained(args.model)
config = AutoConfig.from_pretrained(args.model)
user_model = load(os.path.abspath(os.path.expanduser(args.output_dir)), user_model)
user_model = load(
os.path.abspath(os.path.expanduser(args.output_dir)),
user_model,
device="hpu" if is_hpex_available() else "cpu",
)
setattr(user_model, "config", config)
else:
user_model, tokenizer = get_user_model()


if args.accuracy:
user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
eval_args = LMEvalParser(
model="hf",
user_model=user_model,
tokenizer=tokenizer,
batch_size=args.batch_size,
tasks=args.tasks,
device="cpu",
device="hpu" if is_hpex_available() else "cpu",
)
results = evaluate(eval_args)
for task_name in args.tasks.split(","):
Expand All @@ -352,7 +357,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):

if args.performance:
user_model.eval()
from intel_extension_for_transformers.transformers.llm.evaluation.lm_eval import evaluate, LMEvalParser
from neural_compressor.evaluation.lm_eval import evaluate, LMEvalParser
import time

samples = args.iters * args.batch_size
Expand All @@ -363,7 +368,7 @@ def run_fn_for_gptq(model, dataloader_for_calibration, *args):
batch_size=args.batch_size,
tasks=args.tasks,
limit=samples,
device="cpu",
device="hpu" if is_hpex_available() else "cpu",
)
start = time.time()
results = evaluate(eval_args)
Expand Down
69 changes: 62 additions & 7 deletions neural_compressor/evaluation/lm_eval/accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,26 @@
from pathlib import Path
from typing import Union

import lm_eval
import numpy as np
from lm_eval import utils
from lm_eval import evaluator, utils
from lm_eval.loggers import WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string

from neural_compressor.evaluation.lm_eval import evaluator
from neural_compressor.evaluation.lm_eval.evaluator import request_caching_arg_to_dict

DEFAULT_RESULTS_FILE = "results.json"


def request_caching_arg_to_dict(cache_requests: str) -> dict:
request_caching_args = {
"cache_requests": cache_requests in {"true", "refresh"},
"rewrite_requests_cache": cache_requests == "refresh",
"delete_requests_cache": cache_requests == "delete",
}

return request_caching_args


def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
Expand Down Expand Up @@ -143,8 +151,57 @@ def cli_evaluate(args) -> None:

request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests)

### update model with user_model ###
if args.model_args is None:
args.model_args = ""
# replace HFLM.
from .models.huggingface import HFLM

lm_eval.api.registry.MODEL_REGISTRY["hf-auto"] = HFLM
lm_eval.api.registry.MODEL_REGISTRY["hf"] = HFLM
lm_eval.api.registry.MODEL_REGISTRY["huggingface"] = HFLM

if args.user_model is not None:
# use tiny model to built lm.
print(
"We use 'pretrained=Muennighoff/tiny-random-bert'"
+ "to build `LM` instance, the actually run model is user_model you passed."
)
lm = lm_eval.api.registry.get_model(args.model).create_from_arg_string(
"pretrained=Muennighoff/tiny-random-bert",
{
"batch_size": args.batch_size,
"max_batch_size": args.max_batch_size,
"device": args.device,
},
)
lm._model = args.user_model
if args.tokenizer is not None:
lm.tokenizer = args.tokenizer
else:
assert False, "Please provide tokenizer in evaluation function"
elif isinstance(args.model_args, dict):
lm = lm_eval.api.registry.get_model(args.model).create_from_arg_obj(
args.model_args,
{
"batch_size": args.batch_size,
"max_batch_size": args.max_batch_size,
"device": args.device,
},
)
else:
lm = lm_eval.api.registry.get_model(args.model).create_from_arg_string(
args.model_args,
{
"batch_size": args.batch_size,
"max_batch_size": args.max_batch_size,
"device": args.device,
},
)
lm.pad_to_buckets = args.pad_to_buckets

results = evaluator.simple_evaluate(
model=args.model,
model=lm,
model_args=args.model_args,
tasks=task_names,
num_fewshot=args.num_fewshot,
Expand All @@ -163,8 +220,6 @@ def cli_evaluate(args) -> None:
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2],
user_model=args.user_model, # to validate the model in memory,
tokenizer=args.tokenizer, # to use tokenizer in mem,
**request_caching_args,
)

Expand Down
Loading