|
42 | 42 | set_cuda_visible_devices, |
43 | 43 | ) |
44 | 44 |
|
| 45 | +os.environ["TOKENIZERS_PARALLELISM"] = "false" |
| 46 | + |
45 | 47 |
|
46 | 48 | class BasicArgumentParser(argparse.ArgumentParser): |
47 | 49 |
|
@@ -322,6 +324,27 @@ def __init__(self, *args, **kwargs): |
322 | 324 | help="Limit the number of examples per task. " |
323 | 325 | "If <1, limit is a percentage of the total number of examples.", |
324 | 326 | ) |
| 327 | + # vllm related arguments |
| 328 | + self.add_argument("--revision", default=None, type=str, help="model revision for vllm") |
| 329 | + self.add_argument("--tokenizer", default=None, type=str, help="tokenizer to use with vllm") |
| 330 | + self.add_argument( |
| 331 | + "--tokenizer_mode", default="auto", type=str, help="tokenizer mode for vllm (e.g. auto/fast/slow)" |
| 332 | + ) |
| 333 | + self.add_argument("--tokenizer_revision", default=None, type=str, help="tokenizer revision for vllm") |
| 334 | + self.add_argument("--add_bos_token", action="store_true", help="add BOS token when using vllm") |
| 335 | + self.add_argument("--prefix_token_id", default=None, type=int, help="prefix token id for vllm") |
| 336 | + self.add_argument("--tensor_parallel_size", default=1, type=int, help="tensor parallel size for vllm") |
| 337 | + self.add_argument("--data_parallel_size", default=1, type=int, help="data parallel size for vllm") |
| 338 | + self.add_argument("--quantization", default=None, type=str, help="quantization setting for vllm") |
| 339 | + self.add_argument("--max_gen_toks", default=256, type=int, help="max generation tokens for vllm") |
| 340 | + self.add_argument("--swap_space", default=4, type=float, help="swap space (GB) for vllm") |
| 341 | + self.add_argument("--max_batch_size", default=None, type=int, help="max batch size for vllm") |
| 342 | + self.add_argument("--max_length", default=None, type=int, help="max generation length for vllm") |
| 343 | + self.add_argument("--max_model_len", default=None, type=int, help="maximum model sequence length for vllm") |
| 344 | + self.add_argument( |
| 345 | + "--gpu_memory_utilization", default=0.9, type=float, help="target GPU memory utilization for vllm" |
| 346 | + ) |
| 347 | + self.add_argument("--lora_local_path", default=None, type=str, help="local LoRA path for vllm") |
325 | 348 |
|
326 | 349 |
|
327 | 350 | def setup_parser(): |
@@ -786,15 +809,16 @@ def eval(args): |
786 | 809 | if (batch_size := args.eval_bs) is None: |
787 | 810 | batch_size = "auto:8" |
788 | 811 | is_gguf_file = False |
789 | | - if os.path.isfile(args.model) and args.model.endswith(".gguf"): |
790 | | - is_gguf_file = True |
791 | | - gguf_file = os.path.basename(args.model) |
792 | | - model = os.path.dirname(args.model) |
793 | | - else: |
794 | | - for file in os.listdir(args.model): |
795 | | - if file.endswith(".gguf"): |
796 | | - is_gguf_file = True |
797 | | - gguf_file = file |
| 812 | + if os.path.exists(args.model): |
| 813 | + if os.path.isfile(args.model) and args.model.endswith(".gguf"): |
| 814 | + is_gguf_file = True |
| 815 | + gguf_file = os.path.basename(args.model) |
| 816 | + model = os.path.dirname(args.model) |
| 817 | + else: |
| 818 | + for file in os.listdir(args.model): |
| 819 | + if file.endswith(".gguf"): |
| 820 | + is_gguf_file = True |
| 821 | + gguf_file = file |
798 | 822 | eval_model_dtype = get_model_dtype(args.eval_model_dtype) |
799 | 823 | if is_gguf_file: |
800 | 824 | import torch |
@@ -949,3 +973,51 @@ def eval_task_by_task( |
949 | 973 | print(make_table(res_all)) |
950 | 974 |
|
951 | 975 | print("total eval time:", time.time() - st) |
| 976 | + |
| 977 | + |
| 978 | +def eval_with_vllm(args): |
| 979 | + import time |
| 980 | + |
| 981 | + st = time.time() |
| 982 | + |
| 983 | + from lm_eval import evaluator # pylint: disable=E0401 |
| 984 | + from lm_eval.models.vllm_causallms import VLLM # pylint: disable=E0401 |
| 985 | + from lm_eval.utils import make_table # pylint: disable=E0401 |
| 986 | + |
| 987 | + device_str, _ = get_device_and_parallelism(args.device) |
| 988 | + eval_model_dtype = get_model_dtype(args.eval_model_dtype, "auto") |
| 989 | + if (batch_size := args.eval_bs) is None: |
| 990 | + batch_size = "auto:8" |
| 991 | + |
| 992 | + vllm_lm = VLLM( |
| 993 | + pretrained=args.model, |
| 994 | + dtype=eval_model_dtype, |
| 995 | + revision=args.revision, |
| 996 | + trust_remote_code=not args.disable_trust_remote_code, |
| 997 | + tokenizer=args.tokenizer, |
| 998 | + tokenizer_mode=args.tokenizer_mode, |
| 999 | + tokenizer_revision=args.tokenizer_revision, |
| 1000 | + add_bos_token=args.add_bos_token, |
| 1001 | + prefix_token_id=args.prefix_token_id, |
| 1002 | + tensor_parallel_size=args.tensor_parallel_size, |
| 1003 | + quantization=args.quantization, |
| 1004 | + max_gen_toks=args.max_gen_toks, |
| 1005 | + swap_space=args.swap_space, |
| 1006 | + batch_size=batch_size, |
| 1007 | + max_batch_size=args.max_batch_size, |
| 1008 | + max_length=args.max_length, |
| 1009 | + max_model_len=args.max_model_len, |
| 1010 | + seed=args.seed, |
| 1011 | + gpu_memory_utilization=args.gpu_memory_utilization, |
| 1012 | + device=device_str, |
| 1013 | + data_parallel_size=args.data_parallel_size, |
| 1014 | + lora_local_path=args.lora_local_path, |
| 1015 | + ) |
| 1016 | + res = evaluator.simple_evaluate( |
| 1017 | + model=vllm_lm, |
| 1018 | + tasks=args.tasks, |
| 1019 | + limit=args.limit, |
| 1020 | + ) |
| 1021 | + |
| 1022 | + print(make_table(res)) |
| 1023 | + print("evaluation running time=%ds" % (time.time() - st)) |
0 commit comments