diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..fa6ea236ef04f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..02668702b83af --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..fb4b4915ab955 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..dec9164d1b84e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.632 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000000000..127ec5d97bcff --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000000000..273c5482db264 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3-8B-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh new file mode 100644 index 0000000000000..fdb8ec5393b36 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args pretrained=$MODEL,parallelize=True \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..a2876bade8893 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.2 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000000000..b4fdde6dab425 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-tests.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py new file mode 100644 index 0000000000000..975841dad1c29 --- /dev/null +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -0,0 +1,54 @@ +""" +LM eval harness on model to compare vs HF baseline computed offline. +Configs are found in configs/$MODEL.yaml + +* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml +* export LM_EVAL_TP_SIZE=4 +* pytest -s test_lm_eval_correctness.py +""" + +import os +from pathlib import Path + +import lm_eval +import numpy +import yaml + +RTOL = 0.02 +TEST_DATA_FILE = os.environ.get( + "LM_EVAL_TEST_DATA_FILE", + ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") + +TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) + + +def launch_lm_eval(eval_config): + model_args = f"pretrained={eval_config['model_name']}," \ + f"tensor_parallel_size={TP_SIZE}" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + batch_size="auto") + + return results + + +def test_lm_eval_correctness(): + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Launch eval requests. + results = launch_lm_eval(eval_config) + + # Confirm scores match ground truth. + for task in eval_config["tasks"]: + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][metric["name"]] + print(f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') + assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a0bb55675458..6ed3b5be54d6a 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -197,6 +197,22 @@ steps: - pip install aiohttp - bash run-benchmarks.sh +- label: LM Eval Small Models + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: LM Eval Large Models + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + - label: Documentation Build working_dir: "/vllm-workspace/test_docs/docs" no_gpu: True