Skip to content

Commit

Permalink
Add lm-eval correctness test (vllm-project#210)
Browse files Browse the repository at this point in the history
  • Loading branch information
dbarbuzzi authored May 10, 2024
1 parent 3a31485 commit affd4f4
Show file tree
Hide file tree
Showing 21 changed files with 531 additions and 259 deletions.
6 changes: 5 additions & 1 deletion .github/actions/nm-lm-eval-accuracy/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ runs:
steps:
- id: lm-eval
run: |
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
Expand All @@ -20,7 +24,7 @@ runs:
pip3 install pytest openai==1.3.9
SUCCESS=0
pytest .github/scripts/test_lm_eval_sweep.py -s -v || SUCCESS=$?
pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
exit ${SUCCESS}
shell: bash
4 changes: 4 additions & 0 deletions .github/actions/nm-lm-eval-smoke/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ runs:
steps:
- id: lm-eval
run: |
# move source directories
mv vllm vllm-ignore || echo "no 'vllm' folder to move"
mv csrc csrc-ignore || echo "no 'csrc' folder to move"
COMMIT=${{ github.sha }}
VENV="${{ inputs.venv }}-${COMMIT:0:7}"
source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
Expand Down
5 changes: 5 additions & 0 deletions .github/data/nm_benchmark_weekly_configs_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
neuralmagic/benchmarks/configs/benchmark_serving.json
neuralmagic/benchmarks/configs/benchmark_throughput.json
neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
neuralmagic/benchmarks/configs/benchmark_remote_push.json
4 changes: 2 additions & 2 deletions .github/scripts/lm_eval_compare_hf_vs_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def print_results(data_to_print: List = None,
def check_passing_score(results_dict: Dict = None,
alpha: float = None) -> bool:
for task in results_dict:
p_value = task["p_value"]
p_value = results_dict[task]["p_value"]
if p_value <= alpha:
return False
return True
Expand Down Expand Up @@ -120,6 +120,6 @@ def parse_args():
all_res[task1[0]] = {"z": z, "p_value": p_value}
print_results([results_hf["results"], results_vllm["results"]], all_res,
args.alpha)
if not check_passing_score:
if not check_passing_score(all_res, args.alpha):
print("Accuracy test failed!")
exit(1)
223 changes: 0 additions & 223 deletions .github/scripts/test_lm_eval_sweep.py

This file was deleted.

43 changes: 28 additions & 15 deletions .github/workflows/build-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_call:
inputs:
wf_category:
description: "categories: REMOTE, NIGHTLY, RELEASE"
description: "categories: REMOTE, NIGHTLY, WEEKLY, RELEASE"
type: string
default: "REMOTE"
python:
Expand Down Expand Up @@ -177,17 +177,30 @@ jobs:
push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
secrets: inherit

# TODO: decide if this should build or use the whl
# single gpu
# TODO: this should only run if doing a NIGHTLY or RELEASE
# Accuracy-Smoke-AWS-AVX2-32G-A10G-24G:
# if: ${{ inputs.wf_category == 'NIGHTLY' || inputs.wf_category == 'RELEASE' }}
# uses: ./.github/workflows/nm-lm-eval-smoke.yml
# with:
# label: ${{ inputs.test_label_solo }}
# timeout: ${{ inputs.benchmark_timeout }}
# gitref: ${{ github.ref }}
# Gi_per_thread: ${{ inputs.Gi_per_thread }}
# nvcc_threads: ${{ inputs.nvcc_threads }}
# python: ${{ inputs.python }}
# secrets: inherit
TEST-ACCURACY-SMOKE:
needs: [BUILD]
if: inputs.wf_category == 'NIGHTLY'
uses: ./.github/workflows/nm-lm-eval-smoke.yml
with:
label: ${{ inputs.test_label_solo }}
timeout: ${{ inputs.benchmark_timeout }}
gitref: ${{ inputs.gitref }}
Gi_per_thread: ${{ inputs.Gi_per_thread }}
nvcc_threads: ${{ inputs.nvcc_threads }}
python: ${{ inputs.python }}
whl: ${{ needs.BUILD.outputs.whl }}
secrets: inherit

TEST-ACCURACY-FULL:
needs: [BUILD]
if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
uses: ./.github/workflows/nm-lm-eval-accuracy.yml
with:
label: ${{ inputs.test_label_multi }}
timeout: ${{ inputs.benchmark_timeout }}
gitref: ${{ inputs.gitref }}
Gi_per_thread: ${{ inputs.Gi_per_thread }}
nvcc_threads: ${{ inputs.nvcc_threads }}
python: ${{ inputs.python }}
whl: ${{ needs.BUILD.outputs.whl }}
secrets: inherit
4 changes: 2 additions & 2 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }}
on:
schedule:
# * is a special character in YAML so you have to quote this string
- cron: '0 1 * * *'
- cron: '0 1 * * 1-6' # nightly run (Mon-Sat)

workflow_dispatch:
inputs:
Expand All @@ -27,7 +27,7 @@ jobs:
test_label_solo: aws-avx2-32G-a10g-24G
test_label_multi: aws-avx2-192G-4-a10g-96G
test_timeout: 480
test_skip_list:
test_skip_list: neuralmagic/tests/skip-for-nightly.txt

benchmark_label: aws-avx2-32G-a10g-24G
benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
Expand Down
Loading

0 comments on commit affd4f4

Please sign in to comment.