neuralmagic · robertgshaw2-neuralmagic · Jun 21, 2024 · Jun 8, 2024 · Jun 8, 2024 · Jun 8, 2024
diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml
diff --git a/...ub/actions/nm-lm-eval-accuracy/action.yml → .github/actions/nm-lm-eval/action.yml b/...ub/actions/nm-lm-eval-accuracy/action.yml → .github/actions/nm-lm-eval/action.yml
@@ -1,21 +1,20 @@
-name: run lm-eval full accuracy test
-description: 'run lm-eval full accuracy test'
+name: run lm-eval accuracy test
+description: 'run lm-eval accuracy test'
 inputs:
   python:
     description: 'python version, e.g. 3.10.12'
     required: true
   venv:
     description: 'name for python virtual environment'
     required: true
+  lm_eval_configuration:
+    description: 'file containing test configuration'
+    required: true
 runs:
   using: composite
   steps:
   - id: lm-eval
     run: |
-      # move source directories
-      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
-      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
-
       if [ -n "${{ inputs.venv }}" ]; then
         COMMIT=${{ github.sha }}
         VENV="${{ inputs.venv }}-${COMMIT:0:7}"
@@ -26,7 +25,7 @@ runs:
       pip3 install pytest openai==1.3.9
 
       SUCCESS=0
-      pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$?
-      echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT"
+      ./.github/scripts/nm-run-lm-eval-vllm -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$?
+      echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py
diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline
@@ -0,0 +1,50 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -d    - device to use (e.g. cuda, cuda:0, auto, cpu)"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo
+}
+
+while getopts "m:b:d:l:f:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    d ) 
+        DEVICE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model hf \
+  --model_args pretrained=$MODEL \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE --device $DEVICE
diff --git a/.github/scripts/nm-run-lm-eval-vllm b/.github/scripts/nm-run-lm-eval-vllm
@@ -0,0 +1,30 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for transformers.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using vllm server and compares to "
+    echo "precomputed baseline (measured by HF transformers."
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -c    - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
+    echo
+}
+
+while getopts "c:" OPT; do
+  case ${OPT} in
+    c ) 
+        CONFIG="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -46,7 +46,7 @@ on:
         type: string
         required: true
       test_skip_list:
-        description: 'file containing tests to skip'
+        description: 'file containing tests to skip (see neuralmagic/tests)'
         type: string
         required: true
       # benchmark related parameters
@@ -66,6 +66,19 @@ on:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
         type: string
         default: "false"
+      # lm-eval related parameters
+      lm_eval_label:
+        description: "requested runner label (specifies instance)"
+        type: string
+        default: ""
+      lm_eval_timeout:
+        description: "time limit for lm_eval in minutes"
+        type: string
+        default: "60"
+      lm_eval_configuration:
+        description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
+        type: string
+        default: "" 
 
 jobs:
 
@@ -134,16 +147,14 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
 
-    TEST-ACCURACY-FULL:
+    LM-EVAL-SOLO:
       needs: [BUILD]
-      if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }}
-      uses: ./.github/workflows/nm-test-accuracy-full.yml
+      uses: ./.github/workflows/nm-lm-eval.yml
       with:
-        label: ${{ inputs.test_label_multi }}
-        timeout: ${{ inputs.benchmark_timeout }}
+        label: ${{ inputs.test_label_solo }}
+        timeout: ${{ inputs.lm_eval_timeout }}
         gitref: ${{ inputs.gitref }}
-        Gi_per_thread: ${{ inputs.Gi_per_thread }}
-        nvcc_threads: ${{ inputs.nvcc_threads }}
         python: ${{ inputs.python }}
         whl: ${{ needs.BUILD.outputs.whl }}
+        lm_eval_configuration: ${{ inputs.lm_eval_configuration }}
       secrets: inherit