[ci] fix sagemaker test suite

deepjavalibrary · May 21, 2024 · 73c967a · 73c967a
1 parent f8d162b
commit 73c967a
Show file tree

Hide file tree

Showing 3 changed files with 171 additions and 210 deletions.
diff --git a/.github/workflows/sagemaker-integration.yml b/.github/workflows/sagemaker-integration.yml
@@ -4,7 +4,7 @@ on:
   workflow_dispatch:
     inputs:
       mode:
-        description: "release/nightly containers to test. Default is nightly"
+        description: "candidate release version, or nightly. Default is nightly"
         required: false
         default: 'nightly'
       sagemaker-repository:
@@ -48,14 +48,15 @@ jobs:
       cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
       cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}
 
-  # These tests are SLOW, and we only have 2 ml.g5.12xlarge instances available for testing
-  # We parallelize the tests into two separate tracks, each using 1 instance for a few tests.
-  # There's probably a better way to do this, but for now this works.
-  # If you add a test, please try to keep the groups balanced
-  endpoint-tests-group-1:
+
+  endpoint-tests:
     runs-on: [ self-hosted, cpu ]
     timeout-minutes: 120
     needs: create-runners
+    strategy:
+      fail-fast: false
+      matrix:
+        container: [lmi, tensorrt-llm]
     env:
       run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
       image_type: ${{ github.event.inputs.mode || 'nightly' }}
@@ -70,60 +71,43 @@ jobs:
       - name: Install SageMaker Python SDK
         working-directory: tests/integration
         run: |
-          ./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
+          ./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} ${{ github.event.inputs.repository-branch }}
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-region: us-west-2
-      - name: Test gpt2xl
+      - name: Test llama3-8b
         working-directory: tests/integration
         run: |
-          python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${image_type} ${run_benchmark}
+          python3 llm/sagemaker-endpoint-tests.py llama3-8b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
           echo "sleep 30 seconds to allow endpoint deletion"
           sleep 30
-      - name: Test opt-1.3b
+      - name: Test mistral-7b
         if: success() || failure()
         working-directory: tests/integration
         run: |
-          python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${image_type} ${run_benchmark}
+          python3 llm/sagemaker-endpoint-tests.py llama3-8b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
           echo "sleep 30 seconds to allow endpoint deletion"
           sleep 30
-        
-  endpoint-tests-group-2:
-    runs-on: [ self-hosted, cpu ]
-    timeout-minutes: 120
-    needs: create-runners
-    env:
-      run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
-      image_type: ${{ github.event.inputs.mode || 'nightly' }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install -U boto3 awscli
-      - name: Install SageMaker Python SDK
+      - name: Test phi-2
+        if: success() || failure()
         working-directory: tests/integration
         run: |
-          ./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: us-west-2
-      - name: Test gpt-j-6b
+          python3 llm/sagemaker-endpoint-tests.py phi-2 sme ${image_type} ${{ matrix.container}} ${run_benchmark}
+          echo "sleep 30 seconds to allow endpoint deletion"
+          sleep 30
+      - name: Test Multi Model Endpoint
         if: success() || failure()
         working-directory: tests/integration
         run: |
-          python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${image_type} ${run_benchmark}
+          python3 llm/sagemaker-endpoint-tests.py mme_common mme ${image_type} ${{ matrix.container}} ${run_benchmark}
           echo "sleep 30 seconds to allow endpoint deletion"
           sleep 30
 
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, endpoint-tests-group-1, endpoint-tests-group-2 ]
+    needs: [ create-runners, endpoint-tests ]
     steps:
       - name: Cleanup dangling SageMaker resources
         run: |

diff --git a/tests/integration/install_sagemaker_pysdk.sh b/tests/integration/install_sagemaker_pysdk.sh
@@ -3,10 +3,10 @@
 github_repository=$1
 repository_branch=$2
 
-if [[ -z "$github_repository" ]]; then
+if [[ -n "$github_repository" ]]; then
   git clone $github_repository
   cd sagemaker-python-sdk
-  if [[ -z "$repository_branch" ]]; then
+  if [[ -n "$repository_branch" ]]; then
     git checkout $repository_branch
   fi
   pip install .