Skip to content

Commit

Permalink
[ci] fix sagemaker test suite
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk committed May 21, 2024
1 parent f8d162b commit 73c967a
Show file tree
Hide file tree
Showing 3 changed files with 171 additions and 210 deletions.
56 changes: 20 additions & 36 deletions .github/workflows/sagemaker-integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
workflow_dispatch:
inputs:
mode:
description: "release/nightly containers to test. Default is nightly"
description: "candidate release version, or nightly. Default is nightly"
required: false
default: 'nightly'
sagemaker-repository:
Expand Down Expand Up @@ -48,14 +48,15 @@ jobs:
cpu_instance_id1: ${{ steps.create_cpu1.outputs.action_cpu_instance_id }}
cpu_instance_id2: ${{ steps.create_cpu2.outputs.action_cpu_instance_id }}

# These tests are SLOW, and we only have 2 ml.g5.12xlarge instances available for testing
# We parallelize the tests into two separate tracks, each using 1 instance for a few tests.
# There's probably a better way to do this, but for now this works.
# If you add a test, please try to keep the groups balanced
endpoint-tests-group-1:

endpoint-tests:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
strategy:
fail-fast: false
matrix:
container: [lmi, tensorrt-llm]
env:
run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
image_type: ${{ github.event.inputs.mode || 'nightly' }}
Expand All @@ -70,60 +71,43 @@ jobs:
- name: Install SageMaker Python SDK
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} ${{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
- name: Test gpt2xl
- name: Test llama3-8b
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt2-xl djl ${image_type} ${run_benchmark}
python3 llm/sagemaker-endpoint-tests.py llama3-8b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test opt-1.3b
- name: Test mistral-7b
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py opt-1-3-b djl ${image_type} ${run_benchmark}
python3 llm/sagemaker-endpoint-tests.py llama3-8b sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
endpoint-tests-group-2:
runs-on: [ self-hosted, cpu ]
timeout-minutes: 120
needs: create-runners
env:
run_benchmark: ${{ github.event.inputs.run_benchmark || 'true' }}
image_type: ${{ github.event.inputs.mode || 'nightly' }}
steps:
- uses: actions/checkout@v4
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install -U boto3 awscli
- name: Install SageMaker Python SDK
- name: Test phi-2
if: success() || failure()
working-directory: tests/integration
run: |
./install_sagemaker_pysdk.sh ${{ github.event.inputs.sagemaker-repository }} $ {{ github.event.inputs.repository-branch }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: us-west-2
- name: Test gpt-j-6b
python3 llm/sagemaker-endpoint-tests.py phi-2 sme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
- name: Test Multi Model Endpoint
if: success() || failure()
working-directory: tests/integration
run: |
python3 llm/sagemaker-endpoint-tests.py gpt-j-6b djl ${image_type} ${run_benchmark}
python3 llm/sagemaker-endpoint-tests.py mme_common mme ${image_type} ${{ matrix.container}} ${run_benchmark}
echo "sleep 30 seconds to allow endpoint deletion"
sleep 30
stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, endpoint-tests-group-1, endpoint-tests-group-2 ]
needs: [ create-runners, endpoint-tests ]
steps:
- name: Cleanup dangling SageMaker resources
run: |
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/install_sagemaker_pysdk.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
github_repository=$1
repository_branch=$2

if [[ -z "$github_repository" ]]; then
if [[ -n "$github_repository" ]]; then
git clone $github_repository
cd sagemaker-python-sdk
if [[ -z "$repository_branch" ]]; then
if [[ -n "$repository_branch" ]]; then
git checkout $repository_branch
fi
pip install .
Expand Down
Loading

0 comments on commit 73c967a

Please sign in to comment.