Nightly Throughput Stress #44
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly Throughput Stress | |
| on: | |
| schedule: | |
| # Run at 3 AM PST (11:00 UTC) - offset from existing nightly | |
| - cron: '00 11 * * *' | |
| workflow_dispatch: | |
| inputs: | |
| duration: | |
| description: 'Test duration (e.g., 6h, 1h)' | |
| required: false | |
| default: '5h' | |
| type: string | |
| timeout: | |
| description: 'Scenario timeout (should always be more than duration)' | |
| required: false | |
| default: '5h30m' | |
| type: string | |
| job_timeout_minutes: | |
| description: 'GitHub Actions job timeout in minutes' | |
| required: false | |
| default: 360 | |
| type: number | |
| permissions: | |
| contents: read | |
| env: | |
| # Workflow configuration | |
| TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }} | |
| TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }} | |
| # Logging and artifacts | |
| WORKER_LOG_DIR: /tmp/throughput-stress-logs | |
| # AWS S3 metrics upload ARN | |
| AWS_S3_METRICS_UPLOAD_ROLE_ARN: ${{ vars.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} | |
| # Omes configuration | |
| OMES_REPO: temporalio/omes | |
| OMES_REF: main | |
| RUN_ID: ${{ github.run_id }}-throughput-stress | |
| jobs: | |
| throughput-stress: | |
| runs-on: ubuntu-latest-4-cores | |
| timeout-minutes: ${{ fromJSON(inputs.job_timeout_minutes || vars.NIGHTLY_JOB_TIMEOUT_MINUTES || 360) }} | |
| permissions: | |
| contents: read | |
| actions: write | |
| id-token: write | |
| steps: | |
| - name: Print test configuration | |
| run: | | |
| echo "=== Throughput Stress Test Configuration ===" | |
| echo "Duration: $TEST_DURATION" | |
| echo "Timeout: $TEST_TIMEOUT" | |
| echo "Run ID: $RUN_ID" | |
| echo "==========================================" | |
| - name: Checkout SDK | |
| uses: actions/checkout@v5 | |
| with: | |
| submodules: recursive | |
| fetch-depth: 0 | |
| - name: Checkout OMES | |
| uses: actions/checkout@v5 | |
| with: | |
| repository: ${{ env.OMES_REPO }} | |
| ref: ${{ env.OMES_REF }} | |
| path: omes | |
| - name: Setup Go | |
| uses: actions/setup-go@v5 | |
| with: | |
| go-version-file: omes/go.mod | |
| cache-dependency-path: omes/go.sum | |
| - name: Set up Java | |
| uses: actions/setup-java@v5 | |
| with: | |
| java-version: "11" | |
| distribution: "temurin" | |
| - name: Set up Gradle | |
| uses: gradle/actions/setup-gradle@v4 | |
| - name: Build SDK | |
| run: ./gradlew build -x test -x virtualThreadTests | |
| - name: Install Temporal CLI | |
| uses: temporalio/setup-temporal@v0 | |
| - name: Install Prometheus | |
| run: | | |
| PROM_VERSION="3.8.0" | |
| wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz | |
| tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz | |
| sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/ | |
| prometheus --version | |
| - name: Setup log directory | |
| run: mkdir -p $WORKER_LOG_DIR | |
| - name: Start Temporal Server | |
| run: | | |
| temporal server start-dev \ | |
| --db-filename temporal-throughput-stress.sqlite \ | |
| --sqlite-pragma journal_mode=WAL \ | |
| --sqlite-pragma synchronous=OFF \ | |
| --headless &> $WORKER_LOG_DIR/temporal-server.log & | |
| - name: Run throughput stress scenario with local SDK | |
| working-directory: omes | |
| run: | | |
| # This makes the pipeline return the exit code of the first failing command | |
| # Otherwise the output of the `tee` command will be used | |
| # (which is troublesome when the scenario fails but the `tee` command succeeds) | |
| set -o pipefail | |
| # Use run-scenario-with-worker to build and run in one step | |
| # Pass the SDK directory as --version for local testing | |
| # Note: The hardcoded values below match OMES defaults, except: | |
| # - visibility-count-timeout: 5m (vs 3m default) | |
| # to give CI a bit more time for visibility consistency | |
| go run ./cmd run-scenario-with-worker \ | |
| --scenario throughput_stress \ | |
| --language java \ | |
| --version $(pwd)/.. \ | |
| --run-id $RUN_ID \ | |
| --duration $TEST_DURATION \ | |
| --timeout $TEST_TIMEOUT \ | |
| --max-concurrent 10 \ | |
| --prom-listen-address 127.0.0.1:9091 \ | |
| --worker-prom-listen-address 127.0.0.1:9092 \ | |
| --prom-instance-addr 127.0.0.1:9090 \ | |
| --prom-instance-config \ | |
| --prom-export-worker-metrics $RUN_ID.parquet \ | |
| --option internal-iterations=10 \ | |
| --option continue-as-new-after-iterations=3 \ | |
| --option sleep-time=1s \ | |
| --option visibility-count-timeout=5m \ | |
| --option min-throughput-per-hour=1000 \ | |
| 2>&1 | tee $WORKER_LOG_DIR/scenario.log | |
| - name: Configure AWS credentials | |
| if: always() | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }} | |
| aws-region: us-west-2 | |
| - name: Upload metrics to S3 | |
| if: always() | |
| run: | | |
| DATE=$(date +%Y-%m-%d) | |
| # Use test/ prefix on non-main branches | |
| PREFIX="language=java/date=$DATE" | |
| if [[ "${{ github.ref }}" != "refs/heads/main" ]]; then | |
| PREFIX="test/$PREFIX" | |
| fi | |
| aws s3 cp omes/$RUN_ID.parquet \ | |
| "s3://cloud-data-ingest-prod/github/sdk_load_test/$PREFIX/$RUN_ID.parquet" | |
| - name: Upload logs on failure | |
| if: failure() || cancelled() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: throughput-stress-logs | |
| path: ${{ env.WORKER_LOG_DIR }} | |
| retention-days: 30 | |
| - name: Notify Slack on failure | |
| if: failure() || cancelled() | |
| uses: slackapi/slack-github-action@v2 | |
| with: | |
| webhook-type: incoming-webhook | |
| payload: | | |
| { | |
| "text": "Nightly Java throughput stress test failed", | |
| "blocks": [ | |
| { | |
| "type": "section", | |
| "text": { | |
| "type": "mrkdwn", | |
| "text": "*Nightly Throughput Stress Failed* :x:\n\n*Repository:* ${{ github.repository }}\n*Duration:* ${{ env.TEST_DURATION }}\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Logs>\n*Triggered by:* ${{ github.event_name == 'schedule' && 'Scheduled' || github.actor }}" | |
| } | |
| } | |
| ] | |
| } | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SDK_ALERTS_WEBHOOK }} |