temporalio · THardy98 · Dec 10, 2025 · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025
@@ -0,0 +1,266 @@
+name: Run Omes Scenario on ECS
+
+on:
+  push:
+    branches: [per-version-taskdefs]
+  workflow_dispatch:
+    inputs:
+      language:
+        description: SDK language (go, java, python, dotnet, typescript)
+        default: go
+        required: false
+        type: string
+      sdk-repo:
+        description: SDK repository (e.g., temporalio/sdk-go)
+        default: temporalio/sdk-go
+        required: false
+        type: string
+      sdk-gitref:
+        description: SDK git ref (commit, tag, branch)
+        default: main
+        required: false
+        type: string
+      is-experiment:
+        description: 'Mark this run as an experiment (excluded from nightly dashboards)'
+        default: true
+        required: false
+        type: boolean
+
+      scenario:
+        description: Omes scenario to run
+        default: throughput_stress
+        required: false
+        type: string
+      duration:
+        description: Test duration
+        default: 5h
+        required: false
+        type: string
+      timeout:
+        description: Scenario timeout
+        default: 5h30m
+        required: false
+        type: string
+      max-concurrent:
+        description: Max concurrent workflows
+        default: "10"
+        required: false
+        type: string
+      scenario-options:
+        description: JSON array of --option values
+        default: >
+          ["internal-iterations=10",
+           "continue-as-new-after-iterations=3",
+           "sleep-time=1s",
+           "visibility-count-timeout=5m",
+           "min-throughput-per-hour=1000"]
+        required: false
+        type: string
+
+jobs:
+  run-omes:
+    runs-on: ubuntu-latest
+    permissions:
+      id-token: write
+      contents: read
+
+    env:
+      AWS_REGION: us-west-2
+      ECS_CLUSTER: worker-perf-test-cluster
+      SECURITY_GROUP: ${{ secrets.AWS_ECS_SECURITY_GROUP }}
+      SUBNET: ${{ secrets.AWS_ECS_SUBNET }}
+      TEMPORAL_ADDRESS: ${{ secrets.TEMPORAL_CLOUD_ADDRESS }}
+      TEMPORAL_NAMESPACE: ${{ secrets.TEMPORAL_CLOUD_NAMESPACE }}
+      TEMPORAL_API_KEY: ${{ secrets.TEMPORAL_CLOUD_API_KEY }}
+
+      # Default values for push/nightly runs
+      INPUT_SDK_GITREF: ${{ github.event.inputs.sdk-gitref }}
+      LANGUAGE: ${{ github.event.inputs.language || 'go' }}
+      SDK_REPO: ${{ github.event.inputs.sdk-repo || 'temporalio/sdk-go' }}
+      IS_EXPERIMENT: ${{ github.event.inputs.is-experiment || true }}
+      SCENARIO: ${{ github.event.inputs.scenario || 'throughput_stress' }}
+      DURATION: ${{ github.event.inputs.duration || '10m' }}
+      TIMEOUT: ${{ github.event.inputs.timeout || '20m' }}
+      MAX_CONCURRENT: ${{ github.event.inputs['max-concurrent'] || '10' }}
+      SCENARIO_OPTIONS: ${{ github.event.inputs['scenario-options'] || '["internal-iterations=10","continue-as-new-after-iterations=3","sleep-time=1s","visibility-count-timeout=5m","min-throughput-per-hour=1000"]' }}
+      RUN_ID: ${{ github.run_id }}
+
+    steps:
+      - name: Checkout Omes
+        uses: actions/checkout@v4
+        with:
+          repository: temporalio/omes
+          ref: per-version-taskdefs
+          path: omes
+          submodules: recursive
+
+      - name: Install jq
+        run: sudo apt-get update && sudo apt-get install -y jq
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: ${{ secrets.AWS_OMES_SDK_PERF_AWS_ROLE }}
+          aws-region: ${{ env.AWS_REGION }}
+
+      - name: Login to Amazon ECR
+        uses: aws-actions/amazon-ecr-login@v2
+
+      - name: Determine SDK_GITREF
+        id: sdk-gitref
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          if [ -n "$INPUT_SDK_GITREF" ]; then
+            echo "SDK_GITREF=$INPUT_SDK_GITREF" >> $GITHUB_ENV
+          else
+            BRANCH=$(gh repo view "${{ env.SDK_REPO }}" --json defaultBranchRef -q '.defaultBranchRef.name')
+            echo "SDK_GITREF=$BRANCH" >> $GITHUB_ENV
+          fi
+
+      - name: Checkout SDK
+        uses: actions/checkout@v4
+        with:
+          repository: ${{ env.SDK_REPO }}
+          ref: ${{ env.SDK_GITREF }}
+          path: omes/sdk
+          submodules: recursive
+
+      # TODO(thomas) We need to explicitly build .NET for release mode 
+      # (will need to modify features to allow for --configuration Release flag)
+      # Python is built in release mode by default by sdkbuild in features
+      # Typescript just needs BUILD_CORE_RELEASE env var set (we set as an image build-arg)
+
+      - name: Build and push worker image
+        working-directory: omes
+        env:
+          ECR_REPO: ${{ secrets.AWS_ECR_URI }}
+          GITREF: ${{ env.SDK_GITREF }}
+        run: |
+          set -e
+          IMAGE_TAG="${LANGUAGE}-worker-${GITREF}"
+
+          # BUILD_CORE_RELEASE is for TypeScript core-bridge release build (ignored by other languages)
+          # PREBUILD_WORKER is also for Typescript, see typescript.Dockerfile for reasoning
+          docker build \
+            -f dockerfiles/${LANGUAGE}.Dockerfile \
+            --build-arg SDK_DIR=./sdk \
+            --build-arg SDK_VERSION=./repo \
+            --build-arg BUILD_CORE_RELEASE=true \
+            --build-arg PREBUILD_WORKER=true \
+            -t $ECR_REPO:$IMAGE_TAG \
+            .
+
+          docker push $ECR_REPO:$IMAGE_TAG
+          echo "WORKER_IMAGE=$ECR_REPO:$IMAGE_TAG" >> $GITHUB_ENV
+
+      - name: Register worker task definition
+        id: worker-task-def
+        working-directory: omes
+        env:
+          WORKER_IMAGE: ${{ env.WORKER_IMAGE }}
+          EXECUTION_ROLE_ARN: ${{ secrets.AWS_OMES_SDK_PERF_TASK_AWS_ROLE }}
+        run: |
+          set -e
+          TASK_DEF=$(envsubst < ecs/templates/worker-task-definition.json)
+
+          TASK_DEF_ARN=$(aws ecs register-task-definition \
+            --cli-input-json "$TASK_DEF" \
+            --query 'taskDefinition.taskDefinitionArn' \
+            --output text)
+
+          echo "arn=$TASK_DEF_ARN" >> "$GITHUB_OUTPUT"
+          echo "Registered task definition: $TASK_DEF_ARN"
+
+      - name: Build and push client image
+        working-directory: omes
+        env:
+          ECR_REPO: ${{ secrets.AWS_ECR_URI }}
+        run: |
+          set -e
+          IMAGE_TAG="omes-client"
+
+          docker build \
+            -f dockerfiles/cli-prometheus.Dockerfile \
+            -t $ECR_REPO:$IMAGE_TAG \
+            .
+
+          docker push $ECR_REPO:$IMAGE_TAG
+          echo "CLIENT_IMAGE=$ECR_REPO:$IMAGE_TAG" >> $GITHUB_ENV
+
+      - name: Start worker task
+        working-directory: omes
+        id: worker
+        run: |
+          set -e
+          OVERRIDES=$(envsubst < ecs/templates/worker-overrides.json)
+
+          TASK_ARN=$(aws ecs run-task \
+            --cluster "$ECS_CLUSTER" \
+            --launch-type FARGATE \
+            --task-definition "${{ steps.worker-task-def.outputs.arn }}" \
+            --network-configuration "awsvpcConfiguration={subnets=[$SUBNET],securityGroups=[$SECURITY_GROUP],assignPublicIp=ENABLED}" \
+            --overrides "$OVERRIDES" \
+            --query 'tasks[0].taskArn' \
+            --output text
+          )
+          echo "task=$TASK_ARN" >> "$GITHUB_OUTPUT"
+
+      - name: Wait for worker & get IP
+        id: worker-ip
+        run: |
+          set -e
+          aws ecs wait tasks-running \
+            --cluster $ECS_CLUSTER \
+            --tasks ${{ steps.worker.outputs.task }}
+
+          IP=$(aws ecs describe-tasks \
+            --cluster $ECS_CLUSTER \
+            --tasks ${{ steps.worker.outputs.task }} \
+            --query 'tasks[0].attachments[0].details[?name==`privateIPv4Address`].value' \
+            --output text)
+
+          echo "ip=$IP" >> $GITHUB_OUTPUT
+          echo "Worker IP: $IP"
+
+      - name: Start client task
+        working-directory: omes
+        id: client
+        env:
+          WORKER_IP: ${{ steps.worker-ip.outputs.ip }}
+          WORKER_TASK_ARN: ${{ steps.worker.outputs.task }}
+          S3_BUCKET_URI: ${{ secrets.AWS_SDK_PERF_TEST_S3_URI }}
+        run: |
+          set -e
+
+          # Substitute static variables
+          BASE_OVERRIDES=$(envsubst < ecs/templates/client-overrides.json)
+
+          # Append dynamic --option flags
+          OVERRIDES=$(echo "$BASE_OVERRIDES" | jq \
+            --argjson options '${{ env.SCENARIO_OPTIONS }}' \
+            '.containerOverrides[0].command += ($options | map(["--option", .]) | flatten)'
+          )
+
+          TASK_ARN=$(aws ecs run-task \
+            --cluster "$ECS_CLUSTER" \
+            --launch-type FARGATE \
+            --task-definition omes-client \
+            --network-configuration "awsvpcConfiguration={subnets=[$SUBNET],securityGroups=[$SECURITY_GROUP],assignPublicIp=ENABLED}" \
+            --overrides "$OVERRIDES" \
+            --query 'tasks[0].taskArn' \
+            --output text)
+
+          echo "task=$TASK_ARN" >> "$GITHUB_OUTPUT"
+          echo "Started client: $TASK_ARN"
+
+      - name: Cleanup on failure
+        if: failure()
+        continue-on-error: true
+        run: |
+          if [ -n "${{ steps.worker.outputs.task }}" ]; then
+              aws ecs stop-task --cluster $ECS_CLUSTER --task ${{ steps.worker.outputs.task }} || true
+          fi
+          if [ -n "${{ steps.client.outputs.task }}" ]; then
+              aws ecs stop-task --cluster $ECS_CLUSTER --task ${{ steps.client.outputs.task }} || true
+          fi
@@ -105,8 +105,7 @@ func (p *PrometheusInstanceFlags) FlagSet(prefix string) *pflag.FlagSet {
 	}
 	p.fs = pflag.NewFlagSet(prefix+"prom_instance_options", pflag.ExitOnError)
 	p.fs.StringVar(&p.Address, prefix+"prom-instance-addr", "", "Prometheus instance address")
-	p.fs.StringVar(&p.ConfigPath, prefix+"prom-instance-config", "", "Start a local Prometheus instance with the specified config file (default: prom-config.yml)")
-	p.fs.Lookup(prefix + "prom-instance-config").NoOptDefVal = "prom-config.yml"
+	p.fs.StringVar(&p.ConfigPath, prefix+"prom-instance-config", "prom-config.yml", "Start a local Prometheus instance with the specified config file (default: prom-config.yml)")
 	p.fs.BoolVar(&p.Snapshot, "prom-snapshot", false, "Create a TSDB snapshot on shutdown")
 	p.fs.StringVar(&p.ExportWorkerMetricsPath, prefix+"prom-export-worker-metrics", "", "Export worker process metrics to the specified file on shutdown")
 	p.fs.StringVar(&p.ExportWorkerMetricsJob, prefix+"prom-export-worker-job", "omes-worker", "Name of the worker job to export metrics for")

@@ -0,0 +1,51 @@
+#!/bin/bash
+set -e
+
+# Cleanup worker on any exit (success, failure, or signal)
+cleanup() {
+  if [ -n "$WORKER_TASK_ARN" ] && [ -n "$ECS_CLUSTER" ]; then
+    echo "Stopping worker task: $WORKER_TASK_ARN"
+    aws ecs stop-task --cluster "$ECS_CLUSTER" --task "$WORKER_TASK_ARN" --region "${AWS_REGION:-us-west-2}" --reason "Client exited" || true
+  fi
+}
+trap cleanup EXIT
+
+# Generate prom-config.yml if WORKER_METRICS_HOST is set
+# This enables scraping metrics from a remote worker (e.g., separate ECS task)
+if [ -n "$WORKER_METRICS_HOST" ]; then
+  cat > /app/prom-config.yml << PROMCFG
+global:
+  scrape_interval: 1s
+scrape_configs:
+  - job_name: 'omes-client'
+    static_configs:
+      - targets: ['localhost:9091']
+  - job_name: 'omes-worker'
+    static_configs:
+      - targets: ['${WORKER_METRICS_HOST}:${WORKER_METRICS_PORT:-9092}']
+PROMCFG
+  echo "Generated prom-config.yml with worker target: ${WORKER_METRICS_HOST}:${WORKER_METRICS_PORT:-9092}"
+fi
+
+# Make dir to store metrics
+mkdir -p /app/metrics
+
+set +e
+# Pass through all arguments to temporal-omes
+/app/temporal-omes "$@"
+EXIT_CODE=$?
+set -e
+
+if [ -n "$S3_BUCKET_URI" ] && [ $EXIT_CODE -eq 0 ]; then
+  DATE=$(date +%Y-%m-%d)
+  S3_KEY="${S3_BUCKET_URI}/is_experiment=${IS_EXPERIMENT:-true}/date=${DATE}/"
+
+  echo "Uploading metrics to S3: ${S3_KEY}"
+  aws s3 cp /app/metrics/ "${S3_KEY}" --recursive
+else
+  if [ $EXIT_CODE -ne 0 ]; then
+    echo "Scenario failed. Skipping S3 upload."
+  fi
+fi
+
+exit $EXIT_CODE