Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/actions/upload-benchmark-results/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ inputs:
required: True
dry-run:
default: 'true'
# TODO (huydhn): Use this to gate the migration to oss_ci_benchmark_v3 on S3
schema-version:
default: 'v2'

runs:
using: composite
Expand All @@ -16,21 +19,22 @@ runs:
set -eux
python3 -mpip install boto3==1.35.33

# TODO (huydhn): Once the generic benchmark database is ready, this will be
# uploaded to S3 instead
- name: Upload benchmark results to DynamoDB
- name: Upload benchmark results
shell: bash
env:
BENCHMARK_RESULTS_DIR: ${{ inputs.benchmark-results-dir }}
DRY_RUN: ${{ inputs.dry-run }}
SCHEMA_VERSION: ${{ inputs.schema-version }}
run: |
set -eux

if [[ "${DRY_RUN}" == "true" ]]; then
python3 "${GITHUB_ACTION_PATH}/../../scripts/upload_benchmark_results.py" \
--benchmark-results-dir "${BENCHMARK_RESULTS_DIR}" \
--schema-version "${SCHEMA_VERSION}" \
--dry-run
else
python3 "${GITHUB_ACTION_PATH}/../../scripts/upload_benchmark_results.py" \
--benchmark-results-dir "${BENCHMARK_RESULTS_DIR}"
--benchmark-results-dir "${BENCHMARK_RESULTS_DIR}" \
--schema-version "${SCHEMA_VERSION}"
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[{}, {"repo": "pytorch/pytorch"}, {"repo": "pytorch/pytorch", "workflow_id": 1}, {"repo": "pytorch/pytorch", "workflow_id": 1, "job_id": 1}]
101 changes: 91 additions & 10 deletions .github/scripts/upload_benchmark_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import gzip
import hashlib
import json
import logging
Expand All @@ -21,6 +22,9 @@
logging.basicConfig(level=logging.INFO)


OSSCI_BENCHMARKS_BUCKET = "ossci-benchmarks"


class ValidateDir(Action):
def __call__(
self,
Expand Down Expand Up @@ -57,6 +61,13 @@ def parse_args() -> Any:
default="torchci-oss-ci-benchmark",
help="the name of the DynamoDB table to upload to",
)
# v3 is defined at torchci/clickhouse_queries/oss_ci_benchmark_v3/query.sql
parser.add_argument(
"--schema-version",
choices=["v2", "v3"],
required=True,
help="the database schema to use",
)

return parser.parse_args()

Expand All @@ -69,7 +80,6 @@ def default(self, o: Any) -> Any:
return super().default(o)


# TODO (huydhn): This can be replaced by S3 path once we move to S3
def generate_partition_key(doc: Dict[str, Any]) -> str:
"""
Generate an unique partition key for the document on DynamoDB
Expand Down Expand Up @@ -106,24 +116,95 @@ def upload_to_dynamodb(
batch.put_item(Item=doc)


def generate_s3_path(filepath: str, schema_version: str) -> Optional[str]:
with open(filepath) as f:
docs = json.load(f)

if not docs:
info(f"{filepath} is empty")
return ""

for doc in docs:
repo = doc.get("repo", "")
workflow_id = doc.get("workflow_id", 0)
job_id = doc.get("job_id", 0)
servicelab_experiment_id = doc.get("servicelab_experiment_id", 0)
servicelab_trial_id = doc.get("servicelab_trial_id", 0)

# Also handle service lab records here
workflow_id = workflow_id if workflow_id else servicelab_experiment_id
job_id = job_id if job_id else servicelab_trial_id

# We just need one record here to get some metadata to generate the s3 path
if repo and workflow_id and job_id:
break

if not repo or not workflow_id or not job_id:
info(
f"{filepath} is without any information about the repo, workflow, or job id"
)
return ""

filename = os.path.basename(filepath)
return f"{schema_version}/{repo}/{workflow_id}/{job_id}/{filename}"


def upload_to_s3(
s3_bucket: str,
filepath: str,
schema_version: str,
dry_run: bool = True,
) -> None:
"""
Upload the benchmark results to S3
"""
s3_path = generate_s3_path(filepath, schema_version)
if not s3_path:
info(f"Could not generate an S3 path for {filepath}, skipping...")
return

info(f"Upload {filepath} to s3://{s3_bucket}/{s3_path}")
if not dry_run:
# Copied from upload stats script
with open(filepath) as f:
boto3.resource("s3").Object(
f"{s3_bucket}",
f"{s3_path}",
).put(
Body=gzip.compress(f.read().encode()),
ContentEncoding="gzip",
ContentType="application/json",
)


def main() -> None:
args = parse_args()
schema_version = args.schema_version

for file in os.listdir(args.benchmark_results_dir):
if not file.endswith(".json"):
continue

filepath = os.path.join(args.benchmark_results_dir, file)
info(f"Loading {filepath}")

with open(filepath) as f:
upload_to_dynamodb(
dynamodb_table=args.dynamodb_table,
# NB: DynamoDB only accepts decimal number, not float
docs=json.load(f, parse_float=Decimal),
generate_partition_key=generate_partition_key,
dry_run=args.dry_run,
)
# NB: This is for backward compatibility before we move to schema v3
if schema_version == "v2":
with open(filepath) as f:
info(f"Uploading {filepath} to dynamoDB ({schema_version})")
upload_to_dynamodb(
dynamodb_table=args.dynamodb_table,
# NB: DynamoDB only accepts decimal number, not float
docs=json.load(f, parse_float=Decimal),
generate_partition_key=generate_partition_key,
dry_run=args.dry_run,
)

upload_to_s3(
s3_bucket=OSSCI_BENCHMARKS_BUCKET,
filepath=filepath,
schema_version=schema_version,
dry_run=args.dry_run,
)


if __name__ == "__main__":
Expand Down
12 changes: 10 additions & 2 deletions .github/workflows/test_upload_benchmark_results.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,16 @@ jobs:
steps:
- uses: actions/checkout@v3

- name: Test upload the benchmark results
- name: Test upload the benchmark results (v2)
uses: ./.github/actions/upload-benchmark-results
with:
benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing
benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing/v2
schema-version: v2
dry-run: true

- name: Test upload the benchmark results (v3)
uses: ./.github/actions/upload-benchmark-results
with:
benchmark-results-dir: .github/scripts/benchmark-results-dir-for-testing/v3
schema-version: v3
dry-run: true
Loading