Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
106 commits
Select commit Hold shift + click to select a range
de42748
added checksums
anandhu-eng Sep 21, 2024
7249b73
corrected pre download clean
anandhu-eng Sep 21, 2024
b2b7dfc
Disabled check for condition
anandhu-eng Sep 21, 2024
a6224e3
Merge branch 'mlperf-inference' into checksum-branch-1
anandhu-eng Sep 21, 2024
67dc948
Proper exit for unhandled md5sum errors
anandhu-eng Sep 23, 2024
44076fb
Merge branch 'mlperf-inference' into checksum-branch-1
anandhu-eng Sep 23, 2024
729a65e
sdxl scc commit - WIP
anandhu-eng Sep 23, 2024
0343327
Merge branch 'GATEOverflow:mlperf-inference' into nvidia-sdxl-v4.1
anandhu-eng Sep 23, 2024
4986d1f
Restrict the self-hosted runs to the runner repo
arjunsuresh Sep 23, 2024
39a2d8e
Merge pull request #128 from anandhu-eng/nvidia-sdxl-v4.1
arjunsuresh Sep 23, 2024
e963f00
Merge branch 'mlcommons:mlperf-inference' into mlperf-inference
arjunsuresh Sep 23, 2024
1284f49
Merge pull request #285 from GATEOverflow/mlperf-inference
arjunsuresh Sep 23, 2024
30d90d2
Fix rocm pytorch install
arjunsuresh Sep 23, 2024
6373f62
Fixes for SCC24
arjunsuresh Sep 23, 2024
8ceb313
Update torchvision for rocm
arjunsuresh Sep 23, 2024
0d133c9
Update sut config name for SCC24
arjunsuresh Sep 23, 2024
14a6a66
Fix starting weights for nvidia mlperf inference sdxl
arjunsuresh Sep 23, 2024
a470621
Fix torchaudio installation for rocm
arjunsuresh Sep 23, 2024
af58441
Merge branch 'mlperf-inference' into checksum-branch-1
anandhu-eng Sep 24, 2024
d12083e
preclean fixed
anandhu-eng Sep 24, 2024
74030b2
deleted checksum for url -> cloud.*
anandhu-eng Sep 24, 2024
3566ac1
proper handling of pre_clean
anandhu-eng Sep 24, 2024
27861a5
reverted pre clean change
anandhu-eng Sep 24, 2024
54ae911
changes for custom sample id generation - SDXL
anandhu-eng Sep 24, 2024
dc9bf99
changes for custom sample id generation - SDXL
anandhu-eng Sep 24, 2024
0c98cbd
code clean
anandhu-eng Sep 24, 2024
d8a33bc
fixed bug
anandhu-eng Sep 24, 2024
b40ea46
fix pre download clean
anandhu-eng Sep 24, 2024
7c8984f
added gh action workflow for sdxl reference and nvidia
anandhu-eng Sep 24, 2024
6490673
Merge pull request #129 from anandhu-eng/checksum-branch-2
arjunsuresh Sep 24, 2024
a9bfa16
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 24, 2024
f77dee2
Merge pull request #287 from GATEOverflow/mlperf-inference
arjunsuresh Sep 24, 2024
735b581
Fixes for coco2014 saample ids
arjunsuresh Sep 24, 2024
0784740
removed beam size
anandhu-eng Sep 24, 2024
efee47c
Merge branch 'mlperf-inference' into nvidia-sdxl-v4.1
anandhu-eng Sep 24, 2024
95c4132
Merge pull request #130 from anandhu-eng/nvidia-sdxl-v4.1
arjunsuresh Sep 24, 2024
29bed25
handled false condition in download-file
anandhu-eng Sep 24, 2024
2b33996
Merge pull request #125 from anandhu-eng/checksum-branch-1
arjunsuresh Sep 24, 2024
b89de1d
Cleanup of download-file run.sh
arjunsuresh Sep 24, 2024
f1ca1ee
Create github action for scc24 sdxl
anandhu-eng Sep 24, 2024
088a8d4
added checksum and clean code
anandhu-eng Sep 24, 2024
6fa8c6a
clean code
anandhu-eng Sep 24, 2024
39a3684
Update test-mlperf-inference-sdxl.yaml | Changed conflicting schedule…
arjunsuresh Sep 25, 2024
59a7e3d
Merge pull request #131 from anandhu-eng/nvidia-sdxl-v4.1
arjunsuresh Sep 25, 2024
2bccd71
Merge pull request #132 from anandhu-eng/checksum-branch-1
arjunsuresh Sep 25, 2024
8530385
Cleanups
arjunsuresh Sep 25, 2024
d1957bf
Fix precision for gptj test
arjunsuresh Sep 25, 2024
21c8170
Fix precision for gptj fp16
arjunsuresh Sep 25, 2024
70c1f9f
Fix precision for gptj fp16
arjunsuresh Sep 25, 2024
8c7a2c6
Fix precision for gptj fp16
arjunsuresh Sep 25, 2024
68c883c
Merge branch 'mlcommons:mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
bc10367
Added support for cuda 12.6.1
arjunsuresh Sep 25, 2024
a877d21
Support install prefix for cuda install
arjunsuresh Sep 25, 2024
1171a54
Create code-review.yml
arjunsuresh Sep 25, 2024
253b052
Update code-review.yml
arjunsuresh Sep 25, 2024
545ddeb
Update code-review.yml
arjunsuresh Sep 25, 2024
b0a02ae
Support --install_prefix for cuda installation
arjunsuresh Sep 25, 2024
d57bd30
Support --install_prefix for cuda installation
arjunsuresh Sep 25, 2024
c1f2139
Support --extra_install_args for cuda installation
arjunsuresh Sep 25, 2024
30c817c
Merge pull request #290 from GATEOverflow/mlperf-inference
arjunsuresh Sep 25, 2024
5ccf5f7
Improve download-file run.sh
arjunsuresh Sep 25, 2024
a0775dd
Update code-review.yml
arjunsuresh Sep 25, 2024
3bbe8b7
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
8484a75
Update code-review.yml
arjunsuresh Sep 25, 2024
69e95bb
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
a6bad02
Update code-review.yml
arjunsuresh Sep 25, 2024
06539c5
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
1f9c4bb
Update code-review.yml
arjunsuresh Sep 25, 2024
e1274e2
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
35c9a87
Update code-review.yml
arjunsuresh Sep 25, 2024
f7620ea
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
1c0f623
Update code-review.yml
arjunsuresh Sep 25, 2024
ac0bc37
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
4f44126
Update code-review.yml
arjunsuresh Sep 25, 2024
a648268
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
053682d
Update code-review.yml
arjunsuresh Sep 25, 2024
e2ec95f
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
43f35a0
Update code-review.yml
arjunsuresh Sep 25, 2024
f19a182
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
1debe15
Update code-review.yml
arjunsuresh Sep 25, 2024
4a7d477
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
42db1f7
Update code-review.yml
arjunsuresh Sep 25, 2024
e5cc9ce
Update code-review.yml
arjunsuresh Sep 25, 2024
516aff1
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
42ed9d3
Update code-review.yml
arjunsuresh Sep 25, 2024
04fc9f8
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
76c9de9
Update code-review.yml
arjunsuresh Sep 25, 2024
468a9bd
Merge branch 'mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
33597bf
Merge pull request #133 from arjunsuresh/mlperf-inference
arjunsuresh Sep 25, 2024
219ff8c
Update code-review.yml
arjunsuresh Sep 25, 2024
4d6fd03
Merge pull request #292 from mlcommons/main
arjunsuresh Sep 25, 2024
2033889
Merge pull request #293 from mlcommons/main
arjunsuresh Sep 25, 2024
0a9238f
Merge pull request #294 from mlcommons/mlperf-inference
arjunsuresh Sep 25, 2024
c05f1f7
Merge branch 'mlcommons:mlperf-inference' into mlperf-inference
arjunsuresh Sep 25, 2024
0e1489a
Improve get-cuda-devices to handle multiple GPUs, fixes #288
arjunsuresh Sep 25, 2024
f4a1ad2
Use updated get-cuda-devices in mlperf-inference
arjunsuresh Sep 25, 2024
eb8910b
Improved meta for app-mlperf-inference
arjunsuresh Sep 25, 2024
8d7d254
Stop the remaining mlperf runs for docker detached mode
arjunsuresh Sep 25, 2024
042079b
Fix number of accelerators (GPUs) for mlperf-inference
arjunsuresh Sep 25, 2024
cd24064
Update test-mlperf-inference-sdxl.yaml
arjunsuresh Sep 25, 2024
3dbea4a
Update test-scc24-sdxl.yaml
arjunsuresh Sep 25, 2024
12c779c
Dont use venv for nvidia mlperf inference docker
arjunsuresh Sep 26, 2024
bfdfe7f
Merge pull request #295 from GATEOverflow/mlperf-inference
arjunsuresh Sep 26, 2024
03a740f
Merge branch 'dev' into mlperf-inference
arjunsuresh Sep 26, 2024
db60dad
Merge pull request #298 from mlcommons/mlperf-inference
arjunsuresh Sep 26, 2024
28b817e
Merge pull request #299 from mlcommons/dev
gfursin Sep 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/code-review.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: OpenAI Code Review

on:
pull_request_target:
types: [opened, synchronize]
paths:
- 'automation/**'
- 'script/**'
- '!**.md'

permissions:
issues: write
pull-requests: write

jobs:
code_review:
runs-on: ubuntu-latest
if: github.repository_owner == 'gateoverflow' && github.event.pull_request.changed_files > 0
steps:
# Run code review via OpenAI
# Step to run the OpenAI Code Review using the GATEOverflow action
- name: Run OpenAI Code Review
uses: GATEOverflow/genai-code-review@v1
with:
github_token: ${{ secrets.GITHUB_TOKEN }} # GitHub token for authentication
openai_api_key: ${{ secrets.OPENAI_API_KEY }} # OpenAI API key for accessing the GPT model
github_pr_id: ${{ github.event.pull_request.number }} # ID of the pull request to review
openai_model: "gpt-4o" # Model to use for the code review
openai_temperature: 0.5 # Temperature setting for the model's output
openai_max_tokens: 2048 # Maximum number of tokens for the model's response
mode: "files" # Mode of review, can be "files" or "diff"
language: "en" # Language for the review output
custom_prompt: "" # Optional custom prompt for the model
continue-on-error: true # Allow the workflow to continue even if this step fails
11 changes: 4 additions & 7 deletions .github/workflows/test-mlperf-inference-gptj.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,19 @@
name: MLPerf inference GPT-J

on:
push:
branches: [ "main", "dev", "mlperf-inference" ]
paths:
- '.github/workflows/test-mlperf-inference-gptj.yml'
- '**'
- '!**.md'
schedule:
- cron: "1 1 * * */3"

jobs:
build:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "pytorch" ]
precision: [ "bfloat16" ]
precision: [ "float16" ]

steps:
- name: Install dependencies
Expand Down
47 changes: 47 additions & 0 deletions .github/workflows/test-mlperf-inference-sdxl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: MLPerf inference SDXL

on:
schedule:
- cron: "1 2 * * *"

jobs:
build_reference:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "pytorch" ]
precision: [ "float16" ]
steps:
- name: Install dependencies
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
- name: Test MLPerf Inference SDXL
run: |
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean

build_nvidia:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "tensorrt" ]
precision: [ "float16" ]
implementation: [ "nvidia" ]
steps:
- name: Install dependencies
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
- name: Test MLPerf Inference SDXL
run: |
cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
56 changes: 56 additions & 0 deletions .github/workflows/test-scc24-sdxl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: MLPerf inference SDXL

on:
schedule:
- cron: "43 1 * * *"

jobs:
build_reference:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "pytorch" ]
precision: [ "float16" ]
device: [ "cuda" ]
steps:
- name: Install dependencies
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
- name: Test MLPerf Inference reference SDXL SCC
env:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
run: |
cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean |
cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons |
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet

build_nvidia:
if: github.repository_owner == 'gateoverflow'
runs-on: [ self-hosted, linux, x64 ]
strategy:
fail-fast: false
matrix:
python-version: [ "3.12" ]
backend: [ "tensorrt" ]
precision: [ "float16" ]
implementation: [ "nvidia" ]
steps:
- name: Install dependencies
run: |
source gh_action/bin/deactivate || python3 -m venv gh_action
source gh_action/bin/activate
export CM_REPOS=$HOME/GH_CM
cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
- name: Test MLPerf Inference NVIDIA SDXL SCC
env:
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
run: |
cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean |
cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons |
cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet
21 changes: 16 additions & 5 deletions script/app-mlperf-inference-mlcommons-python/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,9 @@ deps:
## Pytorch (CPU)
- tags: get,generic-python-lib,_torch
names:
- ml-engine-pytorch
- pytorch
- torch
- ml-engine-pytorch
- pytorch
skip_if_env:
CM_MODEL:
- dlrm-v2-99
Expand Down Expand Up @@ -216,6 +217,7 @@ deps:
- tags: get,generic-python-lib,_torchvision
names:
- ml-engine-torchvision
- torchvision
skip_if_env:
CM_MODEL:
- dlrm-v2-99
Expand All @@ -231,6 +233,7 @@ deps:
- tags: get,generic-python-lib,_torchvision_cuda
names:
- ml-engine-torchvision
- torchvision
enable_if_env:
CM_MLPERF_BACKEND:
- pytorch
Expand Down Expand Up @@ -695,6 +698,8 @@ variations:
add_deps_recursive:
pytorch:
tags: _rocm
torchvision:
tags: _rocm

rocm,sdxl:
add_deps:
Expand Down Expand Up @@ -834,6 +839,9 @@ variations:
MLPERF_TVM_TORCH_QUANTIZED_ENGINE: qnnpack
deps:
- tags: get,generic-python-lib,_torch
names:
- torch
- pytorch
- tags: get,tvm
names:
- tvm
Expand Down Expand Up @@ -861,7 +869,6 @@ variations:

gptj_:
deps:
- tags: get,generic-python-lib,_torch
- tags: get,generic-python-lib,_package.datasets
- tags: get,generic-python-lib,_package.attrs
- tags: get,generic-python-lib,_package.accelerate
Expand Down Expand Up @@ -1095,6 +1102,10 @@ variations:
- dlrm-src
# to force the version
- tags: get,generic-python-lib,_torch
names:
- torch
- pytorch
- ml-engine-pytorch
version: "1.13.1"
- tags: get,generic-python-lib,_mlperf_logging
- tags: get,generic-python-lib,_opencv-python
Expand Down Expand Up @@ -1247,9 +1258,9 @@ variations:
bfloat16:
group: precision
add_deps_recursive:
ml-model-bfloat16:
ml-model-float16:
tags:
_fp32
_fp16
env:
CM_MLPERF_QUANTIZATION: off
CM_MLPERF_MODEL_PRECISION: bfloat16
Expand Down
2 changes: 2 additions & 0 deletions script/app-mlperf-inference-mlcommons-python/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,8 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
scenario_extra_options + mode_extra_options + \
" --output " + env['CM_MLPERF_OUTPUT_DIR'] + \
" --model-path " + env['CM_ML_MODEL_PATH']
if env.get('CM_COCO2014_SAMPLE_ID_PATH','') != '':
cmd += " --ids-path " + env['CM_COCO2014_SAMPLE_ID_PATH']

elif "llama2-70b" in env['CM_MODEL']:
env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "language", "llama2-70b")
Expand Down
6 changes: 5 additions & 1 deletion script/app-mlperf-inference-nvidia/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ variations:
group: model
env:
CM_MODEL: stable-diffusion-xl
CM_NOT_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://github.com/mlcommons/cm4mlops/blob/main/script/get-ml-model-stable-diffusion/_cm.json#L174"
CM_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://github.com/mlcommons/cm4mlops/blob/main/script/get-ml-model-stable-diffusion/_cm.json#L174"
CM_ML_MODEL_WEIGHT_TRANSFORMATIONS: "quantization, affine fusion"
CM_ML_MODEL_INPUTS_DATA_TYPE: int32
CM_ML_MODEL_WEIGHTS_DATA_TYPE: int8
Expand Down Expand Up @@ -878,6 +878,8 @@ variations:
tags: build,nvidia,inference,server

- tags: reproduce,mlperf,inference,nvidia,harness,_preprocess_data
names:
- nvidia-preprocess-data
inherit_variation_tags: true
force_cache: true
skip_inherit_variation_groups:
Expand Down Expand Up @@ -988,6 +990,8 @@ variations:

- tags: reproduce,mlperf,inference,nvidia,harness,_preprocess_data
inherit_variation_tags: true
names:
- nvidia-preprocess-data
skip_inherit_variation_groups:
- run-mode
- loadgen-scenario
Expand Down
7 changes: 4 additions & 3 deletions script/app-mlperf-inference-nvidia/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ def preprocess(i):
elif "stable-diffusion" in env["CM_MODEL"]:
target_data_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'data', 'coco', 'SDXL')
if not os.path.exists(target_data_path):
cmds.append("make download_data BENCHMARKS='stable-diffusion-xl'")
os.makedirs(target_data_path)
#cmds.append("make download_data BENCHMARKS='stable-diffusion-xl'")
env['CM_REQUIRE_COCO2014_DOWNLOAD'] = 'yes'
cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/captions/captions.tsv {target_data_path}/captions_5k_final.tsv" )
cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/latents/latents.pt {target_data_path}/latents.pt" )
cmds.append(f"cp -r \$CM_DATASET_PATH_ROOT/captions/captions.tsv {target_data_path}/captions_5k_final.tsv" )
cmds.append(f"cp -r \$CM_DATASET_PATH_ROOT/latents/latents.pt {target_data_path}/latents.pt" )
fp16_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'SDXL', 'official_pytorch', 'fp16', 'stable_diffusion_fp16')

if not os.path.exists(os.path.dirname(fp16_model_path)):
Expand Down
3 changes: 2 additions & 1 deletion script/app-mlperf-inference/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,7 @@ variations:
CM_IMAGENET_ACCURACY_DTYPE: int32
CM_CNNDM_ACCURACY_DTYPE: int32
CM_LIBRISPEECH_ACCURACY_DTYPE: int8
CM_DOCKER_USE_VIRTUAL_PYTHON: no
prehook_deps:
- names:
- nvidia-original-mlperf-inference
Expand Down Expand Up @@ -1162,7 +1163,7 @@ variations:
mlperf-inference-implementation:
tags: _cuda
deps:
- tags: get,cuda-devices
- tags: get,cuda-devices,_with-pycuda
skip_if_env:
CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY:
- "yes"
Expand Down
7 changes: 5 additions & 2 deletions script/build-dockerfile/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,11 @@ def preprocess(i):

f.write(EOL+'# Install python packages' + EOL)
python = get_value(env, config, 'PYTHON', 'CM_DOCKERFILE_PYTHON')
f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL)
f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL)

docker_use_virtual_python = env.get('CM_DOCKER_USE_VIRTUAL_PYTHON', "yes")
if str(docker_use_virtual_python).lower() not in [ "no", "0", "false"]:
f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL)
f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL)
#f.write('RUN . /opt/venv/cm/bin/activate' + EOL)
f.write('RUN {} -m pip install '.format(python) + " ".join(get_value(env, config, 'python-packages')) + ' ' + pip_extra_flags + ' ' + EOL)

Expand Down
2 changes: 2 additions & 0 deletions script/clean-nvidia-mlperf-inference-scratch-space/_cm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ tags:
- mlperf
- inference
uid: bb41f6e3608e4e8a
input_mapping:
extra_cache_rm_tags: CM_CLEAN_EXTRA_CACHE_RM_TAGS
deps:
# Get Nvidia scratch space where data and models get downloaded
- tags: get,mlperf,inference,nvidia,scratch,space
Expand Down
17 changes: 12 additions & 5 deletions script/clean-nvidia-mlperf-inference-scratch-space/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,29 @@ def preprocess(i):

clean_cmd = ''
cache_rm_tags = ''
extra_cache_rm_tags = env.get('CM_CLEAN_EXTRA_CACHE_RM_TAGS', '')

if env.get('CM_MODEL', '') == 'sdxl':
if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'downloaded_data':
clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "data", "coco", "SDXL")} """
cache_rm_tags = "nvidia-harness,_preprocessed_data,_sdxl"
cache_rm_tags = "nvidia-harness,_preprocess_data,_sdxl"
if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'preprocessed_data':
clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "preprocessed_data", "coco2014-tokenized-sdxl")} """
cache_rm_tags = "nvidia-harness,_preprocessed_data,_sdxl"
cache_rm_tags = "nvidia-harness,_preprocess_data,_sdxl"

if clean_cmd != '':
env['CM_RUN_CMD'] = clean_cmd
cache_rm_tags = cache_rm_tags + extra_cache_rm_tags

if cache_rm_tags:
r = cm.access({'action': 'rm', 'automation': 'cache', 'tags': cache_rm_tags})
r = cm.access({'action': 'rm', 'automation': 'cache', 'tags': cache_rm_tags, 'f': True})
print(r)
if r['return'] != 0 and r['return'] != 16: ## ignore missing ones
return r
if r['return'] == 0: # cache entry found
if clean_cmd != '':
env['CM_RUN_CMD'] = clean_cmd
else:
if clean_cmd != '':
env['CM_RUN_CMD'] = clean_cmd

return {'return':0}

Expand Down
2 changes: 2 additions & 0 deletions script/download-file/customize.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def preprocess(i):
elif "no such file" in checksum_result.stderr.lower():
#print(f"No file {env['CM_DOWNLOAD_FILENAME']}. Downloading through cmutil.")
cmutil_require_download = 1
elif checksum_result.returncode == 1:
return {"return":1, "error":f"Error while checking checksum: {checksum_result.stderr}"}
else:
print(f"File {env['CM_DOWNLOAD_FILENAME']} already present, original checksum and computed checksum matches! Skipping Download..")
else:
Expand Down
Loading