mlcommons · gfursin · Sep 24, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -4,17 +4,16 @@
 name: MLPerf inference GPT-J
 
 on:
-  pull_request:
-    branches: [ "main1", "dev1" ]
+  push:
+    branches: [ "main", "dev", "mlperf-inference" ]
     paths:
       - '.github/workflows/test-mlperf-inference-gptj.yml'
       - '**'
       - '!**.md'
 
 jobs:
   build:
-
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, linux, x64 ]
     strategy:
       fail-fast: false
       matrix:
@@ -23,15 +22,12 @@ jobs:
         precision: [ "bfloat16" ]
 
     steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
-        python3 -m pip install cmind
+        source gh_action/bin/deactivate || python3 -m venv gh_action
+        source gh_action/bin/activate
+        export CM_REPOS=$HOME/GH_CM
         cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
     - name: Test MLPerf Inference GPTJ
       run: |
-        cm run script --tags=run,mlperf,inference,generate-run-cmds,_submission,_short --submitter="cTuning" --model=gptj --backend=${{ matrix.backend }} --device=cpu --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet
+        cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=gptj-99 --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean
@@ -1524,7 +1524,7 @@ def dockerfile(i):
         if i.get('print_deps'):
             cm_input = {'action': 'run',
                     'automation': 'script',
-                    'tags': f'{tag_string}',
+                    'tags': f"""{i.get('tags')}""",
                     'print_deps': True,
                     'quiet': True,
                     'silent': True,
@@ -1557,7 +1557,7 @@ def dockerfile(i):
                            'fake_run_option': fake_run_option,
                            'comments': comments,
                            'run_cmd': f'{run_cmd} --quiet',
-                           'script_tags': f'{tag_string}',
+                           'script_tags': f"""{i.get('tags')}""",
                            'copy_files': docker_copy_files,
                            'quiet': True,
                            'env': env,
@@ -2037,11 +2037,11 @@ def docker(i):
                            'image_repo': image_repo,
                            'interactive': interactive,
                            'mounts': mounts,
-                           'image_name': 'cm-script-'+script_alias,
+                           'image_name': i.get('docker_image_name', ''),
 #                            'image_tag': script_alias,
                            'image_tag_extra': image_tag_extra,
                            'detached': detached,
-                           'script_tags': f'{tag_string}',
+                           'script_tags': f"""{i.get('tags')}""",
                            'run_cmd': final_run_cmd,
                            'v': i.get('v', False),
                            'quiet': True,

@@ -533,6 +533,7 @@ deps:
   - tags: get,dataset,coco2014,_validation
     names:
     - coco2014-preprocessed
+    - coco2014-dataset
     enable_if_env:
       CM_MODEL:
       - stable-diffusion-xl
@@ -590,6 +591,9 @@ deps:
   - tags: generate,user-conf,mlperf,inference
     names:
     - user-conf-generator
+    skip_if_env:
+      CM_RUN_STATE_DOCKER:
+      - 'yes'
 
   # Install MLPerf loadgen
   - tags: get,loadgen
@@ -1232,7 +1236,7 @@ variations:
   float16:
     group: precision
     add_deps_recursive:
-      ml-model:
+      ml-model-float16:
         tags:
           _fp16
     env:
@@ -1243,9 +1247,9 @@ variations:
   bfloat16:
     group: precision
     add_deps_recursive:
-      ml-model:
+      ml-model-bfloat16:
         tags:
-          _fp16
+          _fp32
     env:
       CM_MLPERF_QUANTIZATION: off
       CM_MLPERF_MODEL_PRECISION: bfloat16

@@ -42,7 +42,6 @@ input_mapping:
   mlperf_conf: CM_MLPERF_CONF
   mode: CM_MLPERF_LOADGEN_MODE
   output_dir: CM_MLPERF_OUTPUT_DIR
-  performance_sample_count: CM_MLPERF_PERFORMANCE_SAMPLE_COUNT
   scenario: CM_MLPERF_LOADGEN_SCENARIO
   user_conf: CM_MLPERF_USER_CONF
   devices: CM_MLPERF_NVIDIA_HARNESS_DEVICES
@@ -301,6 +300,16 @@ prehook_deps:
       CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST:
       - 'yes'
 
+  # Install coco2014 dataset
+  - enable_if_env:
+      CM_REQUIRE_COCO2014_DOWNLOAD:
+      - 'yes'
+      CM_MLPERF_NVIDIA_HARNESS_RUN_MODE:
+      - preprocess_data
+    names:
+      - coco2014-dataset
+    tags: get,dataset,coco2014,_validation
+
 # Post dependencies to run this app including for power measurement
 post_deps:
 
@@ -477,6 +486,11 @@ variations:
         names:
           - scipy
         version: 1.10.1
+      - tags: get,generic-python-lib,_package.numpy
+        names:
+          - numpy
+        version_max: 1.22.99
+        version_max_usable: "1.22"
 
   sdxl,v4.1:
     deps:
@@ -989,7 +1003,6 @@ variations:
           CM_MODEL:
            - dlrm-v2-99
            - dlrm-v2-99.9
-           - stable-diffusion-xl
 
       - tags: reproduce,mlperf,inference,nvidia,harness,_download_model
         inherit_variation_tags: true

@@ -74,6 +74,9 @@ def preprocess(i):
         target_data_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'data', 'coco', 'SDXL')
         if not os.path.exists(target_data_path):
             cmds.append("make download_data BENCHMARKS='stable-diffusion-xl'")
+            env['CM_REQUIRE_COCO2014_DOWNLOAD'] = 'yes'
+            cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/captions/captions.tsv {target_data_path}/captions_5k_final.tsv" )
+            cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/latents/latents.pt {target_data_path}/latents.pt" )
         fp16_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'SDXL', 'official_pytorch', 'fp16', 'stable_diffusion_fp16')
 
         if not os.path.exists(os.path.dirname(fp16_model_path)):
@@ -361,7 +364,7 @@ def preprocess(i):
         if input_format:
             run_config += f" --input_format={input_format}"
 
-        performance_sample_count = env.get('CM_MLPERF_PERFORMANCE_SAMPLE_COUNT')
+        performance_sample_count = env.get('CM_MLPERF_LOADGEN_PERFORMANCE_SAMPLE_COUNT')
         if performance_sample_count:
             run_config += f" --performance_sample_count={performance_sample_count}"
 

@@ -1565,15 +1565,21 @@ docker:
     - tags: get,mlperf,inference,results,dir,local
       names:
       - get-mlperf-inference-results-dir
+      skip_if_env:
+        OUTPUT_BASE_DIR: [ on ]
     - tags: get,mlperf,inference,submission,dir,local
       names:
       - get-mlperf-inference-submission-dir
+      skip_if_env:
+        CM_MLPERF_INFERENCE_SUBMISSION_DIR: [ on ]
+
   pre_run_cmds:
     #- cm pull repo && cm run script --tags=get,git,repo,_repo.https://github.com/GATEOverflow/inference_results_v4.0.git --update
     - cm pull repo
   mounts:
    - "${{ CM_DATASET_IMAGENET_PATH }}:${{ CM_DATASET_IMAGENET_PATH }}"
    - "${{ CM_MLPERF_INFERENCE_RESULTS_DIR }}:${{ CM_MLPERF_INFERENCE_RESULTS_DIR }}"
+   - "${{ OUTPUT_BASE_DIR }}:${{ OUTPUT_BASE_DIR }}"
    - "${{ CM_MLPERF_INFERENCE_SUBMISSION_DIR }}:${{ CM_MLPERF_INFERENCE_SUBMISSION_DIR }}"
    - "${{ GPTJ_CHECKPOINT_PATH }}:${{ GPTJ_CHECKPOINT_PATH }}"
    - "${{ CM_CRITEO_PREPROCESSED_PATH }}:${{ CM_CRITEO_PREPROCESSED_PATH }}"

@@ -42,5 +42,4 @@ prehook_deps:
 - enable_if_env:
     CM_BUILD_DOCKERFILE:
     - 'yes'
-    - '1'
   tags: build,dockerfile
@@ -15,6 +15,7 @@ def preprocess(i):
     else:
         build_dockerfile = True
         env['CM_BUILD_DOCKERFILE'] = "yes"
+        env['CM_DOCKERFILE_BUILD_FROM_IMAGE_SCRIPT'] = "yes"
 
 
     CM_DOCKER_BUILD_ARGS = env.get('+ CM_DOCKER_BUILD_ARGS', [])
@@ -34,55 +35,54 @@ def preprocess(i):
 #    else:
 #        env['CM_BUILD_DOCKERFILE'] = "no"
 #
-    if "CM_DOCKER_IMAGE_REPO" not in env:
+    if env.get("CM_DOCKER_IMAGE_REPO", "") == '':
         env['CM_DOCKER_IMAGE_REPO'] = "local"
 
     docker_image_name = env.get('CM_DOCKER_IMAGE_NAME', '')
     if docker_image_name == '':
-        docker_image_name = env.get('CM_DOCKER_RUN_SCRIPT_TAGS','').replace(',', '-').replace('_','')
-    if docker_image_name == '':
-        docker_image_name = 'cm'
-
-    env['CM_DOCKER_IMAGE_NAME'] = docker_image_name
+        docker_image_name = "cm-script-" +env.get('CM_DOCKER_RUN_SCRIPT_TAGS','').replace(',', '-').replace('_','-')
+        env['CM_DOCKER_IMAGE_NAME'] = docker_image_name
 
     if env.get("CM_DOCKER_IMAGE_TAG", "") == '':
         env['CM_DOCKER_IMAGE_TAG'] = "latest"
 
-    if env.get("CM_DOCKER_CACHE", "yes") in ["no", "False", False]:
+    if str(env.get("CM_DOCKER_CACHE", "yes")).lower() in ["no", "false", "0"]:
         env["CM_DOCKER_CACHE_ARG"] = " --no-cache"
 
     CMD = ''
 
     image_name = get_image_name(env)
 
-    if not build_dockerfile:
-        # Write .dockerignore
-        with open('.dockerignore', 'w') as f:
-            f.write('.git\n')
+    if build_dockerfile:
+        dockerfile_path = "\${CM_DOCKERFILE_WITH_PATH}"
 
-        # Prepare CMD to build image
-        XCMD = [
-               'docker build ' + env.get('CM_DOCKER_CACHE_ARG',''),
-                ' ' + build_args,
-                ' -f "' + dockerfile_path + '"',
-                ' -t "' + image_name,
-                ' .'
-               ]
+    # Write .dockerignore
+    with open('.dockerignore', 'w') as f:
+        f.write('.git\n')
 
-        with open(dockerfile_path + '.build.sh', 'w') as f:
-            f.write(' \\\n'.join(XCMD) + '\n')
+    # Prepare CMD to build image
+    XCMD = [
+            'docker build ' + env.get('CM_DOCKER_CACHE_ARG',''),
+            ' ' + build_args,
+            ' -f "' + dockerfile_path + '"',
+            ' -t "' + image_name,
+            ' .'
+            ]
 
-        with open(dockerfile_path + '.build.bat', 'w') as f:
-            f.write(' ^\n'.join(XCMD) + '\n')
+    with open(dockerfile_path + '.build.sh', 'w') as f:
+        f.write(' \\\n'.join(XCMD) + '\n')
 
-        CMD = ''.join(XCMD)
+    with open(dockerfile_path + '.build.bat', 'w') as f:
+        f.write(' ^\n'.join(XCMD) + '\n')
 
-        print ('================================================')
-        print ('CM generated the following Docker build command:')
-        print ('')
-        print (CMD)
+    CMD = ''.join(XCMD)
 
-        print ('')
+    print ('================================================')
+    print ('CM generated the following Docker build command:')
+    print ('')
+    print (CMD)
+
+    print ('')
 
     env['CM_DOCKER_BUILD_CMD'] = CMD
 

@@ -56,7 +56,6 @@ post_deps:
 - enable_if_env:
     CM_BUILD_DOCKER_IMAGE:
     - 'yes'
-    - '1'
   names:
   - build-docker-image
   tags: build,docker,image

@@ -21,7 +21,7 @@ def preprocess(i):
     input_args = []
     copy_files = []
 
-    if 'CM_DOCKER_RUN_SCRIPT_TAGS' in env:
+    if env.get('CM_DOCKER_RUN_SCRIPT_TAGS', '') != '':
         script_tags=env['CM_DOCKER_RUN_SCRIPT_TAGS']
         found_scripts = cm.access({'action': 'search', 'automation': 'script', 'tags': script_tags})
         scripts_list = found_scripts['list']
@@ -62,7 +62,7 @@ def preprocess(i):
     else:
         cm_mlops_repo_branch_string = ""
 
-    if 'CM_DOCKERFILE_WITH_PATH' not in env:
+    if env.get('CM_DOCKERFILE_WITH_PATH', '') == '':
         env['CM_DOCKERFILE_WITH_PATH'] = os.path.join(os.getcwd(), "Dockerfile")
 
     dockerfile_with_path = env['CM_DOCKERFILE_WITH_PATH']
@@ -180,9 +180,10 @@ def preprocess(i):
 
     f.write(EOL+'# Install python packages' + EOL)
     python = get_value(env, config, 'PYTHON', 'CM_DOCKERFILE_PYTHON')
-    f.write('RUN {} -m venv cm-venv'.format(python) + " " + EOL)
-    f.write('RUN . cm-venv/bin/activate' + EOL)
-    f.write('RUN {} -m pip install --user '.format(python) + " ".join(get_value(env, config, 'python-packages')) + ' ' + pip_extra_flags + ' ' + EOL)
+    f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL)
+    f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL)
+    #f.write('RUN . /opt/venv/cm/bin/activate' + EOL)
+    f.write('RUN {} -m pip install '.format(python) + " ".join(get_value(env, config, 'python-packages')) + ' ' + pip_extra_flags + ' ' + EOL)
 
     f.write(EOL+'# Download CM repo for scripts' + EOL)
 
@@ -260,6 +261,8 @@ def preprocess(i):
         s = r['string']
         f.write(s + EOL)
 
+    print(f"""Dockerfile written at {dockerfile_with_path}""")
+
     f.close()
 
     #f = open(env['CM_DOCKERFILE_WITH_PATH'], "r")

@@ -0,0 +1,43 @@
+alias: clean-nvidia-mlperf-inference-scratch-space
+automation_alias: script
+automation_uid: 5b4e0237da074764
+cache: false
+tags:
+- clean
+- nvidia
+- scratch
+- space
+- mlperf
+- inference
+uid: bb41f6e3608e4e8a
+deps:
+  # Get Nvidia scratch space where data and models get downloaded
+  - tags: get,mlperf,inference,nvidia,scratch,space
+    names:
+    - nvidia-scratch-space
+
+variations:
+  sdxl:
+    group: model
+    env:
+      CM_MODEL: sdxl
+  downloaded-data:
+    group: artifact
+    env:
+      CM_CLEAN_ARTIFACT_NAME: downloaded_data
+  preprocessed-data:
+    group: artifact
+    env:
+      CM_CLEAN_ARTIFACT_NAME: preprocessed_data
+  downloaded-model:
+    group: artifact
+    env:
+      CM_CLEAN_ARTIFACT_NAME: downloaded_model
+  v4.1:
+    group: version
+    env:
+      CM_NVIDIA_MLPERF_INFERENCE_CODE_VERSION: v4.1
+  v4.0:
+    group: version
+    env:
+      CM_NVIDIA_MLPERF_INFERENCE_CODE_VERSION: v4.0