mlcommons · arjunsuresh · Nov 6, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
@@ -7,8 +7,8 @@ on:
     branches: [ "main", "dev", "mlperf-inference" ]
     paths:
       - '.github/workflows/test-cm-based-submission-generation.yml'
-      # - '**'  # kept on for all the path instead of submission generation CM script so that this could help in trapping any bugs in any recent submission checker modification also
-      # - '!**.md'
+      - '**'  
+      - '!**.md'
 jobs:
   submission_generation:
     runs-on: ${{ matrix.os }}

@@ -20,21 +20,26 @@ jobs:
         python-version: [ "3.8", "3.12" ]
         backend: [ "pytorch" ]
         implementation: [ "python" ]
-        extra-args: [ "--adr.compiler.tags=gcc" ]
+        docker: [ "", " --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes" ]
+        extra-args: [ "--adr.compiler.tags=gcc", "--env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off" ]
         exclude:
         - os: ubuntu-24.04
           python-version: "3.8"
         - os: windows-latest
           python-version: "3.8"
         - os: windows-latest
-          extra-args: "--adr.compiler.tags=gcc" 
+          extra-args: "--adr.compiler.tags=gcc"
+        - os: windows-latest 
+          docker: " --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes"
+          # windows docker image is not supported in CM yet
         - os: macos-latest
           python-version: "3.8"
         - os: macos-13
           python-version: "3.8"
-        include:
-          - os: windows-latest
-            extra-args: "--env.CM_MLPERF_LOADGEN_BUILD_FROM_SRC=off"
+        - os: macos-latest
+          docker: " --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes"
+        - os: macos-13
+          docker: " --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes"
 
     steps:
     - uses: actions/checkout@v3
@@ -47,6 +52,60 @@ jobs:
         pip install cmind
         cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }}
         cm pull repo mlcommons@cm4abtf --branch=poc
+
+    - name: Install Docker on macos
+      if: runner.os == 'macOS-deactivated'
+      run: |
+        brew update
+        brew install --cask docker
+
+    - name: Start Docker Daemon on macos
+      if: runner.os == 'macOS-deactivated'
+      run: |
+        open /Applications/Docker.app
+        echo "Starting Docker, this may take a while..."
+
+        # Set max attempts and initial wait time
+        MAX_ATTEMPTS=20
+        WAIT_TIME=5
+
+        # Loop until Docker daemon is up or max attempts reached
+        attempt=1
+        while ! docker info > /dev/null 2>&1; do
+          echo "Attempt $attempt: Waiting for Docker to start..."
+          sleep $WAIT_TIME
+          attempt=$((attempt + 1))
+          WAIT_TIME=$((WAIT_TIME * 2))  # Exponential backoff
+
+          if [ $attempt -gt $MAX_ATTEMPTS ]; then
+            echo "Docker failed to start within the timeout period"
+            exit 1
+          fi
+        done
+
+        echo "Docker is up and running"
+
+    - name: Install Docker Desktop on Windows
+      if: runner.os == 'Windows-deactivated'
+      run: |
+        choco install docker-desktop --no-progress -y
+
+    - name: Start Docker Desktop on Windows
+      if: runner.os == 'Windows-deactivated'
+      run: |
+        Start-Process 'C:\Program Files\Docker\Docker\Docker Desktop.exe'
+        # Wait until Docker daemon is running
+        $retryCount = 0
+        while (!(docker info) -and ($retryCount -lt 10)) {
+          Write-Output "Waiting for Docker to start..."
+          Start-Sleep -Seconds 10
+          $retryCount++
+        }
+        if ($retryCount -ge 10) {
+          throw "Docker failed to start"
+        }
+        Write-Output "Docker is up and running"
+
     - name: Test MLPerf Inference ABTF POC using ${{ matrix.backend }} on ${{ matrix.os }}
       run: |
-        cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=2 --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet ${{ matrix.extra-args }} -v
+        cm run script --tags=run-abtf,inference,_poc-demo --test_query_count=2 --adr.cocoeval.version_max=1.5.7 --adr.cocoeval.version_max_usable=1.5.7 --quiet ${{ matrix.extra-args }} ${{ matrix.docker }} -v
@@ -2,7 +2,7 @@ name: MLPerf Inference Nvidia implementations
 
 on:
   schedule:
-    - cron: "31 2 * * *" #to be adjusted
+    - cron: "29 20 * * *" #to be adjusted
 
 jobs:
   build_nvidia:
@@ -21,5 +21,5 @@ jobs:
           source gh_action/bin/activate
           export CM_REPOS=$HOME/GH_CM
           pip install --upgrade cm4mlops
-          cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --execution_mode=valid --gpu_name=rtx_4090 --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=RTX4090x2  --implementation=nvidia    --backend=tensorrt    --category=datacenter,edge --division=closed  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
+          cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --execution_mode=valid --gpu_name=rtx_4090 --pull_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=RTX4090x2  --implementation=nvidia    --backend=tensorrt    --category=datacenter,edge --division=closed  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
           cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on NVIDIA_RTX4090x2" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=RTX4090x2
@@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC)
 
 on:
   schedule:
-    - cron: "5 2 * * *"
+    - cron: "20 14 * * *"
 
 jobs:
   build_reference:
@@ -27,7 +27,7 @@ jobs:
         pip install --upgrade cm4mlops
         pip install tabulate
         cm pull repo
-        cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
+        cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --pull_changes=yes --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
         cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
         cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results
         cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions
@@ -52,7 +52,7 @@ jobs:
           pip install --upgrade cm4mlops
           pip install tabulate
           cm pull repo
-          cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --pull_changes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --custom_system_nvidia=yes --clean
+          cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --pull_changes=yes --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --pull_changes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --hw_name=go-spr --custom_system_nvidia=yes --clean
           cm run script --tags=run-mlperf,inference,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/scc_gh_action_results --submission_dir=$HOME/scc_gh_action_submissions --precision=float16 --env.CM_MLPERF_MODEL_SDXL_DOWNLOAD_TO_HOST=yes --clean
           cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons --submission_dir=$HOME/scc_gh_action_submissions --results_dir=$HOME/scc_gh_action_results/test_results
           cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet --submission_dir=$HOME/scc_gh_action_submissions
@@ -4010,6 +4010,19 @@ def update_deps(self, i):
 
         return {'return':0}
 
+    ##############################################################################
+    def update_state_from_meta(self, meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys, new_state_keys, i):
+        """
+        Updates state and env from meta
+        Args:
+        """
+
+        r = update_state_from_meta(meta, env, state, const, const_state, deps, post_deps, prehook_deps, posthook_deps, new_env_keys, new_state_keys, i)
+        if r['return']>0:
+            return r
+
+        return {'return':0}
+
     ##############################################################################
     def get_default_path_list(self, i):
         default_path_env_key = i.get('default_path_env_key', '')
@@ -5259,9 +5272,17 @@ def update_state_from_meta(meta, env, state, const, const_state, deps, post_deps
         r4 = update_deps(posthook_deps, add_deps_info, True, env)
         if r1['return']>0 and r2['return']>0 and r3['return'] > 0 and r4['return'] > 0: return r1
 
+    # i would have 'input' when called through cm.access
+    input_update_env = i.get('input', i)
+
     input_mapping = meta.get('input_mapping', {})
     if input_mapping:
-        update_env_from_input_mapping(env, i['input'], input_mapping)
+        update_env_from_input_mapping(env, input_update_env, input_mapping)
+
+    # handle dynamic env values
+    r = update_env_with_values(env)
+    if r['return']>0:
+        return r
 
     # Possibly restrict this to within docker environment
     add_deps_info = meta.get('ad', i.get('ad', {})) #we need to see input here

@@ -1802,6 +1802,10 @@ def docker(i):
         state['docker'] = docker_settings
         add_deps_recursive = i.get('add_deps_recursive', {})
 
+        r = script_automation.update_state_from_meta(meta, env, state, const, const_state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys = [], new_state_keys = [], i = i)
+        if r['return'] > 0:
+            return r
+
         r = script_automation._update_state_from_variations(i, meta, variation_tags, variations, env, state, const, const_state, deps = [], post_deps = [], prehook_deps = [], posthook_deps = [], new_env_keys_from_meta = [], new_state_keys_from_meta = [], add_deps_recursive = add_deps_recursive, run_state = {}, recursion_spaces='', verbose = False)
         if r['return'] > 0:
             return r
@@ -1916,11 +1920,12 @@ def docker(i):
             mounts[index] = new_host_mount+":"+new_container_mount
             if host_env_key:
                 container_env_string += " --env.{}={} ".format(host_env_key, container_env_key)
-                # check if the below lines are needed when inputs are mapped to container paths
-                '''for v in docker_input_mapping:
+
+                for v in docker_input_mapping:
                     if docker_input_mapping[v] == host_env_key:
-                        i[v] = container_env_key
-                ''' 
+                        i[v] = container_env_key 
+                        i_run_cmd[v] = container_env_key
+
         mounts = list(filter(lambda item: item is not None, mounts))
 
         mount_string = "" if len(mounts)==0 else ",".join(mounts)

@@ -290,6 +290,10 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio
 
     elif "stable-diffusion-xl" in env['CM_MODEL']:
         env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "text_to_image")
+        if env.get('+PYTHONPATH', '') == '':
+            env['+PYTHONPATH'] = []
+        env['+PYTHONPATH'].append(os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "text_to_image", "tools", "fid"))
+
         backend = env['CM_MLPERF_BACKEND']
         device = env['CM_MLPERF_DEVICE'] if env['CM_MLPERF_DEVICE'] not in [ "gpu", "rocm" ] else "cuda"
         max_batchsize = env.get('CM_MLPERF_LOADGEN_MAX_BATCHSIZE', '1')

@@ -1675,7 +1675,7 @@ docker:
   skip_run_cmd: 'no'
   shm_size: '32gb'
   interactive: True
-  extra_run_args: ' --ulimit memlock=-1 --cap-add SYS_ADMIN --cap-add SYS_TIME --security-opt apparmor=unconfined --security-opt seccomp=unconfined'
+  extra_run_args: ' --dns 8.8.8.8 --dns 8.8.4.4 --ulimit memlock=-1 --cap-add SYS_ADMIN --cap-add SYS_TIME --security-opt apparmor=unconfined --security-opt seccomp=unconfined'
   os: ubuntu
   cm_repo: mlcommons@cm4mlops
   cm_repo_branch: mlperf-inference

@@ -53,6 +53,7 @@
     "run_style": "CM_MLPERF_RUN_STYLE",
     "skip_truncation": "CM_SKIP_TRUNCATE_ACCURACY",
     "submission_dir": "CM_MLPERF_INFERENCE_SUBMISSION_DIR",
+    "submission_base_dir": "CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR",
     "clean": "CM_MLPERF_CLEAN_SUBMISSION_DIR",
     "hw_name": "CM_HW_NAME",
     "sw_notes_extra": "CM_MLPERF_SUT_SW_NOTES_EXTRA",
@@ -120,5 +121,46 @@
     "mlperf-inference-submission",
     "mlcommons-inference-submission"
   ],
-  "uid": "5f8ab2d0b5874d53"
+  "uid": "5f8ab2d0b5874d53",
+  "docker": {
+    "use_host_group_id": true,
+    "use_host_user_id": true,
+    "real_run": false,
+    "deps": [
+      {
+        "tags": "get,mlperf,inference,results,dir,local",
+        "names": "get-mlperf-inference-results-dir",
+        "skip_if_env": {
+          "CM_MLPERF_INFERENCE_RESULTS_DIR_": [
+            "on"
+          ]
+        }
+      },
+      {
+        "tags": "get,mlperf,inference,submission,dir,local",
+        "names": "get-mlperf-inference-submission-dir",
+        "skip_if_any_env": {
+          "CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR": [
+            "on"
+          ]
+        }
+      }
+    ],
+    "pre_run_cmds": [
+      "cm pull repo"
+    ],
+    "mounts": [
+      "${{ CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR }}:${{ CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR }}",
+      "${{ CM_MLPERF_INFERENCE_RESULTS_DIR_ }}:${{ CM_MLPERF_INFERENCE_RESULTS_DIR_ }}"
+    ],
+    "extra_run_args": " --cap-add SYS_ADMIN",
+    "os": "ubuntu",
+    "cm_repo": "mlcommons@cm4mlops",
+    "cm_repo_branch": "mlperf-inference",
+    "os_version": "22.04",
+    "docker_input_mapping": {
+      "submission_base_dir": "CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR",
+      "results_dir": "CM_MLPERF_INFERENCE_RESULTS_DIR_"
+    }
+  }
 }
@@ -65,12 +65,19 @@ def generate_submission(i):
         env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = os.path.join(user_home, "mlperf_submission")
 
     submission_dir = env.get('CM_MLPERF_INFERENCE_SUBMISSION_DIR', '')
+    if submission_dir == '':
+        submission_base_dir = env.get('CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR', '')
+        if submission_base_dir == '':
+            return {'return':1, 'error':f"Both CM_MLPERF_INFERENCE_SUBMISSION_DIR and CM_MLPERF_INFERENCE_SUBMISSION_BASE_DIR can not be empty!"}
+        else:
+            submission_dir = os.path.join(submission_base_dir, "mlperf_inference_submission")
+            env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'] = submission_dir
 
     if env.get('CM_MLPERF_CLEAN_SUBMISSION_DIR','')!='':
         print ('=================================================')
         print ('Cleaning {} ...'.format(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR']))
-        if os.path.exists(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR']):
-            shutil.rmtree(env['CM_MLPERF_INFERENCE_SUBMISSION_DIR'])
+        if os.path.exists(submission_dir):
+            shutil.rmtree(submission_dir)
         print ('=================================================')
 
     if not os.path.isdir(submission_dir):

@@ -4,6 +4,9 @@
   "automation_uid": "5b4e0237da074764",
   "cache": false,
   "category": "MLPerf benchmark support",
+  "docker": {
+    "run": false
+  },
   "deps": [
     {
       "tags": "detect,os"

@@ -39,5 +39,5 @@ def postprocess(i):
     shutil.copytree(submission_dir, submission_backup)
     shutil.rmtree(submission_dir)
     os.rename(submission_processed, submission_dir)
-
+            
     return {'return':0}
@@ -92,7 +92,7 @@ def preprocess(i):
 
 
         elif dataset == "coco2014":
-            env['+PYTHONPATH'] = [ os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "text_to_image", "tools") ]
+            env['+PYTHONPATH'] = [ os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "text_to_image", "tools") , os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "text_to_image", "tools", "fid") ]
             extra_options = ""
 
             if env.get('CM_SDXL_STATISTICS_FILE_PATH', '') != '':

@@ -203,6 +203,7 @@ def preprocess(i):
         inp = {}
         if str(docker_dt).lower() in ["yes", "true", "1"]:
             env['CM_DOCKER_REUSE_EXISTING_CONTAINER'] = 'no' # turning it off for the first run and after that we turn it on
+            env['CM_DOCKER_DETACHED_MODE'] = 'yes'
 
         if env.get('CM_DOCKER_IMAGE_NAME', '') != '':
             docker_extra_input['docker_image_name'] = env['CM_DOCKER_IMAGE_NAME']