From 4c5417f7bbfd17357e261e1bfa98f1ad7fa29c2d Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Wed, 12 Apr 2023 11:02:49 +0800
Subject: [PATCH 1/4] Add num_workers argument in model benchmark (#511)

Change num_workers to configurable in model benchmark data loader.
---
 superbench/benchmarks/model_benchmarks/model_base.py   | 7 +++++++
 superbench/benchmarks/model_benchmarks/pytorch_base.py | 2 +-
 tests/benchmarks/model_benchmarks/test_model_base.py   | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py
index 133ee76f4..a51c05850 100644
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -78,6 +78,13 @@ def add_parser_arguments(self):
             required=False,
             help='The number of batch size.',
         )
+        self._parser.add_argument(
+            '--num_workers',
+            type=int,
+            default=8,
+            required=False,
+            help='Number of subprocesses to use for data loading.',
+        )
         self._parser.add_argument(
             '--precision',
             type=Precision,
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index ce1cca93b..f0cb52319 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -181,7 +181,7 @@ def _init_dataloader(self):
             dataset=self._dataset,
             batch_size=self._args.batch_size,
             shuffle=False,
-            num_workers=8,
+            num_workers=self._args.num_workers,
             sampler=train_sampler,
             drop_last=True,
             pin_memory=self._args.pin_memory
diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py
index 926088aea..deba3a438 100644
--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -167,6 +167,7 @@ def test_arguments_related_interfaces():
   --no_gpu              Disable GPU training.
   --num_steps int       The number of test step.
   --num_warmup int      The number of warmup step.
+  --num_workers int     Number of subprocesses to use for data loading.
   --pin_memory          Enable option to pin memory in data loader.
   --precision Precision [Precision ...]
                         Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
@@ -206,6 +207,7 @@ def test_preprocess():
   --no_gpu              Disable GPU training.
   --num_steps int       The number of test step.
   --num_warmup int      The number of warmup step.
+  --num_workers int     Number of subprocesses to use for data loading.
   --pin_memory          Enable option to pin memory in data loader.
   --precision Precision [Precision ...]
                         Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2

From 5a2adddc51f07290fd0a85b00c645de888485f1f Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Wed, 12 Apr 2023 11:55:02 +0800
Subject: [PATCH 2/4] Remove unreachable condition when write host list (#512)

Remove unreachable condition when write host list in mpi mode.
---
 .../utils/gen_traffic_pattern_config.py       | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/superbench/common/utils/gen_traffic_pattern_config.py b/superbench/common/utils/gen_traffic_pattern_config.py
index 97864784c..84a2e65d0 100644
--- a/superbench/common/utils/gen_traffic_pattern_config.py
+++ b/superbench/common/utils/gen_traffic_pattern_config.py
@@ -182,15 +182,14 @@ def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchm
         logger.error('Unsupported traffic pattern: {}'.format(pattern.type))
     host_groups = __convert_config_to_host_group(config, host_list)
     # write traffic pattern host groups to specified path
-    if pattern.mpi_pattern:
-        with open(mpi_pattern_path, 'a') as f:
-            f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
-            for host_group in host_groups:
-                row = []
-                for host_list in host_group:
-                    group = ','.join(host_list)
-                    row.append(group)
-                group = ';'.join(row)
-                f.write(group + '\n')
-            f.write('\n')
+    with open(mpi_pattern_path, 'a') as f:
+        f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
+        for host_group in host_groups:
+            row = []
+            for host_list in host_group:
+                group = ','.join(host_list)
+                row.append(group)
+            group = ';'.join(row)
+            f.write(group + '\n')
+        f.write('\n')
     return host_groups

From 17c01d8452af0c3776cebd43b49c2ddd537e3d14 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Wed, 12 Apr 2023 16:01:25 +0800
Subject: [PATCH 3/4] Update cuda11.8 image to cuda12.1 based on nvcr23.03
 (#513)

Update cuda11.8 image to cuda12.1 based on nvcr23.03 and related versions in the image:
* cuda 11.8 -> 12.1
* nccl 2.15.5 -> 2.17.1
* hpcx: 2.8 -> 2.14
* mlc: 3.9a -> 3.10
---
 .github/workflows/build-image.yml             |  6 +--
 ...uda11.8.dockerfile => cuda12.1.dockerfile} | 50 ++++++++++---------
 2 files changed, 29 insertions(+), 27 deletions(-)
 rename dockerfile/{cuda11.8.dockerfile => cuda12.1.dockerfile} (81%)

diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml
index 6e599e9c7..824418a6f 100644
--- a/.github/workflows/build-image.yml
+++ b/.github/workflows/build-image.yml
@@ -24,9 +24,9 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: cuda11.8
-          dockerfile: cuda11.8
-          tags: superbench/main:cuda11.8
+        - name: cuda12.1
+          dockerfile: cuda12.1
+          tags: superbench/main:cuda12.1
         - name: cuda11.1.1
           dockerfile: cuda11.1.1
           tags: superbench/main:cuda11.1.1,superbench/superbench:latest
diff --git a/dockerfile/cuda11.8.dockerfile b/dockerfile/cuda12.1.dockerfile
similarity index 81%
rename from dockerfile/cuda11.8.dockerfile
rename to dockerfile/cuda12.1.dockerfile
index 7615b60e3..4a257bf43 100644
--- a/dockerfile/cuda11.8.dockerfile
+++ b/dockerfile/cuda12.1.dockerfile
@@ -1,18 +1,18 @@
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+FROM nvcr.io/nvidia/pytorch:23.03-py3
 
 # OS:
 #   - Ubuntu: 20.04
 #   - OpenMPI: 4.1.5a1
 #   - Docker Client: 20.10.8
 # NVIDIA:
-#   - CUDA: 11.8.0
-#   - cuDNN: 8.7.0.84
-#   - NCCL: v2.15.5-1
+#   - CUDA: 12.1.0
+#   - cuDNN: 8.8.1.3
+#   - NCCL: v2.17.1-1
 # Mellanox:
-#   - OFED: 5.2-2.2.3.0
-#   - HPC-X: v2.8.3
+#   - OFED: 5.2-2.2.3.0 # TODO
+#   - HPC-X: v2.14
 # Intel:
-#   - mlc: v3.9a
+#   - mlc: v3.10
 
 LABEL maintainer="SuperBench"
 
@@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
 # Install OFED
 ENV OFED_VERSION=5.2-2.2.3.0
 RUN cd /tmp && \
-    wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
+    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
     MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
     rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
 
 # Install HPC-X
+ENV HPCX_VERSION=v2.14
 RUN cd /opt && \
     rm -rf hpcx && \
-    wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
-    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
-    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
+    wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
+    tar xf hpcx.tbz && \
+    mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
+    rm hpcx.tbz
 
 # Install Intel MLC
 RUN cd /tmp && \
-    wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
+    wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
     tar xzf mlc.tgz Linux/mlc && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
 
-ENV PATH="${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
-    SB_HOME=/opt/superbench \
-    SB_MICRO_PATH=/opt/superbench \
-    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
-    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
-
-RUN echo PATH="$PATH" > /etc/environment && \
-    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
-    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
-
 # Install AOCC compiler
 RUN cd /tmp && \
     wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
@@ -115,6 +105,18 @@ RUN cd /tmp && \
     mv amd-blis /opt/AMD && \
     rm -rf aocl-blis-linux-aocc-4.0.tar.gz
 
+
+ENV PATH="${PATH}" \
+    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
+    SB_HOME=/opt/superbench \
+    SB_MICRO_PATH=/opt/superbench \
+    ANSIBLE_DEPRECATION_WARNINGS=FALSE \
+    ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
+
+RUN echo PATH="$PATH" > /etc/environment && \
+    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
+    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

From 7a162020a8342c81e2574937c84185a90869ace1 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Thu, 13 Apr 2023 12:26:23 +0800
Subject: [PATCH 4/4] Doc - Fix wrong unit of cpu-memory-bw-latency in doc
 (#515)

**Description**
Fix wrong unit of cpu-memory-bw-latency in doc.
---
 .../benchmarks/micro-benchmarks.md            | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md
index 2788a2815..b2e43db3f 100644
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
 
 #### Metrics
 
-| Name                | Unit               | Description                                                                |
-|---------------------|--------------------|----------------------------------------------------------------------------|
-| cpu-hpl/tests_pass  |                    | HPL completed running and correctness test has passed (1: pass, 0: fail).  |
-| cpu-hpl/throughput  | bandwidth (GFlops) | Compute bandwidth.                                                         |
-| cpu-hpl/time        | time (s)           | Time elapsed during HPL run.                                               |
+| Name               | Unit               | Description                                                               |
+|--------------------|--------------------|---------------------------------------------------------------------------|
+| cpu-hpl/tests_pass |                    | HPL completed running and correctness test has passed (1: pass, 0: fail). |
+| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth.                                                        |
+| cpu-hpl/time       | time (s)           | Time elapsed during HPL run.                                              |
 
 ### `cpu-stream`
 
@@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/
 
 | Name                                                                    | Unit             | Description                                                         |
 |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
-| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us)        | Former NUMA to latter NUMA memory latency.                          |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
-| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw  | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth.                        |
+| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns)        | Former NUMA to latter NUMA memory latency.                          |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw               | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read.                      |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw        | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1.           |
+| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw       | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
 
 ### `mem-bw`