From 4c5417f7bbfd17357e261e1bfa98f1ad7fa29c2d Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 12 Apr 2023 11:02:49 +0800 Subject: [PATCH 1/4] Add num_workers argument in model benchmark (#511) Change num_workers to configurable in model benchmark data loader. --- superbench/benchmarks/model_benchmarks/model_base.py | 7 +++++++ superbench/benchmarks/model_benchmarks/pytorch_base.py | 2 +- tests/benchmarks/model_benchmarks/test_model_base.py | 2 ++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/superbench/benchmarks/model_benchmarks/model_base.py b/superbench/benchmarks/model_benchmarks/model_base.py index 133ee76f4..a51c05850 100644 --- a/superbench/benchmarks/model_benchmarks/model_base.py +++ b/superbench/benchmarks/model_benchmarks/model_base.py @@ -78,6 +78,13 @@ def add_parser_arguments(self): required=False, help='The number of batch size.', ) + self._parser.add_argument( + '--num_workers', + type=int, + default=8, + required=False, + help='Number of subprocesses to use for data loading.', + ) self._parser.add_argument( '--precision', type=Precision, diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index ce1cca93b..f0cb52319 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -181,7 +181,7 @@ def _init_dataloader(self): dataset=self._dataset, batch_size=self._args.batch_size, shuffle=False, - num_workers=8, + num_workers=self._args.num_workers, sampler=train_sampler, drop_last=True, pin_memory=self._args.pin_memory diff --git a/tests/benchmarks/model_benchmarks/test_model_base.py b/tests/benchmarks/model_benchmarks/test_model_base.py index 926088aea..deba3a438 100644 --- a/tests/benchmarks/model_benchmarks/test_model_base.py +++ b/tests/benchmarks/model_benchmarks/test_model_base.py @@ -167,6 +167,7 @@ def test_arguments_related_interfaces(): --no_gpu Disable GPU training. --num_steps int The number of test step. --num_warmup int The number of warmup step. + --num_workers int Number of subprocesses to use for data loading. --pin_memory Enable option to pin memory in data loader. --precision Precision [Precision ...] Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2 @@ -206,6 +207,7 @@ def test_preprocess(): --no_gpu Disable GPU training. --num_steps int The number of test step. --num_warmup int The number of warmup step. + --num_workers int Number of subprocesses to use for data loading. --pin_memory Enable option to pin memory in data loader. --precision Precision [Precision ...] Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2 From 5a2adddc51f07290fd0a85b00c645de888485f1f Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 12 Apr 2023 11:55:02 +0800 Subject: [PATCH 2/4] Remove unreachable condition when write host list (#512) Remove unreachable condition when write host list in mpi mode. --- .../utils/gen_traffic_pattern_config.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/superbench/common/utils/gen_traffic_pattern_config.py b/superbench/common/utils/gen_traffic_pattern_config.py index 97864784c..84a2e65d0 100644 --- a/superbench/common/utils/gen_traffic_pattern_config.py +++ b/superbench/common/utils/gen_traffic_pattern_config.py @@ -182,15 +182,14 @@ def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchm logger.error('Unsupported traffic pattern: {}'.format(pattern.type)) host_groups = __convert_config_to_host_group(config, host_list) # write traffic pattern host groups to specified path - if pattern.mpi_pattern: - with open(mpi_pattern_path, 'a') as f: - f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n') - for host_group in host_groups: - row = [] - for host_list in host_group: - group = ','.join(host_list) - row.append(group) - group = ';'.join(row) - f.write(group + '\n') - f.write('\n') + with open(mpi_pattern_path, 'a') as f: + f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n') + for host_group in host_groups: + row = [] + for host_list in host_group: + group = ','.join(host_list) + row.append(group) + group = ';'.join(row) + f.write(group + '\n') + f.write('\n') return host_groups From 17c01d8452af0c3776cebd43b49c2ddd537e3d14 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 12 Apr 2023 16:01:25 +0800 Subject: [PATCH 3/4] Update cuda11.8 image to cuda12.1 based on nvcr23.03 (#513) Update cuda11.8 image to cuda12.1 based on nvcr23.03 and related versions in the image: * cuda 11.8 -> 12.1 * nccl 2.15.5 -> 2.17.1 * hpcx: 2.8 -> 2.14 * mlc: 3.9a -> 3.10 --- .github/workflows/build-image.yml | 6 +-- ...uda11.8.dockerfile => cuda12.1.dockerfile} | 50 ++++++++++--------- 2 files changed, 29 insertions(+), 27 deletions(-) rename dockerfile/{cuda11.8.dockerfile => cuda12.1.dockerfile} (81%) diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 6e599e9c7..824418a6f 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -24,9 +24,9 @@ jobs: strategy: matrix: include: - - name: cuda11.8 - dockerfile: cuda11.8 - tags: superbench/main:cuda11.8 + - name: cuda12.1 + dockerfile: cuda12.1 + tags: superbench/main:cuda12.1 - name: cuda11.1.1 dockerfile: cuda11.1.1 tags: superbench/main:cuda11.1.1,superbench/superbench:latest diff --git a/dockerfile/cuda11.8.dockerfile b/dockerfile/cuda12.1.dockerfile similarity index 81% rename from dockerfile/cuda11.8.dockerfile rename to dockerfile/cuda12.1.dockerfile index 7615b60e3..4a257bf43 100644 --- a/dockerfile/cuda11.8.dockerfile +++ b/dockerfile/cuda12.1.dockerfile @@ -1,18 +1,18 @@ -FROM nvcr.io/nvidia/pytorch:22.12-py3 +FROM nvcr.io/nvidia/pytorch:23.03-py3 # OS: # - Ubuntu: 20.04 # - OpenMPI: 4.1.5a1 # - Docker Client: 20.10.8 # NVIDIA: -# - CUDA: 11.8.0 -# - cuDNN: 8.7.0.84 -# - NCCL: v2.15.5-1 +# - CUDA: 12.1.0 +# - cuDNN: 8.8.1.3 +# - NCCL: v2.17.1-1 # Mellanox: -# - OFED: 5.2-2.2.3.0 -# - HPC-X: v2.8.3 +# - OFED: 5.2-2.2.3.0 # TODO +# - HPC-X: v2.14 # Intel: -# - mlc: v3.9a +# - mlc: v3.10 LABEL maintainer="SuperBench" @@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \ # Install OFED ENV OFED_VERSION=5.2-2.2.3.0 RUN cd /tmp && \ - wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ + wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* # Install HPC-X +ENV HPCX_VERSION=v2.14 RUN cd /opt && \ rm -rf hpcx && \ - wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ - tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \ - ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \ - rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz + wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \ + tar xf hpcx.tbz && \ + mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \ + rm hpcx.tbz # Install Intel MLC RUN cd /tmp && \ - wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \ + wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \ tar xzf mlc.tgz Linux/mlc && \ cp ./Linux/mlc /usr/local/bin/ && \ rm -rf ./Linux mlc.tgz -ENV PATH="${PATH}" \ - LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ - SB_HOME=/opt/superbench \ - SB_MICRO_PATH=/opt/superbench \ - ANSIBLE_DEPRECATION_WARNINGS=FALSE \ - ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections - -RUN echo PATH="$PATH" > /etc/environment && \ - echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ - echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment - # Install AOCC compiler RUN cd /tmp && \ wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ @@ -115,6 +105,18 @@ RUN cd /tmp && \ mv amd-blis /opt/AMD && \ rm -rf aocl-blis-linux-aocc-4.0.tar.gz + +ENV PATH="${PATH}" \ + LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \ + SB_HOME=/opt/superbench \ + SB_MICRO_PATH=/opt/superbench \ + ANSIBLE_DEPRECATION_WARNINGS=FALSE \ + ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections + +RUN echo PATH="$PATH" > /etc/environment && \ + echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \ + echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment + # Add config files ADD dockerfile/etc /opt/microsoft/ From 7a162020a8342c81e2574937c84185a90869ace1 Mon Sep 17 00:00:00 2001 From: Yuting Jiang Date: Thu, 13 Apr 2023 12:26:23 +0800 Subject: [PATCH 4/4] Doc - Fix wrong unit of cpu-memory-bw-latency in doc (#515) **Description** Fix wrong unit of cpu-memory-bw-latency in doc. --- .../benchmarks/micro-benchmarks.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/user-tutorial/benchmarks/micro-benchmarks.md b/docs/user-tutorial/benchmarks/micro-benchmarks.md index 2788a2815..b2e43db3f 100644 --- a/docs/user-tutorial/benchmarks/micro-benchmarks.md +++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md @@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer #### Metrics -| Name | Unit | Description | -|---------------------|--------------------|----------------------------------------------------------------------------| -| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). | -| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. | -| cpu-hpl/time | time (s) | Time elapsed during HPL run. | +| Name | Unit | Description | +|--------------------|--------------------|---------------------------------------------------------------------------| +| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). | +| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. | +| cpu-hpl/time | time (s) | Time elapsed during HPL run. | ### `cpu-stream` @@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/ | Name | Unit | Description | |-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------| -| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth. | -| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us) | Former NUMA to latter NUMA memory latency. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. | -| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. | +| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth. | +| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns) | Former NUMA to latter NUMA memory latency. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. | +| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. | ### `mem-bw`