Skip to content

Commit

Permalink
Merge branch 'release/0.8' into xiongyf/add-release-note-0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
abuccts authored Apr 14, 2023
2 parents 2dcf60d + 7a16202 commit a65b73e
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 51 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/build-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ jobs:
strategy:
matrix:
include:
- name: cuda11.8
dockerfile: cuda11.8
tags: superbench/main:cuda11.8
- name: cuda12.1
dockerfile: cuda12.1
tags: superbench/main:cuda12.1
- name: cuda11.1.1
dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest
Expand Down
50 changes: 26 additions & 24 deletions dockerfile/cuda11.8.dockerfile → dockerfile/cuda12.1.dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
FROM nvcr.io/nvidia/pytorch:22.12-py3
FROM nvcr.io/nvidia/pytorch:23.03-py3

# OS:
# - Ubuntu: 20.04
# - OpenMPI: 4.1.5a1
# - Docker Client: 20.10.8
# NVIDIA:
# - CUDA: 11.8.0
# - cuDNN: 8.7.0.84
# - NCCL: v2.15.5-1
# - CUDA: 12.1.0
# - cuDNN: 8.8.1.3
# - NCCL: v2.17.1-1
# Mellanox:
# - OFED: 5.2-2.2.3.0
# - HPC-X: v2.8.3
# - OFED: 5.2-2.2.3.0 # TODO
# - HPC-X: v2.14
# Intel:
# - mlc: v3.9a
# - mlc: v3.10

LABEL maintainer="SuperBench"

Expand Down Expand Up @@ -71,37 +71,27 @@ RUN mkdir -p /root/.ssh && \
# Install OFED
ENV OFED_VERSION=5.2-2.2.3.0
RUN cd /tmp && \
wget -q http://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*

# Install HPC-X
ENV HPCX_VERSION=v2.14
RUN cd /opt && \
rm -rf hpcx && \
wget -q https://azhpcstor.blob.core.windows.net/azhpc-images-store/hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
tar xf hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64 hpcx && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tbz
wget -q https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-MLNX_OFED_LINUX-5-ubuntu20.04-cuda12-gdrcopy2-nccl2.17-x86_64 hpcx && \
rm hpcx.tbz

# Install Intel MLC
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/736634/mlc_v3.9a.tgz -O mlc.tgz && \
wget -q https://downloadmirror.intel.com/763324/mlc_v3.10.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz

ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
Expand All @@ -115,6 +105,18 @@ RUN cd /tmp && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz


ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections

RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment

# Add config files
ADD dockerfile/etc /opt/microsoft/

Expand Down
24 changes: 12 additions & 12 deletions docs/user-tutorial/benchmarks/micro-benchmarks.md
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,11 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer

#### Metrics

| Name | Unit | Description |
|---------------------|--------------------|----------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |
| Name | Unit | Description |
|--------------------|--------------------|---------------------------------------------------------------------------|
| cpu-hpl/tests_pass | | HPL completed running and correctness test has passed (1: pass, 0: fail). |
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |

### `cpu-stream`

Expand Down Expand Up @@ -216,13 +216,13 @@ performed by [Intel MLC Tool](https://www.intel.com/content/www/us/en/developer/

| Name | Unit | Description |
|-------------------------------------------------------------------------|------------------|---------------------------------------------------------------------|
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (GB/s) | Former NUMA to latter NUMA memory bandwidth. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (us) | Former NUMA to latter NUMA memory latency. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, full read. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (GB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_bw | bandwidth (MB/s) | Former NUMA to latter NUMA memory bandwidth. |
| cpu-memory-bw-latency/mem\_bandwidth\_matrix\_numa\_[0-9]+\_[0-9]+\_lat | time (ns) | Former NUMA to latter NUMA memory latency. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_all\_reads\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, full read. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_3_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 3 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_2_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 2 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_1_1\_reads-writes\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, read : write = 1 : 1. |
| cpu-memory-bw-latency/mem\_max\_bandwidth\_stream-triad\_like\_bw | bandwidth (MB/s) | Whole-CPU maximum memory bandwidth, with stream-triad like pattern. |

### `mem-bw`

Expand Down
7 changes: 7 additions & 0 deletions superbench/benchmarks/model_benchmarks/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,13 @@ def add_parser_arguments(self):
required=False,
help='The number of batch size.',
)
self._parser.add_argument(
'--num_workers',
type=int,
default=8,
required=False,
help='Number of subprocesses to use for data loading.',
)
self._parser.add_argument(
'--precision',
type=Precision,
Expand Down
2 changes: 1 addition & 1 deletion superbench/benchmarks/model_benchmarks/pytorch_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _init_dataloader(self):
dataset=self._dataset,
batch_size=self._args.batch_size,
shuffle=False,
num_workers=8,
num_workers=self._args.num_workers,
sampler=train_sampler,
drop_last=True,
pin_memory=self._args.pin_memory
Expand Down
21 changes: 10 additions & 11 deletions superbench/common/utils/gen_traffic_pattern_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,15 +182,14 @@ def gen_traffic_pattern_host_groups(host_list, pattern, mpi_pattern_path, benchm
logger.error('Unsupported traffic pattern: {}'.format(pattern.type))
host_groups = __convert_config_to_host_group(config, host_list)
# write traffic pattern host groups to specified path
if pattern.mpi_pattern:
with open(mpi_pattern_path, 'a') as f:
f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
for host_group in host_groups:
row = []
for host_list in host_group:
group = ','.join(host_list)
row.append(group)
group = ';'.join(row)
f.write(group + '\n')
f.write('\n')
with open(mpi_pattern_path, 'a') as f:
f.write('benchmark_name: {} pattern_type: {}'.format(benchmark_name, pattern.type) + '\n')
for host_group in host_groups:
row = []
for host_list in host_group:
group = ','.join(host_list)
row.append(group)
group = ';'.join(row)
f.write(group + '\n')
f.write('\n')
return host_groups
2 changes: 2 additions & 0 deletions tests/benchmarks/model_benchmarks/test_model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def test_arguments_related_interfaces():
--no_gpu Disable GPU training.
--num_steps int The number of test step.
--num_warmup int The number of warmup step.
--num_workers int Number of subprocesses to use for data loading.
--pin_memory Enable option to pin memory in data loader.
--precision Precision [Precision ...]
Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
Expand Down Expand Up @@ -206,6 +207,7 @@ def test_preprocess():
--no_gpu Disable GPU training.
--num_steps int The number of test step.
--num_warmup int The number of warmup step.
--num_workers int Number of subprocesses to use for data loading.
--pin_memory Enable option to pin memory in data loader.
--precision Precision [Precision ...]
Model precision. E.g. fp8_hybrid fp8_e4m3 fp8_e5m2
Expand Down

0 comments on commit a65b73e

Please sign in to comment.