microsoft · yukirora · Dec 7, 2023 · Nov 30, 2023 · Dec 1, 2023 · Dec 1, 2023
@@ -41,6 +41,7 @@ RUN apt-get update && \
     libtinfo5 \
     libtool \
     lshw \
+    python3-mpi4py \
     net-tools \
     openssh-client \
     openssh-server \

@@ -41,6 +41,7 @@ RUN apt-get update && \
     libtinfo5 \
     libtool \
     lshw \
+    python3-mpi4py \
     net-tools \
     openssh-client \
     openssh-server \

@@ -41,6 +41,7 @@ RUN apt-get update && \
     libtinfo5 \
     libtool \
     lshw \
+    python3-mpi4py \
     net-tools \
     numactl \
     openssh-client \
@@ -136,7 +137,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make -C third_party rocm -o rocm_hipblaslt
+RUN make -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm
 
 ADD . .
 RUN python3 -m pip install --upgrade setuptools==65.7 && \

@@ -41,6 +41,7 @@ RUN apt-get update && \
     libtinfo5 \
     libtool \
     lshw \
+    python3-mpi4py \
     net-tools \
     numactl \
     openssh-client \
@@ -141,7 +142,7 @@ RUN echo PATH="$PATH" > /etc/environment && \
 WORKDIR ${SB_HOME}
 
 ADD third_party third_party
-RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt
+RUN make ROCBLAS_BRANCH=release/rocm-rel-5.1 -C third_party rocm -o rocm_hipblaslt -o megatron_deepspeed -o megatron_lm 
 
 ADD . .
 RUN python3 -m pip install --no-cache-dir .[amdworker] && \

@@ -37,8 +37,29 @@ For inference, supported percentiles include
 | Name                                                                                    | Unit                   | Description                                                                  |
 |-----------------------------------------------------------------------------------------|------------------------|------------------------------------------------------------------------------|
 | model-benchmarks/pytorch-${model_name}/${precision}_train_step_time                     | time (ms)              | The average training step time with fp32/fp16 precision.                     |
-| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput                    | throughput (samples/s) | The average training throughput with fp32/fp16 precision.                    |
+| model-benchmarks/pytorch-${model_name}/${precision}_train_throughput                    | throughput (samples/s) | The average training throughput with fp32/fp16 precision per GPU.            |
 | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time                 | time (ms)              | The average inference step time with fp32/fp16 precision.                    |
 | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput                | throughput (samples/s) | The average inference throughput with fp32/fp16 precision.                   |
 | model-benchmarks/pytorch-${model_name}/${precision}_inference_step_time\_${percentile}  | time (ms)              | The n<sup>th</sup> percentile inference step time with fp32/fp16 precision.  |
 | model-benchmarks/pytorch-${model_name}/${precision}_inference_throughput\_${percentile} | throughput (samples/s) | The n<sup>th</sup> percentile inference throughput with fp32/fp16 precision. |
+
+
+## Megatron Model benchmarks
+
+### `megatron-gpt`
+
+#### Introduction
+
+Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) or [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
+
+`tips: batch_size in this benchmark represents global batch size, the batch size on each GPU instance is micro_batch_size.`
+
+#### Metrics
+| Name                                              | Unit                   | Description                                             |
+|---------------------------------------------------|------------------------|---------------------------------------------------------|
+| megatron-gpt/${precision}_train_step_time         | time (ms)              | The average training step time per iteration.           |
+| megatron-gpt/${precision}_train_throughput        | throughput (samples/s) | The average training throughput per iteration.          |
+| megatron-gpt/${precision}_train_tflops            | tflops/s               | The average training tflops per second per iteration.   |
+| megatron-gpt/${precision}_train_mem_allocated     | GB                     | The average GPU memory allocated per iteration.         |
+| megatron-gpt/${precision}_train_max_mem_allocated | GB                     | The average maximum GPU memory allocated per iteration. |
+
@@ -177,6 +177,7 @@ def run(self):
         'xlrd>=2.0.1',
         'xlsxwriter>=1.3.8',
         'xmltodict>=0.12.0',
+        'types-requests',
     ],
     extras_require=(
         lambda x: {

@@ -8,5 +8,6 @@
 from superbench.benchmarks.model_benchmarks.pytorch_gpt2 import PytorchGPT2
 from superbench.benchmarks.model_benchmarks.pytorch_cnn import PytorchCNN
 from superbench.benchmarks.model_benchmarks.pytorch_lstm import PytorchLSTM
+from superbench.benchmarks.model_benchmarks.megatron_gpt3 import MegatronGPT
 
-__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM']
+__all__ = ['ModelBenchmark', 'PytorchBERT', 'PytorchGPT2', 'PytorchCNN', 'PytorchLSTM', 'MegatronGPT']