microsoft · rafsalas19 · Feb 13, 2023 · Jan 27, 2023 · Jan 27, 2023 · Jan 30, 2023
@@ -122,6 +122,18 @@ RUN echo PATH="$PATH" > /etc/environment && \
     echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
     echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
 
+#install STREAM
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamx86.exe && \    
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver3 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamZen3.exe && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver4 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=800000000 -DNTIMES=10 stream.c -o streamZen4.exe && \
+    cp stream*.exe /usr/local/bin/ && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf stream*
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

@@ -102,6 +102,18 @@ RUN echo PATH="$PATH" > /etc/environment && \
     echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
     echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
 
+#install STREAM
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamx86.exe && \    
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver3 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamZen3.exe && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver4 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=800000000 -DNTIMES=10 stream.c -o streamZen4.exe && \
+    cp stream*.exe /usr/local/bin/ && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf stream*
+
 # Add config files
 ADD dockerfile/etc /opt/microsoft/
 

@@ -102,6 +102,18 @@ RUN cd /tmp && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
 
+#install STREAM
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamx86.exe && \    
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver3 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamZen3.exe && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver4 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=800000000 -DNTIMES=10 stream.c -o streamZen4.exe && \
+    cp stream*.exe /usr/local/bin/ && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf stream*
+
 # Install rccl-rdma-sharp-plugins
 ENV SHARP_VERSION=5.0
 RUN cd /opt/rocm && \

@@ -114,6 +114,18 @@ RUN cd /tmp && \
     cp ./Linux/mlc /usr/local/bin/ && \
     rm -rf ./Linux mlc.tgz
 
+#install STREAM
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamx86.exe && \    
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver3 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=400000000 -DNTIMES=10 stream.c -o streamZen3.exe && \
+    /opt/AMD/aocc-compiler-4.0.0/bin/clang -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -march=znver4 -lomp -fopenmp -fnt-store=aggressive -DSTREAM_ARRAY_SIZE=800000000 -DNTIMES=10 stream.c -o streamZen4.exe && \
+    cp stream*.exe /usr/local/bin/ && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf stream*
+
 # Install rccl with commitid 6707a27
 RUN cd /tmp && \
     git clone https://github.com/ROCmSoftwarePlatform/rccl.git && \

@@ -171,6 +171,23 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
 
+### `cpu-stream`
+
+#### Introduction
+
+Measure of memory bandwidth and computation rate for simple vector kernels.
+performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.edu/stream/ref.html).
+
+#### Metrics
+
+| Name                                                     | Unit             | Description                                                         |
+|----------------------------------------------------------|------------------|---------------------------------------------------------------------|
+| cpu-stream/threads                                       |                  | Number of threads used for the test. Determined by core count.      |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation.                    |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg   | time (s)         | Average elapsed times over all iterations.                          |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min   | time (s)         | Minimum elapsed times over all iterations.                          |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max   | time (s)         | Maximum elapsed times over all iterations.                          |
+
 ## Communication Benchmarks
 
 ### `cpu-memory-bw-latency`

@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for CPU Stream performance.
+
+Commands to run:
+  python3 examples/benchmarks/cpu_stream_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cpu-stream',
+        parameters='--cpu_arch zen3 \
+        --cores 0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
@@ -16,6 +16,7 @@
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
+from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
@@ -32,6 +33,7 @@
 __all__ = [
     'ComputationCommunicationOverlap',
     'CpuMemBwLatencyBenchmark',
+    'CpuStreamBenchmark',
     'CublasBenchmark',
     'CublasLtBenchmark',
     'CudaGemmFlopsBenchmark',

@@ -0,0 +1,132 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module for running the University of Virginia STREAM tool. It measures sustainable main memory
+    bandwidth in MB/s and the corresponding computation rate for simple vector kernels."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
+    """The Stream benchmark class."""
+
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'streamZen3.exe'
+        self.__cpu_arch = ['other', 'zen3', 'zen4']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--cpu_arch',
+            type=str,
+            default='zen4',
+            required=False,
+            help='The targeted cpu architectures to run \
+                STREAM. Possible values are {}.'.format(' '.join(self.__cpu_arch))
+        )
+        self._parser.add_argument(
+            '--cores',
+            nargs='+',
+            type=int,
+            default=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96,
+                     104, 112, 120, 126, 132, 140, 148, 156, 164, 170],
+            required=True,
+            help='List of cores to perform test'
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        # zen3
+        # cores=[0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50,
+        # 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118]
+        # zen4
+        # cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
+        # 126, 132, 140, 148, 156, 164, 170]
+
+        # parse cores argument
+        omp_places = ''
+        for core in self._args.cores:
+            omp_places += '{' + '{}:1'.format(core) + '}'
+
+        envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
+            OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
+
+        if self._args.cpu_arch == 'zen3':
+            exe = 'streamZen3.exe'
+        elif self._args.cpu_arch == 'zen4':
+            exe = 'streamZen4.exe'
+        else:
+            exe = 'streamx86.exe'
+
+        command = envar + " " + os.path.join(self._args.bin_dir, exe)
+        self._bin_name = exe
+
+        if not self._set_binary_path():
+            logger.error(
+                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
+            )
+            return False
+
+        self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        functions = ['Copy', 'Scale', 'Add', 'Triad']
+        records = []
+        content = raw_output.splitlines()
+        for line in content:
+            if "Number of Threads counted" in line:
+                line.split("= ")[1]
+                self._result.add_result("threads", int(line.split("= ")[1]))
+            for function in functions:
+                if function in line:
+                    records.append(line)
+
+        # individual results
+        for record in records:
+            entries = record.split()
+            metric = entries[0].strip().replace(':', '')
+            self._result.add_result(metric.lower() + "_throughput", float(entries[1].strip()))
+            self._result.add_result(metric.lower() + "_time_avg", float(entries[2].strip()))
+            self._result.add_result(metric.lower() + "_time_min", float(entries[3].strip()))
+            self._result.add_result(metric.lower() + "_time_max", float(entries[4].strip()))
+
+        # raw output
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('cpu-stream', CpuStreamBenchmark)
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for STREAM benchmark."""
+
+import unittest
+
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+class CpuStreamBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Test class for STREAM benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/streamZen3.exe'])
+        return True
+
+    @decorator.load_data('tests/data/streamResult.log')
+    def test_stream(self, results):
+        """Test STREAM benchmark command generation."""
+        benchmark_name = 'cpu-stream'
+        (benchmark_class,
+            predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        cores = '0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
+        coresList = [0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50, 54, 58, 60, 64, 68, 72,
+                     76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118]
+        arch = 'zen3'
+        parameters = '--cpu_arch ' + arch + ' --cores ' + cores
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.cores == coresList)
+        assert (benchmark._args.cpu_arch == arch)
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('OMP_PLACES' in benchmark._commands[0])
+
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        assert (benchmark.result['return_code'][0] == 0)
+        functions = ['copy', 'scale', 'add', 'triad']
+        values = [342008.3, 342409.6, 343827.7, 363208.7]
+        for index in range(0, 4):
+            result = float(benchmark.result[functions[index] + '_throughput'][0])
+            print(result, values[index])
+            assert (result == values[index])
+        assert (int(benchmark.result['threads'][0]) == 32)
+
+
+if __name__ == '__main__':
+    unittest.main()
@@ -0,0 +1,33 @@
+-------------------------------------------------------------
+STREAM version $Revision: 5.10 $
+-------------------------------------------------------------
+This system uses 8 bytes per array element.
+-------------------------------------------------------------
+Array size = 400000000 (elements), Offset = 0 (elements)
+Memory per array = 3051.8 MiB (= 3.0 GiB).
+Total memory required = 9155.3 MiB (= 8.9 GiB).
+Each kernel will be executed 10 times.
+ The *best* time for each kernel (excluding the first iteration)
+ will be used to compute the reported bandwidth.
+-------------------------------------------------------------
+Number of Threads requested = 32
+Number of Threads counted = 32
+-------------------------------------------------------------
+Your clock granularity/precision appears to be 1 microseconds.
+Each test below will take on the order of 19105 microseconds.
+   (= 19105 clock ticks)
+Increase the size of the arrays if this shows that
+you are not getting at least 20 clock ticks per test.
+-------------------------------------------------------------
+WARNING -- The above is only a rough guideline.
+For best results, please be sure you know the
+precision of your system timer.
+-------------------------------------------------------------
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:          342008.3     0.018755     0.018713     0.018895
+Scale:         342409.6     0.018737     0.018691     0.018802
+Add:           343827.7     0.028050     0.027921     0.028269
+Triad:         363208.7     0.026599     0.026431     0.026855
+-------------------------------------------------------------
+Solution Validates: avg error less than 1.000000e-13 on all three arrays
+-------------------------------------------------------------