Benchmarks: Add Benchmark - Add mlc benchmark to superbench (#216)

**Description** Add mlc memory bandwidth and latency micro benchmark to Superbench. **Major Revision** - Add mlc benchmark with test and example files
2021-12-12 21:47:42 -08:00 · 2021-12-12 21:47:42 -08:00 · b590409e0f
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@ -12,6 +12,8 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
 #   - OFED: 5.2-2.2.3.0
 #   - HPC-X: v2.8.3
 #   - NCCL RDMA SHARP plugins: 7cccbc1
+# Intel:
+#   - mlc: v3.9a

 LABEL maintainer="SuperBench"

@ -97,6 +99,16 @@ RUN cd /tmp && \
    cd /tmp && \
    rm -rf nccl

+# Install Intel MLC
+RUN cd /tmp && \
+    mkdir -p mlc && \
+    cd mlc && \
+    wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
+    tar xvf mlc_v3.9a.tgz && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    cd /tmp && \
+    rm -rf mlc
+
 ENV PATH="${PATH}" \
    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
    SB_HOME="/opt/superbench" \
--- a/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
+++ b/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
@ -11,6 +11,8 @@ FROM rocm/pytorch:rocm4.0_ubuntu18.04_py3.6_pytorch_1.7.0
 #   - RCCL: 2.7.8
 # Mellanox:
 #   - OFED: 5.2-2.2.3.0
+# Intel:
+#   - mlc: v3.9a

 LABEL maintainer="SuperBench"

@ -88,6 +90,16 @@ RUN cd /opt && \
    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64 hpcx && \
    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64.tbz

+# Install Intel MLC
+RUN cd /tmp && \
+    mkdir -p mlc && \
+    cd mlc && \
+    wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
+    tar xvf mlc_v3.9a.tgz && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    cd /tmp && \
+    rm -rf mlc
+
 ENV PATH="${PATH}" \
    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
    SB_HOME="/opt/superbench" \
--- a/dockerfile/rocm4.2-pytorch1.7.0.dockerfile
+++ b/dockerfile/rocm4.2-pytorch1.7.0.dockerfile
@ -11,6 +11,8 @@ FROM rocm/pytorch:rocm4.2_ubuntu18.04_py3.6_pytorch_1.7.0
 #   - RCCL: 2.8.4
 # Mellanox:
 #   - OFED: 5.2-2.2.3.0
+# Intel:
+#   - mlc: v3.9a

 LABEL maintainer="SuperBench"

@ -88,6 +90,16 @@ RUN cd /opt && \
    ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64 hpcx && \
    rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64.tbz

+# Install Intel MLC
+RUN cd /tmp && \
+    mkdir -p mlc && \
+    cd mlc && \
+    wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
+    tar xvf mlc_v3.9a.tgz && \
+    cp ./Linux/mlc /usr/local/bin/ && \
+    cd /tmp && \
+    rm -rf mlc
+
 ENV PATH="${PATH}" \
    LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
    SB_HOME="/opt/superbench" \
--- a/examples/benchmarks/cpu_memory_bw_latency_performance.py
+++ b/examples/benchmarks/cpu_memory_bw_latency_performance.py
@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for disk performance.
+
+Commands to run:
+  python3 examples/benchmarks/memory_bw_latency_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cpu-memory-bw-latency',
+        platform=Platform.CPU,
+        parameters='--tests bandwidth_matrix latency_matrix max_bandwidth'
+    )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/init.py
+++ b/superbench/benchmarks/micro_benchmarks/init.py
@ -14,6 +14,7 @@ from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import Cu
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
+from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
@ -34,6 +35,7 @@ __all__ = [
    'CudaNcclBwBenchmark',
    'CudnnBenchmark',
    'DiskBenchmark',
+    'CpuMemBwLatencyBenchmark',
    'GPCNetBenchmark',
    'GemmFlopsBenchmark',
    'GpuCopyBwBenchmark',
--- a/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_memory_bw_latency_performance.py
@ -0,0 +1,150 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module for running the Intel MLC tool to measure memory bandwidth and latency."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class CpuMemBwLatencyBenchmark(MicroBenchmarkWithInvoke):
+    """The Memory bandwidth and latency benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'mlc'
+        self.__support_mlc_commands = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--tests',
+            type=str,
+            nargs='+',
+            default=['bandwidth_matrix'],
+            required=False,
+            help='The modes to run mlc with. Possible values are {}.'.format(' '.join(self.__support_mlc_commands))
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        mlc_path = os.path.join(self._args.bin_dir, self._bin_name)
+        ret_val = os.access(mlc_path, os.X_OK | os.F_OK)
+        if not ret_val:
+            logger.error(
+                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
+            )
+            return False
+
+        # the mlc command requires hugapage to be enabled
+        mlc_wrapper = ' '.join(
+            [
+                'nr_hugepages=`cat /proc/sys/vm/nr_hugepages`;', 'echo 4000 > /proc/sys/vm/nr_hugepages;', '%s;',
+                'err=$?;', 'echo ${nr_hugepages} > /proc/sys/vm/nr_hugepages;', '(exit $err)'
+            ]
+        )
+        for test in self._args.tests:
+            command = mlc_path + ' --%s' % test
+            self._commands.append(mlc_wrapper % command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+
+        # parse the command to see which command this output belongs to
+        # the command is formed as ...; mlc --option; ...
+        # option needs to be extracted
+        if '--' in self._commands[cmd_idx]:
+            mlc_test = self._commands[cmd_idx].split('--')[1]
+        else:
+            logger.error('The command {} is not well formed and missing --'.format(self._commands[cmd_idx]))
+            return False
+        mlc_test = mlc_test.split(';')[0]
+        if 'max_bandwidth' in mlc_test:
+            measure = 'BW'
+            out_table = self._parse_max_bw(raw_output)
+        elif 'bandwidth_matrix' in mlc_test:
+            measure = 'BW'
+            out_table = self._parse_bw_latency(raw_output)
+        elif 'latency_matrix' in mlc_test:
+            measure = 'Latency'
+            out_table = self._parse_bw_latency(raw_output)
+        else:
+            logger.error('Invalid option {} to run the {} command'.format(mlc_test, self._commands[cmd_idx]))
+            return False
+        if len(out_table) == 0:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                    self._curr_run_index, self._name, raw_output
+                )
+            )
+            return False
+        for key in out_table.keys():
+            for index in range(len(out_table[key])):
+                if 'max_bandwidth' in mlc_test:
+                    metric = 'Mem_{}_{}_{}'.format(mlc_test, key, measure)
+                else:
+                    metric = 'Mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure)
+                self._result.add_result(metric, float(out_table[key][index]))
+        return True
+
+    def _parse_bw_latency(self, raw_output):
+        out_table = dict()
+        for line in raw_output.splitlines():
+            if line.strip() == '':
+                continue
+            # only lines starting with a digit is of interest
+            if line.lstrip()[0].isdigit():
+                vals = line.split()
+                if len(vals) < 2:
+                    continue
+                numa_index = 'numa_%s' % vals[0]
+                out_table[numa_index] = vals[1:]
+        return out_table
+
+    def _parse_max_bw(self, raw_output):
+        out_table = dict()
+        # the very last line is empty and only the last 5 lines of the output are of interest
+        for line in raw_output.splitlines()[-6:]:
+            if line.strip() == '':
+                continue
+            vals = line.split()
+            if len(vals) < 2:
+                continue
+            key = '_'.join(vals[0:2]).rstrip(':').replace(':', '_')
+            # making a list to be consistent with the _parse_bw_latency output
+            out_table[key] = [vals[-1]]
+        return out_table
+
+
+BenchmarkRegistry.register_benchmark('cpu-memory-bw-latency', CpuMemBwLatencyBenchmark)
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@ -47,6 +47,17 @@ superbench:
        maxbytes: 8G
        ngpus: 8
        operation: allreduce
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
    mem-bw:
      enable: true
      modes:
--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@ -48,6 +48,17 @@ superbench:
        maxbytes: 8G
        ngpus: 8
        operation: allreduce
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
    mem-bw:
      enable: true
      modes:
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@ -58,6 +58,17 @@ superbench:
          proc_num: 4
          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
          parallel: yes
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
    mem-bw:
      enable: true
      modes:
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@ -58,6 +58,17 @@ superbench:
          proc_num: 4
          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
          parallel: yes
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
    mem-bw:
      enable: true
      modes:
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -60,6 +60,17 @@ superbench:
      parameters:
        block_devices:
          - /dev/nvme0n1
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
    mem-bw:
      enable: true
      modes:
--- a/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_memory_bw_latency_performance.py
@ -0,0 +1,159 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for cpu-memory-bw-latency benchmark."""
+
+from pathlib import Path
+import os
+import unittest
+
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+class CpuMemBwLatencyBenchmarkTest(unittest.TestCase):
+    """Test class for cpu-memory-bw-latency benchmark."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        # Create fake binary file just for testing.
+        self.__curr_micro_path = os.environ.get('SB_MICRO_PATH', '')
+        os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
+        binary_path = Path(os.getenv('SB_MICRO_PATH'), 'bin')
+        binary_path.mkdir(parents=True, exist_ok=True)
+        self.__binary_file = binary_path / 'mlc'
+        self.__binary_file.touch(mode=0o755, exist_ok=True)
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        self.__binary_file.unlink()
+        os.environ['SB_MICRO_PATH'] = self.__curr_micro_path
+
+    def test_cpu_mem_bw_latency_benchmark_empty_param(self):
+        """Test cpu-memory-bw-latency benchmark command generation with empty parameter."""
+        benchmark_name = 'cpu-memory-bw-latency'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        default_mlc_test = 'bandwidth_matrix'
+        benchmark = benchmark_class(benchmark_name, parameters='')
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == 'cpu-memory-bw-latency')
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check commands
+        assert (1 == len(benchmark._commands))
+        assert ('mlc --%s;' % default_mlc_test in benchmark._commands[0])
+
+    def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
+        """Test cpu-memory-bw-latency benchmark result parsing."""
+        benchmark_name = 'cpu-memory-bw-latency'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        all_mlc_tests = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
+        param_str = '--tests %s' % ' '.join(all_mlc_tests)
+        benchmark = benchmark_class(benchmark_name, parameters=param_str)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == 'cpu-memory-bw-latency')
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check commands
+        assert (len(all_mlc_tests) == len(benchmark._commands))
+        for mlc_test, command in zip(all_mlc_tests, benchmark._commands):
+            assert ('mlc --%s;' % mlc_test in command)
+
+        # Positive case - valid bandwidth matrix output.
+        test_raw_output = """
+Intel(R) Memory Latency Checker - v3.9a
+Command line parameters: --bandwidth_matrix
+
+Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
+*** Unable to modify prefetchers (try executing 'modprobe msr')
+*** So, enabling random access for latency measurements
+Measuring Memory Bandwidths between nodes within system
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using Read-only traffic type
+                Numa node
+Numa node            0       1
+       0        82542.2 76679.9
+       1        76536.0 82986.5
+"""
+        assert (benchmark._process_raw_result(0, test_raw_output))
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert ('raw_output_0' in benchmark.raw_data)
+        assert ([test_raw_output] == benchmark.raw_data['raw_output_0'])
+        assert ([82542.2] == benchmark.result['Mem_bandwidth_matrix_numa_0_0_BW'])
+        assert ([76679.9] == benchmark.result['Mem_bandwidth_matrix_numa_0_1_BW'])
+        assert ([76536.0] == benchmark.result['Mem_bandwidth_matrix_numa_1_0_BW'])
+        assert ([82986.5] == benchmark.result['Mem_bandwidth_matrix_numa_1_1_BW'])
+
+        # Positive case - valid latency matrix output.
+        test_raw_output = """
+Intel(R) Memory Latency Checker - v3.9a
+Command line parameters: --latency_matrix
+
+Using buffer size of 600.000MiB
+*** Unable to modify prefetchers (try executing 'modprobe msr')
+*** So, enabling random access for latency measurements
+Measuring idle latencies (in ns)...
+                Numa node
+Numa node            0       1
+       0          87.0   101.0
+       1         101.9    86.9
+"""
+        assert (benchmark._process_raw_result(1, test_raw_output))
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert ('raw_output_1' in benchmark.raw_data)
+        assert ([test_raw_output] == benchmark.raw_data['raw_output_1'])
+
+        assert ([87.0] == benchmark.result['Mem_latency_matrix_numa_0_0_Latency'])
+        assert ([101.0] == benchmark.result['Mem_latency_matrix_numa_0_1_Latency'])
+        assert ([101.9] == benchmark.result['Mem_latency_matrix_numa_1_0_Latency'])
+        assert ([86.9] == benchmark.result['Mem_latency_matrix_numa_1_1_Latency'])
+
+        # Positive case - valid max bandwidth output.
+        test_raw_output = """
+Intel(R) Memory Latency Checker - v3.9a
+Command line parameters: --max_bandwidth
+
+Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
+*** Unable to modify prefetchers (try executing 'modprobe msr')
+*** So, enabling random access for latency measurements
+
+Measuring Maximum Memory Bandwidths for the system
+Will take several minutes to complete as multiple injection rates will be tried to get the best bandwidth
+Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
+Using all the threads from each core if Hyper-threading is enabled
+Using traffic with the following read-write ratios
+ALL Reads        :      165400.60
+3:1 Reads-Writes :      154975.19
+2:1 Reads-Writes :      158433.32
+1:1 Reads-Writes :      157352.05
+Stream-triad like:      157878.32
+
+"""
+        assert (benchmark._process_raw_result(2, test_raw_output))
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert ('raw_output_2' in benchmark.raw_data)
+        assert ([test_raw_output] == benchmark.raw_data['raw_output_2'])
+        assert ([165400.60] == benchmark.result['Mem_max_bandwidth_ALL_Reads_BW'])
+        assert ([154975.19] == benchmark.result['Mem_max_bandwidth_3_1_Reads-Writes_BW'])
+        assert ([158433.32] == benchmark.result['Mem_max_bandwidth_2_1_Reads-Writes_BW'])
+        assert ([157352.05] == benchmark.result['Mem_max_bandwidth_1_1_Reads-Writes_BW'])
+        assert ([157878.32] == benchmark.result['Mem_max_bandwidth_Stream-triad_like_BW'])
+
+        # Negative case - invalid raw output.
+        assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
+        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)