Benchmarks: Add Benchmark - Add GPU SM copy benchmark (#169)

**Description** This commit adds gpu_sm_copy benchmark and related tests.
2021-08-30 18:54:26 +08:00 · 2021-08-30 18:54:26 +08:00 · b97197f08e
--- a/examples/benchmarks/gpu_sm_copy_bw_performance.py
+++ b/examples/benchmarks/gpu_sm_copy_bw_performance.py
@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Micro benchmark example for GPU SM copy bandwidth performance.
+
+Commands to run:
+  python3 examples/benchmarks/gpu_sm_copy_bw_performance.py
+"""
+
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'gpu-sm-copy-bw', platform=Platform.CUDA, parameters='--mem_type dtoh htod'
+    )
+    # For ROCm environment, please specify the benchmark name and the platform as the following.
+    # context = BenchmarkRegistry.create_benchmark_context(
+    #     'gpu-sm-copy-bw', platform=Platform.ROCM, parameters='--mem_type dtoh htod'
+    # )
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/init.py
+++ b/superbench/benchmarks/micro_benchmarks/init.py
@ -18,10 +18,11 @@ from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoo
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
+from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark

 __all__ = [
    'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
    'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
-    'RocmGemmFlopsBenchmark'
+    'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark'
 ]
--- a/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_bw_performance.py
@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the GPU SM Copy Bandwidth Performance benchmark."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class GpuSmCopyBwBenchmark(MicroBenchmarkWithInvoke):
+    """The GPU SM copy bandwidth performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+
+        self._bin_name = 'gpu_sm_copy'
+        self._mem_types = ['htod', 'dtoh']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+
+        self._parser.add_argument(
+            '--mem_type',
+            type=str,
+            nargs='+',
+            default=self._mem_types,
+            help='Memory types for benchmark. E.g. {}.'.format(' '.join(self._mem_types)),
+        )
+
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=64 * 1024**2,
+            required=False,
+            help='Size of data buffer in bytes.',
+        )
+
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of data buffer copies performed.',
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
+
+        for mem_type in self._args.mem_type:
+            command = '%s 0 %s %d %d' % \
+                (self.__bin_path, mem_type, self._args.size, self._args.num_loops)
+            self._commands.append(command)
+
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
+
+        try:
+            output_prefix = 'Bandwidth (GB/s): '
+            assert (raw_output.startswith(output_prefix))
+            self._result.add_result(self._args.mem_type[cmd_idx], float(raw_output[len(output_prefix):]))
+        except BaseException as e:
+            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            return False
+
+        return True
+
+
+BenchmarkRegistry.register_benchmark('gpu-sm-copy-bw', GpuSmCopyBwBenchmark)
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -59,6 +59,16 @@ superbench:
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
          parallel: yes
+    gpu-sm-copy-bw:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 32
+          prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
+          parallel: no
+      parameters:
+        dtoh: true
+        htod: true
    kernel-launch:
      <<: *default_local_mode
    gemm-flops:
--- a/tests/benchmarks/micro_benchmarks/test_gpu_sm_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_sm_copy_bw_performance.py
@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for disk-performance benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+def _test_gpu_sm_copy_bw_performance_impl(platform):
+    """Test gpu-sm-copy-bw benchmark."""
+    benchmark_name = 'gpu-sm-copy-bw'
+    (benchmark_class,
+     predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
+    assert (benchmark_class)
+
+    size = 1048576
+    num_loops = 10000
+    mem_types = ['dtoh', 'htod']
+
+    parameters = '--mem_type %s --size %d --num_loops %d' % \
+        (' '.join(mem_types), size, num_loops)
+    benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+    # Check basic information
+    assert (benchmark)
+    ret = benchmark._preprocess()
+    assert (ret is True)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert (benchmark.name == benchmark_name)
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.mem_type == mem_types)
+    assert (benchmark._args.size == size)
+    assert (benchmark._args.num_loops == num_loops)
+
+    # Check and revise command list
+    assert (len(mem_types) == len(benchmark._commands))
+    for idx, mem_type in enumerate(mem_types):
+        assert (
+            benchmark._commands[idx] == '%s 0 %s %d %d' %
+            (benchmark._GpuSmCopyBwBenchmark__bin_path, mem_type, size, num_loops)
+        )
+
+    # Run benchmark
+    assert (benchmark._benchmark())
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    for idx, mem_type in enumerate(mem_types):
+        raw_output_key = 'raw_output_%d' % idx
+        assert (raw_output_key in benchmark.raw_data)
+        assert (len(benchmark.raw_data[raw_output_key]) == 1)
+        assert (isinstance(benchmark.raw_data[raw_output_key][0], str))
+
+        output_key = mem_type
+        assert (output_key in benchmark.result)
+        assert (len(benchmark.result[output_key]) == 1)
+        assert (isinstance(benchmark.result[output_key][0], numbers.Number))
+
+
+@decorator.cuda_test
+def test_gpu_sm_copy_bw_performance_cuda():
+    """Test gpu-sm-copy-bw benchmark, CUDA case."""
+    _test_gpu_sm_copy_bw_performance_impl(Platform.CUDA)
+
+
+@decorator.rocm_test
+def test_gpu_sm_copy_bw_performance_rocm():
+    """Test gpu-sm-copy-bw benchmark, ROCm case."""
+    _test_gpu_sm_copy_bw_performance_impl(Platform.ROCM)