Benchmarks: Add Benchmark - Add GPU SM copy benchmark (#169)

**Description**
This commit adds gpu_sm_copy benchmark and related tests.
This commit is contained in:
Ziyue Yang 2021-08-30 18:54:26 +08:00 коммит произвёл GitHub
Родитель de481cb0e8
Коммит b97197f08e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
5 изменённых файлов: 218 добавлений и 1 удалений

Просмотреть файл

@ -0,0 +1,28 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for GPU SM copy bandwidth performance.
Commands to run:
python3 examples/benchmarks/gpu_sm_copy_bw_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'gpu-sm-copy-bw', platform=Platform.CUDA, parameters='--mem_type dtoh htod'
)
# For ROCm environment, please specify the benchmark name and the platform as the following.
# context = BenchmarkRegistry.create_benchmark_context(
# 'gpu-sm-copy-bw', platform=Platform.ROCM, parameters='--mem_type dtoh htod'
# )
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)

Просмотреть файл

@ -18,10 +18,11 @@ from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoo
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark
__all__ = [
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
'RocmGemmFlopsBenchmark'
'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark'
]

Просмотреть файл

@ -0,0 +1,103 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPU SM Copy Bandwidth Performance benchmark."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GpuSmCopyBwBenchmark(MicroBenchmarkWithInvoke):
"""The GPU SM copy bandwidth performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'gpu_sm_copy'
self._mem_types = ['htod', 'dtoh']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--mem_type',
type=str,
nargs='+',
default=self._mem_types,
help='Memory types for benchmark. E.g. {}.'.format(' '.join(self._mem_types)),
)
self._parser.add_argument(
'--size',
type=int,
default=64 * 1024**2,
required=False,
help='Size of data buffer in bytes.',
)
self._parser.add_argument(
'--num_loops',
type=int,
default=100,
required=False,
help='Number of data buffer copies performed.',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
for mem_type in self._args.mem_type:
command = '%s 0 %s %d %d' % \
(self.__bin_path, mem_type, self._args.size, self._args.num_loops)
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
try:
output_prefix = 'Bandwidth (GB/s): '
assert (raw_output.startswith(output_prefix))
self._result.add_result(self._args.mem_type[cmd_idx], float(raw_output[len(output_prefix):]))
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
return False
return True
BenchmarkRegistry.register_benchmark('gpu-sm-copy-bw', GpuSmCopyBwBenchmark)

Просмотреть файл

@ -59,6 +59,16 @@ superbench:
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
parallel: yes
gpu-sm-copy-bw:
enable: false
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
dtoh: true
htod: true
kernel-launch:
<<: *default_local_mode
gemm-flops:

Просмотреть файл

@ -0,0 +1,75 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for disk-performance benchmark."""
import numbers
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
def _test_gpu_sm_copy_bw_performance_impl(platform):
"""Test gpu-sm-copy-bw benchmark."""
benchmark_name = 'gpu-sm-copy-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
assert (benchmark_class)
size = 1048576
num_loops = 10000
mem_types = ['dtoh', 'htod']
parameters = '--mem_type %s --size %d --num_loops %d' % \
(' '.join(mem_types), size, num_loops)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.mem_type == mem_types)
assert (benchmark._args.size == size)
assert (benchmark._args.num_loops == num_loops)
# Check and revise command list
assert (len(mem_types) == len(benchmark._commands))
for idx, mem_type in enumerate(mem_types):
assert (
benchmark._commands[idx] == '%s 0 %s %d %d' %
(benchmark._GpuSmCopyBwBenchmark__bin_path, mem_type, size, num_loops)
)
# Run benchmark
assert (benchmark._benchmark())
# Check results and metrics.
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
for idx, mem_type in enumerate(mem_types):
raw_output_key = 'raw_output_%d' % idx
assert (raw_output_key in benchmark.raw_data)
assert (len(benchmark.raw_data[raw_output_key]) == 1)
assert (isinstance(benchmark.raw_data[raw_output_key][0], str))
output_key = mem_type
assert (output_key in benchmark.result)
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
@decorator.cuda_test
def test_gpu_sm_copy_bw_performance_cuda():
"""Test gpu-sm-copy-bw benchmark, CUDA case."""
_test_gpu_sm_copy_bw_performance_impl(Platform.CUDA)
@decorator.rocm_test
def test_gpu_sm_copy_bw_performance_rocm():
"""Test gpu-sm-copy-bw benchmark, ROCm case."""
_test_gpu_sm_copy_bw_performance_impl(Platform.ROCM)