Benchmarks: Add Benchmark - Add GPU SM copy benchmark (#169)
**Description** This commit adds gpu_sm_copy benchmark and related tests.
This commit is contained in:
Родитель
de481cb0e8
Коммит
b97197f08e
|
@ -0,0 +1,28 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Micro benchmark example for GPU SM copy bandwidth performance.
|
||||
|
||||
Commands to run:
|
||||
python3 examples/benchmarks/gpu_sm_copy_bw_performance.py
|
||||
"""
|
||||
|
||||
from superbench.benchmarks import BenchmarkRegistry, Platform
|
||||
from superbench.common.utils import logger
|
||||
|
||||
if __name__ == '__main__':
|
||||
context = BenchmarkRegistry.create_benchmark_context(
|
||||
'gpu-sm-copy-bw', platform=Platform.CUDA, parameters='--mem_type dtoh htod'
|
||||
)
|
||||
# For ROCm environment, please specify the benchmark name and the platform as the following.
|
||||
# context = BenchmarkRegistry.create_benchmark_context(
|
||||
# 'gpu-sm-copy-bw', platform=Platform.ROCM, parameters='--mem_type dtoh htod'
|
||||
# )
|
||||
|
||||
benchmark = BenchmarkRegistry.launch_benchmark(context)
|
||||
if benchmark:
|
||||
logger.info(
|
||||
'benchmark: {}, return code: {}, result: {}'.format(
|
||||
benchmark.name, benchmark.return_code, benchmark.result
|
||||
)
|
||||
)
|
|
@ -18,10 +18,11 @@ from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoo
|
|||
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark
|
||||
|
||||
__all__ = [
|
||||
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
|
||||
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
|
||||
'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
|
||||
'RocmGemmFlopsBenchmark'
|
||||
'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,103 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Module of the GPU SM Copy Bandwidth Performance benchmark."""
|
||||
|
||||
import os
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
|
||||
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
|
||||
|
||||
|
||||
class GpuSmCopyBwBenchmark(MicroBenchmarkWithInvoke):
|
||||
"""The GPU SM copy bandwidth performance benchmark class."""
|
||||
def __init__(self, name, parameters=''):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
name (str): benchmark name.
|
||||
parameters (str): benchmark parameters.
|
||||
"""
|
||||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'gpu_sm_copy'
|
||||
self._mem_types = ['htod', 'dtoh']
|
||||
|
||||
def add_parser_arguments(self):
|
||||
"""Add the specified arguments."""
|
||||
super().add_parser_arguments()
|
||||
|
||||
self._parser.add_argument(
|
||||
'--mem_type',
|
||||
type=str,
|
||||
nargs='+',
|
||||
default=self._mem_types,
|
||||
help='Memory types for benchmark. E.g. {}.'.format(' '.join(self._mem_types)),
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--size',
|
||||
type=int,
|
||||
default=64 * 1024**2,
|
||||
required=False,
|
||||
help='Size of data buffer in bytes.',
|
||||
)
|
||||
|
||||
self._parser.add_argument(
|
||||
'--num_loops',
|
||||
type=int,
|
||||
default=100,
|
||||
required=False,
|
||||
help='Number of data buffer copies performed.',
|
||||
)
|
||||
|
||||
def _preprocess(self):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
||||
Return:
|
||||
True if _preprocess() succeed.
|
||||
"""
|
||||
if not super()._preprocess():
|
||||
return False
|
||||
|
||||
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
|
||||
|
||||
for mem_type in self._args.mem_type:
|
||||
command = '%s 0 %s %d %d' % \
|
||||
(self.__bin_path, mem_type, self._args.size, self._args.num_loops)
|
||||
self._commands.append(command)
|
||||
|
||||
return True
|
||||
|
||||
def _process_raw_result(self, cmd_idx, raw_output):
|
||||
"""Function to parse raw results and save the summarized results.
|
||||
|
||||
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
|
||||
|
||||
Args:
|
||||
cmd_idx (int): the index of command corresponding with the raw_output.
|
||||
raw_output (str): raw output string of the micro-benchmark.
|
||||
|
||||
Return:
|
||||
True if the raw output string is valid and result can be extracted.
|
||||
"""
|
||||
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
|
||||
|
||||
try:
|
||||
output_prefix = 'Bandwidth (GB/s): '
|
||||
assert (raw_output.startswith(output_prefix))
|
||||
self._result.add_result(self._args.mem_type[cmd_idx], float(raw_output[len(output_prefix):]))
|
||||
except BaseException as e:
|
||||
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
|
||||
logger.error(
|
||||
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
|
||||
self._curr_run_index, self._name, raw_output, str(e)
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
BenchmarkRegistry.register_benchmark('gpu-sm-copy-bw', GpuSmCopyBwBenchmark)
|
|
@ -59,6 +59,16 @@ superbench:
|
|||
proc_num: 8
|
||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
|
||||
parallel: yes
|
||||
gpu-sm-copy-bw:
|
||||
enable: false
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 32
|
||||
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
|
||||
parallel: no
|
||||
parameters:
|
||||
dtoh: true
|
||||
htod: true
|
||||
kernel-launch:
|
||||
<<: *default_local_mode
|
||||
gemm-flops:
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Tests for disk-performance benchmark."""
|
||||
|
||||
import numbers
|
||||
|
||||
from tests.helper import decorator
|
||||
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
|
||||
|
||||
|
||||
def _test_gpu_sm_copy_bw_performance_impl(platform):
|
||||
"""Test gpu-sm-copy-bw benchmark."""
|
||||
benchmark_name = 'gpu-sm-copy-bw'
|
||||
(benchmark_class,
|
||||
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
|
||||
assert (benchmark_class)
|
||||
|
||||
size = 1048576
|
||||
num_loops = 10000
|
||||
mem_types = ['dtoh', 'htod']
|
||||
|
||||
parameters = '--mem_type %s --size %d --num_loops %d' % \
|
||||
(' '.join(mem_types), size, num_loops)
|
||||
benchmark = benchmark_class(benchmark_name, parameters=parameters)
|
||||
|
||||
# Check basic information
|
||||
assert (benchmark)
|
||||
ret = benchmark._preprocess()
|
||||
assert (ret is True)
|
||||
assert (benchmark.return_code == ReturnCode.SUCCESS)
|
||||
assert (benchmark.name == benchmark_name)
|
||||
assert (benchmark.type == BenchmarkType.MICRO)
|
||||
|
||||
# Check parameters specified in BenchmarkContext.
|
||||
assert (benchmark._args.mem_type == mem_types)
|
||||
assert (benchmark._args.size == size)
|
||||
assert (benchmark._args.num_loops == num_loops)
|
||||
|
||||
# Check and revise command list
|
||||
assert (len(mem_types) == len(benchmark._commands))
|
||||
for idx, mem_type in enumerate(mem_types):
|
||||
assert (
|
||||
benchmark._commands[idx] == '%s 0 %s %d %d' %
|
||||
(benchmark._GpuSmCopyBwBenchmark__bin_path, mem_type, size, num_loops)
|
||||
)
|
||||
|
||||
# Run benchmark
|
||||
assert (benchmark._benchmark())
|
||||
|
||||
# Check results and metrics.
|
||||
assert (benchmark.run_count == 1)
|
||||
assert (benchmark.return_code == ReturnCode.SUCCESS)
|
||||
for idx, mem_type in enumerate(mem_types):
|
||||
raw_output_key = 'raw_output_%d' % idx
|
||||
assert (raw_output_key in benchmark.raw_data)
|
||||
assert (len(benchmark.raw_data[raw_output_key]) == 1)
|
||||
assert (isinstance(benchmark.raw_data[raw_output_key][0], str))
|
||||
|
||||
output_key = mem_type
|
||||
assert (output_key in benchmark.result)
|
||||
assert (len(benchmark.result[output_key]) == 1)
|
||||
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
|
||||
|
||||
|
||||
@decorator.cuda_test
|
||||
def test_gpu_sm_copy_bw_performance_cuda():
|
||||
"""Test gpu-sm-copy-bw benchmark, CUDA case."""
|
||||
_test_gpu_sm_copy_bw_performance_impl(Platform.CUDA)
|
||||
|
||||
|
||||
@decorator.rocm_test
|
||||
def test_gpu_sm_copy_bw_performance_rocm():
|
||||
"""Test gpu-sm-copy-bw benchmark, ROCm case."""
|
||||
_test_gpu_sm_copy_bw_performance_impl(Platform.ROCM)
|
Загрузка…
Ссылка в новой задаче