Benchmarks: Add Benchmark - Add memory bus bandwidth performance microbenchmark for amd (#153)
**Description** Add memory bus bandwidth performance microbenchmark for amd. **Major Revision** - Add memory bus bandwidth performance microbenchmark for amd. - Add related example and test file.
This commit is contained in:
Родитель
2880f71ef0
Коммит
666e3a9471
|
@ -0,0 +1,22 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Micro benchmark example for device memory bandwidth performance.
|
||||
|
||||
Commands to run:
|
||||
python3 examples/benchmarks/rocm_memory_bw_performance.py
|
||||
"""
|
||||
|
||||
from superbench.benchmarks import BenchmarkRegistry, Platform
|
||||
from superbench.common.utils import logger
|
||||
|
||||
if __name__ == '__main__':
|
||||
context = BenchmarkRegistry.create_benchmark_context('mem-bw', platform=Platform.ROCM)
|
||||
|
||||
benchmark = BenchmarkRegistry.launch_benchmark(context)
|
||||
if benchmark:
|
||||
logger.info(
|
||||
'benchmark: {}, return code: {}, result: {}'.format(
|
||||
benchmark.name, benchmark.return_code, benchmark.result
|
||||
)
|
||||
)
|
|
@ -15,9 +15,10 @@ from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import Cu
|
|||
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
|
||||
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
|
||||
|
||||
__all__ = [
|
||||
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
|
||||
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'MemBwBenchmark', 'CudaMemBwBenchmark', 'DiskBenchmark',
|
||||
'IBLoopbackBenchmark', 'CudaNcclBwBenchmark'
|
||||
'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark'
|
||||
]
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
"""Module of the ROCm memory performance benchmarks."""
|
||||
|
||||
import os
|
||||
|
||||
from superbench.common.utils import logger
|
||||
from superbench.benchmarks import BenchmarkRegistry, Platform
|
||||
from superbench.benchmarks.micro_benchmarks import MemBwBenchmark
|
||||
|
||||
|
||||
class RocmMemBwBenchmark(MemBwBenchmark):
|
||||
"""The ROCm memory performance benchmark class."""
|
||||
def __init__(self, name, parameters=''):
|
||||
"""Constructor.
|
||||
|
||||
Args:
|
||||
name (str): benchmark name.
|
||||
parameters (str): benchmark parameters.
|
||||
"""
|
||||
super().__init__(name, parameters)
|
||||
|
||||
self._bin_name = 'hipBusBandwidth'
|
||||
self._mem_types = ['htod', 'dtoh']
|
||||
self._parse_logline_map = {'htod': 'H2D_Bandwidth', 'dtoh': 'D2H_Bandwidth'}
|
||||
|
||||
def add_parser_arguments(self):
|
||||
"""Add the specified arguments."""
|
||||
super().add_parser_arguments()
|
||||
|
||||
def _preprocess(self):
|
||||
"""Preprocess/preparation operations before the benchmarking.
|
||||
|
||||
Return:
|
||||
True if _preprocess() succeed.
|
||||
"""
|
||||
if not super()._preprocess():
|
||||
return False
|
||||
|
||||
# Check the arguments and generate the commands
|
||||
for mem_type in self._args.mem_type:
|
||||
command = os.path.join(self._args.bin_dir, self._bin_name)
|
||||
command += ' --' + mem_type.replace('to', '2')
|
||||
if self._args.memory == 'unpinned':
|
||||
command += ' --unpinned'
|
||||
self._commands.append(command)
|
||||
|
||||
return True
|
||||
|
||||
def _process_raw_result(self, cmd_idx, raw_output):
|
||||
"""Function to parse raw results and save the summarized results.
|
||||
|
||||
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
|
||||
|
||||
Args:
|
||||
cmd_idx (int): the index of command corresponding with the raw_output.
|
||||
raw_output (str): raw output string of the micro-benchmark.
|
||||
|
||||
Return:
|
||||
True if the raw output string is valid and result can be extracted.
|
||||
"""
|
||||
self._result.add_raw_data('raw_output_' + self._args.mem_type[cmd_idx], raw_output)
|
||||
|
||||
mem_bw = -1
|
||||
value_index = -1
|
||||
size_index = -1
|
||||
valid = True
|
||||
content = raw_output.splitlines()
|
||||
try:
|
||||
parse_logline = self._parse_logline_map[self._args.mem_type[cmd_idx]]
|
||||
for line in content:
|
||||
if parse_logline in line and value_index != -1:
|
||||
line = line.split()
|
||||
mem_bw = float(line[value_index])
|
||||
metric = self._args.mem_type[cmd_idx] + '_' + line[size_index]
|
||||
self._result.add_result(metric, mem_bw)
|
||||
elif 'mean' in line:
|
||||
line = line.split()
|
||||
value_index = line.index('mean')
|
||||
size_index = line.index('atts')
|
||||
except BaseException:
|
||||
valid = False
|
||||
finally:
|
||||
if valid is False or mem_bw == -1:
|
||||
logger.error(
|
||||
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
|
||||
self._curr_run_index, self._name, raw_output
|
||||
)
|
||||
)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.ROCM)
|
|
@ -0,0 +1,169 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
"""Tests for mem-bw benchmark."""
|
||||
|
||||
import numbers
|
||||
from pathlib import Path
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
|
||||
|
||||
|
||||
class RocmMemBwTest(unittest.TestCase):
|
||||
"""Test class for rocm mem-bw benchmark."""
|
||||
def setUp(self):
|
||||
"""Method called to prepare the test fixture."""
|
||||
# Create fake binary file just for testing.
|
||||
os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
|
||||
binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
|
||||
Path(os.getenv('SB_MICRO_PATH'), 'bin').mkdir(parents=True, exist_ok=True)
|
||||
self.__binary_file = Path(binary_path, 'hipBusBandwidth')
|
||||
self.__binary_file.touch(mode=0o755, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
"""Method called after the test method has been called and the result recorded."""
|
||||
self.__binary_file.unlink()
|
||||
|
||||
def test_rocm_memory_bw_performance(self):
|
||||
"""Test rocm mem-bw benchmark."""
|
||||
benchmark_name = 'mem-bw'
|
||||
(benchmark_class,
|
||||
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.ROCM)
|
||||
assert (benchmark_class)
|
||||
|
||||
benchmark = benchmark_class(benchmark_name)
|
||||
|
||||
ret = benchmark._preprocess()
|
||||
assert (ret is True)
|
||||
assert (benchmark.return_code == ReturnCode.SUCCESS)
|
||||
|
||||
# Check basic information.
|
||||
assert (benchmark)
|
||||
assert (benchmark.name == 'mem-bw')
|
||||
assert (benchmark.type == BenchmarkType.MICRO)
|
||||
|
||||
# Check command list
|
||||
expected_command = ['hipBusBandwidth --h2d', 'hipBusBandwidth --d2h']
|
||||
for i in range(len(expected_command)):
|
||||
commnad = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
|
||||
assert (commnad == expected_command[i])
|
||||
|
||||
# Check results and metrics.
|
||||
raw_output = {}
|
||||
raw_output[0] = """
|
||||
Device:Device 738c Mem=32.0GB #CUs=120 Freq=1502Mhz MallocMode=pinned
|
||||
test atts units median mean stddev min max
|
||||
H2D_Bandwidth_pinned +064By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
H2D_Bandwidth_pinned +256By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
H2D_Bandwidth_pinned +512By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
H2D_Bandwidth_pinned 1kB GB/sec 0.0414 0.0411 0.0017 0.0189 0.0434
|
||||
H2D_Bandwidth_pinned 2kB GB/sec 0.0828 0.0824 0.0018 0.0683 0.0862
|
||||
H2D_Bandwidth_pinned 4kB GB/sec 0.1656 0.1652 0.0032 0.1374 0.1724
|
||||
H2D_Bandwidth_pinned 8kB GB/sec 0.3268 0.3251 0.0117 0.1880 0.3425
|
||||
H2D_Bandwidth_pinned 16kB GB/sec 0.6410 0.6365 0.0259 0.3597 0.6757
|
||||
H2D_Bandwidth_pinned 32kB GB/sec 1.2422 1.2432 0.0278 0.9346 1.2987
|
||||
H2D_Bandwidth_pinned 64kB GB/sec 2.3968 2.4161 0.1486 0.7242 2.6042
|
||||
H2D_Bandwidth_pinned 128kB GB/sec 4.6786 4.6339 0.1310 4.1143 4.8162
|
||||
H2D_Bandwidth_pinned 256kB GB/sec 7.8349 7.8369 0.1150 6.9093 8.0270
|
||||
H2D_Bandwidth_pinned 512kB GB/sec 11.9963 11.9828 0.1287 11.2158 12.2201
|
||||
H2D_Bandwidth_pinned 1024kB GB/sec 16.3342 16.3315 0.0956 16.0147 16.5823
|
||||
H2D_Bandwidth_pinned 2048kB GB/sec 19.9790 19.9770 0.0853 19.7681 20.1635
|
||||
H2D_Bandwidth_pinned 4096kB GB/sec 22.2706 22.2642 0.0552 22.0644 22.3847
|
||||
H2D_Bandwidth_pinned 8192kB GB/sec 22.8232 22.7881 0.1669 21.3196 22.8930
|
||||
H2D_Bandwidth_pinned 16384kB GB/sec 24.1521 24.1411 0.0429 24.0165 24.2162
|
||||
H2D_Bandwidth_pinned 32768kB GB/sec 24.8695 24.7086 0.7491 20.6288 24.9035
|
||||
H2D_Bandwidth_pinned 65536kB GB/sec 24.4840 24.0101 2.5769 6.1754 24.5292
|
||||
H2D_Bandwidth_pinned 131072kB GB/sec 25.0487 24.9593 0.2601 24.1286 25.0711
|
||||
H2D_Bandwidth_pinned 262144kB GB/sec 25.3280 25.2351 0.1788 24.8746 25.3498
|
||||
H2D_Bandwidth_pinned 524288kB GB/sec 24.7523 24.6708 0.1586 24.3154 24.7880
|
||||
H2D_Timepinned +064By ms 0.0245 0.0253 0.0240 0.0232 0.7821
|
||||
H2D_Timepinned +256By ms 0.0243 0.0244 0.0013 0.0232 0.0546
|
||||
H2D_Timepinned +512By ms 0.0243 0.0244 0.0014 0.0230 0.0566
|
||||
H2D_Timepinned 1kB ms 0.0242 0.0244 0.0016 0.0230 0.0530
|
||||
H2D_Timepinned 2kB ms 0.0242 0.0243 0.0005 0.0232 0.0293
|
||||
H2D_Timepinned 4kB ms 0.0242 0.0242 0.0005 0.0232 0.0291
|
||||
H2D_Timepinned 8kB ms 0.0245 0.0247 0.0013 0.0234 0.0426
|
||||
H2D_Timepinned 16kB ms 0.0250 0.0252 0.0015 0.0237 0.0445
|
||||
H2D_Timepinned 32kB ms 0.0258 0.0258 0.0006 0.0246 0.0342
|
||||
H2D_Timepinned 64kB ms 0.0271 0.0272 0.0045 0.0250 0.0898
|
||||
H2D_Timepinned 128kB ms 0.0280 0.0283 0.0008 0.0272 0.0318
|
||||
H2D_Timepinned 256kB ms 0.0334 0.0334 0.0005 0.0326 0.0379
|
||||
H2D_Timepinned 512kB ms 0.0437 0.0437 0.0005 0.0429 0.0467
|
||||
H2D_Timepinned 1024kB ms 0.0642 0.0642 0.0004 0.0632 0.0654
|
||||
H2D_Timepinned 2048kB ms 0.1050 0.1050 0.0004 0.1040 0.1061
|
||||
H2D_Timepinned 4096kB ms 0.1883 0.1884 0.0005 0.1874 0.1901
|
||||
H2D_Timepinned 8192kB ms 0.3675 0.3681 0.0028 0.3664 0.3934
|
||||
H2D_Timepinned 16384kB ms 0.6946 0.6950 0.0012 0.6928 0.6986
|
||||
H2D_Timepinned 32768kB ms 1.3492 1.3595 0.0482 1.3474 1.6266
|
||||
H2D_Timepinned 65536kB ms 2.7409 2.9163 1.1368 2.7358 10.8670
|
||||
H2D_Timepinned 131072kB ms 5.3582 5.3780 0.0576 5.3534 5.5626
|
||||
H2D_Timepinned 262144kB ms 10.5983 10.6379 0.0761 10.5892 10.7915
|
||||
H2D_Timepinned 524288kB ms 21.6897 21.7622 0.1411 21.6585 22.0794
|
||||
|
||||
Note: results marked with (*) had missing values such as
|
||||
might occur with a mixture of architectural capabilities.
|
||||
"""
|
||||
raw_output[1] = """
|
||||
Device:Device 738c Mem=32.0GB #CUs=120 Freq=1502Mhz MallocMode=pinned
|
||||
test atts units median mean stddev min max
|
||||
D2H_Bandwidth_pinned +064By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
D2H_Bandwidth_pinned +256By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
D2H_Bandwidth_pinned +512By GB/sec 0.0000 0.0000 0.0000 0.0000 0.0000
|
||||
D2H_Bandwidth_pinned 1kB GB/sec 0.0428 0.0426 0.0019 0.0114 0.0446
|
||||
D2H_Bandwidth_pinned 2kB GB/sec 0.0850 0.0844 0.0034 0.0415 0.0893
|
||||
D2H_Bandwidth_pinned 4kB GB/sec 0.1701 0.1687 0.0084 0.0504 0.1773
|
||||
D2H_Bandwidth_pinned 8kB GB/sec 0.3378 0.3348 0.0168 0.1085 0.3546
|
||||
D2H_Bandwidth_pinned 16kB GB/sec 0.6667 0.6606 0.0218 0.5618 0.6897
|
||||
D2H_Bandwidth_pinned 32kB GB/sec 1.3072 1.2954 0.0663 0.5682 1.3605
|
||||
D2H_Bandwidth_pinned 64kB GB/sec 2.5550 2.5339 0.0955 2.1382 2.6904
|
||||
D2H_Bandwidth_pinned 128kB GB/sec 4.8162 4.7807 0.2331 2.0940 4.9621
|
||||
D2H_Bandwidth_pinned 256kB GB/sec 8.2286 8.2192 0.1671 7.2456 8.5286
|
||||
D2H_Bandwidth_pinned 512kB GB/sec 12.7930 12.7062 0.4407 7.1196 13.0478
|
||||
D2H_Bandwidth_pinned 1024kB GB/sec 17.5603 17.4938 0.3921 12.7184 17.7989
|
||||
D2H_Bandwidth_pinned 2048kB GB/sec 21.6275 21.5591 0.2233 20.6073 21.8076
|
||||
D2H_Bandwidth_pinned 4096kB GB/sec 24.2708 24.2556 0.0942 23.5724 24.4292
|
||||
D2H_Bandwidth_pinned 8192kB GB/sec 24.9287 24.9093 0.0733 24.7171 25.0359
|
||||
D2H_Bandwidth_pinned 16384kB GB/sec 26.4588 26.1976 2.4387 1.9387 26.5191
|
||||
D2H_Bandwidth_pinned 32768kB GB/sec 27.2939 27.1202 0.7941 23.2086 27.3277
|
||||
D2H_Bandwidth_pinned 65536kB GB/sec 26.8278 26.7238 0.3894 24.7946 26.9000
|
||||
D2H_Bandwidth_pinned 131072kB GB/sec 27.4751 27.3457 0.3968 25.4168 27.5098
|
||||
D2H_Bandwidth_pinned 262144kB GB/sec 27.8236 27.7173 0.3072 26.7977 27.8525
|
||||
D2H_Bandwidth_pinned 524288kB GB/sec 28.0193 27.9348 0.1912 27.4707 28.0314
|
||||
D2H_Time_pinned +064By ms 0.0229 0.0246 0.0457 0.0216 1.4690
|
||||
D2H_Time_pinned +256By ms 0.0232 0.0234 0.0013 0.0221 0.0378
|
||||
D2H_Time_pinned +512By ms 0.0234 0.0238 0.0063 0.0224 0.2091
|
||||
D2H_Time_pinned 1kB ms 0.0234 0.0236 0.0028 0.0224 0.0875
|
||||
D2H_Time_pinned 2kB ms 0.0235 0.0237 0.0014 0.0224 0.0482
|
||||
D2H_Time_pinned 4kB ms 0.0235 0.0239 0.0031 0.0226 0.0794
|
||||
D2H_Time_pinned 8kB ms 0.0237 0.0240 0.0027 0.0226 0.0738
|
||||
D2H_Time_pinned 16kB ms 0.0240 0.0242 0.0009 0.0232 0.0285
|
||||
D2H_Time_pinned 32kB ms 0.0245 0.0248 0.0021 0.0235 0.0563
|
||||
D2H_Time_pinned 64kB ms 0.0254 0.0257 0.0011 0.0242 0.0304
|
||||
D2H_Time_pinned 128kB ms 0.0272 0.0275 0.0026 0.0264 0.0626
|
||||
D2H_Time_pinned 256kB ms 0.0318 0.0319 0.0007 0.0307 0.0362
|
||||
D2H_Time_pinned 512kB ms 0.0410 0.0413 0.0024 0.0402 0.0736
|
||||
D2H_Time_pinned 1024kB ms 0.0597 0.0599 0.0017 0.0589 0.0824
|
||||
D2H_Time_pinned 2048kB ms 0.0970 0.0973 0.0010 0.0962 0.1018
|
||||
D2H_Time_pinned 4096kB ms 0.1728 0.1729 0.0007 0.1717 0.1779
|
||||
D2H_Time_pinned 8192kB ms 0.3365 0.3367 0.0010 0.3350 0.3394
|
||||
D2H_Time_pinned 16384kB ms 0.6341 0.7147 0.7979 0.6326 8.6538
|
||||
D2H_Time_pinned 32768kB ms 1.2294 1.2385 0.0420 1.2278 1.4458
|
||||
D2H_Time_pinned 65536kB ms 2.5014 2.5117 0.0391 2.4947 2.7066
|
||||
D2H_Time_pinned 131072kB ms 4.8850 4.9092 0.0748 4.8789 5.2806
|
||||
D2H_Time_pinned 262144kB ms 9.6478 9.6860 0.1106 9.6377 10.0171
|
||||
D2H_Time_pinned 524288kB ms 19.1607 19.2196 0.1333 19.1525 19.5434
|
||||
|
||||
Note: results marked with (*) had missing values such as
|
||||
might occur with a mixture of architectural capabilities.
|
||||
"""
|
||||
|
||||
for i, metric in enumerate(['htod_524288kB', 'htod_524288kB']):
|
||||
assert (benchmark._process_raw_result(i, raw_output[i]))
|
||||
assert (metric in benchmark.result)
|
||||
assert (len(benchmark.result[metric]) == 1)
|
||||
assert (isinstance(benchmark.result[metric][0], numbers.Number))
|
||||
|
||||
assert (benchmark.result['htod_524288kB'][0] == 24.6708)
|
||||
assert (benchmark.result['dtoh_524288kB'][0] == 27.9348)
|
Загрузка…
Ссылка в новой задаче