Benchmarks: Add Benchmark - Add gpcnet microbenchmark (#229)

**Description**
Add gpcnet microbenchmark

**Major Revision**
- add 2 microbenmark for gpcnet, gpc-network-test, gpc-network-load-test
- add related test and example file
This commit is contained in:
Yuting Jiang 2021-10-22 16:40:01 +08:00 коммит произвёл GitHub
Родитель f841c8f466
Коммит 6003f2c2a2
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 449 добавлений и 1 удалений

Просмотреть файл

@ -0,0 +1,23 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for gpcnet performance.
Commands to run:
mpirun --allow-run-as-root -np 2 -H node0:1,node1:1 examples/benchmarks/gpcnet_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context('gpcnet-network-test')
# context = BenchmarkRegistry.create_benchmark_context('gpcnet-network-load-test')
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)

Просмотреть файл

@ -20,10 +20,11 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
__all__ = [
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark', 'TCPConnectivityBenchmark'
'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark', 'TCPConnectivityBenchmark', 'GPCNetBenchmark'
]

Просмотреть файл

@ -0,0 +1,106 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPCNet benchmarks."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GPCNetBenchmark(MicroBenchmarkWithInvoke):
"""The GPCNet performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
if self._name == 'gpcnet-network-test':
self._bin_name = 'network_test'
if self._name == 'gpcnet-network-load-test':
self._bin_name = 'network_load_test'
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
command = os.path.join(self._args.bin_dir, self._bin_name)
self._commands.append(command)
return True
def _process_raw_result(self, idx, raw_output): # noqa: C901
"""Function to process raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
idx (int): the index corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + str(idx), raw_output)
try:
# Parse and add result
if 'ERROR' not in raw_output:
raw_output = raw_output.splitlines()
labels = None
test_name = ''
for line in raw_output:
if not line.startswith('|'):
continue
items = line.split('|')
items = [item.strip() for item in items]
# Get table name
if len(items) == 3 and 'Tests' in items[1]:
test_name = items[1].replace(' ', '')
# Get the line of the table labels
elif 'Avg' in line or 'Name' in line:
labels = items
# Get values related to the labels
else:
name_prefix = items[1].replace(' ', '')
for i in range(2, len(items) - 1):
if labels[i] != 'Units':
self._result.add_result(
test_name + '_' + name_prefix + '_' + labels[i], float(items[i].strip('X'))
)
elif 'ERROR: this application must be run on at least' in raw_output:
return True
else:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
self._curr_run_index, self._name, raw_output
)
)
return False
except Exception as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
return False
return True
BenchmarkRegistry.register_benchmark('gpcnet-network-test', GPCNetBenchmark)
BenchmarkRegistry.register_benchmark('gpcnet-network-load-test', GPCNetBenchmark)

Просмотреть файл

@ -0,0 +1,318 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for GPCNet benchmark."""
import os
import numbers
import unittest
from pathlib import Path
from superbench.benchmarks import BenchmarkRegistry, Platform, BenchmarkType
class GPCNetBenchmarkTest(unittest.TestCase): # noqa: E501
"""Tests for GPCNetBenchmark benchmark."""
def setUp(self):
"""Method called to prepare the test fixture."""
# Create fake binary file just for testing.
os.environ['SB_MICRO_PATH'] = '/tmp/superbench'
binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin')
Path(binary_path).mkdir(parents=True, exist_ok=True)
self.__binary_files = []
for bin_name in ['network_test', 'network_load_test']:
self.__binary_files.append(Path(binary_path, bin_name))
Path(binary_path, bin_name).touch(mode=0o755, exist_ok=True)
def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
for bin_file in self.__binary_files:
bin_file.unlink()
def test_gpcnet_network_test(self):
"""Test gpcnet-network-test benchmark."""
raw_output = """# noqa: E501
Network Tests v1.3
Test with 2 MPI ranks (2 nodes)
Legend
RR = random ring communication pattern
Nat = natural ring communication pattern
Lat = latency
BW = bandwidth
BW+Sync = bandwidth with barrier
+------------------------------------------------------------------------------+
| Isolated Network Tests |
+---------------------------------+--------------+--------------+--------------+
| Name | Avg | 99% | Units |
+---------------------------------+--------------+--------------+--------------+
| RR Two-sided Lat (8 B) | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+
| RR Get Lat (8 B) | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+
| RR Two-sided BW (131072 B) | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+
| RR Put BW (131072 B) | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+
| RR Two-sided BW+Sync (131072 B) | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+
| Nat Two-sided BW (131072 B) | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+
| Multiple Allreduce (8 B) | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+
| Multiple Alltoall (4096 B) | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+
"""
# Check registry.
benchmark_name = 'gpcnet-network-test'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
# Check preprocess
benchmark = benchmark_class(benchmark_name)
ret = benchmark._preprocess()
assert (ret)
expect_command = 'network_test'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
assert (command == expect_command)
raw_output_no_execution = """
ERROR: this application must be run on at least 2 nodes
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[63697,1],0]
Exit code: 1
--------------------------------------------------------------------------
"""
assert (benchmark._process_raw_result(0, raw_output_no_execution))
assert (len(benchmark.result) == 0)
# Check function process_raw_data.
# Positive case - valid raw output.
assert (benchmark._process_raw_result(0, raw_output))
test_name = 'IsolatedNetworkTests'
metric_list = [
'RRTwo-sidedLat(8B)', 'RRGetLat(8B)', 'RRTwo-sidedBW(131072B)', 'RRPutBW(131072B)',
'RRTwo-sidedBW+Sync(131072B)', 'NatTwo-sidedBW(131072B)', 'MultipleAllreduce(8B)', 'MultipleAlltoall(4096B)'
]
for metric_medium in metric_list:
for suffix in ['Avg', '99%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
# Negative case - Add invalid raw output.
assert (benchmark._process_raw_result(0, 'ERROR') is False)
# Check basic information.
assert (benchmark.name == 'gpcnet-network-test')
assert (benchmark.type == BenchmarkType.MICRO)
assert (benchmark._bin_name == 'network_test')
def test_gpcnet_network_load(self): # noqa: C901
"""Test gpcnet-network-load-test benchmark."""
raw_output = """# noqa: E501
NetworkLoad Tests v1.3
Test with 10 MPI ranks (10 nodes)
2 nodes running Network Tests
8 nodes running Congestion Tests (min 100 nodes per congestor)
Legend
RR = random ring communication pattern
Lat = latency
BW = bandwidth
BW+Sync = bandwidth with barrier
+------------------------------------------------------------------------------------------------------------------------------------------+
| Isolated Network Tests |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Name | Min | Max | Avg | Avg(Worst) | 99% | 99.9% | Units |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| RR Two-sided Lat (8 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| RR Two-sided BW+Sync (131072 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Multiple Allreduce (8 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
+------------------------------------------------------------------------------------------------------------------------------------------+
| Isolated Congestion Tests |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Name | Min | Max | Avg | Avg(Worst) | 99% | 99.9% | Units |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Alltoall (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Two-sided Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Put Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Get Bcast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
+------------------------------------------------------------------------------------------------------------------------------------------+
| Network Tests running with Congestion Tests ( RR Two-sided Lat Network Test) |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Name | Min | Max | Avg | Avg(Worst) | 99% | 99.9% | Units |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| RR Two-sided Lat (8 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Alltoall (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Two-sided Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Put Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Get Bcast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
+------------------------------------------------------------------------------------------------------------------------------------------+
| Network Tests running with Congestion Tests (RR Two-sided BW+Sync Network Test) |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Name | Min | Max | Avg | Avg(Worst) | 99% | 99.9% | Units |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| RR Two-sided BW+Sync (131072 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Alltoall (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Two-sided Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Put Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Get Bcast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
+------------------------------------------------------------------------------------------------------------------------------------------+
| Network Tests running with Congestion Tests ( Multiple Allreduce Network Test) |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Name | Min | Max | Avg | Avg(Worst) | 99% | 99.9% | Units |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Multiple Allreduce (8 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | usec |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Alltoall (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Two-sided Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Put Incast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Get Bcast (4096 B) | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | 10000.0 | MiB/s/rank |
+---------------------------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
+------------------------------------------------------------------------------+
| Network Tests running with Congestion Tests - Key Results |
+---------------------------------+--------------------------------------------+
| Name | Congestion Impact Factor |
+---------------------------------+----------------------+---------------------+
| | Avg | 99% |
+---------------------------------+----------------------+---------------------+
| RR Two-sided Lat (8 B) | 0.0X | 0.0X |
+---------------------------------+----------------------+---------------------+
| RR Two-sided BW+Sync (131072 B) | 0.0X | 0.0X |
+---------------------------------+----------------------+---------------------+
| Multiple Allreduce (8 B) | 0.0X | 0.0X |
+---------------------------------+----------------------+---------------------+
"""
# Check registry.
benchmark_name = 'gpcnet-network-load-test'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
# Check preprocess
benchmark = benchmark_class(benchmark_name)
ret = benchmark._preprocess()
assert (ret)
expect_command = 'network_load_test'
command = benchmark._bin_name + benchmark._commands[0].split(benchmark._bin_name)[1]
assert (command == expect_command)
# Check function process_raw_data.
raw_output_no_execution = """
ERROR: this application must be run on at least 10 nodes
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun detected that one or more processes exited with non-zero status, thus causing
the job to be terminated. The first process to do so was:
Process name: [[63697,1],0]
Exit code: 1
--------------------------------------------------------------------------
"""
assert (benchmark._process_raw_result(0, raw_output_no_execution))
assert (len(benchmark.result) == 0)
# Positive case - valid raw output.
assert (benchmark._process_raw_result(0, raw_output))
test_name = 'IsolatedNetworkTests'
metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
for metric_medium in metric_list:
for suffix in ['Max', 'Min', 'Avg', '99.9%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
test_name = 'IsolatedCongestionTests'
metric_list = ['GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)']
for metric_medium in metric_list:
for suffix in ['Max', 'Min', 'Avg', '99.9%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedLatNetworkTest)'
metric_list = [
'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'RRTwo-sidedLat(8B)'
]
for metric_medium in metric_list:
for suffix in ['Max', 'Min', 'Avg', '99.9%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
test_name = 'NetworkTestsrunningwithCongestionTests(RRTwo-sidedBW+SyncNetworkTest)'
metric_list = [
'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)',
'RRTwo-sidedBW+Sync(131072B)'
]
for metric_medium in metric_list:
for suffix in ['Max', 'Min', 'Avg', '99.9%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
test_name = 'NetworkTestsrunningwithCongestionTests(MultipleAllreduceNetworkTest)'
metric_list = [
'GetBcast(4096B)', 'PutIncast(4096B)', 'Two-sidedIncast(4096B)', 'Alltoall(4096B)', 'MultipleAllreduce(8B)'
]
for metric_medium in metric_list:
for suffix in ['Max', 'Min', 'Avg', '99.9%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
test_name = 'NetworkTestsrunningwithCongestionTests-KeyResults'
metric_list = ['RRTwo-sidedLat(8B)', 'RRTwo-sidedBW+Sync(131072B)', 'MultipleAllreduce(8B)']
for metric_medium in metric_list:
for suffix in ['Avg', '99%']:
metric = test_name + '_' + metric_medium + '_' + suffix
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
# Negative case - Add invalid raw output.
assert (benchmark._process_raw_result(0, 'ERROR') is False)
# Check basic information.
assert (benchmark.name == 'gpcnet-network-load-test')
assert (benchmark.type == BenchmarkType.MICRO)
assert (benchmark._bin_name == 'network_load_test')