Benchmarks: Add Benchmark - Add mlc benchmark to superbench (#216)

**Description**
Add mlc memory bandwidth and latency micro benchmark to Superbench.

**Major Revision**
- Add mlc benchmark with test and example files
This commit is contained in:
Hossein Pourreza 2021-12-12 21:47:42 -08:00 коммит произвёл GitHub
Родитель c403b1ca76
Коммит b590409e0f
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
12 изменённых файлов: 428 добавлений и 0 удалений

Просмотреть файл

@ -12,6 +12,8 @@ FROM nvcr.io/nvidia/pytorch:20.12-py3
# - OFED: 5.2-2.2.3.0
# - HPC-X: v2.8.3
# - NCCL RDMA SHARP plugins: 7cccbc1
# Intel:
# - mlc: v3.9a
LABEL maintainer="SuperBench"
@ -97,6 +99,16 @@ RUN cd /tmp && \
cd /tmp && \
rm -rf nccl
# Install Intel MLC
RUN cd /tmp && \
mkdir -p mlc && \
cd mlc && \
wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
tar xvf mlc_v3.9a.tgz && \
cp ./Linux/mlc /usr/local/bin/ && \
cd /tmp && \
rm -rf mlc
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME="/opt/superbench" \

Просмотреть файл

@ -11,6 +11,8 @@ FROM rocm/pytorch:rocm4.0_ubuntu18.04_py3.6_pytorch_1.7.0
# - RCCL: 2.7.8
# Mellanox:
# - OFED: 5.2-2.2.3.0
# Intel:
# - mlc: v3.9a
LABEL maintainer="SuperBench"
@ -88,6 +90,16 @@ RUN cd /opt && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64 hpcx && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64.tbz
# Install Intel MLC
RUN cd /tmp && \
mkdir -p mlc && \
cd mlc && \
wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
tar xvf mlc_v3.9a.tgz && \
cp ./Linux/mlc /usr/local/bin/ && \
cd /tmp && \
rm -rf mlc
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME="/opt/superbench" \

Просмотреть файл

@ -11,6 +11,8 @@ FROM rocm/pytorch:rocm4.2_ubuntu18.04_py3.6_pytorch_1.7.0
# - RCCL: 2.8.4
# Mellanox:
# - OFED: 5.2-2.2.3.0
# Intel:
# - mlc: v3.9a
LABEL maintainer="SuperBench"
@ -88,6 +90,16 @@ RUN cd /opt && \
ln -s hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64 hpcx && \
rm hpcx-v2.8.3-gcc-MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu18.04-x86_64.tbz
# Install Intel MLC
RUN cd /tmp && \
mkdir -p mlc && \
cd mlc && \
wget --user-agent="Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0" https://www.intel.com/content/dam/develop/external/us/en/documents/mlc_v3.9a.tgz && \
tar xvf mlc_v3.9a.tgz && \
cp ./Linux/mlc /usr/local/bin/ && \
cd /tmp && \
rm -rf mlc
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" \
SB_HOME="/opt/superbench" \

Просмотреть файл

@ -0,0 +1,26 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for disk performance.
Commands to run:
python3 examples/benchmarks/memory_bw_latency_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'cpu-memory-bw-latency',
platform=Platform.CPU,
parameters='--tests bandwidth_matrix latency_matrix max_bandwidth'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)

Просмотреть файл

@ -14,6 +14,7 @@ from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import Cu
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
@ -34,6 +35,7 @@ __all__ = [
'CudaNcclBwBenchmark',
'CudnnBenchmark',
'DiskBenchmark',
'CpuMemBwLatencyBenchmark',
'GPCNetBenchmark',
'GemmFlopsBenchmark',
'GpuCopyBwBenchmark',

Просмотреть файл

@ -0,0 +1,150 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module for running the Intel MLC tool to measure memory bandwidth and latency."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class CpuMemBwLatencyBenchmark(MicroBenchmarkWithInvoke):
"""The Memory bandwidth and latency benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'mlc'
self.__support_mlc_commands = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--tests',
type=str,
nargs='+',
default=['bandwidth_matrix'],
required=False,
help='The modes to run mlc with. Possible values are {}.'.format(' '.join(self.__support_mlc_commands))
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
mlc_path = os.path.join(self._args.bin_dir, self._bin_name)
ret_val = os.access(mlc_path, os.X_OK | os.F_OK)
if not ret_val:
logger.error(
'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
)
return False
# the mlc command requires hugapage to be enabled
mlc_wrapper = ' '.join(
[
'nr_hugepages=`cat /proc/sys/vm/nr_hugepages`;', 'echo 4000 > /proc/sys/vm/nr_hugepages;', '%s;',
'err=$?;', 'echo ${nr_hugepages} > /proc/sys/vm/nr_hugepages;', '(exit $err)'
]
)
for test in self._args.tests:
command = mlc_path + ' --%s' % test
self._commands.append(mlc_wrapper % command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
# parse the command to see which command this output belongs to
# the command is formed as ...; mlc --option; ...
# option needs to be extracted
if '--' in self._commands[cmd_idx]:
mlc_test = self._commands[cmd_idx].split('--')[1]
else:
logger.error('The command {} is not well formed and missing --'.format(self._commands[cmd_idx]))
return False
mlc_test = mlc_test.split(';')[0]
if 'max_bandwidth' in mlc_test:
measure = 'BW'
out_table = self._parse_max_bw(raw_output)
elif 'bandwidth_matrix' in mlc_test:
measure = 'BW'
out_table = self._parse_bw_latency(raw_output)
elif 'latency_matrix' in mlc_test:
measure = 'Latency'
out_table = self._parse_bw_latency(raw_output)
else:
logger.error('Invalid option {} to run the {} command'.format(mlc_test, self._commands[cmd_idx]))
return False
if len(out_table) == 0:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
self._curr_run_index, self._name, raw_output
)
)
return False
for key in out_table.keys():
for index in range(len(out_table[key])):
if 'max_bandwidth' in mlc_test:
metric = 'Mem_{}_{}_{}'.format(mlc_test, key, measure)
else:
metric = 'Mem_{}_{}_{}_{}'.format(mlc_test, key, str(index), measure)
self._result.add_result(metric, float(out_table[key][index]))
return True
def _parse_bw_latency(self, raw_output):
out_table = dict()
for line in raw_output.splitlines():
if line.strip() == '':
continue
# only lines starting with a digit is of interest
if line.lstrip()[0].isdigit():
vals = line.split()
if len(vals) < 2:
continue
numa_index = 'numa_%s' % vals[0]
out_table[numa_index] = vals[1:]
return out_table
def _parse_max_bw(self, raw_output):
out_table = dict()
# the very last line is empty and only the last 5 lines of the output are of interest
for line in raw_output.splitlines()[-6:]:
if line.strip() == '':
continue
vals = line.split()
if len(vals) < 2:
continue
key = '_'.join(vals[0:2]).rstrip(':').replace(':', '_')
# making a list to be consistent with the _parse_bw_latency output
out_table[key] = [vals[-1]]
return out_table
BenchmarkRegistry.register_benchmark('cpu-memory-bw-latency', CpuMemBwLatencyBenchmark)

Просмотреть файл

@ -47,6 +47,17 @@ superbench:
maxbytes: 8G
ngpus: 8
operation: allreduce
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:

Просмотреть файл

@ -48,6 +48,17 @@ superbench:
maxbytes: 8G
ngpus: 8
operation: allreduce
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:

Просмотреть файл

@ -58,6 +58,17 @@ superbench:
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
parallel: yes
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:

Просмотреть файл

@ -58,6 +58,17 @@ superbench:
proc_num: 4
prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
parallel: yes
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:

Просмотреть файл

@ -60,6 +60,17 @@ superbench:
parameters:
block_devices:
- /dev/nvme0n1
cpu-memory-bw-latency:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
tests:
- bandwidth_matrix
- latency_matrix
- max_bandwidth
mem-bw:
enable: true
modes:

Просмотреть файл

@ -0,0 +1,159 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for cpu-memory-bw-latency benchmark."""
from pathlib import Path
import os
import unittest
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
class CpuMemBwLatencyBenchmarkTest(unittest.TestCase):
"""Test class for cpu-memory-bw-latency benchmark."""
def setUp(self):
"""Method called to prepare the test fixture."""
# Create fake binary file just for testing.
self.__curr_micro_path = os.environ.get('SB_MICRO_PATH', '')
os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
binary_path = Path(os.getenv('SB_MICRO_PATH'), 'bin')
binary_path.mkdir(parents=True, exist_ok=True)
self.__binary_file = binary_path / 'mlc'
self.__binary_file.touch(mode=0o755, exist_ok=True)
def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
self.__binary_file.unlink()
os.environ['SB_MICRO_PATH'] = self.__curr_micro_path
def test_cpu_mem_bw_latency_benchmark_empty_param(self):
"""Test cpu-memory-bw-latency benchmark command generation with empty parameter."""
benchmark_name = 'cpu-memory-bw-latency'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
default_mlc_test = 'bandwidth_matrix'
benchmark = benchmark_class(benchmark_name, parameters='')
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == 'cpu-memory-bw-latency')
assert (benchmark.type == BenchmarkType.MICRO)
# Check commands
assert (1 == len(benchmark._commands))
assert ('mlc --%s;' % default_mlc_test in benchmark._commands[0])
def test_cpu_mem_bw_latency_benchmark_result_parsing(self):
"""Test cpu-memory-bw-latency benchmark result parsing."""
benchmark_name = 'cpu-memory-bw-latency'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
all_mlc_tests = ['bandwidth_matrix', 'latency_matrix', 'max_bandwidth']
param_str = '--tests %s' % ' '.join(all_mlc_tests)
benchmark = benchmark_class(benchmark_name, parameters=param_str)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == 'cpu-memory-bw-latency')
assert (benchmark.type == BenchmarkType.MICRO)
# Check commands
assert (len(all_mlc_tests) == len(benchmark._commands))
for mlc_test, command in zip(all_mlc_tests, benchmark._commands):
assert ('mlc --%s;' % mlc_test in command)
# Positive case - valid bandwidth matrix output.
test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --bandwidth_matrix
Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements
Measuring Memory Bandwidths between nodes within system
Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
Using all the threads from each core if Hyper-threading is enabled
Using Read-only traffic type
Numa node
Numa node 0 1
0 82542.2 76679.9
1 76536.0 82986.5
"""
assert (benchmark._process_raw_result(0, test_raw_output))
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert ('raw_output_0' in benchmark.raw_data)
assert ([test_raw_output] == benchmark.raw_data['raw_output_0'])
assert ([82542.2] == benchmark.result['Mem_bandwidth_matrix_numa_0_0_BW'])
assert ([76679.9] == benchmark.result['Mem_bandwidth_matrix_numa_0_1_BW'])
assert ([76536.0] == benchmark.result['Mem_bandwidth_matrix_numa_1_0_BW'])
assert ([82986.5] == benchmark.result['Mem_bandwidth_matrix_numa_1_1_BW'])
# Positive case - valid latency matrix output.
test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --latency_matrix
Using buffer size of 600.000MiB
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements
Measuring idle latencies (in ns)...
Numa node
Numa node 0 1
0 87.0 101.0
1 101.9 86.9
"""
assert (benchmark._process_raw_result(1, test_raw_output))
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert ('raw_output_1' in benchmark.raw_data)
assert ([test_raw_output] == benchmark.raw_data['raw_output_1'])
assert ([87.0] == benchmark.result['Mem_latency_matrix_numa_0_0_Latency'])
assert ([101.0] == benchmark.result['Mem_latency_matrix_numa_0_1_Latency'])
assert ([101.9] == benchmark.result['Mem_latency_matrix_numa_1_0_Latency'])
assert ([86.9] == benchmark.result['Mem_latency_matrix_numa_1_1_Latency'])
# Positive case - valid max bandwidth output.
test_raw_output = """
Intel(R) Memory Latency Checker - v3.9a
Command line parameters: --max_bandwidth
Using buffer size of 100.000MiB/thread for reads and an additional 100.000MiB/thread for writes
*** Unable to modify prefetchers (try executing 'modprobe msr')
*** So, enabling random access for latency measurements
Measuring Maximum Memory Bandwidths for the system
Will take several minutes to complete as multiple injection rates will be tried to get the best bandwidth
Bandwidths are in MB/sec (1 MB/sec = 1,000,000 Bytes/sec)
Using all the threads from each core if Hyper-threading is enabled
Using traffic with the following read-write ratios
ALL Reads : 165400.60
3:1 Reads-Writes : 154975.19
2:1 Reads-Writes : 158433.32
1:1 Reads-Writes : 157352.05
Stream-triad like: 157878.32
"""
assert (benchmark._process_raw_result(2, test_raw_output))
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert ('raw_output_2' in benchmark.raw_data)
assert ([test_raw_output] == benchmark.raw_data['raw_output_2'])
assert ([165400.60] == benchmark.result['Mem_max_bandwidth_ALL_Reads_BW'])
assert ([154975.19] == benchmark.result['Mem_max_bandwidth_3_1_Reads-Writes_BW'])
assert ([158433.32] == benchmark.result['Mem_max_bandwidth_2_1_Reads-Writes_BW'])
assert ([157352.05] == benchmark.result['Mem_max_bandwidth_1_1_Reads-Writes_BW'])
assert ([157878.32] == benchmark.result['Mem_max_bandwidth_Stream-triad_like_BW'])
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(0, 'Invalid raw output') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)