Benchmarks: micro benchmarks - add python code for DirectXGPUCoreFlops (#542)
**Description** add python code for DirectX core flops and init DirectX test pipeline. **Major Revision** - add python code for DirectX core flops - init DirectX test pipeline **Minor Revision** - add test for DirectX core flops
This commit is contained in:
Родитель
3704a432b9
Коммит
f1d608aef7
|
@ -17,6 +17,7 @@ coverage:
|
||||||
- cpu-python3.6-unit-test
|
- cpu-python3.6-unit-test
|
||||||
- cpu-python3.7-unit-test
|
- cpu-python3.7-unit-test
|
||||||
- cuda-unit-test
|
- cuda-unit-test
|
||||||
|
- directx-unit-test
|
||||||
patch:
|
patch:
|
||||||
default:
|
default:
|
||||||
target: 80%
|
target: 80%
|
||||||
|
@ -25,3 +26,4 @@ coverage:
|
||||||
- cpu-python3.6-unit-test
|
- cpu-python3.6-unit-test
|
||||||
- cpu-python3.7-unit-test
|
- cpu-python3.7-unit-test
|
||||||
- cuda-unit-test
|
- cuda-unit-test
|
||||||
|
- directx-unit-test
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
name: Build on Windows
|
name: Build on Windows and run directx unit test
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
|
@ -19,6 +19,10 @@ jobs:
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
with:
|
with:
|
||||||
submodules: true
|
submodules: true
|
||||||
|
- name: Clearnup docker data
|
||||||
|
run: |
|
||||||
|
docker system prune -a -f
|
||||||
|
docker volume prune -a -f
|
||||||
- name: Build Docker image
|
- name: Build Docker image
|
||||||
working-directory: .
|
working-directory: .
|
||||||
shell: pwsh
|
shell: pwsh
|
||||||
|
@ -44,11 +48,6 @@ jobs:
|
||||||
TAG: superbench/main:win2004
|
TAG: superbench/main:win2004
|
||||||
USER: ${{ secrets.DOCKERHUB_USERNAME }}
|
USER: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||||
PASS: ${{ secrets.DOCKERHUB_TOKEN }}
|
PASS: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||||
directx-unit-test:
|
|
||||||
name: DirectX unit test
|
|
||||||
needs: docker
|
|
||||||
runs-on: [self-hosted, windows, x64, win2004]
|
|
||||||
steps:
|
|
||||||
- name: Add bash to PATH
|
- name: Add bash to PATH
|
||||||
shell: pwsh
|
shell: pwsh
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
|
||||||
from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
|
from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
|
||||||
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
|
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
|
||||||
from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
|
from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
|
||||||
|
from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'ComputationCommunicationOverlap',
|
'ComputationCommunicationOverlap',
|
||||||
|
@ -61,4 +62,5 @@ __all__ = [
|
||||||
'ShardingMatmul',
|
'ShardingMatmul',
|
||||||
'TCPConnectivityBenchmark',
|
'TCPConnectivityBenchmark',
|
||||||
'TensorRTInferenceBenchmark',
|
'TensorRTInferenceBenchmark',
|
||||||
|
'DirectXGPUCoreFlops',
|
||||||
]
|
]
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
# Copyright (c) Microsoft Corporation.
|
||||||
|
# Licensed under the MIT license.
|
||||||
|
|
||||||
|
"""Module of the DirectXGPUCoreFlops performance benchmarks."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from superbench.common.utils import logger
|
||||||
|
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
|
||||||
|
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
|
||||||
|
|
||||||
|
|
||||||
|
class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke):
|
||||||
|
"""The DirectXGPUCoreFlops benchmark class."""
|
||||||
|
def __init__(self, name, parameters=''):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name (str): benchmark name.
|
||||||
|
parameters (str): benchmark parameters.
|
||||||
|
"""
|
||||||
|
super().__init__(name, parameters)
|
||||||
|
self._bin_name = 'DirectXGPUCoreFlops.exe'
|
||||||
|
self._support_precisions = ['fp16', 'fp32']
|
||||||
|
self._precision_need_to_run = list()
|
||||||
|
|
||||||
|
def add_parser_arguments(self):
|
||||||
|
"""Add the specified arguments."""
|
||||||
|
super().add_parser_arguments()
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--num_loops',
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
required=False,
|
||||||
|
help='The number of benchmark runs.',
|
||||||
|
)
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--num_warm_up',
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
required=False,
|
||||||
|
help='The number of warm up runs.',
|
||||||
|
)
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--n',
|
||||||
|
type=int,
|
||||||
|
default=16 * 256,
|
||||||
|
required=False,
|
||||||
|
help='The N dim of matmul (N, K) * (K, M).',
|
||||||
|
)
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--k',
|
||||||
|
type=int,
|
||||||
|
default=16 * 256,
|
||||||
|
required=False,
|
||||||
|
help='The K dim of matmul (N, K) * (K, M).',
|
||||||
|
)
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--m',
|
||||||
|
type=int,
|
||||||
|
default=16 * 256,
|
||||||
|
required=False,
|
||||||
|
help='The M dim of matmul (N, K) * (K, M).',
|
||||||
|
)
|
||||||
|
self._parser.add_argument(
|
||||||
|
'--precision',
|
||||||
|
type=str,
|
||||||
|
nargs='+',
|
||||||
|
default=list(),
|
||||||
|
help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _preprocess(self):
|
||||||
|
"""Preprocess/preparation operations before the benchmarking.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
True if _preprocess() succeed.
|
||||||
|
"""
|
||||||
|
if not super()._preprocess():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(self._args.precision) == 0:
|
||||||
|
self._precision_need_to_run = self._support_precisions
|
||||||
|
else:
|
||||||
|
self._args.precision = [p.lower() for p in self._args.precision]
|
||||||
|
for p in self._args.precision:
|
||||||
|
if p not in self._support_precisions:
|
||||||
|
logger.warning(
|
||||||
|
'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
|
||||||
|
self._name, p, self._support_precisions
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self._precision_need_to_run.append(p)
|
||||||
|
|
||||||
|
if len(self._precision_need_to_run) == 0:
|
||||||
|
self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
|
||||||
|
return False
|
||||||
|
|
||||||
|
for p in self._precision_need_to_run:
|
||||||
|
command = os.path.join(self._args.bin_dir, self._bin_name)
|
||||||
|
command += (' --num_loops ' + str(self._args.num_loops))
|
||||||
|
command += (' --num_warm_up ' + str(self._args.num_warm_up))
|
||||||
|
command += (' --n ' + str(self._args.n))
|
||||||
|
command += (' --k ' + str(self._args.k))
|
||||||
|
command += (' --m ' + str(self._args.m))
|
||||||
|
command += (' --' + p)
|
||||||
|
self._commands.append(command)
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _process_raw_result(self, cmd_idx, raw_output):
|
||||||
|
"""Function to process raw results and save the summarized results.
|
||||||
|
|
||||||
|
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cmd_idx (int): the index of command corresponding with the raw_output.
|
||||||
|
raw_output (str): raw output string of the micro-benchmark.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
True if the raw output string is valid and result can be extracted.
|
||||||
|
"""
|
||||||
|
precision = self._precision_need_to_run[cmd_idx]
|
||||||
|
self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data)
|
||||||
|
valid = True
|
||||||
|
flops = list()
|
||||||
|
content = raw_output.splitlines()
|
||||||
|
try:
|
||||||
|
for line in content:
|
||||||
|
if 'TFLOPs' in line:
|
||||||
|
flops.append(float(line.split()[0]))
|
||||||
|
except BaseException:
|
||||||
|
valid = False
|
||||||
|
finally:
|
||||||
|
if valid is False or len(flops) == 0:
|
||||||
|
logger.error(
|
||||||
|
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
|
||||||
|
self._curr_run_index, self._name, raw_output
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
self._result.add_result(precision + '_flops', max(flops))
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX)
|
|
@ -42,10 +42,10 @@ class BenchmarkOptions : public Options {
|
||||||
m = get_cmd_line_argument_int("--m", 16 * 256);
|
m = get_cmd_line_argument_int("--m", 16 * 256);
|
||||||
n = get_cmd_line_argument_int("--n", 16 * 256);
|
n = get_cmd_line_argument_int("--n", 16 * 256);
|
||||||
k = get_cmd_line_argument_int("--k", 16 * 256);
|
k = get_cmd_line_argument_int("--k", 16 * 256);
|
||||||
if (get_cmd_line_argument_bool("--f16")) {
|
if (get_cmd_line_argument_bool("--fp16")) {
|
||||||
mode_precision = Option::F16;
|
mode_precision = Option::F16;
|
||||||
}
|
}
|
||||||
if (get_cmd_line_argument_bool("--f32")) {
|
if (get_cmd_line_argument_bool("--fp32")) {
|
||||||
mode_precision = Option::F32;
|
mode_precision = Option::F32;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ void GPUCore::Run() {
|
||||||
|
|
||||||
int loops = opts->num_loops;
|
int loops = opts->num_loops;
|
||||||
std::cout << "GPUCoreFLOPs" << std::endl;
|
std::cout << "GPUCoreFLOPs" << std::endl;
|
||||||
|
gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
|
||||||
switch (opts->mode_precision) {
|
switch (opts->mode_precision) {
|
||||||
case Option::F32: {
|
case Option::F32: {
|
||||||
// Prepare input and output data and buffers.
|
// Prepare input and output data and buffers.
|
||||||
|
@ -37,7 +37,6 @@ void GPUCore::Run() {
|
||||||
ExecuteComputeOp();
|
ExecuteComputeOp();
|
||||||
}
|
}
|
||||||
for (int i = 0; i < loops; ++i) {
|
for (int i = 0; i < loops; ++i) {
|
||||||
gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
|
|
||||||
// Do FLOPs job.
|
// Do FLOPs job.
|
||||||
double timeInMs = ExecuteComputeOp();
|
double timeInMs = ExecuteComputeOp();
|
||||||
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
|
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
|
||||||
|
@ -55,7 +54,6 @@ void GPUCore::Run() {
|
||||||
ExecuteComputeOp();
|
ExecuteComputeOp();
|
||||||
}
|
}
|
||||||
for (int i = 0; i < loops; ++i) {
|
for (int i = 0; i < loops; ++i) {
|
||||||
gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
|
|
||||||
// Do FLOPs job.
|
// Do FLOPs job.
|
||||||
double timeInMs = ExecuteComputeOp();
|
double timeInMs = ExecuteComputeOp();
|
||||||
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
|
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
|
||||||
|
|
|
@ -20,12 +20,14 @@
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||||
|
<TargetName>DirectXGPUCoreFlops</TargetName>
|
||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>true</UseDebugLibraries>
|
<UseDebugLibraries>true</UseDebugLibraries>
|
||||||
<PlatformToolset>v143</PlatformToolset>
|
<PlatformToolset>v143</PlatformToolset>
|
||||||
<CharacterSet>Unicode</CharacterSet>
|
<CharacterSet>Unicode</CharacterSet>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||||
|
<TargetName>DirectXGPUCoreFlops</TargetName>
|
||||||
<ConfigurationType>Application</ConfigurationType>
|
<ConfigurationType>Application</ConfigurationType>
|
||||||
<UseDebugLibraries>false</UseDebugLibraries>
|
<UseDebugLibraries>false</UseDebugLibraries>
|
||||||
<PlatformToolset>v143</PlatformToolset>
|
<PlatformToolset>v143</PlatformToolset>
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
# Copyright (c) Microsoft Corporation.
|
||||||
|
# Licensed under the MIT License.
|
||||||
|
|
||||||
|
"""Tests for DirectXGPUCorefloops benchmark."""
|
||||||
|
|
||||||
|
import numbers
|
||||||
|
|
||||||
|
from tests.helper import decorator
|
||||||
|
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
|
||||||
|
|
||||||
|
|
||||||
|
@decorator.directx_test
|
||||||
|
def test_directx_gpucoreflops():
|
||||||
|
"""Test DirectXGPUCoreFlops benchmark."""
|
||||||
|
# Test for default configuration
|
||||||
|
context = BenchmarkRegistry.create_benchmark_context(
|
||||||
|
'directx-gpu-core-flops',
|
||||||
|
platform=Platform.DIRECTX,
|
||||||
|
parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32'
|
||||||
|
)
|
||||||
|
|
||||||
|
assert (BenchmarkRegistry.is_benchmark_context_valid(context))
|
||||||
|
|
||||||
|
benchmark = BenchmarkRegistry.launch_benchmark(context)
|
||||||
|
|
||||||
|
# Check basic information.
|
||||||
|
assert (benchmark)
|
||||||
|
assert (benchmark.name == 'directx-gpu-core-flops')
|
||||||
|
assert (benchmark.type == BenchmarkType.MICRO)
|
||||||
|
|
||||||
|
# Check parameters specified in BenchmarkContext.
|
||||||
|
assert (benchmark._args.num_loops == 10)
|
||||||
|
assert (benchmark._args.n == 16384)
|
||||||
|
assert (benchmark._args.k == 16384)
|
||||||
|
assert (benchmark._args.m == 16384)
|
||||||
|
assert (sorted(benchmark._args.precision) == ['fp32'])
|
||||||
|
|
||||||
|
# Check results and metrics.
|
||||||
|
assert (benchmark.run_count == 1)
|
||||||
|
assert (benchmark.return_code == ReturnCode.SUCCESS)
|
||||||
|
assert ('raw_output_fp32' in benchmark.raw_data)
|
||||||
|
assert (len(benchmark.raw_data['raw_output_fp32']) == 1)
|
||||||
|
assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str))
|
||||||
|
|
||||||
|
assert ('fp32_flops' in benchmark.result)
|
||||||
|
assert (len(benchmark.result['fp32_flops']) == 1)
|
||||||
|
assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number))
|
Загрузка…
Ссылка в новой задаче