diff --git a/.codecov.yml b/.codecov.yml
index 3f36d561..81d50f8b 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -17,6 +17,7 @@ coverage:
- cpu-python3.6-unit-test
- cpu-python3.7-unit-test
- cuda-unit-test
+ - directx-unit-test
patch:
default:
target: 80%
@@ -25,3 +26,4 @@ coverage:
- cpu-python3.6-unit-test
- cpu-python3.7-unit-test
- cuda-unit-test
+ - directx-unit-test
diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index 24ed3d12..6283544b 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -1,4 +1,4 @@
-name: Build on Windows
+name: Build on Windows and run directx unit test
on:
push:
@@ -19,6 +19,10 @@ jobs:
uses: actions/checkout@v2
with:
submodules: true
+ - name: Clearnup docker data
+ run: |
+ docker system prune -a -f
+ docker volume prune -a -f
- name: Build Docker image
working-directory: .
shell: pwsh
@@ -44,11 +48,6 @@ jobs:
TAG: superbench/main:win2004
USER: ${{ secrets.DOCKERHUB_USERNAME }}
PASS: ${{ secrets.DOCKERHUB_TOKEN }}
- directx-unit-test:
- name: DirectX unit test
- needs: docker
- runs-on: [self-hosted, windows, x64, win2004]
- steps:
- name: Add bash to PATH
shell: pwsh
run: |
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index c1cb3a1b..57304bc4 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
__all__ = [
'ComputationCommunicationOverlap',
@@ -61,4 +62,5 @@ __all__ = [
'ShardingMatmul',
'TCPConnectivityBenchmark',
'TensorRTInferenceBenchmark',
+ 'DirectXGPUCoreFlops',
]
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
new file mode 100644
index 00000000..86236754
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
@@ -0,0 +1,145 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUCoreFlops performance benchmarks."""
+
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke):
+ """The DirectXGPUCoreFlops benchmark class."""
+ def __init__(self, name, parameters=''):
+ """Constructor.
+
+ Args:
+ name (str): benchmark name.
+ parameters (str): benchmark parameters.
+ """
+ super().__init__(name, parameters)
+ self._bin_name = 'DirectXGPUCoreFlops.exe'
+ self._support_precisions = ['fp16', 'fp32']
+ self._precision_need_to_run = list()
+
+ def add_parser_arguments(self):
+ """Add the specified arguments."""
+ super().add_parser_arguments()
+ self._parser.add_argument(
+ '--num_loops',
+ type=int,
+ default=10,
+ required=False,
+ help='The number of benchmark runs.',
+ )
+ self._parser.add_argument(
+ '--num_warm_up',
+ type=int,
+ default=2,
+ required=False,
+ help='The number of warm up runs.',
+ )
+ self._parser.add_argument(
+ '--n',
+ type=int,
+ default=16 * 256,
+ required=False,
+ help='The N dim of matmul (N, K) * (K, M).',
+ )
+ self._parser.add_argument(
+ '--k',
+ type=int,
+ default=16 * 256,
+ required=False,
+ help='The K dim of matmul (N, K) * (K, M).',
+ )
+ self._parser.add_argument(
+ '--m',
+ type=int,
+ default=16 * 256,
+ required=False,
+ help='The M dim of matmul (N, K) * (K, M).',
+ )
+ self._parser.add_argument(
+ '--precision',
+ type=str,
+ nargs='+',
+ default=list(),
+ help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
+ )
+
+ def _preprocess(self):
+ """Preprocess/preparation operations before the benchmarking.
+
+ Return:
+ True if _preprocess() succeed.
+ """
+ if not super()._preprocess():
+ return False
+
+ if len(self._args.precision) == 0:
+ self._precision_need_to_run = self._support_precisions
+ else:
+ self._args.precision = [p.lower() for p in self._args.precision]
+ for p in self._args.precision:
+ if p not in self._support_precisions:
+ logger.warning(
+ 'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
+ self._name, p, self._support_precisions
+ )
+ )
+ else:
+ self._precision_need_to_run.append(p)
+
+ if len(self._precision_need_to_run) == 0:
+ self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+ return False
+
+ for p in self._precision_need_to_run:
+ command = os.path.join(self._args.bin_dir, self._bin_name)
+ command += (' --num_loops ' + str(self._args.num_loops))
+ command += (' --num_warm_up ' + str(self._args.num_warm_up))
+ command += (' --n ' + str(self._args.n))
+ command += (' --k ' + str(self._args.k))
+ command += (' --m ' + str(self._args.m))
+ command += (' --' + p)
+ self._commands.append(command)
+ return True
+
+ def _process_raw_result(self, cmd_idx, raw_output):
+ """Function to process raw results and save the summarized results.
+
+ self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+ Args:
+ cmd_idx (int): the index of command corresponding with the raw_output.
+ raw_output (str): raw output string of the micro-benchmark.
+
+ Return:
+ True if the raw output string is valid and result can be extracted.
+ """
+ precision = self._precision_need_to_run[cmd_idx]
+ self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data)
+ valid = True
+ flops = list()
+ content = raw_output.splitlines()
+ try:
+ for line in content:
+ if 'TFLOPs' in line:
+ flops.append(float(line.split()[0]))
+ except BaseException:
+ valid = False
+ finally:
+ if valid is False or len(flops) == 0:
+ logger.error(
+ 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+ self._curr_run_index, self._name, raw_output
+ )
+ )
+ return False
+ self._result.add_result(precision + '_flops', max(flops))
+ return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX)
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
index 8ba9fb91..0a244e5d 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
@@ -42,10 +42,10 @@ class BenchmarkOptions : public Options {
m = get_cmd_line_argument_int("--m", 16 * 256);
n = get_cmd_line_argument_int("--n", 16 * 256);
k = get_cmd_line_argument_int("--k", 16 * 256);
- if (get_cmd_line_argument_bool("--f16")) {
+ if (get_cmd_line_argument_bool("--fp16")) {
mode_precision = Option::F16;
}
- if (get_cmd_line_argument_bool("--f32")) {
+ if (get_cmd_line_argument_bool("--fp32")) {
mode_precision = Option::F32;
}
}
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
index 206c49f9..d41316a0 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
@@ -25,7 +25,7 @@ void GPUCore::Run() {
int loops = opts->num_loops;
std::cout << "GPUCoreFLOPs" << std::endl;
-
+ gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
switch (opts->mode_precision) {
case Option::F32: {
// Prepare input and output data and buffers.
@@ -37,7 +37,6 @@ void GPUCore::Run() {
ExecuteComputeOp();
}
for (int i = 0; i < loops; ++i) {
- gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
// Do FLOPs job.
double timeInMs = ExecuteComputeOp();
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
@@ -55,7 +54,6 @@ void GPUCore::Run() {
ExecuteComputeOp();
}
for (int i = 0; i < loops; ++i) {
- gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
// Do FLOPs job.
double timeInMs = ExecuteComputeOp();
auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
index 109d3930..f70749b4 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
@@ -20,12 +20,14 @@
+ DirectXGPUCoreFlops
Application
true
v143
Unicode
+ DirectXGPUCoreFlops
Application
false
v143
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
new file mode 100644
index 00000000..7571df75
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
@@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUCorefloops benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpucoreflops():
+ """Test DirectXGPUCoreFlops benchmark."""
+ # Test for default configuration
+ context = BenchmarkRegistry.create_benchmark_context(
+ 'directx-gpu-core-flops',
+ platform=Platform.DIRECTX,
+ parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32'
+ )
+
+ assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+ benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+ # Check basic information.
+ assert (benchmark)
+ assert (benchmark.name == 'directx-gpu-core-flops')
+ assert (benchmark.type == BenchmarkType.MICRO)
+
+ # Check parameters specified in BenchmarkContext.
+ assert (benchmark._args.num_loops == 10)
+ assert (benchmark._args.n == 16384)
+ assert (benchmark._args.k == 16384)
+ assert (benchmark._args.m == 16384)
+ assert (sorted(benchmark._args.precision) == ['fp32'])
+
+ # Check results and metrics.
+ assert (benchmark.run_count == 1)
+ assert (benchmark.return_code == ReturnCode.SUCCESS)
+ assert ('raw_output_fp32' in benchmark.raw_data)
+ assert (len(benchmark.raw_data['raw_output_fp32']) == 1)
+ assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str))
+
+ assert ('fp32_flops' in benchmark.result)
+ assert (len(benchmark.result['fp32_flops']) == 1)
+ assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number))