diff --git a/.codecov.yml b/.codecov.yml index 3f36d561..81d50f8b 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -17,6 +17,7 @@ coverage: - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test + - directx-unit-test patch: default: target: 80% @@ -25,3 +26,4 @@ coverage: - cpu-python3.6-unit-test - cpu-python3.7-unit-test - cuda-unit-test + - directx-unit-test diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml index 24ed3d12..6283544b 100644 --- a/.github/workflows/build-win.yml +++ b/.github/workflows/build-win.yml @@ -1,4 +1,4 @@ -name: Build on Windows +name: Build on Windows and run directx unit test on: push: @@ -19,6 +19,10 @@ jobs: uses: actions/checkout@v2 with: submodules: true + - name: Clearnup docker data + run: | + docker system prune -a -f + docker volume prune -a -f - name: Build Docker image working-directory: . shell: pwsh @@ -44,11 +48,6 @@ jobs: TAG: superbench/main:win2004 USER: ${{ secrets.DOCKERHUB_USERNAME }} PASS: ${{ secrets.DOCKERHUB_TOKEN }} - directx-unit-test: - name: DirectX unit test - needs: docker - runs-on: [self-hosted, windows, x64, win2004] - steps: - name: Add bash to PATH shell: pwsh run: | diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index c1cb3a1b..57304bc4 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark +from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops __all__ = [ 'ComputationCommunicationOverlap', @@ -61,4 +62,5 @@ __all__ = [ 'ShardingMatmul', 'TCPConnectivityBenchmark', 'TensorRTInferenceBenchmark', + 'DirectXGPUCoreFlops', ] diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py new file mode 100644 index 00000000..86236754 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py @@ -0,0 +1,145 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the DirectXGPUCoreFlops performance benchmarks.""" + +import os +from superbench.common.utils import logger +from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke): + """The DirectXGPUCoreFlops benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + self._bin_name = 'DirectXGPUCoreFlops.exe' + self._support_precisions = ['fp16', 'fp32'] + self._precision_need_to_run = list() + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + self._parser.add_argument( + '--num_loops', + type=int, + default=10, + required=False, + help='The number of benchmark runs.', + ) + self._parser.add_argument( + '--num_warm_up', + type=int, + default=2, + required=False, + help='The number of warm up runs.', + ) + self._parser.add_argument( + '--n', + type=int, + default=16 * 256, + required=False, + help='The N dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--k', + type=int, + default=16 * 256, + required=False, + help='The K dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--m', + type=int, + default=16 * 256, + required=False, + help='The M dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--precision', + type=str, + nargs='+', + default=list(), + help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)), + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + if len(self._args.precision) == 0: + self._precision_need_to_run = self._support_precisions + else: + self._args.precision = [p.lower() for p in self._args.precision] + for p in self._args.precision: + if p not in self._support_precisions: + logger.warning( + 'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format( + self._name, p, self._support_precisions + ) + ) + else: + self._precision_need_to_run.append(p) + + if len(self._precision_need_to_run) == 0: + self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION) + return False + + for p in self._precision_need_to_run: + command = os.path.join(self._args.bin_dir, self._bin_name) + command += (' --num_loops ' + str(self._args.num_loops)) + command += (' --num_warm_up ' + str(self._args.num_warm_up)) + command += (' --n ' + str(self._args.n)) + command += (' --k ' + str(self._args.k)) + command += (' --m ' + str(self._args.m)) + command += (' --' + p) + self._commands.append(command) + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to process raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + precision = self._precision_need_to_run[cmd_idx] + self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data) + valid = True + flops = list() + content = raw_output.splitlines() + try: + for line in content: + if 'TFLOPs' in line: + flops.append(float(line.split()[0])) + except BaseException: + valid = False + finally: + if valid is False or len(flops) == 0: + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( + self._curr_run_index, self._name, raw_output + ) + ) + return False + self._result.add_result(precision + '_flops', max(flops)) + return True + + +BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX) diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h index 8ba9fb91..0a244e5d 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h @@ -42,10 +42,10 @@ class BenchmarkOptions : public Options { m = get_cmd_line_argument_int("--m", 16 * 256); n = get_cmd_line_argument_int("--n", 16 * 256); k = get_cmd_line_argument_int("--k", 16 * 256); - if (get_cmd_line_argument_bool("--f16")) { + if (get_cmd_line_argument_bool("--fp16")) { mode_precision = Option::F16; } - if (get_cmd_line_argument_bool("--f32")) { + if (get_cmd_line_argument_bool("--fp32")) { mode_precision = Option::F32; } } diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp index 206c49f9..d41316a0 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp @@ -25,7 +25,7 @@ void GPUCore::Run() { int loops = opts->num_loops; std::cout << "GPUCoreFLOPs" << std::endl; - + gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); switch (opts->mode_precision) { case Option::F32: { // Prepare input and output data and buffers. @@ -37,7 +37,6 @@ void GPUCore::Run() { ExecuteComputeOp(); } for (int i = 0; i < loops; ++i) { - gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); // Do FLOPs job. double timeInMs = ExecuteComputeOp(); auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; @@ -55,7 +54,6 @@ void GPUCore::Run() { ExecuteComputeOp(); } for (int i = 0; i < loops; ++i) { - gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute); // Do FLOPs job. double timeInMs = ExecuteComputeOp(); auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs; diff --git a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj index 109d3930..f70749b4 100644 --- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj +++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj @@ -20,12 +20,14 @@ + DirectXGPUCoreFlops Application true v143 Unicode + DirectXGPUCoreFlops Application false v143 diff --git a/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py new file mode 100644 index 00000000..7571df75 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py @@ -0,0 +1,47 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for DirectXGPUCorefloops benchmark.""" + +import numbers + +from tests.helper import decorator +from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform + + +@decorator.directx_test +def test_directx_gpucoreflops(): + """Test DirectXGPUCoreFlops benchmark.""" + # Test for default configuration + context = BenchmarkRegistry.create_benchmark_context( + 'directx-gpu-core-flops', + platform=Platform.DIRECTX, + parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32' + ) + + assert (BenchmarkRegistry.is_benchmark_context_valid(context)) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + + # Check basic information. + assert (benchmark) + assert (benchmark.name == 'directx-gpu-core-flops') + assert (benchmark.type == BenchmarkType.MICRO) + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.num_loops == 10) + assert (benchmark._args.n == 16384) + assert (benchmark._args.k == 16384) + assert (benchmark._args.m == 16384) + assert (sorted(benchmark._args.precision) == ['fp32']) + + # Check results and metrics. + assert (benchmark.run_count == 1) + assert (benchmark.return_code == ReturnCode.SUCCESS) + assert ('raw_output_fp32' in benchmark.raw_data) + assert (len(benchmark.raw_data['raw_output_fp32']) == 1) + assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str)) + + assert ('fp32_flops' in benchmark.result) + assert (len(benchmark.result['fp32_flops']) == 1) + assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number))