Benchmarks: micro benchmarks - add python code for DirectXGPUCoreFlops (#542)

**Description** add python code for DirectX core flops and init DirectX test pipeline. **Major Revision** - add python code for DirectX core flops - init DirectX test pipeline **Minor Revision** - add test for DirectX core flops
2023-07-05 16:56:21 +08:00 · 2023-07-05 16:56:21 +08:00 · f1d608aef7
--- a/.codecov.yml
+++ b/.codecov.yml
@ -17,6 +17,7 @@ coverage:
          - cpu-python3.6-unit-test
          - cpu-python3.7-unit-test
          - cuda-unit-test
+          - directx-unit-test
    patch:
      default:
        target: 80%
@ -25,3 +26,4 @@ coverage:
          - cpu-python3.6-unit-test
          - cpu-python3.7-unit-test
          - cuda-unit-test
+          - directx-unit-test
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@ -1,4 +1,4 @@
-name: Build on Windows
+name: Build on Windows and run directx unit test

 on:
  push:
@ -19,6 +19,10 @@ jobs:
      uses: actions/checkout@v2
      with:
        submodules: true
+    - name: Clearnup docker data
+      run: |
+        docker system prune -a -f
+        docker volume prune -a -f
    - name: Build Docker image
      working-directory: .
      shell: pwsh
@ -44,11 +48,6 @@ jobs:
        TAG: superbench/main:win2004
        USER: ${{ secrets.DOCKERHUB_USERNAME }}
        PASS: ${{ secrets.DOCKERHUB_TOKEN }}
-  directx-unit-test:
-    name: DirectX unit test
-    needs: docker
-    runs-on: [self-hosted, windows, x64, win2004]
-    steps:
    - name: Add bash to PATH
      shell: pwsh
      run: |
--- a/superbench/benchmarks/micro_benchmarks/init.py
+++ b/superbench/benchmarks/micro_benchmarks/init.py
@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops

 __all__ = [
    'ComputationCommunicationOverlap',
@ -61,4 +62,5 @@ __all__ = [
    'ShardingMatmul',
    'TCPConnectivityBenchmark',
    'TensorRTInferenceBenchmark',
+    'DirectXGPUCoreFlops',
 ]
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance.py
@ -0,0 +1,145 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUCoreFlops performance benchmarks."""
+
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUCoreFlops(MicroBenchmarkWithInvoke):
+    """The DirectXGPUCoreFlops benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'DirectXGPUCoreFlops.exe'
+        self._support_precisions = ['fp16', 'fp32']
+        self._precision_need_to_run = list()
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--num_loops',
+            type=int,
+            default=10,
+            required=False,
+            help='The number of benchmark runs.',
+        )
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=2,
+            required=False,
+            help='The number of warm up runs.',
+        )
+        self._parser.add_argument(
+            '--n',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The N dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--k',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The K dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--m',
+            type=int,
+            default=16 * 256,
+            required=False,
+            help='The M dim of matmul (N, K) * (K, M).',
+        )
+        self._parser.add_argument(
+            '--precision',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Precision for benchmarking. E.g. {}.'.format(' '.join(self._support_precisions)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+
+        if len(self._args.precision) == 0:
+            self._precision_need_to_run = self._support_precisions
+        else:
+            self._args.precision = [p.lower() for p in self._args.precision]
+            for p in self._args.precision:
+                if p not in self._support_precisions:
+                    logger.warning(
+                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
+                            self._name, p, self._support_precisions
+                        )
+                    )
+                else:
+                    self._precision_need_to_run.append(p)
+
+        if len(self._precision_need_to_run) == 0:
+            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
+            return False
+
+        for p in self._precision_need_to_run:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --num_loops ' + str(self._args.num_loops))
+            command += (' --num_warm_up ' + str(self._args.num_warm_up))
+            command += (' --n ' + str(self._args.n))
+            command += (' --k ' + str(self._args.k))
+            command += (' --m ' + str(self._args.m))
+            command += (' --' + p)
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        precision = self._precision_need_to_run[cmd_idx]
+        self._result.add_raw_data('raw_output_' + precision, raw_output, self._args.log_raw_data)
+        valid = True
+        flops = list()
+        content = raw_output.splitlines()
+        try:
+            for line in content:
+                if 'TFLOPs' in line:
+                    flops.append(float(line.split()[0]))
+        except BaseException:
+            valid = False
+        finally:
+            if valid is False or len(flops) == 0:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        self._result.add_result(precision + '_flops', max(flops))
+        return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-core-flops', DirectXGPUCoreFlops, platform=Platform.DIRECTX)
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/BenchmarkOptions.h
@ -42,10 +42,10 @@ class BenchmarkOptions : public Options {
        m = get_cmd_line_argument_int("--m", 16 * 256);
        n = get_cmd_line_argument_int("--n", 16 * 256);
        k = get_cmd_line_argument_int("--k", 16 * 256);
-        if (get_cmd_line_argument_bool("--f16")) {
+        if (get_cmd_line_argument_bool("--fp16")) {
            mode_precision = Option::F16;
        }
-        if (get_cmd_line_argument_bool("--f32")) {
+        if (get_cmd_line_argument_bool("--fp32")) {
            mode_precision = Option::F32;
        }
    }
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.cpp
@ -25,7 +25,7 @@ void GPUCore::Run() {

    int loops = opts->num_loops;
    std::cout << "GPUCoreFLOPs" << std::endl;
-
+    gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
    switch (opts->mode_precision) {
    case Option::F32: {
        // Prepare input and output data and buffers.
@ -37,7 +37,6 @@ void GPUCore::Run() {
            ExecuteComputeOp();
        }
        for (int i = 0; i < loops; ++i) {
-            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
            // Do FLOPs job.
            double timeInMs = ExecuteComputeOp();
            auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
@ -55,7 +54,6 @@ void GPUCore::Run() {
            ExecuteComputeOp();
        }
        for (int i = 0; i < loops; ++i) {
-            gpuTimer.init(m_device.Get(), m_commandQueue.Get(), 1, D3D12::QueueType::compute);
            // Do FLOPs job.
            double timeInMs = ExecuteComputeOp();
            auto flops = (int64_t(m) * n * k + m * n) * 2 * 1e-9 / timeInMs;
--- a/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_gemm_flops_performance/GPUCore.vcxproj
@ -20,12 +20,14 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCoreFlops</TargetName>
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUCoreFlops</TargetName>
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
--- a/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_directx_gemm_flops_performance.py
@ -0,0 +1,47 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUCorefloops benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpucoreflops():
+    """Test DirectXGPUCoreFlops benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-core-flops',
+        platform=Platform.DIRECTX,
+        parameters=r'--num_loops 10 --n 16384 --k 16384 --m 16384 --precision fp32'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-core-flops')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.num_loops == 10)
+    assert (benchmark._args.n == 16384)
+    assert (benchmark._args.k == 16384)
+    assert (benchmark._args.m == 16384)
+    assert (sorted(benchmark._args.precision) == ['fp32'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output_fp32' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output_fp32']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output_fp32'][0], str))
+
+    assert ('fp32_flops' in benchmark.result)
+    assert (len(benchmark.result['fp32_flops']) == 1)
+    assert (isinstance(benchmark.result['fp32_flops'][0], numbers.Number))