From af4cfd5bbfe989b212d5311656be0cbe7cd5ae35 Mon Sep 17 00:00:00 2001
From: Yuting Jiang <yutingjiang@microsoft.com>
Date: Wed, 5 Jul 2023 22:07:13 +0800
Subject: [PATCH] Benchmarks: micro benchmarks - add python code for
 DirecXGPUMemBw (#547)

**Description**
add python code for DirecXGPUMemBw.
---
 .github/workflows/build-win.yml               |   1 +
 .../benchmarks/micro_benchmarks/__init__.py   |   2 +
 .../directx_mem_bw_performance.py             | 149 ++++++++++++++++++
 .../BenchmarkOptions.h                        |   2 +-
 .../GPUMemRwBw.vcxproj                        |   2 +
 .../benchmarks/micro_benchmarks/micro_base.py |   2 +-
 superbench/common/utils/process.py            |  17 +-
 .../test_directx_mem_bw_performance.py        |  52 ++++++
 8 files changed, 222 insertions(+), 5 deletions(-)
 create mode 100644 superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
 create mode 100644 tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py

diff --git a/.github/workflows/build-win.yml b/.github/workflows/build-win.yml
index 6283544b..d1b9a1c8 100644
--- a/.github/workflows/build-win.yml
+++ b/.github/workflows/build-win.yml
@@ -23,6 +23,7 @@ jobs:
       run: |
         docker system prune -a -f
         docker volume prune -a -f
+      shell: pwsh
     - name: Build Docker image
       working-directory: .
       shell: pwsh
diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py
index 57304bc4..9fe14336 100644
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -31,6 +31,7 @@ from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import Ro
 from superbench.benchmarks.micro_benchmarks.sharding_matmul import ShardingMatmul
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance import TensorRTInferenceBenchmark
+from superbench.benchmarks.micro_benchmarks.directx_mem_bw_performance import DirectXGPUMemBw
 from superbench.benchmarks.micro_benchmarks.directx_gemm_flops_performance import DirectXGPUCoreFlops
 
 __all__ = [
@@ -62,5 +63,6 @@ __all__ = [
     'ShardingMatmul',
     'TCPConnectivityBenchmark',
     'TensorRTInferenceBenchmark',
+    'DirectXGPUMemBw',
     'DirectXGPUCoreFlops',
 ]
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
new file mode 100644
index 00000000..ff9d9d23
--- /dev/null
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""Module of the DirectXGPUMemBw performance benchmarks."""
+
+import os
+
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+
+
+class DirectXGPUMemBw(MicroBenchmarkWithInvoke):
+    """The DirectXGPUMemBw benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'DirectXGPUMemRwBw.exe'
+        self._modes = ['read', 'write', 'readwrite']
+
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--num_warm_up',
+            type=int,
+            default=0,
+            required=False,
+            help='Number of warm up rounds.',
+        )
+        self._parser.add_argument(
+            '--num_loop',
+            type=int,
+            default=100,
+            required=False,
+            help='Number of loop times to measure the performance.',
+        )
+        self._parser.add_argument(
+            '--size',
+            type=int,
+            default=None,
+            required=False,
+            help='Size of data for GPU copy.',
+        )
+        self._parser.add_argument(
+            '--minbytes',
+            type=int,
+            default=4096,
+            required=False,
+            help='Lower data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--maxbytes',
+            type=int,
+            default=1024 * 1024 * 1024,
+            required=False,
+            help='Upper data size bound to test.',
+        )
+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            required=False,
+            help='Whether check data correctness.',
+        )
+        self._parser.add_argument(
+            '--mode',
+            type=str,
+            nargs='+',
+            default=list(),
+            help='Memory operation mode. E.g. {}.'.format(' '.join(self._modes)),
+        )
+
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking."""
+        if not super()._preprocess():
+            return False
+
+        self._args.mode = [m.lower() for m in self._args.mode]
+        for mode in self._args.mode:
+            if mode not in self._modes:
+                logger.warning(
+                    'Unsupported mode - benchmark: {}, mode: {}, expected: {}.'.format(self._name, mode, self._modes)
+                )
+                self._args.mode.remove(mode)
+
+        if len(self._args.mode) == 0:
+            logger.error('No valid operation modes are provided.')
+            return False
+
+        for mode in self._args.mode:
+            command = os.path.join(self._args.bin_dir, self._bin_name)
+            command += (' --num_warm_up ' + str(self._args.num_warm_up))
+            command += (' --num_loop ' + str(self._args.num_loop))
+            if self._args.size is not None:
+                command += (' --size ' + str(self._args.size))
+            else:
+                command += (' --minbytes ' + str(self._args.minbytes))
+                command += (' --maxbytes ' + str(self._args.maxbytes))
+            if self._args.check_data:
+                command += (' --check_data')
+            command += (' --' + mode)
+            self._commands.append(command)
+        return True
+
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to process raw results and save the summarized results.
+
+        self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        mode = self._args.mode[cmd_idx]
+        self._result.add_raw_data('raw_output_' + mode, raw_output, self._args.log_raw_data)
+
+        valid = True
+
+        content = raw_output.splitlines()
+        try:
+            for line in content:
+                if 'GPUMemBw:' in line:
+                    size = int(line.split()[-3])
+                    bw = float(line.split()[-2])
+                    self._result.add_result(f'{mode}_{size}_bw', bw)
+                if 'error' in line.lower():
+                    valid = False
+        except BaseException:
+            valid = False
+        finally:
+            if not valid:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        return True
+
+
+BenchmarkRegistry.register_benchmark('directx-gpu-mem-bw', DirectXGPUMemBw, platform=Platform.DIRECTX)
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
index 7893fe8a..c9d7507a 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/BenchmarkOptions.h
@@ -68,7 +68,7 @@ class BenchmarkOptions : public Options {
         min_size = get_cmd_line_argument_int("--minbytes", 4 * 1024);
         max_size =
             get_cmd_line_argument_ulonglong("--maxbytes", static_cast<unsigned long long>(1LL * 1024 * 1024 * 1024));
-        check_data = get_cmd_line_argument_bool("--check");
+        check_data = get_cmd_line_argument_bool("--check_data");
         if (get_cmd_line_argument_bool("--read")) {
             mem_type = Memtype::Read;
         }
diff --git a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
index 80ab02e3..b575f804 100644
--- a/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
+++ b/superbench/benchmarks/micro_benchmarks/directx_mem_bw_performance/GPUMemRwBw.vcxproj
@@ -19,12 +19,14 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
     <CharacterSet>Unicode</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <TargetName>DirectXGPUMemRwBw</TargetName>
     <ConfigurationType>Application</ConfigurationType>
     <UseDebugLibraries>false</UseDebugLibraries>
     <PlatformToolset>v143</PlatformToolset>
diff --git a/superbench/benchmarks/micro_benchmarks/micro_base.py b/superbench/benchmarks/micro_benchmarks/micro_base.py
index 7a2d3602..e1e85405 100644
--- a/superbench/benchmarks/micro_benchmarks/micro_base.py
+++ b/superbench/benchmarks/micro_benchmarks/micro_base.py
@@ -180,7 +180,7 @@ class MicroBenchmarkWithInvoke(MicroBenchmark):
                 )
             )
 
-            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing)
+            output = run_command(self._commands[cmd_idx], flush_output=self._args.log_flushing, cwd=self._args.bin_dir)
             if output.returncode != 0:
                 self._result.set_return_code(ReturnCode.MICROBENCHMARK_EXECUTION_FAILURE)
                 logger.error(
diff --git a/superbench/common/utils/process.py b/superbench/common/utils/process.py
index 334bf766..75767ead 100644
--- a/superbench/common/utils/process.py
+++ b/superbench/common/utils/process.py
@@ -10,13 +10,14 @@ import shlex
 from superbench.common.utils import stdout_logger
 
 
-def run_command(command, quiet=False, flush_output=False):
+def run_command(command, quiet=False, flush_output=False, cwd=None):
     """Run command in string format, return the result with stdout and stderr.
 
     Args:
         command (str): command to run.
         quiet (bool): no stdout display of the command if quiet is True.
         flush_output (bool): enable real-time output flush or not when running the command.
+        cwd (str): working directory to run the command.
 
     Return:
         result (subprocess.CompletedProcess): The return value from subprocess.run().
@@ -26,7 +27,11 @@ def run_command(command, quiet=False, flush_output=False):
         try:
             args = shlex.split(command)
             process = subprocess.Popen(
-                args, cwd=os.getcwd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True
+                args,
+                cwd=os.getcwd() if cwd is None else cwd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                universal_newlines=True
             )
             output = ''
             for line in process.stdout:
@@ -43,7 +48,13 @@ def run_command(command, quiet=False, flush_output=False):
             return subprocess.CompletedProcess(args=args, returncode=-1, stdout=str(e))
     else:
         result = subprocess.run(
-            command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, check=False, universal_newlines=True
+            command,
+            cwd=os.getcwd() if cwd is None else cwd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            shell=True,
+            check=False,
+            universal_newlines=True
         )
         if not quiet:
             stdout_logger.log(result.stdout)
diff --git a/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
new file mode 100644
index 00000000..baeed54a
--- /dev/null
+++ b/tests/benchmarks/micro_benchmarks/test_directx_mem_bw_performance.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DirectXGPUMemBw benchmark."""
+
+import numbers
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+@decorator.directx_test
+def test_directx_gpu_mem_bw():
+    """Test DirectXGPUMemBw benchmark."""
+    # Test for default configuration
+    context = BenchmarkRegistry.create_benchmark_context(
+        'directx-gpu-mem-bw',
+        platform=Platform.DIRECTX,
+        parameters=r'--num_warm_up 0 --num_loop 100 --size 1073741824 --mode read write'
+    )
+
+    assert (BenchmarkRegistry.is_benchmark_context_valid(context))
+
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+
+    # Check basic information.
+    assert (benchmark)
+    assert (benchmark.name == 'directx-gpu-mem-bw')
+    assert (benchmark.type == BenchmarkType.MICRO)
+
+    # Check parameters specified in BenchmarkContext.
+    assert (benchmark._args.num_warm_up == 0)
+    assert (benchmark._args.num_loop == 100)
+    assert (benchmark._args.size == 1073741824)
+    assert (sorted(benchmark._args.mode) == ['read', 'write'])
+
+    # Check results and metrics.
+    assert (benchmark.run_count == 1)
+    assert (benchmark.return_code == ReturnCode.SUCCESS)
+    assert ('raw_output_read' in benchmark.raw_data)
+    assert ('raw_output_write' in benchmark.raw_data)
+    assert (len(benchmark.raw_data['raw_output_read']) == 1)
+    assert (len(benchmark.raw_data['raw_output_write']) == 1)
+    assert (isinstance(benchmark.raw_data['raw_output_read'][0], str))
+    assert (isinstance(benchmark.raw_data['raw_output_write'][0], str))
+
+    assert ('read_1073741824_bw' in benchmark.result)
+    assert ('write_1073741824_bw' in benchmark.result)
+    assert (len(benchmark.result['read_1073741824_bw']) == 1)
+    assert (len(benchmark.result['write_1073741824_bw']) == 1)
+    assert (isinstance(benchmark.result['read_1073741824_bw'][0], numbers.Number))
+    assert (isinstance(benchmark.result['write_1073741824_bw'][0], numbers.Number))