From 6c6f5269371d83d3d5d519b18c660e9be10947ef Mon Sep 17 00:00:00 2001 From: guoshzhao Date: Wed, 2 Jun 2021 09:15:58 +0800 Subject: [PATCH] Benchmarks: Add Benchmark - Add FLOPs performance benchmark for cuda. (#87) * add cuda flops performance benchmark. --- .../benchmarks/gemm_flops_cuda_performance.py | 23 +++ .../benchmarks/micro_benchmarks/__init__.py | 3 +- .../gemm_flops_performance.py | 171 ++++++++++++++++++ superbench/benchmarks/return_code.py | 1 + .../test_gemm_flops_performance.py | 110 +++++++++++ 5 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 examples/benchmarks/gemm_flops_cuda_performance.py create mode 100644 superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py create mode 100644 tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py diff --git a/examples/benchmarks/gemm_flops_cuda_performance.py b/examples/benchmarks/gemm_flops_cuda_performance.py new file mode 100644 index 00000000..057dfa06 --- /dev/null +++ b/examples/benchmarks/gemm_flops_cuda_performance.py @@ -0,0 +1,23 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Model benchmark example for Cutlass GEMM FLOPs performance. + +Commands to run: + python3 examples/benchmarks/gemm_flops_cuda_performance.py +""" + +from superbench.benchmarks import BenchmarkRegistry, Platform +from superbench.common.utils import logger + +if __name__ == '__main__': + parameters = '--n 16384 --k 16384 --m 16384' + context = BenchmarkRegistry.create_benchmark_context('gemm-flops', platform=Platform.CUDA, parameters=parameters) + + benchmark = BenchmarkRegistry.launch_benchmark(context) + if benchmark: + logger.info( + 'benchmark: {}, return code: {}, result: {}'.format( + benchmark.name, benchmark.return_code, benchmark.result + ) + ) diff --git a/superbench/benchmarks/micro_benchmarks/__init__.py b/superbench/benchmarks/micro_benchmarks/__init__.py index 50edaf14..3b1b820f 100644 --- a/superbench/benchmarks/micro_benchmarks/__init__.py +++ b/superbench/benchmarks/micro_benchmarks/__init__.py @@ -9,8 +9,9 @@ from superbench.benchmarks.micro_benchmarks.computation_communication_overlap im from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark +from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda __all__ = [ 'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch', - 'CublasBenchmark', 'CudnnBenchmark' + 'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda' ] diff --git a/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py new file mode 100644 index 00000000..a53a15c4 --- /dev/null +++ b/superbench/benchmarks/micro_benchmarks/gemm_flops_performance.py @@ -0,0 +1,171 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Module of the FLOPs performance benchmarks.""" + +import os + +from superbench.common.utils import logger +from superbench.common.utils import nv_helper +from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode +from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke + + +class GemmFlopsCuda(MicroBenchmarkWithInvoke): + """The GEMM FLOPs performance benchmark class.""" + def __init__(self, name, parameters=''): + """Constructor. + + Args: + name (str): benchmark name. + parameters (str): benchmark parameters. + """ + super().__init__(name, parameters) + + self._bin_name = 'cutlass_profiler' + + self.__kernel_map = { + 'FP64': 'cutlass_simt_dgemm_128x128_8x2_*', + 'FP32': 'cutlass_simt_sgemm_128x128_8x2_*', + 'FP16': 'cutlass_simt_hgemm_256x128_8x2_*', + 'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*', + 'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*', + 'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*', + 'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*', + 'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*', + 'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*', + } + + def add_parser_arguments(self): + """Add the specified arguments.""" + super().add_parser_arguments() + + self._parser.add_argument( + '--num_warmup', + type=int, + default=5, + required=False, + help='The number of warmup step.', + ) + self._parser.add_argument( + '--n', + type=int, + default=16384, + required=False, + help='The N dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--k', + type=int, + default=16384, + required=False, + help='The K dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--m', + type=int, + default=16384, + required=False, + help='The M dim of matmul (N, K) * (K, M).', + ) + self._parser.add_argument( + '--precision', + type=str, + nargs='+', + default=list(self.__kernel_map.keys()), + help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map.keys()))), + ) + + def _preprocess(self): + """Preprocess/preparation operations before the benchmarking. + + Return: + True if _preprocess() succeed. + """ + if not super()._preprocess(): + return False + + self._args.precision = [p.upper() for p in self._args.precision] + for p in self._args.precision: + if p not in list(self.__kernel_map.keys()): + self._result.set_return_code(ReturnCode.INVALID_ARGUMENT) + logger.error( + 'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format( + self._name, p, list(self.__kernel_map.keys()) + ) + ) + return False + else: + command = os.path.join(self._args.bin_dir, self._bin_name) + command += (' --warmup-iterations=' + str(self._args.num_warmup)) + command += (' --operation=gemm') + command += (' --n=' + str(self._args.n)) + command += (' --k=' + str(self._args.k)) + command += (' --m=' + str(self._args.m)) + command += (' --kernels=' + self.__kernel_map[p]) + self._commands.append(command) + + # TODO - To support more architecutres, currently only support compute capability = 7.0 or 8.0 + capability = nv_helper.get_device_compute_capability() + if capability == 7.0: + self.__kernel_map['FP16_TC'] = 'cutlass_tensorop_h884gemm_256x128_32x2_*' + + if capability not in [7.0, 8.0]: + self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE) + logger.error( + 'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format( + self._name, capability + ) + ) + return False + + return True + + def _process_raw_result(self, cmd_idx, raw_output): + """Function to parse raw results and save the summarized results. + + self._result.add_raw_data() and self._result.add_result() need to be called to save the results. + + Args: + cmd_idx (int): the index of command corresponding with the raw_output. + raw_output (str): raw output string of the micro-benchmark. + + Return: + True if the raw output string is valid and result can be extracted. + """ + precision = self._args.precision[cmd_idx] + self._result.add_raw_data('raw_output_' + precision, raw_output) + + valid = True + flops = list() + content = raw_output.splitlines() + try: + for line in content: + if 'gemm,cutlass_simt_dgemm_128x128_8x2' in line or \ + 'gemm,cutlass_simt_sgemm_128x128_8x2' in line or \ + 'gemm,cutlass_simt_hgemm_256x128_8x2' in line or \ + 'gemm,cutlass_tensorop_d884gemm_128x128_16x3' in line or \ + 'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3' in line or \ + 'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3' in line or \ + 'gemm,cutlass_tensorop_h16816gemm_256x128_32x3' in line or \ + 'gemm,cutlass_tensorop_h884gemm_256x128_32x2' in line or \ + 'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3' in line or \ + 'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3' in line: + flops.append(float(line.split(',')[-1])) + except BaseException: + valid = False + finally: + if valid is False or len(flops) == 0: + logger.error( + 'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format( + self._curr_run_index, self._name, raw_output + ) + ) + return False + + self._result.add_result(precision, max(flops)) + + return True + + +BenchmarkRegistry.register_benchmark('gemm-flops', GemmFlopsCuda, platform=Platform.CUDA) diff --git a/superbench/benchmarks/return_code.py b/superbench/benchmarks/return_code.py index 3c688f41..b200c086 100644 --- a/superbench/benchmarks/return_code.py +++ b/superbench/benchmarks/return_code.py @@ -28,3 +28,4 @@ class ReturnCode(Enum): MICROBENCHMARK_BINARY_NOT_EXIST = 31 MICROBENCHMARK_EXECUTION_FAILURE = 32 MICROBENCHMARK_RESULT_PARSING_FAILURE = 33 + MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE = 34 diff --git a/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py new file mode 100644 index 00000000..70c28917 --- /dev/null +++ b/tests/benchmarks/micro_benchmarks/test_gemm_flops_performance.py @@ -0,0 +1,110 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Tests for gemm-flops benchmark.""" + +import os +import unittest +from pathlib import Path + +from tests.helper import decorator +from superbench.common.utils import nv_helper +from superbench.benchmarks import BenchmarkRegistry, ReturnCode, Platform, BenchmarkType + + +class GemmFlopsCudaTest(unittest.TestCase): + """Tests for GemmFlopsCuda benchmark.""" + def setUp(self): + """Method called to prepare the test fixture.""" + # Create fake binary file just for testing. + os.environ['SB_MICRO_PATH'] = '/tmp/superbench/' + binary_path = os.path.join(os.getenv('SB_MICRO_PATH'), 'bin') + Path(binary_path).mkdir(parents=True, exist_ok=True) + self.__binary_file = Path(os.path.join(binary_path, 'cutlass_profiler')) + self.__binary_file.touch(mode=0o755, exist_ok=True) + + def tearDown(self): + """Method called after the test method has been called and the result recorded.""" + self.__binary_file.unlink() + + @decorator.cuda_test + def test_flops_performance_cuda(self): + """Test gemm-flops benchmark.""" + benchmark_name = 'gemm-flops' + (benchmark_class, + predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA) + assert (benchmark_class) + + # Negative case - MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE. + benchmark = benchmark_class( + benchmark_name, + parameters='--num_warmup 200 --n 1024 --k 512 --m 2048 --precision FP32 TF32_TC FP16_TC INT8_TC' + ) + + ret = benchmark._preprocess() + if nv_helper.get_device_compute_capability() not in [7.0, 8.0]: + assert (ret is False) + assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE) + else: + assert (ret is True) + assert (benchmark.return_code == ReturnCode.SUCCESS) + + # Check basic information. + assert (benchmark.name == 'gemm-flops') + assert (benchmark.type == BenchmarkType.MICRO) + assert (benchmark._bin_name == 'cutlass_profiler') + + # Check parameters specified in BenchmarkContext. + assert (benchmark._args.num_warmup == 200) + assert (benchmark._args.n == 1024) + assert (benchmark._args.k == 512) + assert (benchmark._args.m == 2048) + assert (benchmark._args.precision == ['FP32', 'TF32_TC', 'FP16_TC', 'INT8_TC']) + + # Check the command list. + for i in range(len(benchmark._args.precision)): + command = '{} --warmup-iterations={} --operation=gemm --n={} --k={} --m={} --kernels={}'.format( + benchmark._bin_name, benchmark._args.num_warmup, benchmark._args.n, benchmark._args.k, + benchmark._args.m, benchmark._GemmFlopsCuda__kernel_map[benchmark._args.precision[i]] + ) + expected_cmd = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1] + assert (command == expected_cmd) + + # Check results and metrics. + raw_output_FP32 = """ +CSV Results: + +Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs +1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nn_align1,passed,success,universal,16384,16384,16384,f32:column,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.022,6.23672,18287.4 +1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_nt_align1,passed,success,universal,16384,16384,16384,f32:column,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,478.866,6.2648,18369.7 +1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tn_align1,passed,success,universal,16384,16384,16384,f32:row,f32:column,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,482.034,6.22363,18249 +1,CUTLASS,gemm,cutlass_simt_sgemm_128x128_8x2_tt_align1,passed,success,universal,16384,16384,16384,f32:row,f32:row,f32:column,1,0,1,1,simt,f32,128,128,8,2,4,2,1,1,1,1,50,1024,3221225472,8796629893120,481.838,6.22616,18256.4 +""" + raw_output_TF32_TC = """ +CSV Results: + +Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs +1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nn_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,88.5764,33.8691,99311.2 +1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_nt_align4,passed,success,universal,16384,16384,16384,tf32:column,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,70.3503,42.6438,125040 +1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tn_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:column,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,86.5167,34.6754,101676 +1,CUTLASS,gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_tt_align4,passed,success,universal,16384,16384,16384,tf32:row,tf32:row,tf32:column,1,0,1,1,tensorop,f32,256,128,16,3,4,2,1,16,8,8,80,1024,3221225472,8796629893120,68.3621,43.884,128677 +""" + raw_output_FP16_TC = """ +CSV Results: + +Problem,Provider,OperationKind,Operation,Disposition,Status,gemm_kind,m,n,k,A,B,C,alpha,beta,split_k_slices,batch_count,op_class,accum,cta_m,cta_n,cta_k,stages,warps_m,warps_n,warps_k,inst_m,inst_n,inst_k,min_cc,max_cc,Bytes,Flops,Runtime,GB/s,GFLOPs +1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nn_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.1575,43.9142,257531 +1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_nt_align8,incorrect,success,universal,16384,16384,16384,f16:column,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,34.6153,43.3334,254126 +1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tn_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:column,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,39.0413,38.4209,225316 +1,CUTLASS,gemm,cutlass_tensorop_h16816gemm_256x128_32x3_tt_align8,incorrect,success,universal,16384,16384,16384,f16:row,f16:row,f16:column,1,0,1,1,tensorop,f16,256,128,32,3,4,2,1,16,8,16,80,1024,1610612736,8796629893120,31.2994,47.9243,281048 +""" + assert (benchmark._process_raw_result(0, raw_output_FP32)) + assert (benchmark._process_raw_result(1, raw_output_TF32_TC)) + assert (benchmark._process_raw_result(2, raw_output_FP16_TC)) + + assert (benchmark.result['FP32'][0] == 18369.7) + assert (benchmark.result['TF32_TC'][0] == 128677) + assert (benchmark.result['FP16_TC'][0] == 281048) + + # Negative case - Add invalid raw output. + assert (benchmark._process_raw_result(3, 'Invalid raw output') is False)