diff --git a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py index 3feb582d..cca70e08 100644 --- a/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py +++ b/superbench/benchmarks/micro_benchmarks/hipblaslt_function.py @@ -66,6 +66,21 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): required=False, help='Transpose matrix B.', ) + self._parser.add_argument( + '--algo_method', + type=str, + default='heuristic', + choices=['heuristic', 'all', 'index'], + required=False, + help='Use different algorithm search API. Options: heuristic, all, index.', + ) + self._parser.add_argument( + '--solution_index', + type=int, + default=None, + required=False, + help='Used with --algo_method index. Specify solution index to use in benchmark. ' + ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -85,7 +100,14 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \ f' --transA {self._args.transA} --transB {self._args.transB}' + \ f' --initialization {self._args.initialization}' - command = command + f' -b {str(_b)}' if _b > 0 else command + command = command + f' --batch_count {str(_b)}' if _b > 0 else command + if self._args.algo_method != 'heuristic': + command += f' --algo_method {self._args.algo_method}' + if self._args.algo_method == 'index': + if not self._args.solution_index: + logger.error('Solution index must be specified when algo_method is "index".') + return False + command += f' --solution_index {self._args.solution_index}' logger.info(command) self._commands.append(command) self._precision_in_commands.append(_in_type) @@ -109,27 +131,24 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark): try: lines = raw_output.splitlines() index = None + tflops = -1 + metric = None # Find the line containing 'hipblaslt-Gflops' for i, line in enumerate(lines): if 'hipblaslt-Gflops' in line: index = i - break - + # Split the line into fields using a comma as the delimiter + fields = lines[index + 1].strip().split(',') + # Check the number of fields and the format of the first two fields + if len(fields) < 23: + raise ValueError('Invalid result') + metric = f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}' + tflops = max(tflops, float(fields[21])/1000) if index is None: raise ValueError('Line with "hipblaslt-Gflops" not found in the log.') + self._result.add_result(f'{metric}_tflops', tflops) - # Split the line into fields using a comma as the delimiter - fields = lines[index + 1].strip().split(',') - - # Check the number of fields and the format of the first two fields - if len(fields) != 23: - raise ValueError('Invalid result') - - self._result.add_result( - f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops', - float(fields[-2]) / 1000 - ) except BaseException as e: self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE) logger.error( diff --git a/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py b/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py index 83080526..9e573c00 100644 --- a/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py +++ b/superbench/benchmarks/micro_benchmarks/rocm_composable_kernel_performance.py @@ -125,15 +125,15 @@ class RocmComposableKernelBenchmark(BlasLtBaseBenchmark): command = f'{self.__bin_path} gemm {params} {self._args.num_warmup} {self._args.num_steps}' self._commands.append(command) logger.info(command) - if self._args.splitk: + if self._args.splitk and _in_type not in ['fp8']: if not isinstance(self._args.splitk, list): self._args.splitk = [self._args.splitk] for splitk in self._args.splitk: command = f'{self.__bin_path} gemm_splitk {params} {splitk}' + \ f' {self._args.num_warmup} {self._args.num_steps}' self._commands.append(command) - logger.info(command) - if self._args.streamk: + logger.info(command) + if self._args.streamk and _in_type not in ['fp8']: if not isinstance(self._args.streamk, list): self._args.streamk = [self._args.streamk] for streamk in self._args.streamk: @@ -203,8 +203,8 @@ class RocmComposableKernelBenchmark(BlasLtBaseBenchmark): return False finally: if cmd_idx == len(self._commands) - 1: - for metric in self.results: - self.results[metric] = [max(self.results[metric])] + for metric in self._result.result: + self._result.result[metric] = [max(self._result.result[metric])] return True diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 9533806c..275ea935 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -150,6 +150,39 @@ superbench: <<: *default_pytorch_mode computation-communication-overlap: <<: *default_pytorch_mode + composable-kernel-gemm: + <<: *default_local_mode + parameters: + in_types: + - fp16 + - bf16 + - fp32 + - fp8 + shapes: + - 8192,8192,8192 + - 4096,4096,4096 + splitk: [2, 4] + streamk: -1 + tolerant_fail: yes + num_warmup: 10 + num_steps: 100 + hipblaslt-gemm: + modes: + - name: local + proc_num: 8 + prefix: CUDA_VISIBLE_DEVICES={proc_rank} + parallel: yes + env: + HIP_FORCE_DEV_KERNARG: '1' + parameters: + algo_method: all + shapes: + - 8192,8192,8192 + - 4096,4096,4096 + in_types: ["fp32", "fp16", "bf16", 'fp8'] + tolerant_fail: yes + num_warmup: 10 + num_steps: 100 ib-traffic: enable: false modes: diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index bfff5cb7..1f6c50e4 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -114,6 +114,7 @@ class SuperBenchExecutor(): elif isinstance(val, (str, int, float)): argv.append('--{} {}'.format(name, val)) elif isinstance(val, (list, ListConfig)): + val = [str(v) for v in val] argv.append('--{} {}'.format(name, ' '.join(val))) return ' '.join(argv) diff --git a/third_party/Makefile b/third_party/Makefile index 847ae450..bd0cec71 100755 --- a/third_party/Makefile +++ b/third_party/Makefile @@ -128,8 +128,9 @@ rocm_composable_kernel: sb_micro_path if [ -d composable_kernel ]; then rm -rf composable_kernel; fi; \ git clone -b ${COMPOSABLEKERNEL_BRANCH} https://github.com/ROCm/composable_kernel; \ cd composable_kernel && mkdir build && cd build; \ - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D CMAKE_BUILD_TYPE=Release -D DTYPES="fp64;fp32;fp16;fp8;bf16;int8" ..; \ - make -j ckProfiler install; \ + cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D CMAKE_BUILD_TYPE=Release -D DTYPES="fp64;fp32;fp16;fp8;bf16;int8" -D GPU_TARGETS="gfx941;gfx942;gfx90a;gfx908" ..; \ + make -j ckProfiler; \ + cp -v ./bin/ckProfiler $(SB_MICRO_PATH)/bin/; \ fi # Build hipBusBandwidth.