This commit is contained in:
yukirora 2024-04-29 15:06:10 +00:00
Родитель ca5a5b6f94
Коммит 93eaae32a2
5 изменённых файлов: 75 добавлений и 21 удалений

Просмотреть файл

@ -66,6 +66,21 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
required=False,
help='Transpose matrix B.',
)
self._parser.add_argument(
'--algo_method',
type=str,
default='heuristic',
choices=['heuristic', 'all', 'index'],
required=False,
help='Use different algorithm search API. Options: heuristic, all, index.',
)
self._parser.add_argument(
'--solution_index',
type=int,
default=None,
required=False,
help='Used with --algo_method index. Specify solution index to use in benchmark. '
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
@ -85,7 +100,14 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
f' -i {self._args.num_steps} {self._in_type_map[_in_type]}' + \
f' --transA {self._args.transA} --transB {self._args.transB}' + \
f' --initialization {self._args.initialization}'
command = command + f' -b {str(_b)}' if _b > 0 else command
command = command + f' --batch_count {str(_b)}' if _b > 0 else command
if self._args.algo_method != 'heuristic':
command += f' --algo_method {self._args.algo_method}'
if self._args.algo_method == 'index':
if not self._args.solution_index:
logger.error('Solution index must be specified when algo_method is "index".')
return False
command += f' --solution_index {self._args.solution_index}'
logger.info(command)
self._commands.append(command)
self._precision_in_commands.append(_in_type)
@ -109,27 +131,24 @@ class HipBlasLtBenchmark(BlasLtBaseBenchmark):
try:
lines = raw_output.splitlines()
index = None
tflops = -1
metric = None
# Find the line containing 'hipblaslt-Gflops'
for i, line in enumerate(lines):
if 'hipblaslt-Gflops' in line:
index = i
break
if index is None:
raise ValueError('Line with "hipblaslt-Gflops" not found in the log.')
# Split the line into fields using a comma as the delimiter
fields = lines[index + 1].strip().split(',')
# Check the number of fields and the format of the first two fields
if len(fields) != 23:
if len(fields) < 23:
raise ValueError('Invalid result')
metric = f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}'
tflops = max(tflops, float(fields[21])/1000)
if index is None:
raise ValueError('Line with "hipblaslt-Gflops" not found in the log.')
self._result.add_result(f'{metric}_tflops', tflops)
self._result.add_result(
f'{self._precision_in_commands[cmd_idx]}_{fields[3]}_{"_".join(fields[4:7])}_flops',
float(fields[-2]) / 1000
)
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(

Просмотреть файл

@ -125,7 +125,7 @@ class RocmComposableKernelBenchmark(BlasLtBaseBenchmark):
command = f'{self.__bin_path} gemm {params} {self._args.num_warmup} {self._args.num_steps}'
self._commands.append(command)
logger.info(command)
if self._args.splitk:
if self._args.splitk and _in_type not in ['fp8']:
if not isinstance(self._args.splitk, list):
self._args.splitk = [self._args.splitk]
for splitk in self._args.splitk:
@ -133,7 +133,7 @@ class RocmComposableKernelBenchmark(BlasLtBaseBenchmark):
f' {self._args.num_warmup} {self._args.num_steps}'
self._commands.append(command)
logger.info(command)
if self._args.streamk:
if self._args.streamk and _in_type not in ['fp8']:
if not isinstance(self._args.streamk, list):
self._args.streamk = [self._args.streamk]
for streamk in self._args.streamk:
@ -203,8 +203,8 @@ class RocmComposableKernelBenchmark(BlasLtBaseBenchmark):
return False
finally:
if cmd_idx == len(self._commands) - 1:
for metric in self.results:
self.results[metric] = [max(self.results[metric])]
for metric in self._result.result:
self._result.result[metric] = [max(self._result.result[metric])]
return True

Просмотреть файл

@ -150,6 +150,39 @@ superbench:
<<: *default_pytorch_mode
computation-communication-overlap:
<<: *default_pytorch_mode
composable-kernel-gemm:
<<: *default_local_mode
parameters:
in_types:
- fp16
- bf16
- fp32
- fp8
shapes:
- 8192,8192,8192
- 4096,4096,4096
splitk: [2, 4]
streamk: -1
tolerant_fail: yes
num_warmup: 10
num_steps: 100
hipblaslt-gemm:
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank}
parallel: yes
env:
HIP_FORCE_DEV_KERNARG: '1'
parameters:
algo_method: all
shapes:
- 8192,8192,8192
- 4096,4096,4096
in_types: ["fp32", "fp16", "bf16", 'fp8']
tolerant_fail: yes
num_warmup: 10
num_steps: 100
ib-traffic:
enable: false
modes:

Просмотреть файл

@ -114,6 +114,7 @@ class SuperBenchExecutor():
elif isinstance(val, (str, int, float)):
argv.append('--{} {}'.format(name, val))
elif isinstance(val, (list, ListConfig)):
val = [str(v) for v in val]
argv.append('--{} {}'.format(name, ' '.join(val)))
return ' '.join(argv)

5
third_party/Makefile поставляемый
Просмотреть файл

@ -128,8 +128,9 @@ rocm_composable_kernel: sb_micro_path
if [ -d composable_kernel ]; then rm -rf composable_kernel; fi; \
git clone -b ${COMPOSABLEKERNEL_BRANCH} https://github.com/ROCm/composable_kernel; \
cd composable_kernel && mkdir build && cd build; \
cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D CMAKE_BUILD_TYPE=Release -D DTYPES="fp64;fp32;fp16;fp8;bf16;int8" ..; \
make -j ckProfiler install; \
cmake -D CMAKE_PREFIX_PATH=/opt/rocm -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -D CMAKE_BUILD_TYPE=Release -D DTYPES="fp64;fp32;fp16;fp8;bf16;int8" -D GPU_TARGETS="gfx941;gfx942;gfx90a;gfx908" ..; \
make -j ckProfiler; \
cp -v ./bin/ckProfiler $(SB_MICRO_PATH)/bin/; \
fi
# Build hipBusBandwidth.