Bug - Fix issues for distributed runs (#258)
Fix issues for distributed runs: * fix config for memory bandwidth benchmarks * add throttling for high concurrency docker pull * update rsync path and exclude directories * handle exceptions when creating summary * tune for logging
This commit is contained in:
Родитель
44f0270ec4
Коммит
213ab14bea
|
@ -48,7 +48,12 @@ superbench:
|
|||
ngpus: 8
|
||||
operation: allreduce
|
||||
mem-bw:
|
||||
<<: *default_local_mode
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 8
|
||||
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||
parallel: no
|
||||
gemm-flops:
|
||||
<<: *default_local_mode
|
||||
parameters:
|
||||
|
|
|
@ -49,7 +49,12 @@ superbench:
|
|||
ngpus: 8
|
||||
operation: allreduce
|
||||
mem-bw:
|
||||
<<: *default_local_mode
|
||||
enable: true
|
||||
modes:
|
||||
- name: local
|
||||
proc_num: 8
|
||||
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||
parallel: no
|
||||
gemm-flops:
|
||||
<<: *default_local_mode
|
||||
parameters:
|
||||
|
|
|
@ -64,7 +64,7 @@ superbench:
|
|||
- name: local
|
||||
proc_num: 8
|
||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||
parallel: yes
|
||||
parallel: no
|
||||
disk-benchmark:
|
||||
enable: true
|
||||
modes:
|
||||
|
|
|
@ -60,7 +60,7 @@ superbench:
|
|||
- name: local
|
||||
proc_num: 8
|
||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||
parallel: yes
|
||||
parallel: no
|
||||
disk-benchmark:
|
||||
enable: false
|
||||
modes:
|
||||
|
|
|
@ -62,7 +62,7 @@ superbench:
|
|||
- name: local
|
||||
proc_num: 8
|
||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||
parallel: yes
|
||||
parallel: no
|
||||
gpu-copy-bw:
|
||||
enable: true
|
||||
modes:
|
||||
|
|
|
@ -28,13 +28,13 @@ class SuperBenchExecutor():
|
|||
self._output_path = Path(sb_output_dir).expanduser().resolve()
|
||||
|
||||
self.__set_logger('sb-exec.log')
|
||||
logger.info('Executor uses config: %s.', self._sb_config)
|
||||
logger.info('Executor writes to: %s.', str(self._output_path))
|
||||
logger.debug('Executor uses config: %s.', self._sb_config)
|
||||
logger.debug('Executor writes to: %s.', str(self._output_path))
|
||||
|
||||
self.__validate_sb_config()
|
||||
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
||||
self._sb_enabled = self.__get_enabled_benchmarks()
|
||||
logger.info('Executor will execute: %s', self._sb_enabled)
|
||||
logger.debug('Executor will execute: %s', self._sb_enabled)
|
||||
|
||||
def __set_logger(self, filename):
|
||||
"""Set logger and add file handler.
|
||||
|
|
|
@ -74,7 +74,7 @@ class AnsibleClient():
|
|||
logger.info('Run succeed, return code {}.'.format(r.rc))
|
||||
else:
|
||||
logger.warning('Run failed, return code {}.'.format(r.rc))
|
||||
logger.info(r.stats)
|
||||
logger.debug(r.stats)
|
||||
return r.rc
|
||||
|
||||
def update_mpi_config(self, ansible_config):
|
||||
|
|
|
@ -92,6 +92,7 @@
|
|||
shell: |
|
||||
docker pull {{ docker_image }}
|
||||
become: yes
|
||||
throttle: 32
|
||||
- name: Starting Container
|
||||
shell: |
|
||||
docker rm --force {{ container }} ||: && \
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
- name: Fetch Results
|
||||
hosts: all
|
||||
gather_facts: true
|
||||
vars:
|
||||
workspace: '{{ ansible_user_dir }}/sb-workspace'
|
||||
tasks:
|
||||
- name: Synchronize Output Directory
|
||||
ansible.posix.synchronize:
|
||||
mode: pull
|
||||
src: '{{ workspace }}/{{ sb_output_dir }}/'
|
||||
src: '{{ sb_output_dir }}/'
|
||||
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
|
||||
rsync_opts:
|
||||
- --exclude=nodes
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
from pprint import pformat
|
||||
from collections import defaultdict
|
||||
|
||||
from natsort import natsorted
|
||||
|
@ -36,7 +37,7 @@ class SuperBenchRunner():
|
|||
self._ansible_client = AnsibleClient(ansible_config)
|
||||
|
||||
self.__set_logger('sb-run.log')
|
||||
logger.info('Runner uses config: %s.', self._sb_config)
|
||||
logger.info('Runner uses config: %s.', pformat(self._sb_config))
|
||||
logger.info('Runner writes to: %s.', str(self._output_path))
|
||||
|
||||
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
||||
|
@ -214,7 +215,7 @@ class SuperBenchRunner():
|
|||
json.dump(result, f)
|
||||
f.write('\n')
|
||||
|
||||
def __create_single_node_summary(self, node_path): # pragma: no cover
|
||||
def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: C901
|
||||
"""Create the result summary file of single node.
|
||||
|
||||
Args:
|
||||
|
@ -235,7 +236,11 @@ class SuperBenchRunner():
|
|||
continue
|
||||
|
||||
for result in results:
|
||||
benchmark_name = result['name']
|
||||
try:
|
||||
benchmark_name = result['name']
|
||||
except Exception:
|
||||
logger.error('Invalid content in JSON file: {}'.format(results_file))
|
||||
continue
|
||||
if results_file.parts[-3].endswith('_models'):
|
||||
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
|
||||
if benchmark_name not in results_summary:
|
||||
|
|
Загрузка…
Ссылка в новой задаче