Bug - Fix issues for distributed runs (#258)
Fix issues for distributed runs: * fix config for memory bandwidth benchmarks * add throttling for high concurrency docker pull * update rsync path and exclude directories * handle exceptions when creating summary * tune for logging
This commit is contained in:
Родитель
44f0270ec4
Коммит
213ab14bea
|
@ -48,7 +48,12 @@ superbench:
|
||||||
ngpus: 8
|
ngpus: 8
|
||||||
operation: allreduce
|
operation: allreduce
|
||||||
mem-bw:
|
mem-bw:
|
||||||
<<: *default_local_mode
|
enable: true
|
||||||
|
modes:
|
||||||
|
- name: local
|
||||||
|
proc_num: 8
|
||||||
|
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||||
|
parallel: no
|
||||||
gemm-flops:
|
gemm-flops:
|
||||||
<<: *default_local_mode
|
<<: *default_local_mode
|
||||||
parameters:
|
parameters:
|
||||||
|
|
|
@ -49,7 +49,12 @@ superbench:
|
||||||
ngpus: 8
|
ngpus: 8
|
||||||
operation: allreduce
|
operation: allreduce
|
||||||
mem-bw:
|
mem-bw:
|
||||||
<<: *default_local_mode
|
enable: true
|
||||||
|
modes:
|
||||||
|
- name: local
|
||||||
|
proc_num: 8
|
||||||
|
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||||
|
parallel: no
|
||||||
gemm-flops:
|
gemm-flops:
|
||||||
<<: *default_local_mode
|
<<: *default_local_mode
|
||||||
parameters:
|
parameters:
|
||||||
|
|
|
@ -64,7 +64,7 @@ superbench:
|
||||||
- name: local
|
- name: local
|
||||||
proc_num: 8
|
proc_num: 8
|
||||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||||
parallel: yes
|
parallel: no
|
||||||
disk-benchmark:
|
disk-benchmark:
|
||||||
enable: true
|
enable: true
|
||||||
modes:
|
modes:
|
||||||
|
|
|
@ -60,7 +60,7 @@ superbench:
|
||||||
- name: local
|
- name: local
|
||||||
proc_num: 8
|
proc_num: 8
|
||||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||||
parallel: yes
|
parallel: no
|
||||||
disk-benchmark:
|
disk-benchmark:
|
||||||
enable: false
|
enable: false
|
||||||
modes:
|
modes:
|
||||||
|
|
|
@ -62,7 +62,7 @@ superbench:
|
||||||
- name: local
|
- name: local
|
||||||
proc_num: 8
|
proc_num: 8
|
||||||
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
|
||||||
parallel: yes
|
parallel: no
|
||||||
gpu-copy-bw:
|
gpu-copy-bw:
|
||||||
enable: true
|
enable: true
|
||||||
modes:
|
modes:
|
||||||
|
|
|
@ -28,13 +28,13 @@ class SuperBenchExecutor():
|
||||||
self._output_path = Path(sb_output_dir).expanduser().resolve()
|
self._output_path = Path(sb_output_dir).expanduser().resolve()
|
||||||
|
|
||||||
self.__set_logger('sb-exec.log')
|
self.__set_logger('sb-exec.log')
|
||||||
logger.info('Executor uses config: %s.', self._sb_config)
|
logger.debug('Executor uses config: %s.', self._sb_config)
|
||||||
logger.info('Executor writes to: %s.', str(self._output_path))
|
logger.debug('Executor writes to: %s.', str(self._output_path))
|
||||||
|
|
||||||
self.__validate_sb_config()
|
self.__validate_sb_config()
|
||||||
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
||||||
self._sb_enabled = self.__get_enabled_benchmarks()
|
self._sb_enabled = self.__get_enabled_benchmarks()
|
||||||
logger.info('Executor will execute: %s', self._sb_enabled)
|
logger.debug('Executor will execute: %s', self._sb_enabled)
|
||||||
|
|
||||||
def __set_logger(self, filename):
|
def __set_logger(self, filename):
|
||||||
"""Set logger and add file handler.
|
"""Set logger and add file handler.
|
||||||
|
|
|
@ -74,7 +74,7 @@ class AnsibleClient():
|
||||||
logger.info('Run succeed, return code {}.'.format(r.rc))
|
logger.info('Run succeed, return code {}.'.format(r.rc))
|
||||||
else:
|
else:
|
||||||
logger.warning('Run failed, return code {}.'.format(r.rc))
|
logger.warning('Run failed, return code {}.'.format(r.rc))
|
||||||
logger.info(r.stats)
|
logger.debug(r.stats)
|
||||||
return r.rc
|
return r.rc
|
||||||
|
|
||||||
def update_mpi_config(self, ansible_config):
|
def update_mpi_config(self, ansible_config):
|
||||||
|
|
|
@ -92,6 +92,7 @@
|
||||||
shell: |
|
shell: |
|
||||||
docker pull {{ docker_image }}
|
docker pull {{ docker_image }}
|
||||||
become: yes
|
become: yes
|
||||||
|
throttle: 32
|
||||||
- name: Starting Container
|
- name: Starting Container
|
||||||
shell: |
|
shell: |
|
||||||
docker rm --force {{ container }} ||: && \
|
docker rm --force {{ container }} ||: && \
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
- name: Fetch Results
|
- name: Fetch Results
|
||||||
hosts: all
|
hosts: all
|
||||||
gather_facts: true
|
gather_facts: true
|
||||||
vars:
|
|
||||||
workspace: '{{ ansible_user_dir }}/sb-workspace'
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Synchronize Output Directory
|
- name: Synchronize Output Directory
|
||||||
ansible.posix.synchronize:
|
ansible.posix.synchronize:
|
||||||
mode: pull
|
mode: pull
|
||||||
src: '{{ workspace }}/{{ sb_output_dir }}/'
|
src: '{{ sb_output_dir }}/'
|
||||||
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
|
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
|
||||||
|
rsync_opts:
|
||||||
|
- --exclude=nodes
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from pprint import pformat
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from natsort import natsorted
|
from natsort import natsorted
|
||||||
|
@ -36,7 +37,7 @@ class SuperBenchRunner():
|
||||||
self._ansible_client = AnsibleClient(ansible_config)
|
self._ansible_client = AnsibleClient(ansible_config)
|
||||||
|
|
||||||
self.__set_logger('sb-run.log')
|
self.__set_logger('sb-run.log')
|
||||||
logger.info('Runner uses config: %s.', self._sb_config)
|
logger.info('Runner uses config: %s.', pformat(self._sb_config))
|
||||||
logger.info('Runner writes to: %s.', str(self._output_path))
|
logger.info('Runner writes to: %s.', str(self._output_path))
|
||||||
|
|
||||||
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
self._sb_benchmarks = self._sb_config.superbench.benchmarks
|
||||||
|
@ -214,7 +215,7 @@ class SuperBenchRunner():
|
||||||
json.dump(result, f)
|
json.dump(result, f)
|
||||||
f.write('\n')
|
f.write('\n')
|
||||||
|
|
||||||
def __create_single_node_summary(self, node_path): # pragma: no cover
|
def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: C901
|
||||||
"""Create the result summary file of single node.
|
"""Create the result summary file of single node.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -235,7 +236,11 @@ class SuperBenchRunner():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for result in results:
|
for result in results:
|
||||||
benchmark_name = result['name']
|
try:
|
||||||
|
benchmark_name = result['name']
|
||||||
|
except Exception:
|
||||||
|
logger.error('Invalid content in JSON file: {}'.format(results_file))
|
||||||
|
continue
|
||||||
if results_file.parts[-3].endswith('_models'):
|
if results_file.parts[-3].endswith('_models'):
|
||||||
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
|
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
|
||||||
if benchmark_name not in results_summary:
|
if benchmark_name not in results_summary:
|
||||||
|
|
Загрузка…
Ссылка в новой задаче