Bug - Fix issues for distributed runs (#258)

Fix issues for distributed runs:
* fix config for memory bandwidth benchmarks
* add throttling for high concurrency docker pull
* update rsync path and exclude directories
* handle exceptions when creating summary
* tune for logging
This commit is contained in:
Yifan Xiong 2021-12-08 14:55:13 +08:00 коммит произвёл GitHub
Родитель 44f0270ec4
Коммит 213ab14bea
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
10 изменённых файлов: 31 добавлений и 15 удалений

Просмотреть файл

@ -48,7 +48,12 @@ superbench:
ngpus: 8 ngpus: 8
operation: allreduce operation: allreduce
mem-bw: mem-bw:
<<: *default_local_mode enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: no
gemm-flops: gemm-flops:
<<: *default_local_mode <<: *default_local_mode
parameters: parameters:

Просмотреть файл

@ -49,7 +49,12 @@ superbench:
ngpus: 8 ngpus: 8
operation: allreduce operation: allreduce
mem-bw: mem-bw:
<<: *default_local_mode enable: true
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: no
gemm-flops: gemm-flops:
<<: *default_local_mode <<: *default_local_mode
parameters: parameters:

Просмотреть файл

@ -64,7 +64,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
disk-benchmark: disk-benchmark:
enable: true enable: true
modes: modes:

Просмотреть файл

@ -60,7 +60,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
disk-benchmark: disk-benchmark:
enable: false enable: false
modes: modes:

Просмотреть файл

@ -62,7 +62,7 @@ superbench:
- name: local - name: local
proc_num: 8 proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
parallel: yes parallel: no
gpu-copy-bw: gpu-copy-bw:
enable: true enable: true
modes: modes:

Просмотреть файл

@ -28,13 +28,13 @@ class SuperBenchExecutor():
self._output_path = Path(sb_output_dir).expanduser().resolve() self._output_path = Path(sb_output_dir).expanduser().resolve()
self.__set_logger('sb-exec.log') self.__set_logger('sb-exec.log')
logger.info('Executor uses config: %s.', self._sb_config) logger.debug('Executor uses config: %s.', self._sb_config)
logger.info('Executor writes to: %s.', str(self._output_path)) logger.debug('Executor writes to: %s.', str(self._output_path))
self.__validate_sb_config() self.__validate_sb_config()
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
self._sb_enabled = self.__get_enabled_benchmarks() self._sb_enabled = self.__get_enabled_benchmarks()
logger.info('Executor will execute: %s', self._sb_enabled) logger.debug('Executor will execute: %s', self._sb_enabled)
def __set_logger(self, filename): def __set_logger(self, filename):
"""Set logger and add file handler. """Set logger and add file handler.

Просмотреть файл

@ -74,7 +74,7 @@ class AnsibleClient():
logger.info('Run succeed, return code {}.'.format(r.rc)) logger.info('Run succeed, return code {}.'.format(r.rc))
else: else:
logger.warning('Run failed, return code {}.'.format(r.rc)) logger.warning('Run failed, return code {}.'.format(r.rc))
logger.info(r.stats) logger.debug(r.stats)
return r.rc return r.rc
def update_mpi_config(self, ansible_config): def update_mpi_config(self, ansible_config):

Просмотреть файл

@ -92,6 +92,7 @@
shell: | shell: |
docker pull {{ docker_image }} docker pull {{ docker_image }}
become: yes become: yes
throttle: 32
- name: Starting Container - name: Starting Container
shell: | shell: |
docker rm --force {{ container }} ||: && \ docker rm --force {{ container }} ||: && \

Просмотреть файл

@ -1,11 +1,11 @@
- name: Fetch Results - name: Fetch Results
hosts: all hosts: all
gather_facts: true gather_facts: true
vars:
workspace: '{{ ansible_user_dir }}/sb-workspace'
tasks: tasks:
- name: Synchronize Output Directory - name: Synchronize Output Directory
ansible.posix.synchronize: ansible.posix.synchronize:
mode: pull mode: pull
src: '{{ workspace }}/{{ sb_output_dir }}/' src: '{{ sb_output_dir }}/'
dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}' dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
rsync_opts:
- --exclude=nodes

Просмотреть файл

@ -6,6 +6,7 @@
import json import json
import random import random
from pathlib import Path from pathlib import Path
from pprint import pformat
from collections import defaultdict from collections import defaultdict
from natsort import natsorted from natsort import natsorted
@ -36,7 +37,7 @@ class SuperBenchRunner():
self._ansible_client = AnsibleClient(ansible_config) self._ansible_client = AnsibleClient(ansible_config)
self.__set_logger('sb-run.log') self.__set_logger('sb-run.log')
logger.info('Runner uses config: %s.', self._sb_config) logger.info('Runner uses config: %s.', pformat(self._sb_config))
logger.info('Runner writes to: %s.', str(self._output_path)) logger.info('Runner writes to: %s.', str(self._output_path))
self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_benchmarks = self._sb_config.superbench.benchmarks
@ -214,7 +215,7 @@ class SuperBenchRunner():
json.dump(result, f) json.dump(result, f)
f.write('\n') f.write('\n')
def __create_single_node_summary(self, node_path): # pragma: no cover def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: C901
"""Create the result summary file of single node. """Create the result summary file of single node.
Args: Args:
@ -235,7 +236,11 @@ class SuperBenchRunner():
continue continue
for result in results: for result in results:
benchmark_name = result['name'] try:
benchmark_name = result['name']
except Exception:
logger.error('Invalid content in JSON file: {}'.format(results_file))
continue
if results_file.parts[-3].endswith('_models'): if results_file.parts[-3].endswith('_models'):
benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name']) benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
if benchmark_name not in results_summary: if benchmark_name not in results_summary: