From 213ab14bead96bd1c1cc91980aeac5adb573d051 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 8 Dec 2021 14:55:13 +0800 Subject: [PATCH] Bug - Fix issues for distributed runs (#258) Fix issues for distributed runs: * fix config for memory bandwidth benchmarks * add throttling for high concurrency docker pull * update rsync path and exclude directories * handle exceptions when creating summary * tune for logging --- superbench/config/amd_mi100_hpe.yaml | 7 ++++++- superbench/config/amd_mi100_z53.yaml | 7 ++++++- superbench/config/azure_ndmv4.yaml | 2 +- superbench/config/azure_ndv4.yaml | 2 +- superbench/config/default.yaml | 2 +- superbench/executor/executor.py | 6 +++--- superbench/runner/ansible.py | 2 +- superbench/runner/playbooks/deploy.yaml | 1 + superbench/runner/playbooks/fetch_results.yaml | 6 +++--- superbench/runner/runner.py | 11 ++++++++--- 10 files changed, 31 insertions(+), 15 deletions(-) diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml index 9148f6b2..948cd1de 100644 --- a/superbench/config/amd_mi100_hpe.yaml +++ b/superbench/config/amd_mi100_hpe.yaml @@ -48,7 +48,12 @@ superbench: ngpus: 8 operation: allreduce mem-bw: - <<: *default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no gemm-flops: <<: *default_local_mode parameters: diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml index 0fb45197..8a8d6309 100644 --- a/superbench/config/amd_mi100_z53.yaml +++ b/superbench/config/amd_mi100_z53.yaml @@ -49,7 +49,12 @@ superbench: ngpus: 8 operation: allreduce mem-bw: - <<: *default_local_mode + enable: true + modes: + - name: local + proc_num: 8 + prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) + parallel: no gemm-flops: <<: *default_local_mode parameters: diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml index fcd07cef..799cc947 100644 --- a/superbench/config/azure_ndmv4.yaml +++ b/superbench/config/azure_ndmv4.yaml @@ -64,7 +64,7 @@ superbench: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) - parallel: yes + parallel: no disk-benchmark: enable: true modes: diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml index fb05ad23..9eabdfa3 100644 --- a/superbench/config/azure_ndv4.yaml +++ b/superbench/config/azure_ndv4.yaml @@ -60,7 +60,7 @@ superbench: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) - parallel: yes + parallel: no disk-benchmark: enable: false modes: diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 5823e92c..8c5d9e98 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -62,7 +62,7 @@ superbench: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) - parallel: yes + parallel: no gpu-copy-bw: enable: true modes: diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py index 7780356d..9d7b4655 100644 --- a/superbench/executor/executor.py +++ b/superbench/executor/executor.py @@ -28,13 +28,13 @@ class SuperBenchExecutor(): self._output_path = Path(sb_output_dir).expanduser().resolve() self.__set_logger('sb-exec.log') - logger.info('Executor uses config: %s.', self._sb_config) - logger.info('Executor writes to: %s.', str(self._output_path)) + logger.debug('Executor uses config: %s.', self._sb_config) + logger.debug('Executor writes to: %s.', str(self._output_path)) self.__validate_sb_config() self._sb_benchmarks = self._sb_config.superbench.benchmarks self._sb_enabled = self.__get_enabled_benchmarks() - logger.info('Executor will execute: %s', self._sb_enabled) + logger.debug('Executor will execute: %s', self._sb_enabled) def __set_logger(self, filename): """Set logger and add file handler. diff --git a/superbench/runner/ansible.py b/superbench/runner/ansible.py index e5660007..15ddb48d 100644 --- a/superbench/runner/ansible.py +++ b/superbench/runner/ansible.py @@ -74,7 +74,7 @@ class AnsibleClient(): logger.info('Run succeed, return code {}.'.format(r.rc)) else: logger.warning('Run failed, return code {}.'.format(r.rc)) - logger.info(r.stats) + logger.debug(r.stats) return r.rc def update_mpi_config(self, ansible_config): diff --git a/superbench/runner/playbooks/deploy.yaml b/superbench/runner/playbooks/deploy.yaml index 3437b936..aea8a490 100644 --- a/superbench/runner/playbooks/deploy.yaml +++ b/superbench/runner/playbooks/deploy.yaml @@ -92,6 +92,7 @@ shell: | docker pull {{ docker_image }} become: yes + throttle: 32 - name: Starting Container shell: | docker rm --force {{ container }} ||: && \ diff --git a/superbench/runner/playbooks/fetch_results.yaml b/superbench/runner/playbooks/fetch_results.yaml index 343d54a0..d1661fab 100644 --- a/superbench/runner/playbooks/fetch_results.yaml +++ b/superbench/runner/playbooks/fetch_results.yaml @@ -1,11 +1,11 @@ - name: Fetch Results hosts: all gather_facts: true - vars: - workspace: '{{ ansible_user_dir }}/sb-workspace' tasks: - name: Synchronize Output Directory ansible.posix.synchronize: mode: pull - src: '{{ workspace }}/{{ sb_output_dir }}/' + src: '{{ sb_output_dir }}/' dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}' + rsync_opts: + - --exclude=nodes diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index fcaf3f70..f45eabb6 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -6,6 +6,7 @@ import json import random from pathlib import Path +from pprint import pformat from collections import defaultdict from natsort import natsorted @@ -36,7 +37,7 @@ class SuperBenchRunner(): self._ansible_client = AnsibleClient(ansible_config) self.__set_logger('sb-run.log') - logger.info('Runner uses config: %s.', self._sb_config) + logger.info('Runner uses config: %s.', pformat(self._sb_config)) logger.info('Runner writes to: %s.', str(self._output_path)) self._sb_benchmarks = self._sb_config.superbench.benchmarks @@ -214,7 +215,7 @@ class SuperBenchRunner(): json.dump(result, f) f.write('\n') - def __create_single_node_summary(self, node_path): # pragma: no cover + def __create_single_node_summary(self, node_path): # pragma: no cover # noqa: C901 """Create the result summary file of single node. Args: @@ -235,7 +236,11 @@ class SuperBenchRunner(): continue for result in results: - benchmark_name = result['name'] + try: + benchmark_name = result['name'] + except Exception: + logger.error('Invalid content in JSON file: {}'.format(results_file)) + continue if results_file.parts[-3].endswith('_models'): benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name']) if benchmark_name not in results_summary: