Bug - Fix issues for distributed runs (#258)

Fix issues for distributed runs: * fix config for memory bandwidth benchmarks * add throttling for high concurrency docker pull * update rsync path and exclude directories * handle exceptions when creating summary * tune for logging
2021-12-08 14:55:13 +08:00 · 2021-12-08 14:55:13 +08:00 · 213ab14bea
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@ -48,7 +48,12 @@ superbench:
        ngpus: 8
        operation: allreduce
    mem-bw:
-      <<: *default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
    gemm-flops:
      <<: *default_local_mode
      parameters:
--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@ -49,7 +49,12 @@ superbench:
        ngpus: 8
        operation: allreduce
    mem-bw:
-      <<: *default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
    gemm-flops:
      <<: *default_local_mode
      parameters:
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@ -64,7 +64,7 @@ superbench:
        - name: local
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
    disk-benchmark:
      enable: true
      modes:
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@ -60,7 +60,7 @@ superbench:
        - name: local
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
    disk-benchmark:
      enable: false
      modes:
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -62,7 +62,7 @@ superbench:
        - name: local
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
    gpu-copy-bw:
      enable: true
      modes:
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@ -28,13 +28,13 @@ class SuperBenchExecutor():
        self._output_path = Path(sb_output_dir).expanduser().resolve()

        self.__set_logger('sb-exec.log')
-        logger.info('Executor uses config: %s.', self._sb_config)
-        logger.info('Executor writes to: %s.', str(self._output_path))
+        logger.debug('Executor uses config: %s.', self._sb_config)
+        logger.debug('Executor writes to: %s.', str(self._output_path))

        self.__validate_sb_config()
        self._sb_benchmarks = self._sb_config.superbench.benchmarks
        self._sb_enabled = self.__get_enabled_benchmarks()
-        logger.info('Executor will execute: %s', self._sb_enabled)
+        logger.debug('Executor will execute: %s', self._sb_enabled)

    def __set_logger(self, filename):
        """Set logger and add file handler.
--- a/superbench/runner/ansible.py
+++ b/superbench/runner/ansible.py
@ -74,7 +74,7 @@ class AnsibleClient():
            logger.info('Run succeed, return code {}.'.format(r.rc))
        else:
            logger.warning('Run failed, return code {}.'.format(r.rc))
-        logger.info(r.stats)
+        logger.debug(r.stats)
        return r.rc

    def update_mpi_config(self, ansible_config):
--- a/superbench/runner/playbooks/deploy.yaml
+++ b/superbench/runner/playbooks/deploy.yaml
@ -92,6 +92,7 @@
      shell: |
        docker pull {{ docker_image }}
      become: yes
+      throttle: 32
    - name: Starting Container
      shell: |
        docker rm --force {{ container }} ||: && \
--- a/superbench/runner/playbooks/fetch_results.yaml
+++ b/superbench/runner/playbooks/fetch_results.yaml
@ -1,11 +1,11 @@
 - name: Fetch Results
  hosts: all
  gather_facts: true
-  vars:
-    workspace: '{{ ansible_user_dir }}/sb-workspace'
  tasks:
    - name: Synchronize Output Directory
      ansible.posix.synchronize:
        mode: pull
-        src: '{{ workspace }}/{{ sb_output_dir }}/'
+        src: '{{ sb_output_dir }}/'
        dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
+        rsync_opts:
+          - --exclude=nodes
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@ -6,6 +6,7 @@
 import json
 import random
 from pathlib import Path
+from pprint import pformat
 from collections import defaultdict

 from natsort import natsorted
@ -36,7 +37,7 @@ class SuperBenchRunner():
        self._ansible_client = AnsibleClient(ansible_config)

        self.__set_logger('sb-run.log')
-        logger.info('Runner uses config: %s.', self._sb_config)
+        logger.info('Runner uses config: %s.', pformat(self._sb_config))
        logger.info('Runner writes to: %s.', str(self._output_path))

        self._sb_benchmarks = self._sb_config.superbench.benchmarks
@ -214,7 +215,7 @@ class SuperBenchRunner():
                json.dump(result, f)
                f.write('\n')

-    def __create_single_node_summary(self, node_path):    # pragma: no cover
+    def __create_single_node_summary(self, node_path):    # pragma: no cover # noqa: C901
        """Create the result summary file of single node.

        Args:
@ -235,7 +236,11 @@ class SuperBenchRunner():
                    continue

                for result in results:
-                    benchmark_name = result['name']
+                    try:
+                        benchmark_name = result['name']
+                    except Exception:
+                        logger.error('Invalid content in JSON file: {}'.format(results_file))
+                        continue
                    if results_file.parts[-3].endswith('_models'):
                        benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
                    if benchmark_name not in results_summary: