From 213ab14bead96bd1c1cc91980aeac5adb573d051 Mon Sep 17 00:00:00 2001
From: Yifan Xiong <yifan.xiong@microsoft.com>
Date: Wed, 8 Dec 2021 14:55:13 +0800
Subject: [PATCH] Bug - Fix issues for distributed runs (#258)

Fix issues for distributed runs:
* fix config for memory bandwidth benchmarks
* add throttling for high concurrency docker pull
* update rsync path and exclude directories
* handle exceptions when creating summary
* tune for logging
---
 superbench/config/amd_mi100_hpe.yaml           |  7 ++++++-
 superbench/config/amd_mi100_z53.yaml           |  7 ++++++-
 superbench/config/azure_ndmv4.yaml             |  2 +-
 superbench/config/azure_ndv4.yaml              |  2 +-
 superbench/config/default.yaml                 |  2 +-
 superbench/executor/executor.py                |  6 +++---
 superbench/runner/ansible.py                   |  2 +-
 superbench/runner/playbooks/deploy.yaml        |  1 +
 superbench/runner/playbooks/fetch_results.yaml |  6 +++---
 superbench/runner/runner.py                    | 11 ++++++++---
 10 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/superbench/config/amd_mi100_hpe.yaml b/superbench/config/amd_mi100_hpe.yaml
index 9148f6b2..948cd1de 100644
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -48,7 +48,12 @@ superbench:
         ngpus: 8
         operation: allreduce
     mem-bw:
-      <<: *default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
     gemm-flops:
       <<: *default_local_mode
       parameters:
diff --git a/superbench/config/amd_mi100_z53.yaml b/superbench/config/amd_mi100_z53.yaml
index 0fb45197..8a8d6309 100644
--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -49,7 +49,12 @@ superbench:
         ngpus: 8
         operation: allreduce
     mem-bw:
-      <<: *default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
+          parallel: no
     gemm-flops:
       <<: *default_local_mode
       parameters:
diff --git a/superbench/config/azure_ndmv4.yaml b/superbench/config/azure_ndmv4.yaml
index fcd07cef..799cc947 100644
--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -64,7 +64,7 @@ superbench:
         - name: local
           proc_num: 8
           prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
     disk-benchmark:
       enable: true
       modes:
diff --git a/superbench/config/azure_ndv4.yaml b/superbench/config/azure_ndv4.yaml
index fb05ad23..9eabdfa3 100644
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -60,7 +60,7 @@ superbench:
         - name: local
           proc_num: 8
           prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
     disk-benchmark:
       enable: false
       modes:
diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml
index 5823e92c..8c5d9e98 100644
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -62,7 +62,7 @@ superbench:
         - name: local
           proc_num: 8
           prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
-          parallel: yes
+          parallel: no
     gpu-copy-bw:
       enable: true
       modes:
diff --git a/superbench/executor/executor.py b/superbench/executor/executor.py
index 7780356d..9d7b4655 100644
--- a/superbench/executor/executor.py
+++ b/superbench/executor/executor.py
@@ -28,13 +28,13 @@ class SuperBenchExecutor():
         self._output_path = Path(sb_output_dir).expanduser().resolve()
 
         self.__set_logger('sb-exec.log')
-        logger.info('Executor uses config: %s.', self._sb_config)
-        logger.info('Executor writes to: %s.', str(self._output_path))
+        logger.debug('Executor uses config: %s.', self._sb_config)
+        logger.debug('Executor writes to: %s.', str(self._output_path))
 
         self.__validate_sb_config()
         self._sb_benchmarks = self._sb_config.superbench.benchmarks
         self._sb_enabled = self.__get_enabled_benchmarks()
-        logger.info('Executor will execute: %s', self._sb_enabled)
+        logger.debug('Executor will execute: %s', self._sb_enabled)
 
     def __set_logger(self, filename):
         """Set logger and add file handler.
diff --git a/superbench/runner/ansible.py b/superbench/runner/ansible.py
index e5660007..15ddb48d 100644
--- a/superbench/runner/ansible.py
+++ b/superbench/runner/ansible.py
@@ -74,7 +74,7 @@ class AnsibleClient():
             logger.info('Run succeed, return code {}.'.format(r.rc))
         else:
             logger.warning('Run failed, return code {}.'.format(r.rc))
-        logger.info(r.stats)
+        logger.debug(r.stats)
         return r.rc
 
     def update_mpi_config(self, ansible_config):
diff --git a/superbench/runner/playbooks/deploy.yaml b/superbench/runner/playbooks/deploy.yaml
index 3437b936..aea8a490 100644
--- a/superbench/runner/playbooks/deploy.yaml
+++ b/superbench/runner/playbooks/deploy.yaml
@@ -92,6 +92,7 @@
       shell: |
         docker pull {{ docker_image }}
       become: yes
+      throttle: 32
     - name: Starting Container
       shell: |
         docker rm --force {{ container }} ||: && \
diff --git a/superbench/runner/playbooks/fetch_results.yaml b/superbench/runner/playbooks/fetch_results.yaml
index 343d54a0..d1661fab 100644
--- a/superbench/runner/playbooks/fetch_results.yaml
+++ b/superbench/runner/playbooks/fetch_results.yaml
@@ -1,11 +1,11 @@
 - name: Fetch Results
   hosts: all
   gather_facts: true
-  vars:
-    workspace: '{{ ansible_user_dir }}/sb-workspace'
   tasks:
     - name: Synchronize Output Directory
       ansible.posix.synchronize:
         mode: pull
-        src: '{{ workspace }}/{{ sb_output_dir }}/'
+        src: '{{ sb_output_dir }}/'
         dest: '{{ absolute_output_dir }}/nodes/{{ ansible_hostname }}'
+        rsync_opts:
+          - --exclude=nodes
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index fcaf3f70..f45eabb6 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -6,6 +6,7 @@
 import json
 import random
 from pathlib import Path
+from pprint import pformat
 from collections import defaultdict
 
 from natsort import natsorted
@@ -36,7 +37,7 @@ class SuperBenchRunner():
         self._ansible_client = AnsibleClient(ansible_config)
 
         self.__set_logger('sb-run.log')
-        logger.info('Runner uses config: %s.', self._sb_config)
+        logger.info('Runner uses config: %s.', pformat(self._sb_config))
         logger.info('Runner writes to: %s.', str(self._output_path))
 
         self._sb_benchmarks = self._sb_config.superbench.benchmarks
@@ -214,7 +215,7 @@ class SuperBenchRunner():
                 json.dump(result, f)
                 f.write('\n')
 
-    def __create_single_node_summary(self, node_path):    # pragma: no cover
+    def __create_single_node_summary(self, node_path):    # pragma: no cover # noqa: C901
         """Create the result summary file of single node.
 
         Args:
@@ -235,7 +236,11 @@ class SuperBenchRunner():
                     continue
 
                 for result in results:
-                    benchmark_name = result['name']
+                    try:
+                        benchmark_name = result['name']
+                    except Exception:
+                        logger.error('Invalid content in JSON file: {}'.format(results_file))
+                        continue
                     if results_file.parts[-3].endswith('_models'):
                         benchmark_name = '{}/{}'.format(results_file.parts[-3], result['name'])
                     if benchmark_name not in results_summary: