Bug bash - Fix bugs in multi GPU benchmarks (#98)

* Add `sb deploy` command content. * Fix inline if-expression syntax in playbook. * Fix quote escape issue in bash command. * Add custom env in config. * Update default config for multi GPU benchmarks. * Update MANIFEST.in to include jinja2 template. * Require jinja2 minimum version. * Fix occasional duplicate output in Ansible runner. * Fix mixed color from Ansible and Python colorlog. * Update according to comments. * Change superbench.env from list to dict in config file.
2021-06-23 18:16:43 +08:00 · 2021-06-23 18:16:43 +08:00 · c0c43b8f81
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,3 @@
 include LICENSE README.md
-recursive-include superbench *.py
-recursive-include superbench *.yaml
-global-exclude *.pyc
-global-exclude __pycache__
+recursive-include superbench *.py *.j2 *.yaml
+global-exclude *.py[cod] __pycache__
--- a/setup.py
+++ b/setup.py
@ -134,11 +134,13 @@ setup(
    python_requires='>=3.6, <4',
    install_requires=[
        'ansible_base>=2.10.9;os_name=="posix"',
-        'ansible_runner>=1.4.7',
+        'ansible_runner>=2.0.0rc1',
        'colorlog>=4.7.2',
+        'jinja2>=2.10.1',
        'joblib>=1.0.1',
        'knack>=0.7.2',
        'omegaconf==2.0.6',
+        'pyyaml>=5.3',
    ],
    extras_require={
        'dev': ['pre-commit>=2.10.0'],
--- a/superbench/cli/_handler.py
+++ b/superbench/cli/_handler.py
@ -227,8 +227,8 @@ def deploy_command_handler(
        private_key=private_key,
    )

-    SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
-    raise NotImplementedError
+    runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
+    runner.deploy()


 def run_command_handler(
--- a/superbench/common/utils/file_handler.py
+++ b/superbench/common/utils/file_handler.py
@ -6,6 +6,7 @@
 from pathlib import Path
 from datetime import datetime

+import yaml
 from omegaconf import OmegaConf


@ -38,4 +39,5 @@ def get_sb_config(config_file):
    p = Path(config_file) if config_file else default_config_file
    if not p.is_file():
        return None
-    return OmegaConf.load(str(p))
+    with p.open() as fp:
+        return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
--- a/superbench/common/utils/logging.py
+++ b/superbench/common/utils/logging.py
@ -41,6 +41,7 @@ class SuperBenchLogger:
        )
        if color:
            formatter = colorlog.ColoredFormatter(
+                '%(reset)s'
                '[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
                '[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
                '[%(log_color)s%(levelname)s%(reset)s] %(message)s'
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -1,115 +1,95 @@
 # SuperBench Config
 superbench:
  enable: null
-  benchmarks:
-    kernel-launch:
-      enable: true
-    gemm-flops:
-      enable: true
-    cudnn-function:
-      enable: true
-    cublas-function:
-      enable: true
-    matmul:
+  var:
+    default_local_mode: &default_local_mode
      enable: true
      modes:
        - name: local
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank}
-          parallel: no
-      frameworks:
-        - pytorch
-    gpt_models:
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
      enable: true
      modes:
        - name: torch.distributed
          proc_num: 8
-          node_num: all
+          node_num: 1
      frameworks:
        - pytorch
+    common_model_config: &common_model_config
+      duration: 0
+      num_warmup: 16
+      num_steps: 128
+      precision:
+        - float32
+        - float16
+      model_action:
+        - train
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+    cudnn-function:
+      <<: *default_local_mode
+    cublas-function:
+      <<: *default_local_mode
+    matmul:
+      <<: *default_local_mode
+      frameworks:
+        - pytorch
+    sharding-matmul:
+      <<: *default_pytorch_mode
+    computation-communication-overlap:
+      <<: *default_pytorch_mode
+    gpt_models:
+      <<: *default_pytorch_mode
      models:
        - gpt2-small
        - gpt2-large
      parameters:
-        duration: 0
-        num_warmup: 16
-        num_steps: 128
+        <<: *common_model_config
        batch_size: 4
-        precision:
-          - float32
-          - float16
-        model_action:
-          - train
-          - inference
    bert_models:
-      enable: true
-      modes:
-        - name: torch.distributed
-          proc_num: 8
-          node_num: all
-      frameworks:
-        - pytorch
+      <<: *default_pytorch_mode
      models:
        - bert-base
        - bert-large
      parameters:
-        duration: 0
-        num_warmup: 16
-        num_steps: 128
-        batch_size: 16
-        precision:
-          - float32
-          - float16
-        model_action:
-          - train
-          - inference
+        <<: *common_model_config
+        batch_size: 8
    lstm_models:
-      enable: true
-      modes:
-        - name: torch.distributed
-          proc_num: 8
-          node_num: all
-      frameworks:
-        - pytorch
+      <<: *default_pytorch_mode
      models:
        - lstm
      parameters:
-        duration: 0
-        num_warmup: 16
-        num_steps: 128
+        <<: *common_model_config
        batch_size: 128
-        precision:
-          - float32
-          - float16
-        model_action:
-          - train
-          - inference
-    cnn_models:
-      enable: true
-      modes:
-        - name: torch.distributed
-          proc_num: 8
-          node_num: all
-      frameworks:
-        - pytorch
+    resnet_models:
+      <<: *default_pytorch_mode
      models:
        - resnet50
        - resnet101
        - resnet152
+      parameters:
+        <<: *common_model_config
+        batch_size: 128
+    densenet_models:
+      <<: *default_pytorch_mode
+      models:
        - densenet169
        - densenet201
+      parameters:
+        <<: *common_model_config
+        batch_size: 128
+    vgg_models:
+      <<: *default_pytorch_mode
+      models:
        - vgg11
        - vgg13
        - vgg16
        - vgg19
      parameters:
-        duration: 0
-        num_warmup: 16
-        num_steps: 128
+        <<: *common_model_config
        batch_size: 128
-        precision:
-          - float32
-          - float16
-        model_action:
-          - train
-          - inference
--- a/superbench/runner/playbooks/check_env.yaml
+++ b/superbench/runner/playbooks/check_env.yaml
@ -22,10 +22,14 @@
    container: sb-workspace
    sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
    sb_env: |
+      # pytorch env
      NNODES={{ sb_nodes | length }}
      NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
      MASTER_ADDR={{ sb_nodes | first }}
      MASTER_PORT=29500
+      OMP_NUM_THREADS=1
+      # config env
+      {{ env | default('') }}
  tasks:
    - name: Updating Config
      copy:
--- a/superbench/runner/playbooks/deploy.yaml
+++ b/superbench/runner/playbooks/deploy.yaml
@ -65,8 +65,8 @@
        docker rm --force {{ container }} ||: && \
        docker run -itd --name={{ container }} \
          --privileged --net=host --ipc=host \
-          {{ '--gpus=all' if gpu_vendor == 'nvidia' }} \
-          {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \
+          {{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
+          {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
          -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
          {{ docker_image }} bash && \
        docker exec {{ container }} bash -c \
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@ -54,6 +54,8 @@ class SuperBenchRunner():
            InvalidConfigError: If input config is invalid.
        """
        # TODO: add validation and defaulting
+        if not self._sb_config.superbench.env:
+            self._sb_config.superbench.env = {}
        for name in self._sb_benchmarks:
            if not self._sb_benchmarks[name].modes:
                self._sb_benchmarks[name].modes = []
@ -141,7 +143,13 @@ class SuperBenchRunner():
        logger.info('Checking SuperBench environment.')
        OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
        self._ansible_client.run(
-            self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir})
+            self._ansible_client.get_playbook_config(
+                'check_env.yaml',
+                extravars={
+                    'output_dir': self._output_dir,
+                    'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
+                }
+            )
        )

    def _run_proc(self, benchmark_name, mode, vars):
@ -161,7 +169,7 @@ class SuperBenchRunner():
            self._ansible_client.get_shell_config(
                (
                    'docker exec sb-workspace bash -c '
-                    '"set -o allexport && source sb.env && set +o allexport && {command}"'
+                    "'set -o allexport && source sb.env && set +o allexport && {command}'"
                ).format(command=self.__get_mode_command(benchmark_name, mode), )
            ),
            sudo=True
--- a/tests/cli/test_sb.py
+++ b/tests/cli/test_sb.py
@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):

    def test_sb_deploy(self):
        """Test sb deploy."""
-        self.cmd('sb deploy --host-list localhost', expect_failure=True)
+        self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
+
+    def test_sb_deploy_no_host(self):
+        """Test sb deploy, no host_file or host_list provided, should fail."""
+        self.cmd('sb deploy', expect_failure=True)

    def test_sb_exec(self):
        """Test sb exec."""
--- a/tests/executor/test_executor.py
+++ b/tests/executor/test_executor.py
@ -10,6 +10,7 @@ import tempfile
 from pathlib import Path
 from unittest import mock

+import yaml
 from omegaconf import OmegaConf

 from superbench.executor import SuperBenchExecutor
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
    def setUp(self):
        """Hook method for setting up the test fixture before exercising it."""
        default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
-        self.default_config = OmegaConf.load(str(default_config_file))
+        with default_config_file.open() as fp:
+            self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
        self.output_dir = tempfile.mkdtemp()

        self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
@ -61,19 +63,31 @@ class ExecutorTestCase(unittest.TestCase):

    def test_get_arguments(self):
        """Test benchmarks arguments."""
-        expected_matmul_args = ''
-        self.assertEqual(
-            self.executor._SuperBenchExecutor__get_arguments(
-                self.default_config.superbench.benchmarks.matmul.parameters
-            ), expected_matmul_args
-        )
-        expected_bert_models_args = \
-            '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \
+        test_cases = [
+            {
+                'parameters': None,
+                'expected_args': '',
+            },
+            {
+                'parameters': {
+                    'duration': 0,
+                    'num_warmup': 16,
+                    'num_steps': 128,
+                    'batch_size': 16,
+                    'precision': ['float32', 'float16'],
+                    'model_action': ['train', 'inference'],
+                },
+                'expected_args': (
+                    '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
                    '--precision float32 float16 --model_action train inference'
+                ),
+            },
+        ]
+        for test_case in test_cases:
+            with self.subTest(msg='Testing with case', test_case=test_case):
                self.assertEqual(
-            self.executor._SuperBenchExecutor__get_arguments(
-                self.default_config.superbench.benchmarks.bert_models.parameters
-            ), expected_bert_models_args
+                    self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
+                    test_case['expected_args']
                )

    def test_create_benchmark_dir(self):
--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@ -9,6 +9,7 @@ import tempfile
 from pathlib import Path
 from unittest import mock

+import yaml
 from omegaconf import OmegaConf

 from superbench.runner import SuperBenchRunner
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
    def setUp(self):
        """Hook method for setting up the test fixture before exercising it."""
        default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
-        self.default_config = OmegaConf.load(str(default_config_file))
+        with default_config_file.open() as fp:
+            self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
        self.output_dir = tempfile.mkdtemp()

        self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)