Bug bash - Fix bugs in multi GPU benchmarks (#98)

* Add `sb deploy` command content. * Fix inline if-expression syntax in playbook. * Fix quote escape issue in bash command. * Add custom env in config. * Update default config for multi GPU benchmarks. * Update MANIFEST.in to include jinja2 template. * Require jinja2 minimum version. * Fix occasional duplicate output in Ansible runner. * Fix mixed color from Ansible and Python colorlog. * Update according to comments. * Change superbench.env from list to dict in config file.
2021-06-23 18:16:43 +08:00 · 2021-06-23 18:16:43 +08:00 · c0c43b8f81
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,5 +1,3 @@
 include LICENSE README.md
-recursive-include superbench *.py
+recursive-include superbench *.py *.j2 *.yaml
-recursive-include superbench *.yaml
+global-exclude *.py[cod] __pycache__
 global-exclude *.pyc
 global-exclude __pycache__
--- a/setup.py
+++ b/setup.py
@ -134,11 +134,13 @@ setup(
    python_requires='>=3.6, <4',
    install_requires=[
        'ansible_base>=2.10.9;os_name=="posix"',
-        'ansible_runner>=1.4.7',
+        'ansible_runner>=2.0.0rc1',
        'colorlog>=4.7.2',
        'jinja2>=2.10.1',
        'joblib>=1.0.1',
        'knack>=0.7.2',
        'omegaconf==2.0.6',
        'pyyaml>=5.3',
    ],
    extras_require={
        'dev': ['pre-commit>=2.10.0'],
--- a/superbench/cli/_handler.py
+++ b/superbench/cli/_handler.py
@ -227,8 +227,8 @@ def deploy_command_handler(
        private_key=private_key,
    )
-    SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
+    runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir)
-    raise NotImplementedError
+    runner.deploy()
 def run_command_handler(
--- a/superbench/common/utils/file_handler.py
+++ b/superbench/common/utils/file_handler.py
@ -6,6 +6,7 @@
 from pathlib import Path
 from datetime import datetime
 import yaml
 from omegaconf import OmegaConf
@ -38,4 +39,5 @@ def get_sb_config(config_file):
    p = Path(config_file) if config_file else default_config_file
    if not p.is_file():
        return None
-    return OmegaConf.load(str(p))
+    with p.open() as fp:
        return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
--- a/superbench/common/utils/logging.py
+++ b/superbench/common/utils/logging.py
@ -41,6 +41,7 @@ class SuperBenchLogger:
        )
        if color:
            formatter = colorlog.ColoredFormatter(
                '%(reset)s'
                '[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]'
                '[%(blue)s%(filename)s:%(lineno)s%(reset)s]'
                '[%(log_color)s%(levelname)s%(reset)s] %(message)s'
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@ -1,115 +1,95 @@
 # SuperBench Config
 superbench:
  enable: null
-  benchmarks:
+  var:
-    kernel-launch:
+    default_local_mode: &default_local_mode
      enable: true
    gemm-flops:
      enable: true
    cudnn-function:
      enable: true
    cublas-function:
      enable: true
    matmul:
      enable: true
      modes:
        - name: local
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank}
-          parallel: no
+          parallel: yes
-      frameworks:
+    default_pytorch_mode: &default_pytorch_mode
        - pytorch
    gpt_models:
      enable: true
      modes:
        - name: torch.distributed
          proc_num: 8
-          node_num: all
+          node_num: 1
      frameworks:
        - pytorch
    common_model_config: &common_model_config
      duration: 0
      num_warmup: 16
      num_steps: 128
      precision:
        - float32
        - float16
      model_action:
        - train
  benchmarks:
    kernel-launch:
      <<: *default_local_mode
    gemm-flops:
      <<: *default_local_mode
    cudnn-function:
      <<: *default_local_mode
    cublas-function:
      <<: *default_local_mode
    matmul:
      <<: *default_local_mode
      frameworks:
        - pytorch
    sharding-matmul:
      <<: *default_pytorch_mode
    computation-communication-overlap:
      <<: *default_pytorch_mode
    gpt_models:
      <<: *default_pytorch_mode
      models:
        - gpt2-small
        - gpt2-large
      parameters:
-        duration: 0
+        <<: *common_model_config
        num_warmup: 16
        num_steps: 128
        batch_size: 4
        precision:
          - float32
          - float16
        model_action:
          - train
          - inference
    bert_models:
-      enable: true
+      <<: *default_pytorch_mode
      modes:
        - name: torch.distributed
          proc_num: 8
          node_num: all
      frameworks:
        - pytorch
      models:
        - bert-base
        - bert-large
      parameters:
-        duration: 0
+        <<: *common_model_config
-        num_warmup: 16
+        batch_size: 8
        num_steps: 128
        batch_size: 16
        precision:
          - float32
          - float16
        model_action:
          - train
          - inference
    lstm_models:
-      enable: true
+      <<: *default_pytorch_mode
      modes:
        - name: torch.distributed
          proc_num: 8
          node_num: all
      frameworks:
        - pytorch
      models:
        - lstm
      parameters:
-        duration: 0
+        <<: *common_model_config
        num_warmup: 16
        num_steps: 128
        batch_size: 128
-        precision:
+    resnet_models:
-          - float32
+      <<: *default_pytorch_mode
          - float16
        model_action:
          - train
          - inference
    cnn_models:
      enable: true
      modes:
        - name: torch.distributed
          proc_num: 8
          node_num: all
      frameworks:
        - pytorch
      models:
        - resnet50
        - resnet101
        - resnet152
      parameters:
        <<: *common_model_config
        batch_size: 128
    densenet_models:
      <<: *default_pytorch_mode
      models:
        - densenet169
        - densenet201
      parameters:
        <<: *common_model_config
        batch_size: 128
    vgg_models:
      <<: *default_pytorch_mode
      models:
        - vgg11
        - vgg13
        - vgg16
        - vgg19
      parameters:
-        duration: 0
+        <<: *common_model_config
        num_warmup: 16
        num_steps: 128
        batch_size: 128
        precision:
          - float32
          - float16
        model_action:
          - train
          - inference
--- a/superbench/runner/playbooks/check_env.yaml
+++ b/superbench/runner/playbooks/check_env.yaml
@ -22,10 +22,14 @@
    container: sb-workspace
    sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}'
    sb_env: |
      # pytorch env
      NNODES={{ sb_nodes | length }}
      NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }}
      MASTER_ADDR={{ sb_nodes | first }}
      MASTER_PORT=29500
      OMP_NUM_THREADS=1
      # config env
      {{ env | default('') }}
  tasks:
    - name: Updating Config
      copy:
--- a/superbench/runner/playbooks/deploy.yaml
+++ b/superbench/runner/playbooks/deploy.yaml
@ -65,8 +65,8 @@
        docker rm --force {{ container }} ||: && \
        docker run -itd --name={{ container }} \
          --privileged --net=host --ipc=host \
-          {{ '--gpus=all' if gpu_vendor == 'nvidia' }} \
+          {{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \
-          {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \
+          {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \
          -w /root -v {{ workspace }}:/root -v /mnt:/mnt \
          {{ docker_image }} bash && \
        docker exec {{ container }} bash -c \
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@ -54,6 +54,8 @@ class SuperBenchRunner():
            InvalidConfigError: If input config is invalid.
        """
        # TODO: add validation and defaulting
        if not self._sb_config.superbench.env:
            self._sb_config.superbench.env = {}
        for name in self._sb_benchmarks:
            if not self._sb_benchmarks[name].modes:
                self._sb_benchmarks[name].modes = []
@ -141,7 +143,13 @@ class SuperBenchRunner():
        logger.info('Checking SuperBench environment.')
        OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml'))
        self._ansible_client.run(
-            self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir})
+            self._ansible_client.get_playbook_config(
                'check_env.yaml',
                extravars={
                    'output_dir': self._output_dir,
                    'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()),
                }
            )
        )
    def _run_proc(self, benchmark_name, mode, vars):
@ -161,7 +169,7 @@ class SuperBenchRunner():
            self._ansible_client.get_shell_config(
                (
                    'docker exec sb-workspace bash -c '
-                    '"set -o allexport && source sb.env && set +o allexport && {command}"'
+                    "'set -o allexport && source sb.env && set +o allexport && {command}'"
                ).format(command=self.__get_mode_command(benchmark_name, mode), )
            ),
            sudo=True
--- a/tests/cli/test_sb.py
+++ b/tests/cli/test_sb.py
@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest):
    def test_sb_deploy(self):
        """Test sb deploy."""
-        self.cmd('sb deploy --host-list localhost', expect_failure=True)
+        self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()])
    def test_sb_deploy_no_host(self):
        """Test sb deploy, no host_file or host_list provided, should fail."""
        self.cmd('sb deploy', expect_failure=True)
    def test_sb_exec(self):
        """Test sb exec."""
--- a/tests/executor/test_executor.py
+++ b/tests/executor/test_executor.py
@ -10,6 +10,7 @@ import tempfile
 from pathlib import Path
 from unittest import mock
 import yaml
 from omegaconf import OmegaConf
 from superbench.executor import SuperBenchExecutor
@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase):
    def setUp(self):
        """Hook method for setting up the test fixture before exercising it."""
        default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
-        self.default_config = OmegaConf.load(str(default_config_file))
+        with default_config_file.open() as fp:
            self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
        self.output_dir = tempfile.mkdtemp()
        self.executor = SuperBenchExecutor(self.default_config, self.output_dir)
@ -61,19 +63,31 @@ class ExecutorTestCase(unittest.TestCase):
    def test_get_arguments(self):
        """Test benchmarks arguments."""
-        expected_matmul_args = ''
+        test_cases = [
-        self.assertEqual(
+            {
-            self.executor._SuperBenchExecutor__get_arguments(
+                'parameters': None,
-                self.default_config.superbench.benchmarks.matmul.parameters
+                'expected_args': '',
-            ), expected_matmul_args
+            },
-        )
+            {
-        expected_bert_models_args = \
+                'parameters': {
-            '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \
+                    'duration': 0,
                    'num_warmup': 16,
                    'num_steps': 128,
                    'batch_size': 16,
                    'precision': ['float32', 'float16'],
                    'model_action': ['train', 'inference'],
                },
                'expected_args': (
                    '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 '
                    '--precision float32 float16 --model_action train inference'
                ),
            },
        ]
        for test_case in test_cases:
            with self.subTest(msg='Testing with case', test_case=test_case):
                self.assertEqual(
-            self.executor._SuperBenchExecutor__get_arguments(
+                    self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']),
-                self.default_config.superbench.benchmarks.bert_models.parameters
+                    test_case['expected_args']
            ), expected_bert_models_args
                )
    def test_create_benchmark_dir(self):
--- a/tests/runner/test_runner.py
+++ b/tests/runner/test_runner.py
@ -9,6 +9,7 @@ import tempfile
 from pathlib import Path
 from unittest import mock
 import yaml
 from omegaconf import OmegaConf
 from superbench.runner import SuperBenchRunner
@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase):
    def setUp(self):
        """Hook method for setting up the test fixture before exercising it."""
        default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml'
-        self.default_config = OmegaConf.load(str(default_config_file))
+        with default_config_file.open() as fp:
            self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
        self.output_dir = tempfile.mkdtemp()
        self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)