From c0c43b8f812d6a100d3b494f5d31cebcdf043fad Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 23 Jun 2021 18:16:43 +0800 Subject: [PATCH] Bug bash - Fix bugs in multi GPU benchmarks (#98) * Add `sb deploy` command content. * Fix inline if-expression syntax in playbook. * Fix quote escape issue in bash command. * Add custom env in config. * Update default config for multi GPU benchmarks. * Update MANIFEST.in to include jinja2 template. * Require jinja2 minimum version. * Fix occasional duplicate output in Ansible runner. * Fix mixed color from Ansible and Python colorlog. * Update according to comments. * Change superbench.env from list to dict in config file. --- MANIFEST.in | 6 +- setup.py | 4 +- superbench/cli/_handler.py | 4 +- superbench/common/utils/file_handler.py | 4 +- superbench/common/utils/logging.py | 1 + superbench/config/default.yaml | 128 +++++++++------------ superbench/runner/playbooks/check_env.yaml | 4 + superbench/runner/playbooks/deploy.yaml | 4 +- superbench/runner/runner.py | 12 +- tests/cli/test_sb.py | 6 +- tests/executor/test_executor.py | 44 ++++--- tests/runner/test_runner.py | 4 +- 12 files changed, 118 insertions(+), 103 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 5d18fd6e..34d9d027 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,3 @@ include LICENSE README.md -recursive-include superbench *.py -recursive-include superbench *.yaml -global-exclude *.pyc -global-exclude __pycache__ +recursive-include superbench *.py *.j2 *.yaml +global-exclude *.py[cod] __pycache__ diff --git a/setup.py b/setup.py index 35501065..6788df70 100644 --- a/setup.py +++ b/setup.py @@ -134,11 +134,13 @@ setup( python_requires='>=3.6, <4', install_requires=[ 'ansible_base>=2.10.9;os_name=="posix"', - 'ansible_runner>=1.4.7', + 'ansible_runner>=2.0.0rc1', 'colorlog>=4.7.2', + 'jinja2>=2.10.1', 'joblib>=1.0.1', 'knack>=0.7.2', 'omegaconf==2.0.6', + 'pyyaml>=5.3', ], extras_require={ 'dev': ['pre-commit>=2.10.0'], diff --git a/superbench/cli/_handler.py b/superbench/cli/_handler.py index 3fe7962c..7d3ff4c1 100644 --- a/superbench/cli/_handler.py +++ b/superbench/cli/_handler.py @@ -227,8 +227,8 @@ def deploy_command_handler( private_key=private_key, ) - SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir) - raise NotImplementedError + runner = SuperBenchRunner(sb_config, docker_config, ansible_config, output_dir) + runner.deploy() def run_command_handler( diff --git a/superbench/common/utils/file_handler.py b/superbench/common/utils/file_handler.py index f74a3893..6eacefbb 100644 --- a/superbench/common/utils/file_handler.py +++ b/superbench/common/utils/file_handler.py @@ -6,6 +6,7 @@ from pathlib import Path from datetime import datetime +import yaml from omegaconf import OmegaConf @@ -38,4 +39,5 @@ def get_sb_config(config_file): p = Path(config_file) if config_file else default_config_file if not p.is_file(): return None - return OmegaConf.load(str(p)) + with p.open() as fp: + return OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader)) diff --git a/superbench/common/utils/logging.py b/superbench/common/utils/logging.py index f38069be..b9e8aed9 100644 --- a/superbench/common/utils/logging.py +++ b/superbench/common/utils/logging.py @@ -41,6 +41,7 @@ class SuperBenchLogger: ) if color: formatter = colorlog.ColoredFormatter( + '%(reset)s' '[%(cyan)s%(asctime)s %(hostname)s:%(process)d%(reset)s]' '[%(blue)s%(filename)s:%(lineno)s%(reset)s]' '[%(log_color)s%(levelname)s%(reset)s] %(message)s' diff --git a/superbench/config/default.yaml b/superbench/config/default.yaml index 9fd3e6e0..491cad2f 100644 --- a/superbench/config/default.yaml +++ b/superbench/config/default.yaml @@ -1,115 +1,95 @@ # SuperBench Config superbench: enable: null - benchmarks: - kernel-launch: - enable: true - gemm-flops: - enable: true - cudnn-function: - enable: true - cublas-function: - enable: true - matmul: + var: + default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} - parallel: no - frameworks: - - pytorch - gpt_models: + parallel: yes + default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 - node_num: all + node_num: 1 frameworks: - pytorch + common_model_config: &common_model_config + duration: 0 + num_warmup: 16 + num_steps: 128 + precision: + - float32 + - float16 + model_action: + - train + benchmarks: + kernel-launch: + <<: *default_local_mode + gemm-flops: + <<: *default_local_mode + cudnn-function: + <<: *default_local_mode + cublas-function: + <<: *default_local_mode + matmul: + <<: *default_local_mode + frameworks: + - pytorch + sharding-matmul: + <<: *default_pytorch_mode + computation-communication-overlap: + <<: *default_pytorch_mode + gpt_models: + <<: *default_pytorch_mode models: - gpt2-small - gpt2-large parameters: - duration: 0 - num_warmup: 16 - num_steps: 128 + <<: *common_model_config batch_size: 4 - precision: - - float32 - - float16 - model_action: - - train - - inference bert_models: - enable: true - modes: - - name: torch.distributed - proc_num: 8 - node_num: all - frameworks: - - pytorch + <<: *default_pytorch_mode models: - bert-base - bert-large parameters: - duration: 0 - num_warmup: 16 - num_steps: 128 - batch_size: 16 - precision: - - float32 - - float16 - model_action: - - train - - inference + <<: *common_model_config + batch_size: 8 lstm_models: - enable: true - modes: - - name: torch.distributed - proc_num: 8 - node_num: all - frameworks: - - pytorch + <<: *default_pytorch_mode models: - lstm parameters: - duration: 0 - num_warmup: 16 - num_steps: 128 + <<: *common_model_config batch_size: 128 - precision: - - float32 - - float16 - model_action: - - train - - inference - cnn_models: - enable: true - modes: - - name: torch.distributed - proc_num: 8 - node_num: all - frameworks: - - pytorch + resnet_models: + <<: *default_pytorch_mode models: - resnet50 - resnet101 - resnet152 + parameters: + <<: *common_model_config + batch_size: 128 + densenet_models: + <<: *default_pytorch_mode + models: - densenet169 - densenet201 + parameters: + <<: *common_model_config + batch_size: 128 + vgg_models: + <<: *default_pytorch_mode + models: - vgg11 - vgg13 - vgg16 - vgg19 parameters: - duration: 0 - num_warmup: 16 - num_steps: 128 + <<: *common_model_config batch_size: 128 - precision: - - float32 - - float16 - model_action: - - train - - inference diff --git a/superbench/runner/playbooks/check_env.yaml b/superbench/runner/playbooks/check_env.yaml index cbe0a7d0..c11e7710 100644 --- a/superbench/runner/playbooks/check_env.yaml +++ b/superbench/runner/playbooks/check_env.yaml @@ -22,10 +22,14 @@ container: sb-workspace sb_nodes: '{{ hostvars.values() | map(attribute="ansible_hostname") | sort }}' sb_env: | + # pytorch env NNODES={{ sb_nodes | length }} NODE_RANK={{ lookup('ansible.utils.index_of', sb_nodes, 'eq', ansible_hostname) }} MASTER_ADDR={{ sb_nodes | first }} MASTER_PORT=29500 + OMP_NUM_THREADS=1 + # config env + {{ env | default('') }} tasks: - name: Updating Config copy: diff --git a/superbench/runner/playbooks/deploy.yaml b/superbench/runner/playbooks/deploy.yaml index ab17bb2b..bce1fe52 100644 --- a/superbench/runner/playbooks/deploy.yaml +++ b/superbench/runner/playbooks/deploy.yaml @@ -65,8 +65,8 @@ docker rm --force {{ container }} ||: && \ docker run -itd --name={{ container }} \ --privileged --net=host --ipc=host \ - {{ '--gpus=all' if gpu_vendor == 'nvidia' }} \ - {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' }} \ + {{ '--gpus=all' if gpu_vendor == 'nvidia' else '' }} \ + {{ '--security-opt seccomp=unconfined --group-add video' if gpu_vendor == 'amd' else '' }} \ -w /root -v {{ workspace }}:/root -v /mnt:/mnt \ {{ docker_image }} bash && \ docker exec {{ container }} bash -c \ diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 7c886d1d..a9e288d2 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -54,6 +54,8 @@ class SuperBenchRunner(): InvalidConfigError: If input config is invalid. """ # TODO: add validation and defaulting + if not self._sb_config.superbench.env: + self._sb_config.superbench.env = {} for name in self._sb_benchmarks: if not self._sb_benchmarks[name].modes: self._sb_benchmarks[name].modes = [] @@ -141,7 +143,13 @@ class SuperBenchRunner(): logger.info('Checking SuperBench environment.') OmegaConf.save(config=self._sb_config, f=str(Path(self._output_dir) / 'sb.config.yaml')) self._ansible_client.run( - self._ansible_client.get_playbook_config('check_env.yaml', extravars={'output_dir': self._output_dir}) + self._ansible_client.get_playbook_config( + 'check_env.yaml', + extravars={ + 'output_dir': self._output_dir, + 'env': '\n'.join(f'{k}={v}' for k, v in self._sb_config.superbench.env.items()), + } + ) ) def _run_proc(self, benchmark_name, mode, vars): @@ -161,7 +169,7 @@ class SuperBenchRunner(): self._ansible_client.get_shell_config( ( 'docker exec sb-workspace bash -c ' - '"set -o allexport && source sb.env && set +o allexport && {command}"' + "'set -o allexport && source sb.env && set +o allexport && {command}'" ).format(command=self.__get_mode_command(benchmark_name, mode), ) ), sudo=True diff --git a/tests/cli/test_sb.py b/tests/cli/test_sb.py index 275ede84..327f0c3d 100644 --- a/tests/cli/test_sb.py +++ b/tests/cli/test_sb.py @@ -53,7 +53,11 @@ class SuperBenchCLIScenarioTest(ScenarioTest): def test_sb_deploy(self): """Test sb deploy.""" - self.cmd('sb deploy --host-list localhost', expect_failure=True) + self.cmd('sb deploy --host-list localhost', checks=[NoneCheck()]) + + def test_sb_deploy_no_host(self): + """Test sb deploy, no host_file or host_list provided, should fail.""" + self.cmd('sb deploy', expect_failure=True) def test_sb_exec(self): """Test sb exec.""" diff --git a/tests/executor/test_executor.py b/tests/executor/test_executor.py index 1b53d378..2124150f 100644 --- a/tests/executor/test_executor.py +++ b/tests/executor/test_executor.py @@ -10,6 +10,7 @@ import tempfile from pathlib import Path from unittest import mock +import yaml from omegaconf import OmegaConf from superbench.executor import SuperBenchExecutor @@ -24,7 +25,8 @@ class ExecutorTestCase(unittest.TestCase): def setUp(self): """Hook method for setting up the test fixture before exercising it.""" default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' - self.default_config = OmegaConf.load(str(default_config_file)) + with default_config_file.open() as fp: + self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader)) self.output_dir = tempfile.mkdtemp() self.executor = SuperBenchExecutor(self.default_config, self.output_dir) @@ -61,20 +63,32 @@ class ExecutorTestCase(unittest.TestCase): def test_get_arguments(self): """Test benchmarks arguments.""" - expected_matmul_args = '' - self.assertEqual( - self.executor._SuperBenchExecutor__get_arguments( - self.default_config.superbench.benchmarks.matmul.parameters - ), expected_matmul_args - ) - expected_bert_models_args = \ - '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' \ - '--precision float32 float16 --model_action train inference' - self.assertEqual( - self.executor._SuperBenchExecutor__get_arguments( - self.default_config.superbench.benchmarks.bert_models.parameters - ), expected_bert_models_args - ) + test_cases = [ + { + 'parameters': None, + 'expected_args': '', + }, + { + 'parameters': { + 'duration': 0, + 'num_warmup': 16, + 'num_steps': 128, + 'batch_size': 16, + 'precision': ['float32', 'float16'], + 'model_action': ['train', 'inference'], + }, + 'expected_args': ( + '--duration 0 --num_warmup 16 --num_steps 128 --batch_size 16 ' + '--precision float32 float16 --model_action train inference' + ), + }, + ] + for test_case in test_cases: + with self.subTest(msg='Testing with case', test_case=test_case): + self.assertEqual( + self.executor._SuperBenchExecutor__get_arguments(test_case['parameters']), + test_case['expected_args'] + ) def test_create_benchmark_dir(self): """Test __create_benchmark_dir.""" diff --git a/tests/runner/test_runner.py b/tests/runner/test_runner.py index 2cb26f84..6fa9a63f 100644 --- a/tests/runner/test_runner.py +++ b/tests/runner/test_runner.py @@ -9,6 +9,7 @@ import tempfile from pathlib import Path from unittest import mock +import yaml from omegaconf import OmegaConf from superbench.runner import SuperBenchRunner @@ -19,7 +20,8 @@ class RunnerTestCase(unittest.TestCase): def setUp(self): """Hook method for setting up the test fixture before exercising it.""" default_config_file = Path(__file__).parent / '../../superbench/config/default.yaml' - self.default_config = OmegaConf.load(str(default_config_file)) + with default_config_file.open() as fp: + self.default_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader)) self.output_dir = tempfile.mkdtemp() self.runner = SuperBenchRunner(self.default_config, None, None, self.output_dir)